diff --git a/patch/npu.patch b/patch/npu.patch
index 14c9a2b015f9a880197f7275d29f92be65b92dc7..14c25d237195c40be1fdeb3639feecafe729d89f 100644
--- a/patch/npu.patch
+++ b/patch/npu.patch
@@ -1,6 +1,6 @@
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/CMakeLists.txt pytorch-develop/aten/CMakeLists.txt
 --- pytorch-v1.5.0/aten/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/CMakeLists.txt	2021-07-13 15:30:57.594267657 +0800
++++ pytorch-develop/aten/CMakeLists.txt	2021-07-15 20:52:26.641436929 +0800
 @@ -22,8 +22,10 @@
  set(ATen_CPU_INCLUDE)
  set(ATen_THIRD_PARTY_INCLUDE)
@@ -51,7 +51,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/CMakeLists.txt pytorch-develop/aten/src/ATen/CMakeLists.txt
 --- pytorch-v1.5.0/aten/src/ATen/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/CMakeLists.txt	2021-07-13 15:30:57.594267657 +0800
++++ pytorch-develop/aten/src/ATen/CMakeLists.txt	2021-07-15 20:52:26.645437073 +0800
 @@ -67,6 +67,9 @@
  FILE(GLOB native_quantized_h "native/quantized/*.h" "native/quantized/cpu/*.h")
  FILE(GLOB native_cpu_h "native/cpu/*.h")
@@ -129,7 +129,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  set(ATen_QUANTIZED_SRCS ${ATen_QUANTIZED_SRCS} PARENT_SCOPE)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/core/dispatch/DispatchTable.h pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h
 --- pytorch-v1.5.0/aten/src/ATen/core/dispatch/DispatchTable.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h	2021-07-13 15:30:57.602267943 +0800
++++ pytorch-develop/aten/src/ATen/core/dispatch/DispatchTable.h	2021-07-15 20:52:26.649437216 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -170,7 +170,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/function_wrapper.py pytorch-develop/aten/src/ATen/function_wrapper.py
 --- pytorch-v1.5.0/aten/src/ATen/function_wrapper.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/function_wrapper.py	2021-07-13 15:30:57.610268230 +0800
++++ pytorch-develop/aten/src/ATen/function_wrapper.py	2021-07-15 20:52:26.657437502 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -354,7 +354,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          for option in declaration['options']:
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/gen.py pytorch-develop/aten/src/ATen/gen.py
 --- pytorch-v1.5.0/aten/src/ATen/gen.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/gen.py	2021-07-13 15:30:57.610268230 +0800
++++ pytorch-develop/aten/src/ATen/gen.py	2021-07-15 20:52:26.657437502 +0800
 @@ -1,3 +1,18 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -512,7 +512,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      generate_outputs()
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/cpu/Activation.cpp pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/cpu/Activation.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp	2021-07-13 15:30:57.622268661 +0800
++++ pytorch-develop/aten/src/ATen/native/cpu/Activation.cpp	2021-07-15 20:52:26.669437932 +0800
 @@ -339,20 +339,20 @@
  
  void hardsigmoid_backward_kernel(TensorIterator& iter) {
@@ -540,7 +540,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    });
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/Memory.cpp pytorch-develop/aten/src/ATen/native/Memory.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/Memory.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/Memory.cpp	2021-07-13 15:30:57.614268374 +0800
++++ pytorch-develop/aten/src/ATen/native/Memory.cpp	2021-07-15 20:52:26.665437789 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -595,7 +595,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        detail::computeStorageSize(self.sizes(), self.strides()),
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/native_functions.yaml pytorch-develop/aten/src/ATen/native/native_functions.yaml
 --- pytorch-v1.5.0/aten/src/ATen/native/native_functions.yaml	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/native_functions.yaml	2021-07-13 15:30:57.634269091 +0800
++++ pytorch-develop/aten/src/ATen/native/native_functions.yaml	2021-07-15 20:52:26.685438507 +0800
 @@ -1,6 +1,5 @@
  # See README.md in this directory for more guidance
  
@@ -5916,24 +5916,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: max_unpool2d.out(Tensor self, Tensor indices, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -6118,12 +7584,16 @@
-   dispatch:
-     CPU: reflection_pad2d_out_cpu
-     CUDA: reflection_pad2d_out_cuda
-+  npu_dispatch:
-+    NPU: reflection_pad2d_out_npu
- 
- - func: reflection_pad2d(Tensor self, int[4] padding) -> Tensor
-   python_module: nn
-   dispatch:
-     CPU: reflection_pad2d_cpu
-     CUDA: reflection_pad2d_cuda
-+  npu_dispatch:
-+    NPU: reflection_pad2d_npu
- 
- - func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
-   python_module: nn
-@@ -6166,12 +7636,16 @@
+@@ -6166,12 +7632,16 @@
    dispatch:
      CPU: replication_pad2d_out_cpu
      CUDA: replication_pad2d_out_cuda
@@ -5950,7 +5933,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: replication_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
    python_module: nn
-@@ -6214,12 +7688,16 @@
+@@ -6214,12 +7684,16 @@
    dispatch:
      CPU: upsample_linear1d_out_cpu
      CUDA: upsample_linear1d_out_cuda
@@ -5967,7 +5950,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: upsample_linear1d_backward.grad_input(Tensor grad_output, int[1] output_size, int[3] input_size, bool align_corners, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
    python_module: nn
-@@ -6232,12 +7710,16 @@
+@@ -6232,12 +7706,16 @@
    dispatch:
      CPU: upsample_linear1d_backward_cpu
      CUDA: upsample_linear1d_backward_cuda
@@ -5984,7 +5967,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
    python_module: nn
-@@ -6245,96 +7727,128 @@
+@@ -6245,96 +7723,128 @@
      CPU: upsample_bilinear2d_cpu
      CUDA: upsample_bilinear2d_cuda
      QuantizedCPU: quantized_upsample_bilinear2d_cpu
@@ -6113,7 +6096,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
    python_module: nn
-@@ -6342,24 +7856,32 @@
+@@ -6342,24 +7852,32 @@
      CPU: upsample_nearest2d_cpu
      CUDA: upsample_nearest2d_cuda
      QuantizedCPU: quantized_upsample_nearest2d_cpu
@@ -6146,7 +6129,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: upsample_nearest3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
    python_module: nn
-@@ -6367,38 +7889,52 @@
+@@ -6367,38 +7885,52 @@
      CPU: upsample_nearest3d_cpu
      CUDA: upsample_nearest3d_cuda
      QuantizedCPU: quantized_upsample_nearest3d_cpu
@@ -6199,7 +6182,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  # What's a thnn_conv_ versus a slow_conv_?
  #
-@@ -6423,24 +7959,32 @@
+@@ -6423,24 +7955,32 @@
    dispatch:
      CPU: slow_conv_transpose2d_out_cpu
      CUDA: slow_conv_transpose2d_out_cuda
@@ -6232,7 +6215,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: slow_conv_transpose3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -6468,21 +8012,29 @@
+@@ -6468,21 +8008,29 @@
  
  - func: thnn_conv2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
@@ -6262,7 +6245,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: thnn_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, Tensor finput, Tensor fgrad_input, *, Tensor(a!)? grad_input, Tensor(b!)? grad_weight, Tensor(c!)? grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
    python_module: nn
-@@ -6495,32 +8047,46 @@
+@@ -6495,32 +8043,46 @@
    dispatch:
      CPU: slow_conv2d_backward_cpu
      CUDA: legacy::cuda::_thnn_conv2d_backward
@@ -6309,7 +6292,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: slow_conv3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)
    python_module: nn
-@@ -6553,12 +8119,16 @@
+@@ -6553,12 +8115,16 @@
    dispatch:
      CPU: slow_conv_dilated2d_cpu
      CUDA: slow_conv_dilated2d_cuda
@@ -6326,7 +6309,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
  - func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1) -> Tensor
    python_module: nn
-@@ -6577,57 +8147,405 @@
+@@ -6577,57 +8143,413 @@
    dispatch:
      CPU: col2im_out_cpu
      CUDA: col2im_out_cuda
@@ -6732,10 +6715,18 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +- func: npu_bert_apply_adam(Tensor(a!) var, Tensor(b!) m, Tensor(c!) v, Scalar lr, Scalar beta1, Scalar beta2, Scalar epsilon, Tensor grad, Scalar max_grad_norm, Scalar global_grad_norm, Scalar weight_decay) -> (Tensor(a!), Tensor(b!), Tensor(c!))
 +  npu_dispatch_only:
 +    NPU: bert_apply_adam_npu
++
++- func: npu_giou(Tensor self, Tensor gtboxes, bool trans=False, bool is_cross=False, int mode=0) -> Tensor
++  npu_dispatch_only:
++    NPU: giou_npu
++
++- func: npu_giou_backward(Tensor grad, Tensor bboxes, Tensor gtboxes, bool trans=False, bool is_cross=False, int mode=0) -> (Tensor, Tensor)
++  npu_dispatch_only:
++    NPU: giou_backward_npu
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S
 --- pytorch-v1.5.0/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S	2021-07-13 15:30:57.674270525 +0800
++++ pytorch-develop/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-dq-aarch64-neon.S	2021-07-15 20:52:26.725439941 +0800
 @@ -659,14 +659,14 @@
  
      SUB x1, x1, 4
@@ -6761,7 +6752,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      CMP x1, 2
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/TensorCompare.cpp pytorch-develop/aten/src/ATen/native/TensorCompare.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/TensorCompare.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/TensorCompare.cpp	2021-07-13 15:30:57.618268517 +0800
++++ pytorch-develop/aten/src/ATen/native/TensorCompare.cpp	2021-07-15 20:52:26.669437932 +0800
 @@ -64,7 +64,7 @@
  
  Tensor isinf(const Tensor &self) {
@@ -6773,7 +6764,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    return AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "isinf", [&]() {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/TensorFactories.cpp pytorch-develop/aten/src/ATen/native/TensorFactories.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/TensorFactories.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/TensorFactories.cpp	2021-07-13 15:30:57.618268517 +0800
++++ pytorch-develop/aten/src/ATen/native/TensorFactories.cpp	2021-07-15 20:52:26.669437932 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -6818,7 +6809,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/TensorProperties.cpp pytorch-develop/aten/src/ATen/native/TensorProperties.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/TensorProperties.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/TensorProperties.cpp	2021-07-13 15:30:57.618268517 +0800
++++ pytorch-develop/aten/src/ATen/native/TensorProperties.cpp	2021-07-15 20:52:26.669437932 +0800
 @@ -87,6 +87,7 @@
    if (self.is_contiguous(memory_format)) {
      return self;
@@ -6829,7 +6820,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        "preserve memory format is unsupported by the contiguous operator");
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native/UpSampleBicubic2d.cpp pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp
 --- pytorch-v1.5.0/aten/src/ATen/native/UpSampleBicubic2d.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp	2021-07-13 15:30:57.622268661 +0800
++++ pytorch-develop/aten/src/ATen/native/UpSampleBicubic2d.cpp	2021-07-15 20:52:26.669437932 +0800
 @@ -26,7 +26,7 @@
          const scalar_t* in = &idata[output_y * input_width + output_x];
          scalar_t* out = &odata[output_y * output_width + output_x];
@@ -6841,7 +6832,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
            out += output_width * output_height;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/native_parse.py pytorch-develop/aten/src/ATen/native_parse.py
 --- pytorch-v1.5.0/aten/src/ATen/native_parse.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/native_parse.py	2021-07-13 15:30:57.686270955 +0800
++++ pytorch-develop/aten/src/ATen/native_parse.py	2021-07-15 20:52:26.737440371 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -6879,7 +6870,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
                  msg = '''Exception raised in processing function:
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/preprocess_declarations.py pytorch-develop/aten/src/ATen/preprocess_declarations.py
 --- pytorch-v1.5.0/aten/src/ATen/preprocess_declarations.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/preprocess_declarations.py	2021-07-13 15:30:57.690271099 +0800
++++ pytorch-develop/aten/src/ATen/preprocess_declarations.py	2021-07-15 20:52:26.737440371 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -6911,7 +6902,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/templates/TensorBody.h pytorch-develop/aten/src/ATen/templates/TensorBody.h
 --- pytorch-v1.5.0/aten/src/ATen/templates/TensorBody.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/templates/TensorBody.h	2021-07-13 15:30:57.690271099 +0800
++++ pytorch-develop/aten/src/ATen/templates/TensorBody.h	2021-07-15 20:52:26.741440515 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -6944,7 +6935,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/ATen/templates/TensorMethods.h pytorch-develop/aten/src/ATen/templates/TensorMethods.h
 --- pytorch-v1.5.0/aten/src/ATen/templates/TensorMethods.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/ATen/templates/TensorMethods.h	2021-07-13 15:30:57.690271099 +0800
++++ pytorch-develop/aten/src/ATen/templates/TensorMethods.h	2021-07-15 20:52:26.741440515 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -6978,7 +6969,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/TH/CMakeLists.txt pytorch-develop/aten/src/TH/CMakeLists.txt
 --- pytorch-v1.5.0/aten/src/TH/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/TH/CMakeLists.txt	2021-07-13 15:30:57.694271242 +0800
++++ pytorch-develop/aten/src/TH/CMakeLists.txt	2021-07-15 20:52:26.741440515 +0800
 @@ -48,6 +48,11 @@
    ${CMAKE_CURRENT_SOURCE_DIR}
  PARENT_SCOPE)
@@ -6993,7 +6984,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/TH/generic/THStorage.cpp pytorch-develop/aten/src/TH/generic/THStorage.cpp
 --- pytorch-v1.5.0/aten/src/TH/generic/THStorage.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/TH/generic/THStorage.cpp	2021-07-13 15:30:57.694271242 +0800
++++ pytorch-develop/aten/src/TH/generic/THStorage.cpp	2021-07-15 20:52:26.745440658 +0800
 @@ -1,9 +1,32 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7102,7 +7093,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/aten/src/TH/generic/THStorage.h pytorch-develop/aten/src/TH/generic/THStorage.h
 --- pytorch-v1.5.0/aten/src/TH/generic/THStorage.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/aten/src/TH/generic/THStorage.h	2021-07-13 15:30:57.694271242 +0800
++++ pytorch-develop/aten/src/TH/generic/THStorage.h	2021-07-15 20:52:26.745440658 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7141,7 +7132,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/CMakeLists.txt pytorch-develop/c10/CMakeLists.txt
 --- pytorch-v1.5.0/c10/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/CMakeLists.txt	2021-07-13 15:30:57.706271672 +0800
++++ pytorch-develop/c10/CMakeLists.txt	2021-07-15 20:52:26.761441232 +0800
 @@ -63,6 +63,14 @@
    message(STATUS "don't use NUMA")
  endif()
@@ -7170,7 +7161,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    # not checked in
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Backend.h pytorch-develop/c10/core/Backend.h
 --- pytorch-v1.5.0/c10/core/Backend.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/Backend.h	2021-07-13 15:30:57.706271672 +0800
++++ pytorch-develop/c10/core/Backend.h	2021-07-15 20:52:26.761441232 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7265,7 +7256,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Device.cpp pytorch-develop/c10/core/Device.cpp
 --- pytorch-v1.5.0/c10/core/Device.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/Device.cpp	2021-07-13 15:30:57.706271672 +0800
++++ pytorch-develop/c10/core/Device.cpp	2021-07-15 20:52:26.761441232 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7305,7 +7296,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        types.begin(),
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Device.h pytorch-develop/c10/core/Device.h
 --- pytorch-v1.5.0/c10/core/Device.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/Device.h	2021-07-13 15:30:57.706271672 +0800
++++ pytorch-develop/c10/core/Device.h	2021-07-15 20:52:26.761441232 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7340,7 +7331,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      return type_ == DeviceType::CPU;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DeviceType.cpp pytorch-develop/c10/core/DeviceType.cpp
 --- pytorch-v1.5.0/c10/core/DeviceType.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/DeviceType.cpp	2021-07-13 15:30:57.706271672 +0800
++++ pytorch-develop/c10/core/DeviceType.cpp	2021-07-15 20:52:26.761441232 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7380,7 +7371,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        return false;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DeviceType.h pytorch-develop/c10/core/DeviceType.h
 --- pytorch-v1.5.0/c10/core/DeviceType.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/DeviceType.h	2021-07-13 15:30:57.706271672 +0800
++++ pytorch-develop/c10/core/DeviceType.h	2021-07-15 20:52:26.761441232 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7423,7 +7414,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  constexpr DeviceType kXLA = DeviceType::XLA;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DispatchKey.cpp pytorch-develop/c10/core/DispatchKey.cpp
 --- pytorch-v1.5.0/c10/core/DispatchKey.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/DispatchKey.cpp	2021-07-13 15:30:57.706271672 +0800
++++ pytorch-develop/c10/core/DispatchKey.cpp	2021-07-15 20:52:26.761441232 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7455,7 +7446,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      case DispatchKey::TESTING_ONLY_GenericModeTensorId:
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/DispatchKey.h pytorch-develop/c10/core/DispatchKey.h
 --- pytorch-v1.5.0/c10/core/DispatchKey.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/DispatchKey.h	2021-07-13 15:30:57.706271672 +0800
++++ pytorch-develop/c10/core/DispatchKey.h	2021-07-15 20:52:26.761441232 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7487,7 +7478,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/Storage.h pytorch-develop/c10/core/Storage.h
 --- pytorch-v1.5.0/c10/core/Storage.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/Storage.h	2021-07-13 15:30:57.706271672 +0800
++++ pytorch-develop/c10/core/Storage.h	2021-07-15 20:52:26.761441232 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7521,7 +7512,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  };
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/StorageImpl.h pytorch-develop/c10/core/StorageImpl.h
 --- pytorch-v1.5.0/c10/core/StorageImpl.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/StorageImpl.h	2021-07-13 15:30:57.706271672 +0800
++++ pytorch-develop/c10/core/StorageImpl.h	2021-07-15 20:52:26.761441232 +0800
 @@ -1,12 +1,39 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7578,7 +7569,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/TensorImpl.h pytorch-develop/c10/core/TensorImpl.h
 --- pytorch-v1.5.0/c10/core/TensorImpl.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/TensorImpl.h	2021-07-13 15:30:57.710271816 +0800
++++ pytorch-develop/c10/core/TensorImpl.h	2021-07-15 20:52:26.761441232 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7648,7 +7639,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/core/TensorOptions.h pytorch-develop/c10/core/TensorOptions.h
 --- pytorch-v1.5.0/c10/core/TensorOptions.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/core/TensorOptions.h	2021-07-13 15:30:57.710271816 +0800
++++ pytorch-develop/c10/core/TensorOptions.h	2021-07-15 20:52:26.761441232 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7689,7 +7680,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/c10/macros/Export.h pytorch-develop/c10/macros/Export.h
 --- pytorch-v1.5.0/c10/macros/Export.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/c10/macros/Export.h	2021-07-13 15:30:57.710271816 +0800
++++ pytorch-develop/c10/macros/Export.h	2021-07-15 20:52:26.765441375 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -7816,7 +7807,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/caffe2/CMakeLists.txt pytorch-develop/caffe2/CMakeLists.txt
 --- pytorch-v1.5.0/caffe2/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/caffe2/CMakeLists.txt	2021-07-13 15:30:57.718272102 +0800
++++ pytorch-develop/caffe2/CMakeLists.txt	2021-07-15 20:52:26.773441662 +0800
 @@ -32,6 +32,7 @@
    # Add source, includes, and libs to lists
    list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS})
@@ -7963,7 +7954,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    # Call again since Caffe2_HIP_INCLUDE is extended with ATen include dirs.
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/.clang-format pytorch-develop/.clang-format
 --- pytorch-v1.5.0/.clang-format	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/.clang-format	2021-07-13 15:30:57.586267370 +0800
++++ pytorch-develop/.clang-format	2021-07-15 20:52:26.637436786 +0800
 @@ -84,5 +84,4 @@
  SpacesInSquareBrackets: false
  Standard:        Cpp11
@@ -7974,7 +7965,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/BuildVariables.cmake pytorch-develop/cmake/BuildVariables.cmake
 --- pytorch-v1.5.0/cmake/BuildVariables.cmake	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/BuildVariables.cmake	2021-07-13 15:30:57.830276118 +0800
++++ pytorch-develop/cmake/BuildVariables.cmake	2021-07-15 20:52:26.913446682 +0800
 @@ -11,6 +11,7 @@
  # CMakeLists.txt files under each folder respectively.
  set(Caffe2_CPU_SRCS)
@@ -8001,7 +7992,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  # symbols. However, if the lib is whole linked in caffe2 lib, we don't want
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/Codegen.cmake pytorch-develop/cmake/Codegen.cmake
 --- pytorch-v1.5.0/cmake/Codegen.cmake	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/Codegen.cmake	2021-07-13 15:30:57.830276118 +0800
++++ pytorch-develop/cmake/Codegen.cmake	2021-07-15 20:52:26.913446682 +0800
 @@ -191,13 +191,14 @@
    file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt generated_cpp)
    file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt-cuda cuda_generated_cpp)
@@ -8032,7 +8023,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  endif()
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/Dependencies.cmake pytorch-develop/cmake/Dependencies.cmake
 --- pytorch-v1.5.0/cmake/Dependencies.cmake	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/Dependencies.cmake	2021-07-13 15:30:57.830276118 +0800
++++ pytorch-develop/cmake/Dependencies.cmake	2021-07-15 20:52:26.917446825 +0800
 @@ -1509,6 +1509,13 @@
    ENDIF(NOT C_HAS_THREAD)
  endif()
@@ -8049,7 +8040,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  #
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/Summary.cmake pytorch-develop/cmake/Summary.cmake
 --- pytorch-v1.5.0/cmake/Summary.cmake	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/Summary.cmake	2021-07-13 15:30:57.830276118 +0800
++++ pytorch-develop/cmake/Summary.cmake	2021-07-15 20:52:26.917446825 +0800
 @@ -134,6 +134,7 @@
    if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
      message(STATUS "  SELECTED_OP_LIST    : ${SELECTED_OP_LIST}")
@@ -8060,7 +8051,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  endfunction()
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/cmake/TorchConfig.cmake.in pytorch-develop/cmake/TorchConfig.cmake.in
 --- pytorch-v1.5.0/cmake/TorchConfig.cmake.in	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/cmake/TorchConfig.cmake.in	2021-07-13 15:30:57.830276118 +0800
++++ pytorch-develop/cmake/TorchConfig.cmake.in	2021-07-15 20:52:26.917446825 +0800
 @@ -112,6 +112,11 @@
    list(APPEND TORCH_LIBRARIES ${TORCH_CUDA_LIBRARIES})
  endif()
@@ -8075,7 +8066,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=@GLIBCXX_USE_CXX11_ABI@")
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/CMakeLists.txt pytorch-develop/CMakeLists.txt
 --- pytorch-v1.5.0/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/CMakeLists.txt	2021-07-13 15:30:57.590267513 +0800
++++ pytorch-develop/CMakeLists.txt	2021-07-15 20:52:26.637436786 +0800
 @@ -205,6 +205,10 @@
  option(USE_TBB "Use TBB" OFF)
  option(ONNX_ML "Enable traditional ONNX ML API." ON)
@@ -8142,7 +8133,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-braces")
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/.dockerignore pytorch-develop/.dockerignore
 --- pytorch-v1.5.0/.dockerignore	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/.dockerignore	2021-07-13 15:30:57.586267370 +0800
++++ pytorch-develop/.dockerignore	2021-07-15 20:52:26.637436786 +0800
 @@ -1,257 +1 @@
 -# READ THIS BEFORE YOU REFACTOR ME
 -#
@@ -8405,44 +8396,44 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/docs/make.bat pytorch-develop/docs/make.bat
 --- pytorch-v1.5.0/docs/make.bat	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/docs/make.bat	2021-07-13 15:30:57.834276262 +0800
++++ pytorch-develop/docs/make.bat	2021-07-15 20:52:26.925447111 +0800
 @@ -1,36 +1,36 @@
--@ECHO OFF
--
--pushd %~dp0
--
--REM Command file for Sphinx documentation
--
--if "%SPHINXBUILD%" == "" (
--        set SPHINXBUILD=sphinx-build
--)
--set SOURCEDIR=source
--set BUILDDIR=build
--set SPHINXPROJ=PyTorch
--
--if "%1" == "" goto help
--
--%SPHINXBUILD% >NUL 2>NUL
--if errorlevel 9009 (
--        echo.
--        echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
--        echo.installed, then set the SPHINXBUILD environment variable to point
--        echo.to the full path of the 'sphinx-build' executable. Alternatively you
--        echo.may add the Sphinx directory to PATH.
--        echo.
--        echo.If you don't have Sphinx installed, grab it from
--        echo.http://sphinx-doc.org/
--        exit /b 1
--)
--
--%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
--goto end
--
--:help
--%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
--
--:end
--popd
+-@ECHO OFF
+-
+-pushd %~dp0
+-
+-REM Command file for Sphinx documentation
+-
+-if "%SPHINXBUILD%" == "" (
+-        set SPHINXBUILD=sphinx-build
+-)
+-set SOURCEDIR=source
+-set BUILDDIR=build
+-set SPHINXPROJ=PyTorch
+-
+-if "%1" == "" goto help
+-
+-%SPHINXBUILD% >NUL 2>NUL
+-if errorlevel 9009 (
+-        echo.
+-        echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+-        echo.installed, then set the SPHINXBUILD environment variable to point
+-        echo.to the full path of the 'sphinx-build' executable. Alternatively you
+-        echo.may add the Sphinx directory to PATH.
+-        echo.
+-        echo.If you don't have Sphinx installed, grab it from
+-        echo.http://sphinx-doc.org/
+-        exit /b 1
+-)
+-
+-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+-goto end
+-
+-:help
+-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+-
+-:end
+-popd
 +@ECHO OFF
 +
 +pushd %~dp0
@@ -8494,7 +8485,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/requirements.txt pytorch-develop/requirements.txt
 --- pytorch-v1.5.0/requirements.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/requirements.txt	2021-07-13 15:30:57.850276836 +0800
++++ pytorch-develop/requirements.txt	2021-07-15 20:52:26.941447686 +0800
 @@ -4,4 +4,12 @@
  requests
  setuptools
@@ -8513,18 +8504,18 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/appveyor/install.bat pytorch-develop/scripts/appveyor/install.bat
 --- pytorch-v1.5.0/scripts/appveyor/install.bat	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/scripts/appveyor/install.bat	2021-07-13 15:30:57.850276836 +0800
++++ pytorch-develop/scripts/appveyor/install.bat	2021-07-15 20:52:26.941447686 +0800
 @@ -1,10 +1,10 @@
--:: Installation scripts for appveyor.
--
--@echo on
--
--if "%USE_CUDA%" == "ON" call %~dp0%install_cuda.bat
--
--:: Miniconda path for appveyor
--set PATH=C:\Miniconda-x64;C:\Miniconda-x64\Scripts;%PATH%
--:: Install numpy
--conda install -y numpy
+-:: Installation scripts for appveyor.
+-
+-@echo on
+-
+-if "%USE_CUDA%" == "ON" call %~dp0%install_cuda.bat
+-
+-:: Miniconda path for appveyor
+-set PATH=C:\Miniconda-x64;C:\Miniconda-x64\Scripts;%PATH%
+-:: Install numpy
+-conda install -y numpy
 +:: Installation scripts for appveyor.
 +
 +@echo on
@@ -8537,30 +8528,30 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +conda install -y numpy
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/appveyor/install_cuda.bat pytorch-develop/scripts/appveyor/install_cuda.bat
 --- pytorch-v1.5.0/scripts/appveyor/install_cuda.bat	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/scripts/appveyor/install_cuda.bat	2021-07-13 15:30:57.850276836 +0800
++++ pytorch-develop/scripts/appveyor/install_cuda.bat	2021-07-15 20:52:26.941447686 +0800
 @@ -1,22 +1,22 @@
--@echo on
--
--appveyor DownloadFile ^
--  https://developer.nvidia.com/compute/cuda/8.0/prod/local_installers/cuda_8.0.44_windows-exe ^
--  -FileName cuda_8.0.44_windows.exe
--appveyor Downloadfile ^
--  http://developer.download.nvidia.com/compute/redist/cudnn/v5.1/cudnn-8.0-windows10-x64-v5.1.zip ^
--  -FileName cudnn-8.0-windows10-x64-v5.1.zip
--
--cuda_8.0.44_windows.exe -s compiler_8.0 cublas_8.0 cublas_dev_8.0 cudart_8.0 curand_8.0 curand_dev_8.0 nvrtc_8.0 nvrtc_dev_8.0
--set PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v8.0\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v8.0\libnvvp;%PATH%
--
--7z x cudnn-8.0-windows10-x64-v5.1.zip
--copy cuda\include\cudnn.h ^
--  "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\include\"
--copy cuda\lib\x64\cudnn.lib ^
--  "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\lib\x64\"
--copy cuda\bin\cudnn64_5.dll ^
--  "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\bin\"
--
--:: Make sure that nvcc is working correctly.
--nvcc -V || exit /b
+-@echo on
+-
+-appveyor DownloadFile ^
+-  https://developer.nvidia.com/compute/cuda/8.0/prod/local_installers/cuda_8.0.44_windows-exe ^
+-  -FileName cuda_8.0.44_windows.exe
+-appveyor Downloadfile ^
+-  http://developer.download.nvidia.com/compute/redist/cudnn/v5.1/cudnn-8.0-windows10-x64-v5.1.zip ^
+-  -FileName cudnn-8.0-windows10-x64-v5.1.zip
+-
+-cuda_8.0.44_windows.exe -s compiler_8.0 cublas_8.0 cublas_dev_8.0 cudart_8.0 curand_8.0 curand_dev_8.0 nvrtc_8.0 nvrtc_dev_8.0
+-set PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v8.0\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v8.0\libnvvp;%PATH%
+-
+-7z x cudnn-8.0-windows10-x64-v5.1.zip
+-copy cuda\include\cudnn.h ^
+-  "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\include\"
+-copy cuda\lib\x64\cudnn.lib ^
+-  "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\lib\x64\"
+-copy cuda\bin\cudnn64_5.dll ^
+-  "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\bin\"
+-
+-:: Make sure that nvcc is working correctly.
+-nvcc -V || exit /b
 +@echo on
 +
 +appveyor DownloadFile ^
@@ -8585,92 +8576,92 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +nvcc -V || exit /b
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/build_windows.bat pytorch-develop/scripts/build_windows.bat
 --- pytorch-v1.5.0/scripts/build_windows.bat	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/scripts/build_windows.bat	2021-07-13 15:30:57.850276836 +0800
++++ pytorch-develop/scripts/build_windows.bat	2021-07-15 20:52:26.941447686 +0800
 @@ -1,84 +1,84 @@
--:: #############################################################################
--:: Example command to build on Windows.
--:: #############################################################################
--
--:: This script shows how one can build a Caffe2 binary for windows.
--
--@echo off
--setlocal
--
--SET ORIGINAL_DIR=%cd%
--SET CAFFE2_ROOT=%~dp0%..
--
--if NOT DEFINED BUILD_BINARY (
--  set BUILD_BINARY=OFF
--)
--
--if NOT DEFINED BUILD_SHARED_LIBS (
--  :: On CI, we test with BUILD_SHARED_LIBS=OFF.
--  :: By default, it will be BUILD_SHARED_LIBS=ON.
--  if NOT DEFINED BUILD_ENVIRONMENT (
--    set BUILD_SHARED_LIBS=OFF
--  )
--)
--
--IF NOT DEFINED BUILDING_WITH_TORCH_LIBS (
--  set BUILDING_WITH_TORCH_LIBS=OFF
--)
--
--if NOT DEFINED CAFFE2_STATIC_LINK_CUDA (
--  set CAFFE2_STATIC_LINK_CUDA=OFF
--)
--
--if NOT DEFINED CMAKE_BUILD_TYPE (
--  set CMAKE_BUILD_TYPE=Release
--)
--
--if NOT DEFINED ONNX_NAMESPACE (
--  set ONNX_NAMESPACE=onnx_c2
--)
--
--if NOT DEFINED TORCH_CUDA_ARCH_LIST (
--  set TORCH_CUDA_ARCH_LIST=5.0
--)
--
--if NOT DEFINED USE_CUDA (
--  set USE_CUDA=OFF
--)
--
--if NOT DEFINED USE_OBSERVERS (
--  set USE_OBSERVERS=OFF
--)
--
--if NOT DEFINED MSVC_Z7_OVERRIDE (
--  set MSVC_Z7_OVERRIDE=OFF
--)
--
--if NOT DEFINED CMAKE_GENERATOR (
--  set CMAKE_GENERATOR=Ninja
--)
--
--set CMAKE_VERBOSE_MAKEFILE=1
--
--:: Install pyyaml for Aten codegen
--pip install pyyaml ninja
--
--echo CAFFE2_ROOT=%CAFFE2_ROOT%
--echo CMAKE_GENERATOR=%CMAKE_GENERATOR%
--echo CMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE%
--
--:: Set up cmake. We will skip building the test files right now.
--pushd %CAFFE2_ROOT%
--python tools\build_libtorch.py || goto :label_error
--popd
--
--echo "Caffe2 built successfully"
--cd %ORIGINAL_DIR%
--endlocal
--exit /b 0
--
--:label_error
--echo "Caffe2 building failed"
--cd %ORIGINAL_DIR%
--endlocal
--exit /b 1
+-:: #############################################################################
+-:: Example command to build on Windows.
+-:: #############################################################################
+-
+-:: This script shows how one can build a Caffe2 binary for windows.
+-
+-@echo off
+-setlocal
+-
+-SET ORIGINAL_DIR=%cd%
+-SET CAFFE2_ROOT=%~dp0%..
+-
+-if NOT DEFINED BUILD_BINARY (
+-  set BUILD_BINARY=OFF
+-)
+-
+-if NOT DEFINED BUILD_SHARED_LIBS (
+-  :: On CI, we test with BUILD_SHARED_LIBS=OFF.
+-  :: By default, it will be BUILD_SHARED_LIBS=ON.
+-  if NOT DEFINED BUILD_ENVIRONMENT (
+-    set BUILD_SHARED_LIBS=OFF
+-  )
+-)
+-
+-IF NOT DEFINED BUILDING_WITH_TORCH_LIBS (
+-  set BUILDING_WITH_TORCH_LIBS=OFF
+-)
+-
+-if NOT DEFINED CAFFE2_STATIC_LINK_CUDA (
+-  set CAFFE2_STATIC_LINK_CUDA=OFF
+-)
+-
+-if NOT DEFINED CMAKE_BUILD_TYPE (
+-  set CMAKE_BUILD_TYPE=Release
+-)
+-
+-if NOT DEFINED ONNX_NAMESPACE (
+-  set ONNX_NAMESPACE=onnx_c2
+-)
+-
+-if NOT DEFINED TORCH_CUDA_ARCH_LIST (
+-  set TORCH_CUDA_ARCH_LIST=5.0
+-)
+-
+-if NOT DEFINED USE_CUDA (
+-  set USE_CUDA=OFF
+-)
+-
+-if NOT DEFINED USE_OBSERVERS (
+-  set USE_OBSERVERS=OFF
+-)
+-
+-if NOT DEFINED MSVC_Z7_OVERRIDE (
+-  set MSVC_Z7_OVERRIDE=OFF
+-)
+-
+-if NOT DEFINED CMAKE_GENERATOR (
+-  set CMAKE_GENERATOR=Ninja
+-)
+-
+-set CMAKE_VERBOSE_MAKEFILE=1
+-
+-:: Install pyyaml for Aten codegen
+-pip install pyyaml ninja
+-
+-echo CAFFE2_ROOT=%CAFFE2_ROOT%
+-echo CMAKE_GENERATOR=%CMAKE_GENERATOR%
+-echo CMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE%
+-
+-:: Set up cmake. We will skip building the test files right now.
+-pushd %CAFFE2_ROOT%
+-python tools\build_libtorch.py || goto :label_error
+-popd
+-
+-echo "Caffe2 built successfully"
+-cd %ORIGINAL_DIR%
+-endlocal
+-exit /b 0
+-
+-:label_error
+-echo "Caffe2 building failed"
+-cd %ORIGINAL_DIR%
+-endlocal
+-exit /b 1
 +:: #############################################################################
 +:: Example command to build on Windows.
 +:: #############################################################################
@@ -8757,25 +8748,25 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +exit /b 1
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/scripts/proto.ps1 pytorch-develop/scripts/proto.ps1
 --- pytorch-v1.5.0/scripts/proto.ps1	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/scripts/proto.ps1	2021-07-13 15:30:57.850276836 +0800
++++ pytorch-develop/scripts/proto.ps1	2021-07-15 20:52:26.941447686 +0800
 @@ -1,17 +1,17 @@
--param(
--  [string]$protoc,
--  [string]$srcdir,
--  [string]$unprocessed,
--  [string]$processed,
--  [string]$out
--)
--$ErrorActionPreference = "Stop"
--Get-Content $unprocessed | % {$_ -Replace "caffe2/proto/caffe2.proto", "caffe2.proto"} | Set-Content $processed
--Add-Content -Path $processed -Value "option optimize_for = LITE_RUNTIME;`n" -NoNewline
--$dir = (Get-Item $processed).DirectoryName
--
--copy $srcdir/caffe2/proto/caffe2.proto $srcdir/caffe2.proto
--Add-Content -Path $srcdir/caffe2.proto -Value "option optimize_for = LITE_RUNTIME;`n" -NoNewline
--
--$cmd = "$protoc -I${dir} --cpp_out=$out $processed"
--Invoke-Expression $cmd
+-param(
+-  [string]$protoc,
+-  [string]$srcdir,
+-  [string]$unprocessed,
+-  [string]$processed,
+-  [string]$out
+-)
+-$ErrorActionPreference = "Stop"
+-Get-Content $unprocessed | % {$_ -Replace "caffe2/proto/caffe2.proto", "caffe2.proto"} | Set-Content $processed
+-Add-Content -Path $processed -Value "option optimize_for = LITE_RUNTIME;`n" -NoNewline
+-$dir = (Get-Item $processed).DirectoryName
+-
+-copy $srcdir/caffe2/proto/caffe2.proto $srcdir/caffe2.proto
+-Add-Content -Path $srcdir/caffe2.proto -Value "option optimize_for = LITE_RUNTIME;`n" -NoNewline
+-
+-$cmd = "$protoc -I${dir} --cpp_out=$out $processed"
+-Invoke-Expression $cmd
 +param(
 +  [string]$protoc,
 +  [string]$srcdir,
@@ -8795,7 +8786,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +Invoke-Expression $cmd
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/setup.py pytorch-develop/setup.py
 --- pytorch-v1.5.0/setup.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/setup.py	2021-07-13 15:30:57.850276836 +0800
++++ pytorch-develop/setup.py	2021-07-15 20:52:26.941447686 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -8894,7 +8885,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
                  'python/serialized_test/data/operator_test/*.zip',
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/derivatives.yaml pytorch-develop/tools/autograd/derivatives.yaml
 --- pytorch-v1.5.0/tools/autograd/derivatives.yaml	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/derivatives.yaml	2021-07-13 15:30:58.990317711 +0800
++++ pytorch-develop/tools/autograd/derivatives.yaml	2021-07-15 20:52:28.085488704 +0800
 @@ -107,6 +107,10 @@
  #
  # NB: The parameter names here MUST be consistent with the parameter names
@@ -8951,7 +8942,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  # The above backward definitions are equivalent to the definitions below.  Why do we bundle
  # everything up?  It's because it's more convenient to define double backwards
  # when there is a single function that manages everything.
-@@ -1630,3 +1643,52 @@
+@@ -1630,3 +1643,55 @@
  
  - name: nonzero(Tensor self) -> Tensor
    output_differentiability: [False]
@@ -9004,11 +8995,14 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +- name: npu_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
 +  input, weight: npu_linear_backward(grad, input, weight)
 +  bias: maybe_multiply(grad, 1)
++
++- name: npu_giou(Tensor self, Tensor gtboxes, bool trans=False, bool is_cross=False, int mode=0) -> Tensor
++  self, gtboxes: npu_giou_backward(grad, self, gtboxes, trans, is_cross, mode)
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/dump_utils.py pytorch-develop/tools/autograd/dump_utils.py
 --- pytorch-v1.5.0/tools/autograd/dump_utils.py	1970-01-01 08:00:00.000000000 +0800
-+++ pytorch-develop/tools/autograd/dump_utils.py	2021-07-13 15:30:58.990317711 +0800
-@@ -0,0 +1,114 @@
++++ pytorch-develop/tools/autograd/dump_utils.py	2021-07-15 20:52:28.085488704 +0800
+@@ -0,0 +1,115 @@
 +# Copyright (c) 2021 Huawei Technologies Co., Ltd
 +# All rights reserved.
 +#
@@ -9121,11 +9115,12 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +  "pin_memory",
 +  "to_device",
 +  "numpy_T",
-+  "slice_Tensor"
++  "slice_Tensor",
++  "select_int"
 +]
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/gen_autograd_functions.py pytorch-develop/tools/autograd/gen_autograd_functions.py
 --- pytorch-v1.5.0/tools/autograd/gen_autograd_functions.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/gen_autograd_functions.py	2021-07-13 15:30:58.990317711 +0800
++++ pytorch-develop/tools/autograd/gen_autograd_functions.py	2021-07-15 20:52:28.085488704 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2021 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -9311,7 +9306,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/gen_python_functions.py pytorch-develop/tools/autograd/gen_python_functions.py
 --- pytorch-v1.5.0/tools/autograd/gen_python_functions.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/gen_python_functions.py	2021-07-13 15:30:58.990317711 +0800
++++ pytorch-develop/tools/autograd/gen_python_functions.py	2021-07-15 20:52:28.085488704 +0800
 @@ -1,3 +1,20 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -9353,7 +9348,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
              'value': argname,
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/gen_variable_type.py pytorch-develop/tools/autograd/gen_variable_type.py
 --- pytorch-v1.5.0/tools/autograd/gen_variable_type.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/gen_variable_type.py	2021-07-13 15:30:58.990317711 +0800
++++ pytorch-develop/tools/autograd/gen_variable_type.py	2021-07-15 20:52:28.085488704 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2021 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -9526,7 +9521,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/Functions.cpp pytorch-develop/tools/autograd/templates/Functions.cpp
 --- pytorch-v1.5.0/tools/autograd/templates/Functions.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/Functions.cpp	2021-07-13 15:30:58.990317711 +0800
++++ pytorch-develop/tools/autograd/templates/Functions.cpp	2021-07-15 20:52:28.085488704 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2021 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -9606,7 +9601,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    auto sparse = sparse_.coalesce();
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/python_torch_functions.cpp pytorch-develop/tools/autograd/templates/python_torch_functions.cpp
 --- pytorch-v1.5.0/tools/autograd/templates/python_torch_functions.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/python_torch_functions.cpp	2021-07-13 15:30:58.990317711 +0800
++++ pytorch-develop/tools/autograd/templates/python_torch_functions.cpp	2021-07-15 20:52:28.085488704 +0800
 @@ -22,7 +22,7 @@
  #include "torch/csrc/autograd/generated/variable_factories.h"
  #include "torch/csrc/utils/structseq.h"
@@ -9690,7 +9685,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/python_variable_methods.cpp pytorch-develop/tools/autograd/templates/python_variable_methods.cpp
 --- pytorch-v1.5.0/tools/autograd/templates/python_variable_methods.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/python_variable_methods.cpp	2021-07-13 15:30:58.990317711 +0800
++++ pytorch-develop/tools/autograd/templates/python_variable_methods.cpp	2021-07-15 20:52:28.085488704 +0800
 @@ -15,7 +15,13 @@
  #include "torch/csrc/cuda/Stream.h"
  #include "torch/csrc/cuda/Event.h"
@@ -9777,7 +9772,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    {"has_names", (PyCFunction)THPVariable_has_names, METH_NOARGS, NULL},
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/VariableType.cpp pytorch-develop/tools/autograd/templates/VariableType.cpp
 --- pytorch-v1.5.0/tools/autograd/templates/VariableType.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/VariableType.cpp	2021-07-13 15:30:58.990317711 +0800
++++ pytorch-develop/tools/autograd/templates/VariableType.cpp	2021-07-15 20:52:28.085488704 +0800
 @@ -1,7 +1,27 @@
 +// Copyright (c) 2021 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -9808,7 +9803,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/autograd/templates/VariableType.h pytorch-develop/tools/autograd/templates/VariableType.h
 --- pytorch-v1.5.0/tools/autograd/templates/VariableType.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/autograd/templates/VariableType.h	2021-07-13 15:30:58.990317711 +0800
++++ pytorch-develop/tools/autograd/templates/VariableType.h	2021-07-15 20:52:28.085488704 +0800
 @@ -1,3 +1,20 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -9840,7 +9835,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    const at::Tensor & unpack(const Tensor & t, const char * name, int pos);
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/tools/build_variables.bzl pytorch-develop/tools/build_variables.bzl
 --- pytorch-v1.5.0/tools/build_variables.bzl	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/tools/build_variables.bzl	2021-07-13 15:30:58.994317854 +0800
++++ pytorch-develop/tools/build_variables.bzl	2021-07-15 20:52:28.085488704 +0800
 @@ -46,6 +46,7 @@
      "torch/csrc/autograd/functions/utils.cpp",
      "torch/csrc/autograd/input_buffer.cpp",
@@ -9926,7 +9921,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -def grad(outputs: _TensorOrTensors, inputs: _TensorOrTensors, grad_outputs: Optional[_TensorOrTensors]=..., retain_graph: Optional[bool]=..., create_graph: bool=..., only_inputs: bool=..., allow_unused: bool=...) -> Tuple[Tensor, ...]: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/autograd/profiler.py pytorch-develop/torch/autograd/profiler.py
 --- pytorch-v1.5.0/torch/autograd/profiler.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/autograd/profiler.py	2021-07-13 15:30:58.998317998 +0800
++++ pytorch-develop/torch/autograd/profiler.py	2021-07-15 20:52:28.093488991 +0800
 @@ -1,8 +1,25 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -10399,7 +10394,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      return ''.join(result)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/CMakeLists.txt pytorch-develop/torch/CMakeLists.txt
 --- pytorch-v1.5.0/torch/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/CMakeLists.txt	2021-07-13 15:30:58.994317854 +0800
++++ pytorch-develop/torch/CMakeLists.txt	2021-07-15 20:52:28.089488848 +0800
 @@ -97,6 +97,7 @@
      ${TORCH_SRC_DIR}/csrc/tensor/python_tensor.cpp
      ${TORCH_SRC_DIR}/csrc/utils.cpp
@@ -10431,7 +10426,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  endif()
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/engine.cpp pytorch-develop/torch/csrc/autograd/engine.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/engine.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/engine.cpp	2021-07-13 15:30:59.010318428 +0800
++++ pytorch-develop/torch/csrc/autograd/engine.cpp	2021-07-15 20:52:28.101489278 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10554,7 +10549,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        auto event = c10::Event{c10::DeviceType::CUDA};
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/functions/tensor.cpp pytorch-develop/torch/csrc/autograd/functions/tensor.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/functions/tensor.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/functions/tensor.cpp	2021-07-13 15:30:59.010318428 +0800
++++ pytorch-develop/torch/csrc/autograd/functions/tensor.cpp	2021-07-15 20:52:28.101489278 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10586,7 +10581,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
            /*non_blocking=*/false,
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/init.cpp pytorch-develop/torch/csrc/autograd/init.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/init.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/init.cpp	2021-07-13 15:30:59.010318428 +0800
++++ pytorch-develop/torch/csrc/autograd/init.cpp	2021-07-15 20:52:28.101489278 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10629,7 +10624,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    m.def("_enable_profiler", enableProfiler);
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/input_buffer.cpp pytorch-develop/torch/csrc/autograd/input_buffer.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/input_buffer.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/input_buffer.cpp	2021-07-13 15:30:59.010318428 +0800
++++ pytorch-develop/torch/csrc/autograd/input_buffer.cpp	2021-07-15 20:52:28.101489278 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10681,7 +10676,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    auto& old_var = buffer[pos];
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/profiler.cpp pytorch-develop/torch/csrc/autograd/profiler.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/profiler.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/profiler.cpp	2021-07-13 15:30:59.010318428 +0800
++++ pytorch-develop/torch/csrc/autograd/profiler.cpp	2021-07-15 20:52:28.105489421 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -10877,7 +10872,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  CUDAStubs::~CUDAStubs() = default;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/profiler.h pytorch-develop/torch/csrc/autograd/profiler.h
 --- pytorch-v1.5.0/torch/csrc/autograd/profiler.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/profiler.h	2021-07-13 15:30:59.010318428 +0800
++++ pytorch-develop/torch/csrc/autograd/profiler.h	2021-07-15 20:52:28.105489421 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11002,7 +10997,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/python_variable.cpp pytorch-develop/torch/csrc/autograd/python_variable.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/python_variable.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/python_variable.cpp	2021-07-13 15:30:59.010318428 +0800
++++ pytorch-develop/torch/csrc/autograd/python_variable.cpp	2021-07-15 20:52:28.105489421 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11056,7 +11051,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    {"is_complex", (getter)THPVariable_is_complex, nullptr, nullptr, nullptr},
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/python_variable_indexing.cpp pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/python_variable_indexing.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp	2021-07-13 15:30:59.010318428 +0800
++++ pytorch-develop/torch/csrc/autograd/python_variable_indexing.cpp	2021-07-15 20:52:28.105489421 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11097,7 +11092,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/utils/wrap_outputs.h pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h
 --- pytorch-v1.5.0/torch/csrc/autograd/utils/wrap_outputs.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h	2021-07-13 15:30:59.010318428 +0800
++++ pytorch-develop/torch/csrc/autograd/utils/wrap_outputs.h	2021-07-15 20:52:28.105489421 +0800
 @@ -168,6 +168,45 @@
    return r.release();
  }
@@ -11146,7 +11141,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    if (!r) throw python_error();
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/autograd/VariableTypeManual.cpp pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp
 --- pytorch-v1.5.0/torch/csrc/autograd/VariableTypeManual.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp	2021-07-13 15:30:59.006318284 +0800
++++ pytorch-develop/torch/csrc/autograd/VariableTypeManual.cpp	2021-07-15 20:52:28.101489278 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11180,7 +11175,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    if (!t.defined()) {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/distributed/c10d/comm.cpp pytorch-develop/torch/csrc/distributed/c10d/comm.cpp
 --- pytorch-v1.5.0/torch/csrc/distributed/c10d/comm.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/distributed/c10d/comm.cpp	2021-07-13 15:30:59.014318571 +0800
++++ pytorch-develop/torch/csrc/distributed/c10d/comm.cpp	2021-07-15 20:52:28.105489421 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11286,7 +11281,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    while (!in_flight.empty()) {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/distributed/c10d/init.cpp pytorch-develop/torch/csrc/distributed/c10d/init.cpp
 --- pytorch-v1.5.0/torch/csrc/distributed/c10d/init.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/distributed/c10d/init.cpp	2021-07-13 15:30:59.014318571 +0800
++++ pytorch-develop/torch/csrc/distributed/c10d/init.cpp	2021-07-15 20:52:28.109489564 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11343,7 +11338,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        .def("is_success", &::c10d::ProcessGroup::Work::isSuccess)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/distributed/c10d/reducer.cpp pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp
 --- pytorch-v1.5.0/torch/csrc/distributed/c10d/reducer.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp	2021-07-13 15:30:59.014318571 +0800
++++ pytorch-develop/torch/csrc/distributed/c10d/reducer.cpp	2021-07-15 20:52:28.109489564 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11468,7 +11463,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  }
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/DynamicTypes.cpp pytorch-develop/torch/csrc/DynamicTypes.cpp
 --- pytorch-v1.5.0/torch/csrc/DynamicTypes.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/DynamicTypes.cpp	2021-07-13 15:30:58.998317998 +0800
++++ pytorch-develop/torch/csrc/DynamicTypes.cpp	2021-07-15 20:52:28.093488991 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11517,7 +11512,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      return it->second;
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/Generator.cpp pytorch-develop/torch/csrc/Generator.cpp
 --- pytorch-v1.5.0/torch/csrc/Generator.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/Generator.cpp	2021-07-13 15:30:58.998317998 +0800
++++ pytorch-develop/torch/csrc/Generator.cpp	2021-07-15 20:52:28.093488991 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11585,7 +11580,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  #endif 
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/generic/serialization.cpp pytorch-develop/torch/csrc/generic/serialization.cpp
 --- pytorch-v1.5.0/torch/csrc/generic/serialization.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/generic/serialization.cpp	2021-07-13 15:30:59.018318714 +0800
++++ pytorch-develop/torch/csrc/generic/serialization.cpp	2021-07-15 20:52:28.109489564 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11685,7 +11680,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/generic/Storage.cpp pytorch-develop/torch/csrc/generic/Storage.cpp
 --- pytorch-v1.5.0/torch/csrc/generic/Storage.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/generic/Storage.cpp	2021-07-13 15:30:59.018318714 +0800
++++ pytorch-develop/torch/csrc/generic/Storage.cpp	2021-07-15 20:52:28.109489564 +0800
 @@ -1,7 +1,25 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11764,7 +11759,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
        for (Py_ssize_t i = 0; i < length; i++) {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/generic/StorageMethods.cpp pytorch-develop/torch/csrc/generic/StorageMethods.cpp
 --- pytorch-v1.5.0/torch/csrc/generic/StorageMethods.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/generic/StorageMethods.cpp	2021-07-13 15:30:59.018318714 +0800
++++ pytorch-develop/torch/csrc/generic/StorageMethods.cpp	2021-07-15 20:52:28.109489564 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11812,7 +11807,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    {"_write_file", (PyCFunction)THPStorage_(writeFile), METH_VARARGS, nullptr},
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/Module.cpp pytorch-develop/torch/csrc/Module.cpp
 --- pytorch-v1.5.0/torch/csrc/Module.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/Module.cpp	2021-07-13 15:30:58.998317998 +0800
++++ pytorch-develop/torch/csrc/Module.cpp	2021-07-15 20:52:28.093488991 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -11956,7 +11951,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    auto set_module_attr = [&](const char* name, PyObject* v, bool incref = true) {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/tensor/python_tensor.cpp pytorch-develop/torch/csrc/tensor/python_tensor.cpp
 --- pytorch-v1.5.0/torch/csrc/tensor/python_tensor.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/tensor/python_tensor.cpp	2021-07-13 15:30:59.038319432 +0800
++++ pytorch-develop/torch/csrc/tensor/python_tensor.cpp	2021-07-15 20:52:28.133490425 +0800
 @@ -1,18 +1,35 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12333,7 +12328,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +} // namespace torch
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/init.cpp pytorch-develop/torch/csrc/utils/init.cpp
 --- pytorch-v1.5.0/torch/csrc/utils/init.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/init.cpp	2021-07-13 15:30:59.038319432 +0800
++++ pytorch-develop/torch/csrc/utils/init.cpp	2021-07-15 20:52:28.133490425 +0800
 @@ -1,6 +1,10 @@
  #include <ATen/core/ivalue.h>
  #include <torch/csrc/utils/init.h>
@@ -12421,7 +12416,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  } // namespace torch
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/init.h pytorch-develop/torch/csrc/utils/init.h
 --- pytorch-v1.5.0/torch/csrc/utils/init.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/init.h	2021-07-13 15:30:59.038319432 +0800
++++ pytorch-develop/torch/csrc/utils/init.h	2021-07-15 20:52:28.133490425 +0800
 @@ -8,4 +8,7 @@
  void initThroughputBenchmarkBindings(PyObject* module);
  
@@ -12432,7 +12427,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  } // namespace torch
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/python_arg_parser.h pytorch-develop/torch/csrc/utils/python_arg_parser.h
 --- pytorch-v1.5.0/torch/csrc/utils/python_arg_parser.h	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/python_arg_parser.h	2021-07-13 15:30:59.038319432 +0800
++++ pytorch-develop/torch/csrc/utils/python_arg_parser.h	2021-07-15 20:52:28.133490425 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12467,7 +12462,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    return at::Device(device_str);
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/tensor_layouts.cpp pytorch-develop/torch/csrc/utils/tensor_layouts.cpp
 --- pytorch-v1.5.0/torch/csrc/utils/tensor_layouts.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/tensor_layouts.cpp	2021-07-13 15:30:59.038319432 +0800
++++ pytorch-develop/torch/csrc/utils/tensor_layouts.cpp	2021-07-15 20:52:28.133490425 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12498,7 +12493,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    registerLayoutObject((THPLayout*)strided_layout, at::Backend::QuantizedCPU);
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/tensor_new.cpp pytorch-develop/torch/csrc/utils/tensor_new.cpp
 --- pytorch-v1.5.0/torch/csrc/utils/tensor_new.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/tensor_new.cpp	2021-07-13 15:30:59.038319432 +0800
++++ pytorch-develop/torch/csrc/utils/tensor_new.cpp	2021-07-15 20:52:28.133490425 +0800
 @@ -1,3 +1,19 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12634,7 +12629,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    } else if(expected_layout == c10::kSparse) {
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/csrc/utils/tensor_types.cpp pytorch-develop/torch/csrc/utils/tensor_types.cpp
 --- pytorch-v1.5.0/torch/csrc/utils/tensor_types.cpp	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/csrc/utils/tensor_types.cpp	2021-07-13 15:30:59.038319432 +0800
++++ pytorch-develop/torch/csrc/utils/tensor_types.cpp	2021-07-15 20:52:28.133490425 +0800
 @@ -1,58 +1,91 @@
 +// Copyright (c) 2020 Huawei Technologies Co., Ltd
 +// Copyright (c) 2019, Facebook CORPORATION. 
@@ -12847,7 +12842,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -def get_rng_state(): ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/distributed/distributed_c10d.py pytorch-develop/torch/distributed/distributed_c10d.py
 --- pytorch-v1.5.0/torch/distributed/distributed_c10d.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/distributed/distributed_c10d.py	2021-07-13 15:30:59.042319575 +0800
++++ pytorch-develop/torch/distributed/distributed_c10d.py	2021-07-15 20:52:28.133490425 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -12928,148 +12923,148 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/distributions/von_mises.py pytorch-develop/torch/distributions/von_mises.py
 --- pytorch-v1.5.0/torch/distributions/von_mises.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/distributions/von_mises.py	2021-07-13 15:30:59.042319575 +0800
++++ pytorch-develop/torch/distributions/von_mises.py	2021-07-15 20:52:28.137490568 +0800
 @@ -1,140 +1,140 @@
--from __future__ import absolute_import, division, print_function
--
--import math
--
--import torch
--import torch.jit
--from torch.distributions import constraints
--from torch.distributions.distribution import Distribution
--from torch.distributions.utils import broadcast_all, lazy_property
--
--
--def _eval_poly(y, coef):
--    coef = list(coef)
--    result = coef.pop()
--    while coef:
--        result = coef.pop() + y * result
--    return result
--
--
--_I0_COEF_SMALL = [1.0, 3.5156229, 3.0899424, 1.2067492, 0.2659732, 0.360768e-1, 0.45813e-2]
--_I0_COEF_LARGE = [0.39894228, 0.1328592e-1, 0.225319e-2, -0.157565e-2, 0.916281e-2,
--                  -0.2057706e-1, 0.2635537e-1, -0.1647633e-1, 0.392377e-2]
--_I1_COEF_SMALL = [0.5, 0.87890594, 0.51498869, 0.15084934, 0.2658733e-1, 0.301532e-2, 0.32411e-3]
--_I1_COEF_LARGE = [0.39894228, -0.3988024e-1, -0.362018e-2, 0.163801e-2, -0.1031555e-1,
--                  0.2282967e-1, -0.2895312e-1, 0.1787654e-1, -0.420059e-2]
--
--_COEF_SMALL = [_I0_COEF_SMALL, _I1_COEF_SMALL]
--_COEF_LARGE = [_I0_COEF_LARGE, _I1_COEF_LARGE]
--
--
--def _log_modified_bessel_fn(x, order=0):
--    """
--    Returns ``log(I_order(x))`` for ``x > 0``,
--    where `order` is either 0 or 1.
--    """
--    assert order == 0 or order == 1
--
--    # compute small solution
--    y = (x / 3.75)
--    y = y * y
--    small = _eval_poly(y, _COEF_SMALL[order])
--    if order == 1:
--        small = x.abs() * small
--    small = small.log()
--
--    # compute large solution
--    y = 3.75 / x
--    large = x - 0.5 * x.log() + _eval_poly(y, _COEF_LARGE[order]).log()
--
--    result = torch.where(x < 3.75, small, large)
--    return result
--
--
--@torch.jit.script
--def _rejection_sample(loc, concentration, proposal_r, x):
--    done = torch.zeros(x.shape, dtype=torch.bool, device=loc.device)
--    while not done.all():
--        u = torch.rand((3,) + x.shape, dtype=loc.dtype, device=loc.device)
--        u1, u2, u3 = u.unbind()
--        z = torch.cos(math.pi * u1)
--        f = (1 + proposal_r * z) / (proposal_r + z)
--        c = concentration * (proposal_r - f)
--        accept = ((c * (2 - c) - u2) > 0) | ((c / u2).log() + 1 - c >= 0)
--        if accept.any():
--            x = torch.where(accept, (u3 - 0.5).sign() * f.acos(), x)
--            done = done | accept
--    return (x + math.pi + loc) % (2 * math.pi) - math.pi
--
--
--class VonMises(Distribution):
--    """
--    A circular von Mises distribution.
--
--    This implementation uses polar coordinates. The ``loc`` and ``value`` args
--    can be any real number (to facilitate unconstrained optimization), but are
--    interpreted as angles modulo 2 pi.
--
--    Example::
--        >>> m = dist.VonMises(torch.tensor([1.0]), torch.tensor([1.0]))
--        >>> m.sample() # von Mises distributed with loc=1 and concentration=1
--        tensor([1.9777])
--
--    :param torch.Tensor loc: an angle in radians.
--    :param torch.Tensor concentration: concentration parameter
--    """
--    arg_constraints = {'loc': constraints.real, 'concentration': constraints.positive}
--    support = constraints.real
--    has_rsample = False
--
--    def __init__(self, loc, concentration, validate_args=None):
--        self.loc, self.concentration = broadcast_all(loc, concentration)
--        batch_shape = self.loc.shape
--        event_shape = torch.Size()
--
--        # Parameters for sampling
--        tau = 1 + (1 + 4 * self.concentration ** 2).sqrt()
--        rho = (tau - (2 * tau).sqrt()) / (2 * self.concentration)
--        self._proposal_r = (1 + rho ** 2) / (2 * rho)
--
--        super(VonMises, self).__init__(batch_shape, event_shape, validate_args)
--
--    def log_prob(self, value):
--        log_prob = self.concentration * torch.cos(value - self.loc)
--        log_prob = log_prob - math.log(2 * math.pi) - _log_modified_bessel_fn(self.concentration, order=0)
--        return log_prob
--
--    @torch.no_grad()
--    def sample(self, sample_shape=torch.Size()):
--        """
--        The sampling algorithm for the von Mises distribution is based on the following paper:
--        Best, D. J., and Nicholas I. Fisher.
--        "Efficient simulation of the von Mises distribution." Applied Statistics (1979): 152-157.
--        """
--        shape = self._extended_shape(sample_shape)
--        x = torch.empty(shape, dtype=self.loc.dtype, device=self.loc.device)
--        return _rejection_sample(self.loc, self.concentration, self._proposal_r, x)
--
--    def expand(self, batch_shape):
--        try:
--            return super(VonMises, self).expand(batch_shape)
--        except NotImplementedError:
--            validate_args = self.__dict__.get('_validate_args')
--            loc = self.loc.expand(batch_shape)
--            concentration = self.concentration.expand(batch_shape)
--            return type(self)(loc, concentration, validate_args=validate_args)
--
--    @property
--    def mean(self):
--        """
--        The provided mean is the circular one.
--        """
--        return self.loc
--
--    @lazy_property
--    def variance(self):
--        """
--        The provided variance is the circular one.
--        """
--        return 1 - (_log_modified_bessel_fn(self.concentration, order=1) -
--                    _log_modified_bessel_fn(self.concentration, order=0)).exp()
+-from __future__ import absolute_import, division, print_function
+-
+-import math
+-
+-import torch
+-import torch.jit
+-from torch.distributions import constraints
+-from torch.distributions.distribution import Distribution
+-from torch.distributions.utils import broadcast_all, lazy_property
+-
+-
+-def _eval_poly(y, coef):
+-    coef = list(coef)
+-    result = coef.pop()
+-    while coef:
+-        result = coef.pop() + y * result
+-    return result
+-
+-
+-_I0_COEF_SMALL = [1.0, 3.5156229, 3.0899424, 1.2067492, 0.2659732, 0.360768e-1, 0.45813e-2]
+-_I0_COEF_LARGE = [0.39894228, 0.1328592e-1, 0.225319e-2, -0.157565e-2, 0.916281e-2,
+-                  -0.2057706e-1, 0.2635537e-1, -0.1647633e-1, 0.392377e-2]
+-_I1_COEF_SMALL = [0.5, 0.87890594, 0.51498869, 0.15084934, 0.2658733e-1, 0.301532e-2, 0.32411e-3]
+-_I1_COEF_LARGE = [0.39894228, -0.3988024e-1, -0.362018e-2, 0.163801e-2, -0.1031555e-1,
+-                  0.2282967e-1, -0.2895312e-1, 0.1787654e-1, -0.420059e-2]
+-
+-_COEF_SMALL = [_I0_COEF_SMALL, _I1_COEF_SMALL]
+-_COEF_LARGE = [_I0_COEF_LARGE, _I1_COEF_LARGE]
+-
+-
+-def _log_modified_bessel_fn(x, order=0):
+-    """
+-    Returns ``log(I_order(x))`` for ``x > 0``,
+-    where `order` is either 0 or 1.
+-    """
+-    assert order == 0 or order == 1
+-
+-    # compute small solution
+-    y = (x / 3.75)
+-    y = y * y
+-    small = _eval_poly(y, _COEF_SMALL[order])
+-    if order == 1:
+-        small = x.abs() * small
+-    small = small.log()
+-
+-    # compute large solution
+-    y = 3.75 / x
+-    large = x - 0.5 * x.log() + _eval_poly(y, _COEF_LARGE[order]).log()
+-
+-    result = torch.where(x < 3.75, small, large)
+-    return result
+-
+-
+-@torch.jit.script
+-def _rejection_sample(loc, concentration, proposal_r, x):
+-    done = torch.zeros(x.shape, dtype=torch.bool, device=loc.device)
+-    while not done.all():
+-        u = torch.rand((3,) + x.shape, dtype=loc.dtype, device=loc.device)
+-        u1, u2, u3 = u.unbind()
+-        z = torch.cos(math.pi * u1)
+-        f = (1 + proposal_r * z) / (proposal_r + z)
+-        c = concentration * (proposal_r - f)
+-        accept = ((c * (2 - c) - u2) > 0) | ((c / u2).log() + 1 - c >= 0)
+-        if accept.any():
+-            x = torch.where(accept, (u3 - 0.5).sign() * f.acos(), x)
+-            done = done | accept
+-    return (x + math.pi + loc) % (2 * math.pi) - math.pi
+-
+-
+-class VonMises(Distribution):
+-    """
+-    A circular von Mises distribution.
+-
+-    This implementation uses polar coordinates. The ``loc`` and ``value`` args
+-    can be any real number (to facilitate unconstrained optimization), but are
+-    interpreted as angles modulo 2 pi.
+-
+-    Example::
+-        >>> m = dist.VonMises(torch.tensor([1.0]), torch.tensor([1.0]))
+-        >>> m.sample() # von Mises distributed with loc=1 and concentration=1
+-        tensor([1.9777])
+-
+-    :param torch.Tensor loc: an angle in radians.
+-    :param torch.Tensor concentration: concentration parameter
+-    """
+-    arg_constraints = {'loc': constraints.real, 'concentration': constraints.positive}
+-    support = constraints.real
+-    has_rsample = False
+-
+-    def __init__(self, loc, concentration, validate_args=None):
+-        self.loc, self.concentration = broadcast_all(loc, concentration)
+-        batch_shape = self.loc.shape
+-        event_shape = torch.Size()
+-
+-        # Parameters for sampling
+-        tau = 1 + (1 + 4 * self.concentration ** 2).sqrt()
+-        rho = (tau - (2 * tau).sqrt()) / (2 * self.concentration)
+-        self._proposal_r = (1 + rho ** 2) / (2 * rho)
+-
+-        super(VonMises, self).__init__(batch_shape, event_shape, validate_args)
+-
+-    def log_prob(self, value):
+-        log_prob = self.concentration * torch.cos(value - self.loc)
+-        log_prob = log_prob - math.log(2 * math.pi) - _log_modified_bessel_fn(self.concentration, order=0)
+-        return log_prob
+-
+-    @torch.no_grad()
+-    def sample(self, sample_shape=torch.Size()):
+-        """
+-        The sampling algorithm for the von Mises distribution is based on the following paper:
+-        Best, D. J., and Nicholas I. Fisher.
+-        "Efficient simulation of the von Mises distribution." Applied Statistics (1979): 152-157.
+-        """
+-        shape = self._extended_shape(sample_shape)
+-        x = torch.empty(shape, dtype=self.loc.dtype, device=self.loc.device)
+-        return _rejection_sample(self.loc, self.concentration, self._proposal_r, x)
+-
+-    def expand(self, batch_shape):
+-        try:
+-            return super(VonMises, self).expand(batch_shape)
+-        except NotImplementedError:
+-            validate_args = self.__dict__.get('_validate_args')
+-            loc = self.loc.expand(batch_shape)
+-            concentration = self.concentration.expand(batch_shape)
+-            return type(self)(loc, concentration, validate_args=validate_args)
+-
+-    @property
+-    def mean(self):
+-        """
+-        The provided mean is the circular one.
+-        """
+-        return self.loc
+-
+-    @lazy_property
+-    def variance(self):
+-        """
+-        The provided variance is the circular one.
+-        """
+-        return 1 - (_log_modified_bessel_fn(self.concentration, order=1) -
+-                    _log_modified_bessel_fn(self.concentration, order=0)).exp()
 +from __future__ import absolute_import, division, print_function
 +
 +import math
@@ -13212,7 +13207,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 +                    _log_modified_bessel_fn(self.concentration, order=0)).exp()
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/__init__.py pytorch-develop/torch/__init__.py
 --- pytorch-v1.5.0/torch/__init__.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/__init__.py	2021-07-13 15:30:58.994317854 +0800
++++ pytorch-develop/torch/__init__.py	2021-07-15 20:52:28.089488848 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -13255,7 +13250,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 \ No newline at end of file
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/lib/c10d/CMakeLists.txt pytorch-develop/torch/lib/c10d/CMakeLists.txt
 --- pytorch-v1.5.0/torch/lib/c10d/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/lib/c10d/CMakeLists.txt	2021-07-13 15:30:59.046319718 +0800
++++ pytorch-develop/torch/lib/c10d/CMakeLists.txt	2021-07-15 20:52:28.137490568 +0800
 @@ -28,6 +28,10 @@
    option(USE_C10D_NCCL "USE C10D NCCL" ON)
  endif()
@@ -13308,7 +13303,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
    copy_header(ProcessGroupMPI.hpp)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/lib/libshm/CMakeLists.txt pytorch-develop/torch/lib/libshm/CMakeLists.txt
 --- pytorch-v1.5.0/torch/lib/libshm/CMakeLists.txt	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/lib/libshm/CMakeLists.txt	2021-07-13 15:30:59.046319718 +0800
++++ pytorch-develop/torch/lib/libshm/CMakeLists.txt	2021-07-15 20:52:28.141490712 +0800
 @@ -37,8 +37,11 @@
  SET_TARGET_PROPERTIES(shm PROPERTIES
    PREFIX "lib"
@@ -13365,7 +13360,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -_maybe_indices_t = _scalar_or_tuple_2_t[Tensor]
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/functional.py pytorch-develop/torch/nn/functional.py
 --- pytorch-v1.5.0/torch/nn/functional.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/functional.py	2021-07-13 15:30:59.050319862 +0800
++++ pytorch-develop/torch/nn/functional.py	2021-07-15 20:52:28.141490712 +0800
 @@ -1611,7 +1611,7 @@
      else:
          output = input.matmul(weight.t())
@@ -13388,7 +13383,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -from . import parallel as parallel
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/batchnorm.py pytorch-develop/torch/nn/modules/batchnorm.py
 --- pytorch-v1.5.0/torch/nn/modules/batchnorm.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/modules/batchnorm.py	2021-07-13 15:30:59.050319862 +0800
++++ pytorch-develop/torch/nn/modules/batchnorm.py	2021-07-15 20:52:28.145490855 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -13420,7 +13415,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
              self.register_parameter('running_var', None)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/module.py pytorch-develop/torch/nn/modules/module.py
 --- pytorch-v1.5.0/torch/nn/modules/module.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/modules/module.py	2021-07-13 15:30:59.050319862 +0800
++++ pytorch-develop/torch/nn/modules/module.py	2021-07-15 20:52:28.145490855 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -13563,7 +13558,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
                  return t.to(device, dtype if t.is_floating_point() else None, non_blocking, memory_format=convert_to_format)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/normalization.py pytorch-develop/torch/nn/modules/normalization.py
 --- pytorch-v1.5.0/torch/nn/modules/normalization.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/modules/normalization.py	2021-07-13 15:30:59.050319862 +0800
++++ pytorch-develop/torch/nn/modules/normalization.py	2021-07-15 20:52:28.145490855 +0800
 @@ -128,13 +128,14 @@
      """
      __constants__ = ['normalized_shape', 'eps', 'elementwise_affine']
@@ -13596,68 +13591,68 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          return '{normalized_shape}, eps={eps}, ' \
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/modules/transformer.pyi.in pytorch-develop/torch/nn/modules/transformer.pyi.in
 --- pytorch-v1.5.0/torch/nn/modules/transformer.pyi.in	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/modules/transformer.pyi.in	2021-07-13 15:30:59.054320005 +0800
++++ pytorch-develop/torch/nn/modules/transformer.pyi.in	2021-07-15 20:52:28.145490855 +0800
 @@ -1,60 +1,60 @@
--from ..init import xavier_uniform_
--from .activation import MultiheadAttention
--from .container import ModuleList
--from .dropout import Dropout
--from .linear import Linear
--from .module import Module
--from .normalization import LayerNorm
--from typing import Any, Optional
--
--class Transformer(Module):
--    encoder: Any = ...
--    decoder: Any = ...
--    d_model: Any = ...
--    nhead: Any = ...
--    def __init__(self, d_model: int = ..., nhead: int = ..., num_encoder_layers: int = ..., num_decoder_layers: int = ..., dim_feedforward: int = ..., dropout: float = ..., activation: str = ..., custom_encoder: Optional[Any] = ..., custom_decoder: Optional[Any] = ...) -> None: ...
--    def forward(self, src: Any, tgt: Any, src_mask: Optional[Any] = ..., tgt_mask: Optional[Any] = ..., memory_mask: Optional[Any] = ..., src_key_padding_mask: Optional[Any] = ..., tgt_key_padding_mask: Optional[Any] = ..., memory_key_padding_mask: Optional[Any] = ...): ...
--    def generate_square_subsequent_mask(self, sz: Any): ...
--
--class TransformerEncoder(Module):
--    layers: Any = ...
--    num_layers: Any = ...
--    norm: Any = ...
--    def __init__(self, encoder_layer: Any, num_layers: Any, norm: Optional[Any] = ...) -> None: ...
--    def forward(self, src: Any, mask: Optional[Any] = ..., src_key_padding_mask: Optional[Any] = ...): ...
--
--class TransformerDecoder(Module):
--    layers: Any = ...
--    num_layers: Any = ...
--    norm: Any = ...
--    def __init__(self, decoder_layer: Any, num_layers: Any, norm: Optional[Any] = ...) -> None: ...
--    def forward(self, tgt: Any, memory: Any, tgt_mask: Optional[Any] = ..., memory_mask: Optional[Any] = ..., tgt_key_padding_mask: Optional[Any] = ..., memory_key_padding_mask: Optional[Any] = ...): ...
--
--class TransformerEncoderLayer(Module):
--    self_attn: Any = ...
--    linear1: Any = ...
--    dropout: Any = ...
--    linear2: Any = ...
--    norm1: Any = ...
--    norm2: Any = ...
--    dropout1: Any = ...
--    dropout2: Any = ...
--    activation: Any = ...
--    def __init__(self, d_model: Any, nhead: Any, dim_feedforward: int = ..., dropout: float = ..., activation: str = ...) -> None: ...
--    def forward(self, src: Any, src_mask: Optional[Any] = ..., src_key_padding_mask: Optional[Any] = ...): ...
--
--class TransformerDecoderLayer(Module):
--    self_attn: Any = ...
--    multihead_attn: Any = ...
--    linear1: Any = ...
--    dropout: Any = ...
--    linear2: Any = ...
--    norm1: Any = ...
--    norm2: Any = ...
--    norm3: Any = ...
--    dropout1: Any = ...
--    dropout2: Any = ...
--    dropout3: Any = ...
--    activation: Any = ...
--    def __init__(self, d_model: Any, nhead: Any, dim_feedforward: int = ..., dropout: float = ..., activation: str = ...) -> None: ...
--    def forward(self, tgt: Any, memory: Any, tgt_mask: Optional[Any] = ..., memory_mask: Optional[Any] = ..., tgt_key_padding_mask: Optional[Any] = ..., memory_key_padding_mask: Optional[Any] = ...): ...
+-from ..init import xavier_uniform_
+-from .activation import MultiheadAttention
+-from .container import ModuleList
+-from .dropout import Dropout
+-from .linear import Linear
+-from .module import Module
+-from .normalization import LayerNorm
+-from typing import Any, Optional
+-
+-class Transformer(Module):
+-    encoder: Any = ...
+-    decoder: Any = ...
+-    d_model: Any = ...
+-    nhead: Any = ...
+-    def __init__(self, d_model: int = ..., nhead: int = ..., num_encoder_layers: int = ..., num_decoder_layers: int = ..., dim_feedforward: int = ..., dropout: float = ..., activation: str = ..., custom_encoder: Optional[Any] = ..., custom_decoder: Optional[Any] = ...) -> None: ...
+-    def forward(self, src: Any, tgt: Any, src_mask: Optional[Any] = ..., tgt_mask: Optional[Any] = ..., memory_mask: Optional[Any] = ..., src_key_padding_mask: Optional[Any] = ..., tgt_key_padding_mask: Optional[Any] = ..., memory_key_padding_mask: Optional[Any] = ...): ...
+-    def generate_square_subsequent_mask(self, sz: Any): ...
+-
+-class TransformerEncoder(Module):
+-    layers: Any = ...
+-    num_layers: Any = ...
+-    norm: Any = ...
+-    def __init__(self, encoder_layer: Any, num_layers: Any, norm: Optional[Any] = ...) -> None: ...
+-    def forward(self, src: Any, mask: Optional[Any] = ..., src_key_padding_mask: Optional[Any] = ...): ...
+-
+-class TransformerDecoder(Module):
+-    layers: Any = ...
+-    num_layers: Any = ...
+-    norm: Any = ...
+-    def __init__(self, decoder_layer: Any, num_layers: Any, norm: Optional[Any] = ...) -> None: ...
+-    def forward(self, tgt: Any, memory: Any, tgt_mask: Optional[Any] = ..., memory_mask: Optional[Any] = ..., tgt_key_padding_mask: Optional[Any] = ..., memory_key_padding_mask: Optional[Any] = ...): ...
+-
+-class TransformerEncoderLayer(Module):
+-    self_attn: Any = ...
+-    linear1: Any = ...
+-    dropout: Any = ...
+-    linear2: Any = ...
+-    norm1: Any = ...
+-    norm2: Any = ...
+-    dropout1: Any = ...
+-    dropout2: Any = ...
+-    activation: Any = ...
+-    def __init__(self, d_model: Any, nhead: Any, dim_feedforward: int = ..., dropout: float = ..., activation: str = ...) -> None: ...
+-    def forward(self, src: Any, src_mask: Optional[Any] = ..., src_key_padding_mask: Optional[Any] = ...): ...
+-
+-class TransformerDecoderLayer(Module):
+-    self_attn: Any = ...
+-    multihead_attn: Any = ...
+-    linear1: Any = ...
+-    dropout: Any = ...
+-    linear2: Any = ...
+-    norm1: Any = ...
+-    norm2: Any = ...
+-    norm3: Any = ...
+-    dropout1: Any = ...
+-    dropout2: Any = ...
+-    dropout3: Any = ...
+-    activation: Any = ...
+-    def __init__(self, d_model: Any, nhead: Any, dim_feedforward: int = ..., dropout: float = ..., activation: str = ...) -> None: ...
+-    def forward(self, tgt: Any, memory: Any, tgt_mask: Optional[Any] = ..., memory_mask: Optional[Any] = ..., tgt_key_padding_mask: Optional[Any] = ..., memory_key_padding_mask: Optional[Any] = ...): ...
 +from ..init import xavier_uniform_
 +from .activation import MultiheadAttention
 +from .container import ModuleList
@@ -13756,7 +13751,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -                  module_kwargs: Optional[Any] = ...) -> Tensor: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/nn/parallel/distributed.py pytorch-develop/torch/nn/parallel/distributed.py
 --- pytorch-v1.5.0/torch/nn/parallel/distributed.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/nn/parallel/distributed.py	2021-07-13 15:30:59.054320005 +0800
++++ pytorch-develop/torch/nn/parallel/distributed.py	2021-07-15 20:52:28.145490855 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14107,7 +14102,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -def remove_weight_norm(module: T_module, name: str = ...) -> T_module: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/onnx/symbolic_opset9.py pytorch-develop/torch/onnx/symbolic_opset9.py
 --- pytorch-v1.5.0/torch/onnx/symbolic_opset9.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/onnx/symbolic_opset9.py	2021-07-13 15:30:59.054320005 +0800
++++ pytorch-develop/torch/onnx/symbolic_opset9.py	2021-07-15 20:52:28.149490998 +0800
 @@ -1621,14 +1621,23 @@
          slices = [sym_help._slice_helper(g, w, axes=[0], starts=[x * n], ends=[y * n]) for x, y in intervals]
          return g.op('Concat', *slices, axis_i=0)
@@ -14185,7 +14180,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    def __init__(self, params: _params_t, lr: float=..., lr_decay: float=..., weight_decay: float=..., initial_accumulator_value: float=...,  eps: float=...) -> None: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/optim/adamax.py pytorch-develop/torch/optim/adamax.py
 --- pytorch-v1.5.0/torch/optim/adamax.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/optim/adamax.py	2021-07-13 15:30:59.058320149 +0800
++++ pytorch-develop/torch/optim/adamax.py	2021-07-15 20:52:28.149490998 +0800
 @@ -80,8 +80,8 @@
                      exp_inf.mul_(beta2).unsqueeze(0),
                      grad.abs().add_(eps).unsqueeze_(0)
@@ -14362,7 +14357,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=...) -> None: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/serialization.py pytorch-develop/torch/serialization.py
 --- pytorch-v1.5.0/torch/serialization.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/serialization.py	2021-07-13 15:30:59.058320149 +0800
++++ pytorch-develop/torch/serialization.py	2021-07-15 20:52:28.153491142 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14446,7 +14441,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  def location_tag(storage):
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/storage.py pytorch-develop/torch/storage.py
 --- pytorch-v1.5.0/torch/storage.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/storage.py	2021-07-13 15:30:59.058320149 +0800
++++ pytorch-develop/torch/storage.py	2021-07-15 20:52:28.153491142 +0800
 @@ -7,6 +7,7 @@
  
  class _StorageBase(object):
@@ -14466,7 +14461,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
          else:
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/tensor.py pytorch-develop/torch/tensor.py
 --- pytorch-v1.5.0/torch/tensor.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/tensor.py	2021-07-13 15:30:59.058320149 +0800
++++ pytorch-develop/torch/tensor.py	2021-07-15 20:52:28.153491142 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14528,7 +14523,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      def __reversed__(self):
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/_tensor_str.py pytorch-develop/torch/_tensor_str.py
 --- pytorch-v1.5.0/torch/_tensor_str.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/_tensor_str.py	2021-07-13 15:30:58.994317854 +0800
++++ pytorch-develop/torch/_tensor_str.py	2021-07-15 20:52:28.089488848 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14582,7 +14577,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
      has_default_dtype = self.dtype in (torch.get_default_dtype(), torch.int64, torch.bool)
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/dataloader.py pytorch-develop/torch/utils/data/dataloader.py
 --- pytorch-v1.5.0/torch/utils/data/dataloader.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/utils/data/dataloader.py	2021-07-13 15:30:59.062320292 +0800
++++ pytorch-develop/torch/utils/data/dataloader.py	2021-07-15 20:52:28.157491286 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14791,7 +14786,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -    def __init__(self, sampler: Sampler[int], batch_size: int, drop_last: bool) -> None: ...
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/data/_utils/pin_memory.py pytorch-develop/torch/utils/data/_utils/pin_memory.py
 --- pytorch-v1.5.0/torch/utils/data/_utils/pin_memory.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/utils/data/_utils/pin_memory.py	2021-07-13 15:30:59.062320292 +0800
++++ pytorch-develop/torch/utils/data/_utils/pin_memory.py	2021-07-15 20:52:28.157491286 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
@@ -14852,7 +14847,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
 -
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/utils/__init__.py pytorch-develop/torch/utils/__init__.py
 --- pytorch-v1.5.0/torch/utils/__init__.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/utils/__init__.py	2021-07-13 15:30:59.062320292 +0800
++++ pytorch-develop/torch/utils/__init__.py	2021-07-15 20:52:28.153491142 +0800
 @@ -1,6 +1,7 @@
  from __future__ import absolute_import, division, print_function, unicode_literals
  
@@ -14863,7 +14858,7 @@ diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=
  def set_module(obj, mod):
 diff -Nur '--exclude=.git*' '--exclude=.jenkins' '--exclude=android' '--exclude=OWNERS' '--exclude=third_party' '--exclude=README*' -Nur pytorch-v1.5.0/torch/_utils.py pytorch-develop/torch/_utils.py
 --- pytorch-v1.5.0/torch/_utils.py	2021-04-10 18:39:32.000000000 +0800
-+++ pytorch-develop/torch/_utils.py	2021-07-13 15:30:58.998317998 +0800
++++ pytorch-develop/torch/_utils.py	2021-07-15 20:52:28.089488848 +0800
 @@ -1,3 +1,19 @@
 +# Copyright (c) 2020 Huawei Technologies Co., Ltd
 +# Copyright (c) 2019, Facebook CORPORATION. 
diff --git a/src/aten/src/ATen/native/native_functions.yaml b/src/aten/src/ATen/native/native_functions.yaml
index 30c7a8aeb19a82f8bffa37cd4947172f082db5be..74c22e5b3a4fcf502e8d595da222c51f34003de3 100644
--- a/src/aten/src/ATen/native/native_functions.yaml
+++ b/src/aten/src/ATen/native/native_functions.yaml
@@ -7584,16 +7584,12 @@
   dispatch:
     CPU: reflection_pad2d_out_cpu
     CUDA: reflection_pad2d_out_cuda
-  npu_dispatch:
-    NPU: reflection_pad2d_out_npu
 
 - func: reflection_pad2d(Tensor self, int[4] padding) -> Tensor
   python_module: nn
   dispatch:
     CPU: reflection_pad2d_cpu
     CUDA: reflection_pad2d_cuda
-  npu_dispatch:
-    NPU: reflection_pad2d_npu
 
 - func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
   python_module: nn
@@ -8548,4 +8544,12 @@
 
 - func: npu_bert_apply_adam(Tensor(a!) var, Tensor(b!) m, Tensor(c!) v, Scalar lr, Scalar beta1, Scalar beta2, Scalar epsilon, Tensor grad, Scalar max_grad_norm, Scalar global_grad_norm, Scalar weight_decay) -> (Tensor(a!), Tensor(b!), Tensor(c!))
   npu_dispatch_only:
-    NPU: bert_apply_adam_npu
\ No newline at end of file
+    NPU: bert_apply_adam_npu
+
+- func: npu_giou(Tensor self, Tensor gtboxes, bool trans=False, bool is_cross=False, int mode=0) -> Tensor
+  npu_dispatch_only:
+    NPU: giou_npu
+
+- func: npu_giou_backward(Tensor grad, Tensor bboxes, Tensor gtboxes, bool trans=False, bool is_cross=False, int mode=0) -> (Tensor, Tensor)
+  npu_dispatch_only:
+    NPU: giou_backward_npu
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/AddmvKernelNpu.cpp b/src/aten/src/ATen/native/npu/AddmvKernelNpu.cpp
index 3e11e9de5ac5307c7a8fc7ceaec6b0dfebeb132a..f2defe0dedee2c153329f45e800a7a78035edc49 100644
--- a/src/aten/src/ATen/native/npu/AddmvKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/AddmvKernelNpu.cpp
@@ -1,96 +1,96 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-static void check_1d(const Tensor& t, const char* arg, const char* fn) {
-  TORCH_CHECK(t.dim() == 1, fn, ": Expected 1-D argument ", arg, ", but got ", t.dim(), "-D");
-}
-
-Tensor& addmv_out_npu(
-    Tensor& result,
-    const Tensor& self,
-    const Tensor& mat,
-    const Tensor& vec,
-    Scalar beta,
-    Scalar alpha) {
-    
-  check_1d(vec, "vec", "addmv");
-  
-  Tensor mat1 = vec.unsqueeze(1);
-
-  // matmul mat*alpha
-  Tensor mat_alpha = at::mul(mat, alpha);
-
-  // matmul*alpha
-  Tensor mmMulResult = at::mm(mat_alpha, mat1);
-  
-  Tensor mmMulResult1 = mmMulResult.squeeze();
-
-  // calculate the output size
-  auto outputSize = addmv_npu_output_size(self, mat, vec, beta, alpha);
-
-  if (!result.sizes().equals(outputSize)) {
-    result.resize_(outputSize);
-  }
-  // matmul*alpha+self*beta
-  at::add_out(result, mmMulResult1, self, beta);
-
-  return result;
-}
-
-Tensor addmv_npu(
-    const Tensor& self,
-    const Tensor& mat,
-    const Tensor& vec,
-    Scalar beta,
-    Scalar alpha) {
-    
-  check_1d(vec, "vec", "addmv");
-  auto outputSize = addmv_npu_output_size(self, mat, vec, beta, alpha);
-  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
-  addmv_out_npu(result, self, mat, vec, beta, alpha);
-
-  return result;
-}
-
-Tensor& addmv_npu_(
-    Tensor& self,
-    const Tensor& mat,
-    const Tensor& vec,
-    Scalar beta,
-    Scalar alpha) {
-    
-  check_1d(vec, "vec", "addmv");
-  OpPreparation::CheckMemory({self, mat, vec}, {self});
-  if (!NpuUtils::check_match(&self)) {
-    Tensor contiguousSelf = NpuUtils::format_contiguous(self);
-    Tensor result =
-        addmv_out_npu(contiguousSelf, contiguousSelf, mat, vec, beta, alpha);
-    NpuUtils::format_fresh_view(self, result);
-  } else {
-    addmv_out_npu(self, self, mat, vec, beta, alpha);
-  }
-  return self;
-}
-
-} // namespace native
-} // namespace at
-
-
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/NpuUtils.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+static void check_1d(const Tensor& t, const char* arg, const char* fn) {
+  TORCH_CHECK(t.dim() == 1, fn, ": Expected 1-D argument ", arg, ", but got ", t.dim(), "-D");
+}
+
+Tensor& addmv_out_npu(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& mat,
+    const Tensor& vec,
+    Scalar beta,
+    Scalar alpha) {
+    
+  check_1d(vec, "vec", "addmv");
+  
+  Tensor mat1 = vec.unsqueeze(1);
+
+  // matmul mat*alpha
+  Tensor mat_alpha = at::mul(mat, alpha);
+
+  // matmul*alpha
+  Tensor mmMulResult = at::mm(mat_alpha, mat1);
+  
+  Tensor mmMulResult1 = mmMulResult.squeeze();
+
+  // calculate the output size
+  auto outputSize = addmv_npu_output_size(self, mat, vec, beta, alpha);
+
+  if (!result.sizes().equals(outputSize)) {
+    result.resize_(outputSize);
+  }
+  // matmul*alpha+self*beta
+  at::add_out(result, mmMulResult1, self, beta);
+
+  return result;
+}
+
+Tensor addmv_npu(
+    const Tensor& self,
+    const Tensor& mat,
+    const Tensor& vec,
+    Scalar beta,
+    Scalar alpha) {
+    
+  check_1d(vec, "vec", "addmv");
+  auto outputSize = addmv_npu_output_size(self, mat, vec, beta, alpha);
+  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
+  addmv_out_npu(result, self, mat, vec, beta, alpha);
+
+  return result;
+}
+
+Tensor& addmv_npu_(
+    Tensor& self,
+    const Tensor& mat,
+    const Tensor& vec,
+    Scalar beta,
+    Scalar alpha) {
+    
+  check_1d(vec, "vec", "addmv");
+  OpPreparation::CheckMemory({self, mat, vec}, {self});
+  if (!NpuUtils::check_match(&self)) {
+    Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    Tensor result =
+        addmv_out_npu(contiguousSelf, contiguousSelf, mat, vec, beta, alpha);
+    NpuUtils::format_fresh_view(self, result);
+  } else {
+    addmv_out_npu(self, self, mat, vec, beta, alpha);
+  }
+  return self;
+}
+
+} // namespace native
+} // namespace at
+
+
diff --git a/src/aten/src/ATen/native/npu/ArgminKernelNpu.cpp b/src/aten/src/ATen/native/npu/ArgminKernelNpu.cpp
old mode 100644
new mode 100755
diff --git a/src/aten/src/ATen/native/npu/AtanKernelNpu.cpp b/src/aten/src/ATen/native/npu/AtanKernelNpu.cpp
index a8e38f109b1a515823350f9b1e2d9e3c9124c466..1dbfae12a22a94e82a59109e5993c3e261cd9f04 100644
--- a/src/aten/src/ATen/native/npu/AtanKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/AtanKernelNpu.cpp
@@ -1,52 +1,52 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
-
-namespace at { 
-namespace native {
-using namespace at::native::npu;
-
-Tensor& atan_out_npu(Tensor& result, const Tensor& self) { 
-  OpCommand cmd;
-  cmd.Name("Atan")
-     .Input(self)
-     .Output(result)
-     .Run();
-  return result;  
-}
- 
-Tensor atan_npu(const Tensor& self) { 
-  Tensor result = OpPreparation::ApplyTensor(self);
-  //calculate the output result of the NPU 
-  atan_out_npu(result, self);  
-  return result; 
-} 
- 
-Tensor& atan_npu_(Tensor& self) { 
-  OpPreparation::CheckMemory({self}, {self});
-  if (!NpuUtils::check_match(&self)) { 
-    Tensor contiguousSelf = NpuUtils::format_contiguous(self); 
-    Tensor result = atan_out_npu(contiguousSelf, contiguousSelf); 
-    NpuUtils::format_fresh_view(self, result); 
-  } else {
-    atan_out_npu(self, self); 
-  }
-  return self;
-}
- 
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
+#include "ATen/native/npu/utils/OpTemplate.h"
+
+namespace at { 
+namespace native {
+using namespace at::native::npu;
+
+Tensor& atan_out_npu(Tensor& result, const Tensor& self) { 
+  OpCommand cmd;
+  cmd.Name("Atan")
+     .Input(self)
+     .Output(result)
+     .Run();
+  return result;  
+}
+ 
+Tensor atan_npu(const Tensor& self) { 
+  Tensor result = OpPreparation::ApplyTensor(self);
+  //calculate the output result of the NPU 
+  atan_out_npu(result, self);  
+  return result; 
+} 
+ 
+Tensor& atan_npu_(Tensor& self) { 
+  OpPreparation::CheckMemory({self}, {self});
+  if (!NpuUtils::check_match(&self)) { 
+    Tensor contiguousSelf = NpuUtils::format_contiguous(self); 
+    Tensor result = atan_out_npu(contiguousSelf, contiguousSelf); 
+    NpuUtils::format_fresh_view(self, result); 
+  } else {
+    atan_out_npu(self, self); 
+  }
+  return self;
+}
+ 
 }} // namespace at::native
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/BernoulliKernelNpu.cpp b/src/aten/src/ATen/native/npu/BernoulliKernelNpu.cpp
index a01096cd57982596ce6166413db67531d2fc6258..cdb37c1ed487079d399f2c85bea72762e8723bae 100644
--- a/src/aten/src/ATen/native/npu/BernoulliKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BernoulliKernelNpu.cpp
@@ -1,101 +1,101 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor& bernoulli_out_npu(Tensor& result, const Tensor& self, double p) {
-  OpCommand cmd;
-  cmd.Name("Bernoulli")
-      .Input(self)
-      .Input(p, ScalarType::Float)
-      .Output(result)
-      .Run();
-
-  return result;
-}
-
-Tensor& bernoulli_out_npu(Tensor& result, const Tensor& self, const Tensor& p) {
-  OpCommand cmd;
-  cmd.Name("Bernoulli")
-      .Input(self)
-      .Input(p)
-      .Output(result)
-      .Run();
-
-  return result;
-}
-
-Tensor& bernoulli_npu_(Tensor& self, double p, Generator* gen) {
-  OpPreparation::CheckMemory({self}, {self});
-  ScalarType selfType = self.scalar_type();
-  Tensor selfFp32 = self;
-  if (self.scalar_type() == ScalarType::Half) {
-    selfFp32 = self.to(ScalarType::Float);
-  }
-
-  if (!NpuUtils::check_match(&self)) {
-    Tensor contiguousSelf = NpuUtils::format_contiguous(selfFp32);
-    Tensor result = bernoulli_out_npu(contiguousSelf, contiguousSelf, p);
-    NpuUtils::format_fresh_view(self, result);
-  } else {
-    bernoulli_out_npu(selfFp32, selfFp32, p);
-    self.copy_(selfFp32);
-  }
-
-  if(self.scalar_type() != selfType){
-    self = self.to(ScalarType::Half);
-  }
-  return self;
-}
-
-Tensor& bernoulli_npu_(Tensor& self, const Tensor& p, Generator* gen) {
-  OpPreparation::CheckMemory({self}, {self});
-  ScalarType selfType = self.scalar_type();
-  Tensor selfFp32 = self;
-  Tensor pFp32 = OpPreparation::CastBackToOriFormat(p);;
-  if (self.scalar_type() == ScalarType::Half) {
-    selfFp32 = self.to(ScalarType::Float);
-    pFp32 = p.to(ScalarType::Float);
-  }
-
-  if (!NpuUtils::check_match(&self)) {
-    Tensor contiguousSelf = NpuUtils::format_contiguous(selfFp32);
-    Tensor result = bernoulli_out_npu(contiguousSelf, contiguousSelf, pFp32);
-    NpuUtils::format_fresh_view(self, result);
-  } else {
-    bernoulli_out_npu(selfFp32, selfFp32, pFp32);
-    self.copy_(selfFp32);
-  }
-
-  if(self.scalar_type() != selfType){
-    self = self.to(ScalarType::Half);
-  }
-  return self;
-}
-
-Tensor bernoulli_npu(const Tensor& self, Generator* gen) {
-  const Tensor p = self;
-  Tensor selfCopy = at::empty_with_format(
-      self.sizes(), self.options(), ACL_FORMAT_ND);
-  selfCopy.copy_(self);
-  return bernoulli_npu_(selfCopy, p, gen);
-}
-} // namespace native
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor& bernoulli_out_npu(Tensor& result, const Tensor& self, double p) {
+  OpCommand cmd;
+  cmd.Name("Bernoulli")
+      .Input(self)
+      .Input(p, ScalarType::Float)
+      .Output(result)
+      .Run();
+
+  return result;
+}
+
+Tensor& bernoulli_out_npu(Tensor& result, const Tensor& self, const Tensor& p) {
+  OpCommand cmd;
+  cmd.Name("Bernoulli")
+      .Input(self)
+      .Input(p)
+      .Output(result)
+      .Run();
+
+  return result;
+}
+
+Tensor& bernoulli_npu_(Tensor& self, double p, Generator* gen) {
+  OpPreparation::CheckMemory({self}, {self});
+  ScalarType selfType = self.scalar_type();
+  Tensor selfFp32 = self;
+  if (self.scalar_type() == ScalarType::Half) {
+    selfFp32 = self.to(ScalarType::Float);
+  }
+
+  if (!NpuUtils::check_match(&self)) {
+    Tensor contiguousSelf = NpuUtils::format_contiguous(selfFp32);
+    Tensor result = bernoulli_out_npu(contiguousSelf, contiguousSelf, p);
+    NpuUtils::format_fresh_view(self, result);
+  } else {
+    bernoulli_out_npu(selfFp32, selfFp32, p);
+    self.copy_(selfFp32);
+  }
+
+  if(self.scalar_type() != selfType){
+    self = self.to(ScalarType::Half);
+  }
+  return self;
+}
+
+Tensor& bernoulli_npu_(Tensor& self, const Tensor& p, Generator* gen) {
+  OpPreparation::CheckMemory({self}, {self});
+  ScalarType selfType = self.scalar_type();
+  Tensor selfFp32 = self;
+  Tensor pFp32 = OpPreparation::CastBackToOriFormat(p);;
+  if (self.scalar_type() == ScalarType::Half) {
+    selfFp32 = self.to(ScalarType::Float);
+    pFp32 = p.to(ScalarType::Float);
+  }
+
+  if (!NpuUtils::check_match(&self)) {
+    Tensor contiguousSelf = NpuUtils::format_contiguous(selfFp32);
+    Tensor result = bernoulli_out_npu(contiguousSelf, contiguousSelf, pFp32);
+    NpuUtils::format_fresh_view(self, result);
+  } else {
+    bernoulli_out_npu(selfFp32, selfFp32, pFp32);
+    self.copy_(selfFp32);
+  }
+
+  if(self.scalar_type() != selfType){
+    self = self.to(ScalarType::Half);
+  }
+  return self;
+}
+
+Tensor bernoulli_npu(const Tensor& self, Generator* gen) {
+  const Tensor p = self;
+  Tensor selfCopy = at::empty_with_format(
+      self.sizes(), self.options(), ACL_FORMAT_ND);
+  selfCopy.copy_(self);
+  return bernoulli_npu_(selfCopy, p, gen);
+}
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/BertApplyAdamKernelNpu.cpp b/src/aten/src/ATen/native/npu/BertApplyAdamKernelNpu.cpp
index 380f65f1aac08321d6a9b72e091454e8be042f0d..20a17421f33e4731220eee7dbbaa92ea178eb48b 100644
--- a/src/aten/src/ATen/native/npu/BertApplyAdamKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BertApplyAdamKernelNpu.cpp
@@ -1,108 +1,108 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-tuple<Tensor, Tensor, Tensor> bert_apply_adam_out_npu_nocheck(
-    Tensor& var_out,
-    Tensor& m_out,
-    Tensor& v_out,
-    const Tensor& var,
-    const Tensor& m,
-    const Tensor& v,
-    Scalar lr,
-    Scalar beta1,
-    Scalar beta2,
-    Scalar epsilon,
-    const Tensor& grad,
-    Scalar max_grad_norm,
-    Scalar global_grad_norm,
-    Scalar weight_decay) {
-  OpCommand cmd;
-  cmd.Name("ApplyAdamV2")
-      .Input(var)
-      .Input(m)
-      .Input(v)
-      .Input(lr, var.scalar_type())
-      .Input(beta1, var.scalar_type())
-      .Input(beta2, var.scalar_type())
-      .Input(epsilon, var.scalar_type())
-      .Input(grad)
-      .Input(max_grad_norm, var.scalar_type())
-      .Input(global_grad_norm, var.scalar_type())
-      .Input(weight_decay, var.scalar_type())
-      .Output(var_out)
-      .Output(m_out)
-      .Output(v_out)
-      .Run();
-  return std::tie(var_out, m_out, v_out);
-}
-
-tuple<Tensor, Tensor, Tensor> bert_apply_adam_out_npu(
-    Tensor& var_out,
-    Tensor& m_out,
-    Tensor& v_out,
-    const Tensor& var,
-    const Tensor& m,
-    const Tensor& v,
-    Scalar lr,
-    Scalar beta1,
-    Scalar beta2,
-    Scalar epsilon,
-    const Tensor& grad,
-    Scalar max_grad_norm,
-    Scalar global_grad_norm,
-    Scalar weight_decay) {
-  OpPipeWithDefinedOut check;
-  check.CheckMemory({var, m, v, grad}, {var_out, m_out, v_out});
-
-  auto func = [&var, &m, &v, &lr, &beta1, &beta2, &epsilon, &grad, &max_grad_norm, &global_grad_norm, &weight_decay] (
-      Tensor& var_out,
-      Tensor& m_out,
-      Tensor& v_out) {
-        bert_apply_adam_out_npu_nocheck(var_out, m_out, v_out, var, m, v, 
-            lr, beta1, beta2, epsilon, grad, max_grad_norm, global_grad_norm, weight_decay);
-      };
-  
-  OpPipeWithMultiOut<Tensor&, Tensor&, Tensor&> pipe(var_out, m_out, v_out);
-  return pipe.Call(func)
-              .ReturnRef<Tensor&, Tensor&, Tensor&>();
-}
-
-tuple<Tensor, Tensor, Tensor> bert_apply_adam_npu(
-    Tensor& var,
-    Tensor& m,
-    Tensor& v,
-    Scalar lr,
-    Scalar beta1,
-    Scalar beta2,
-    Scalar epsilon,
-    const Tensor& grad,
-    Scalar max_grad_norm,
-    Scalar global_grad_norm,
-    Scalar weight_decay) {
-  bert_apply_adam_out_npu(
-      var, m, v, var, m, v,
-      lr, beta1, beta2, epsilon, grad, max_grad_norm, global_grad_norm, weight_decay);
-  return std::tie(var, m, v);
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+tuple<Tensor, Tensor, Tensor> bert_apply_adam_out_npu_nocheck(
+    Tensor& var_out,
+    Tensor& m_out,
+    Tensor& v_out,
+    const Tensor& var,
+    const Tensor& m,
+    const Tensor& v,
+    Scalar lr,
+    Scalar beta1,
+    Scalar beta2,
+    Scalar epsilon,
+    const Tensor& grad,
+    Scalar max_grad_norm,
+    Scalar global_grad_norm,
+    Scalar weight_decay) {
+  OpCommand cmd;
+  cmd.Name("ApplyAdamV2")
+      .Input(var)
+      .Input(m)
+      .Input(v)
+      .Input(lr, var.scalar_type())
+      .Input(beta1, var.scalar_type())
+      .Input(beta2, var.scalar_type())
+      .Input(epsilon, var.scalar_type())
+      .Input(grad)
+      .Input(max_grad_norm, var.scalar_type())
+      .Input(global_grad_norm, var.scalar_type())
+      .Input(weight_decay, var.scalar_type())
+      .Output(var_out)
+      .Output(m_out)
+      .Output(v_out)
+      .Run();
+  return std::tie(var_out, m_out, v_out);
+}
+
+tuple<Tensor, Tensor, Tensor> bert_apply_adam_out_npu(
+    Tensor& var_out,
+    Tensor& m_out,
+    Tensor& v_out,
+    const Tensor& var,
+    const Tensor& m,
+    const Tensor& v,
+    Scalar lr,
+    Scalar beta1,
+    Scalar beta2,
+    Scalar epsilon,
+    const Tensor& grad,
+    Scalar max_grad_norm,
+    Scalar global_grad_norm,
+    Scalar weight_decay) {
+  OpPipeWithDefinedOut check;
+  check.CheckMemory({var, m, v, grad}, {var_out, m_out, v_out});
+
+  auto func = [&var, &m, &v, &lr, &beta1, &beta2, &epsilon, &grad, &max_grad_norm, &global_grad_norm, &weight_decay] (
+      Tensor& var_out,
+      Tensor& m_out,
+      Tensor& v_out) {
+        bert_apply_adam_out_npu_nocheck(var_out, m_out, v_out, var, m, v, 
+            lr, beta1, beta2, epsilon, grad, max_grad_norm, global_grad_norm, weight_decay);
+      };
+  
+  OpPipeWithMultiOut<Tensor&, Tensor&, Tensor&> pipe(var_out, m_out, v_out);
+  return pipe.Call(func)
+              .ReturnRef<Tensor&, Tensor&, Tensor&>();
+}
+
+tuple<Tensor, Tensor, Tensor> bert_apply_adam_npu(
+    Tensor& var,
+    Tensor& m,
+    Tensor& v,
+    Scalar lr,
+    Scalar beta1,
+    Scalar beta2,
+    Scalar epsilon,
+    const Tensor& grad,
+    Scalar max_grad_norm,
+    Scalar global_grad_norm,
+    Scalar weight_decay) {
+  bert_apply_adam_out_npu(
+      var, m, v, var, m, v,
+      lr, beta1, beta2, epsilon, grad, max_grad_norm, global_grad_norm, weight_decay);
+  return std::tie(var, m, v);
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/BitwiseXorKernelNpu.cpp b/src/aten/src/ATen/native/npu/BitwiseXorKernelNpu.cpp
index 818a660681db5b3a9053bb8a820d9977077342cb..0d2bd6fd64feb9383058280a67a44881a9bcec7d 100644
--- a/src/aten/src/ATen/native/npu/BitwiseXorKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BitwiseXorKernelNpu.cpp
@@ -1,164 +1,164 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor& bitwise_xor_out_npu_nocheck(
-    Tensor& result,
-    const Tensor& self,
-    const Scalar other) {
-  // executing the NPU operator
-  Tensor selfInput = (self.dtype() == at::ScalarType::Bool) ? self.to(at::ScalarType::Int) : self;
-  result = (result.dtype() == at::ScalarType::Bool) ? result.to(at::ScalarType::Int) : result;
-
-  OpCommand cmd;
-  cmd.Name("BitwiseXor")
-      .Input(selfInput)
-      .Input(other, selfInput.scalar_type())
-      .Output(result)
-      .Run();
-
-  return (result = (self.dtype() == at::ScalarType::Bool) ? result.to(at::ScalarType::Bool) : result);
-}
-
-Tensor& bitwise_xor_out_npu(
-    Tensor& result,
-    const Tensor& self,
-    const Scalar other) {
-  OpPreparation::CheckOut(
-      {self},
-      result,
-      self);
-
-  bitwise_xor_out_npu_nocheck(result, self, other);
-
-  return result;
-}
-
-Tensor& bitwise_xor_out_npu_nocheck(
-    Tensor& result,
-    const Tensor& self,
-    const Tensor& other) {
-  auto unified_result = OpPreparation::binary_op_check(result, self, other, true);
-
-  Tensor selfInput  = (self.dtype() == at::ScalarType::Bool) ? self.to(at::ScalarType::Int) : self;
-  Tensor otherInput = (other.dtype() == at::ScalarType::Bool) ? other.to(at::ScalarType::Int) : other;
-  result = (result.dtype() == at::ScalarType::Bool) ? result.to(at::ScalarType::Int) : result;
-
-  if (otherInput.dim() == 0 && !otherInput.is_npu()) {
-    bitwise_xor_out_npu(result, selfInput, otherInput.item());
-  } else if (selfInput.dim() == 0 && !selfInput.is_npu()) {
-    bitwise_xor_out_npu(result, otherInput, selfInput.item());
-  } else {
-    // executing the NPU operator
-    OpCommand cmd;
-    cmd.Name("BitwiseXor")
-        .Expect(unified_result)
-        .Input(selfInput)
-        .Input(otherInput)
-        .Output(result)
-        .Run();
-  }
-
-  return (result = (self.dtype() == at::ScalarType::Bool) ? result.to(at::ScalarType::Bool) : result);
-}
-
-Tensor& bitwise_xor_out_npu(
-    Tensor& result,
-    const Tensor& self,
-    const Tensor& other) {
-  bool isSelfWrapped = CalcuOpUtil::is_scalar_wrapped_to_tensor(self);
-
-  Tensor outputTensor;
-  if (not isSelfWrapped) {
-    outputTensor = self;
-  } else {
-    outputTensor = other;
-  }
-
-  auto outputSize = broadcast_ops_npu_output_size(self, other);
-
-  OpPreparation::CheckOut(
-      {self},
-      result,
-      CalcuOpUtil::get_tensor_npu_format(outputTensor),
-      outputTensor.scalar_type(),
-      outputSize);
-
-  bitwise_xor_out_npu_nocheck(result, self, other);
-
-  return result;
-}
-
-Tensor bitwise_xor_npu(const Tensor& self, const Tensor& other) {
-  // calculate the output size
-  bool isSelfWrapped = CalcuOpUtil::is_scalar_wrapped_to_tensor(self);
-
-  Tensor outputTensor;
-  if (not isSelfWrapped) {
-    outputTensor = self;
-  } else {
-    outputTensor = other;
-  }
-
-  auto outputSize = broadcast_ops_npu_output_size(self, other);
-
-  // construct the output tensor of the NPU
-  Tensor result = OpPreparation::ApplyTensor(outputTensor, outputSize);
-  // calculate the output result of the NPU
-  bitwise_xor_out_npu_nocheck(result, self, other);
-
-  return result;
-}
-
-Tensor bitwise_xor_npu(const Tensor& self, Scalar other) {
-  Tensor result = OpPreparation::ApplyTensor(self);
-  // calculate the output result of the NPU
-  bitwise_xor_out_npu_nocheck(result, self, other);
-
-  return result;
-}
-
-Tensor& bitwise_xor_npu_(Tensor& self, const Tensor& other) {
-  OpPreparation::CheckMemory({self, other}, {self});
-  if (!NpuUtils::check_match(&self)) {
-    Tensor contiguousSelf = NpuUtils::format_contiguous(self);
-    Tensor result = bitwise_xor_out_npu_nocheck(contiguousSelf, contiguousSelf, other);
-    NpuUtils::format_fresh_view(self, result);
-  } else {
-    bitwise_xor_out_npu_nocheck(self, self, other);
-  }
-
-  return self;
-}
-
-Tensor& bitwise_xor_npu_(Tensor& self, Scalar other) {
-  if (!NpuUtils::check_match(&self)) {
-    Tensor contiguousSelf = NpuUtils::format_contiguous(self);
-    Tensor result = bitwise_xor_out_npu_nocheck(contiguousSelf, contiguousSelf, other);
-    NpuUtils::format_fresh_view(self, result);
-  } else {
-    bitwise_xor_out_npu_nocheck(self, self, other);
-  }
-
-  return self;
-}
-
-} // namespace native
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor& bitwise_xor_out_npu_nocheck(
+    Tensor& result,
+    const Tensor& self,
+    const Scalar other) {
+  // executing the NPU operator
+  Tensor selfInput = (self.dtype() == at::ScalarType::Bool) ? self.to(at::ScalarType::Int) : self;
+  result = (result.dtype() == at::ScalarType::Bool) ? result.to(at::ScalarType::Int) : result;
+
+  OpCommand cmd;
+  cmd.Name("BitwiseXor")
+      .Input(selfInput)
+      .Input(other, selfInput.scalar_type())
+      .Output(result)
+      .Run();
+
+  return (result = (self.dtype() == at::ScalarType::Bool) ? result.to(at::ScalarType::Bool) : result);
+}
+
+Tensor& bitwise_xor_out_npu(
+    Tensor& result,
+    const Tensor& self,
+    const Scalar other) {
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      self);
+
+  bitwise_xor_out_npu_nocheck(result, self, other);
+
+  return result;
+}
+
+Tensor& bitwise_xor_out_npu_nocheck(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& other) {
+  auto unified_result = OpPreparation::binary_op_check(result, self, other, true);
+
+  Tensor selfInput  = (self.dtype() == at::ScalarType::Bool) ? self.to(at::ScalarType::Int) : self;
+  Tensor otherInput = (other.dtype() == at::ScalarType::Bool) ? other.to(at::ScalarType::Int) : other;
+  result = (result.dtype() == at::ScalarType::Bool) ? result.to(at::ScalarType::Int) : result;
+
+  if (otherInput.dim() == 0 && !otherInput.is_npu()) {
+    bitwise_xor_out_npu(result, selfInput, otherInput.item());
+  } else if (selfInput.dim() == 0 && !selfInput.is_npu()) {
+    bitwise_xor_out_npu(result, otherInput, selfInput.item());
+  } else {
+    // executing the NPU operator
+    OpCommand cmd;
+    cmd.Name("BitwiseXor")
+        .Expect(unified_result)
+        .Input(selfInput)
+        .Input(otherInput)
+        .Output(result)
+        .Run();
+  }
+
+  return (result = (self.dtype() == at::ScalarType::Bool) ? result.to(at::ScalarType::Bool) : result);
+}
+
+Tensor& bitwise_xor_out_npu(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& other) {
+  bool isSelfWrapped = CalcuOpUtil::is_scalar_wrapped_to_tensor(self);
+
+  Tensor outputTensor;
+  if (not isSelfWrapped) {
+    outputTensor = self;
+  } else {
+    outputTensor = other;
+  }
+
+  auto outputSize = broadcast_ops_npu_output_size(self, other);
+
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      CalcuOpUtil::get_tensor_npu_format(outputTensor),
+      outputTensor.scalar_type(),
+      outputSize);
+
+  bitwise_xor_out_npu_nocheck(result, self, other);
+
+  return result;
+}
+
+Tensor bitwise_xor_npu(const Tensor& self, const Tensor& other) {
+  // calculate the output size
+  bool isSelfWrapped = CalcuOpUtil::is_scalar_wrapped_to_tensor(self);
+
+  Tensor outputTensor;
+  if (not isSelfWrapped) {
+    outputTensor = self;
+  } else {
+    outputTensor = other;
+  }
+
+  auto outputSize = broadcast_ops_npu_output_size(self, other);
+
+  // construct the output tensor of the NPU
+  Tensor result = OpPreparation::ApplyTensor(outputTensor, outputSize);
+  // calculate the output result of the NPU
+  bitwise_xor_out_npu_nocheck(result, self, other);
+
+  return result;
+}
+
+Tensor bitwise_xor_npu(const Tensor& self, Scalar other) {
+  Tensor result = OpPreparation::ApplyTensor(self);
+  // calculate the output result of the NPU
+  bitwise_xor_out_npu_nocheck(result, self, other);
+
+  return result;
+}
+
+Tensor& bitwise_xor_npu_(Tensor& self, const Tensor& other) {
+  OpPreparation::CheckMemory({self, other}, {self});
+  if (!NpuUtils::check_match(&self)) {
+    Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    Tensor result = bitwise_xor_out_npu_nocheck(contiguousSelf, contiguousSelf, other);
+    NpuUtils::format_fresh_view(self, result);
+  } else {
+    bitwise_xor_out_npu_nocheck(self, self, other);
+  }
+
+  return self;
+}
+
+Tensor& bitwise_xor_npu_(Tensor& self, Scalar other) {
+  if (!NpuUtils::check_match(&self)) {
+    Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    Tensor result = bitwise_xor_out_npu_nocheck(contiguousSelf, contiguousSelf, other);
+    NpuUtils::format_fresh_view(self, result);
+  } else {
+    bitwise_xor_out_npu_nocheck(self, self, other);
+  }
+
+  return self;
+}
+
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/BoundingBoxDecodeKernelNpu.cpp b/src/aten/src/ATen/native/npu/BoundingBoxDecodeKernelNpu.cpp
index fa2a86b0871c7d06d9199d4663ff3986736141b3..fb3745b091294a3f9d87906f7d775a18d7d0b3fc 100644
--- a/src/aten/src/ATen/native/npu/BoundingBoxDecodeKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BoundingBoxDecodeKernelNpu.cpp
@@ -1,81 +1,81 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION.
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor& bounding_box_decode_out_npu(
-    Tensor& result,
-    const Tensor& rois,
-    const Tensor& deltas,
-    SmallVector<float, SIZE> means,
-    SmallVector<float, SIZE> stds,
-    IntArrayRef max_shape,
-    double wh_ratio_clip) {
-  OpCommand cmd;
-  cmd.Name("BoundingBoxDecode")
-       .Input(rois)
-       .Input(deltas)
-       .Output(result)
-       .Attr("means", means)
-       .Attr("stds", stds)
-       .Attr("max_shape", max_shape)
-       .Attr("wh_ratio_clip", static_cast<float>(wh_ratio_clip))
-       .Run();
-
-  return result;
-}
-
-Tensor bounding_box_decode_npu(
-    const Tensor& rois,
-    const Tensor& deltas,
-    double means0,
-    double means1,
-    double means2,
-    double means3,
-    double stds0,
-    double stds1,
-    double stds2,
-    double stds3,
-    IntArrayRef max_shape,
-    double wh_ratio_clip) {
-  SmallVector<int64_t, SIZE> outputSize = {rois.size(0), 4};
-  // construct the output tensor of the NPU
-  Tensor result = OpPreparation::ApplyTensor(rois, outputSize);
-  SmallVector<float, SIZE> means = {
-      static_cast<float>(means0),
-      static_cast<float>(means1),
-      static_cast<float>(means2),
-      static_cast<float>(means3)};
-  SmallVector<float, SIZE> stds = {
-      static_cast<float>(stds0),
-      static_cast<float>(stds1),
-      static_cast<float>(stds2),
-      static_cast<float>(stds3)};
-
-  // calculate the output result of the NPU
-  bounding_box_decode_out_npu(
-      result, rois, deltas, means, stds, max_shape, wh_ratio_clip);
-
-  return result;
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor& bounding_box_decode_out_npu(
+    Tensor& result,
+    const Tensor& rois,
+    const Tensor& deltas,
+    SmallVector<float, SIZE> means,
+    SmallVector<float, SIZE> stds,
+    IntArrayRef max_shape,
+    double wh_ratio_clip) {
+  OpCommand cmd;
+  cmd.Name("BoundingBoxDecode")
+       .Input(rois)
+       .Input(deltas)
+       .Output(result)
+       .Attr("means", means)
+       .Attr("stds", stds)
+       .Attr("max_shape", max_shape)
+       .Attr("wh_ratio_clip", static_cast<float>(wh_ratio_clip))
+       .Run();
+
+  return result;
+}
+
+Tensor bounding_box_decode_npu(
+    const Tensor& rois,
+    const Tensor& deltas,
+    double means0,
+    double means1,
+    double means2,
+    double means3,
+    double stds0,
+    double stds1,
+    double stds2,
+    double stds3,
+    IntArrayRef max_shape,
+    double wh_ratio_clip) {
+  SmallVector<int64_t, SIZE> outputSize = {rois.size(0), 4};
+  // construct the output tensor of the NPU
+  Tensor result = OpPreparation::ApplyTensor(rois, outputSize);
+  SmallVector<float, SIZE> means = {
+      static_cast<float>(means0),
+      static_cast<float>(means1),
+      static_cast<float>(means2),
+      static_cast<float>(means3)};
+  SmallVector<float, SIZE> stds = {
+      static_cast<float>(stds0),
+      static_cast<float>(stds1),
+      static_cast<float>(stds2),
+      static_cast<float>(stds3)};
+
+  // calculate the output result of the NPU
+  bounding_box_decode_out_npu(
+      result, rois, deltas, means, stds, max_shape, wh_ratio_clip);
+
+  return result;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/BoundingBoxEncodeKernelNpu.cpp b/src/aten/src/ATen/native/npu/BoundingBoxEncodeKernelNpu.cpp
index 3e02aad811f780301ab953efbb7f330a3d6ecfae..e4ea87d8d49337740c771c0566eddc4940afb297 100644
--- a/src/aten/src/ATen/native/npu/BoundingBoxEncodeKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/BoundingBoxEncodeKernelNpu.cpp
@@ -1,73 +1,73 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION.
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor& bounding_box_encode_out_npu(
-    Tensor& delats,
-    const Tensor& anchor_box,
-    const Tensor& ground_truth_box,
-    SmallVector<float, SIZE> means,
-    SmallVector<float, SIZE> stds) {
-  OpCommand cmd;
-  cmd.Name("BoundingBoxEncode")
-       .Input(anchor_box)
-       .Input(ground_truth_box)
-       .Output(delats)
-       .Attr("means", means)
-       .Attr("stds", stds)
-       .Run();
-
-  return delats;
-}
-
-Tensor bounding_box_encode_npu(
-    const Tensor& anchor_box,
-    const Tensor& ground_truth_box,
-    double means0,
-    double means1,
-    double means2,
-    double means3,
-    double stds0,
-    double stds1,
-    double stds2,
-    double stds3) {
-  // construct the output tensor of the NPU
-  Tensor delats = OpPreparation::ApplyTensor(anchor_box, {anchor_box.size(0), 4});
-  SmallVector<float, SIZE> means = {
-      static_cast<float>(means0),
-      static_cast<float>(means1),
-      static_cast<float>(means2),
-      static_cast<float>(means3)};
-  SmallVector<float, SIZE> stds = {
-      static_cast<float>(stds0),
-      static_cast<float>(stds1),
-      static_cast<float>(stds2),
-      static_cast<float>(stds3)};
-
-  bounding_box_encode_out_npu(
-      delats, anchor_box, ground_truth_box, means, stds);
-
-  return delats;
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
+#include "ATen/native/npu/utils/OpTemplate.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor& bounding_box_encode_out_npu(
+    Tensor& delats,
+    const Tensor& anchor_box,
+    const Tensor& ground_truth_box,
+    SmallVector<float, SIZE> means,
+    SmallVector<float, SIZE> stds) {
+  OpCommand cmd;
+  cmd.Name("BoundingBoxEncode")
+       .Input(anchor_box)
+       .Input(ground_truth_box)
+       .Output(delats)
+       .Attr("means", means)
+       .Attr("stds", stds)
+       .Run();
+
+  return delats;
+}
+
+Tensor bounding_box_encode_npu(
+    const Tensor& anchor_box,
+    const Tensor& ground_truth_box,
+    double means0,
+    double means1,
+    double means2,
+    double means3,
+    double stds0,
+    double stds1,
+    double stds2,
+    double stds3) {
+  // construct the output tensor of the NPU
+  Tensor delats = OpPreparation::ApplyTensor(anchor_box, {anchor_box.size(0), 4});
+  SmallVector<float, SIZE> means = {
+      static_cast<float>(means0),
+      static_cast<float>(means1),
+      static_cast<float>(means2),
+      static_cast<float>(means3)};
+  SmallVector<float, SIZE> stds = {
+      static_cast<float>(stds0),
+      static_cast<float>(stds1),
+      static_cast<float>(stds2),
+      static_cast<float>(stds3)};
+
+  bounding_box_encode_out_npu(
+      delats, anchor_box, ground_truth_box, means, stds);
+
+  return delats;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/CatKernelNpu.cpp b/src/aten/src/ATen/native/npu/CatKernelNpu.cpp
index 8c3ac876475bf2928430246cb85f1ab59a121e4e..4bc949120d755585d1fbd1ebd9f122f5a5ec7042 100644
--- a/src/aten/src/ATen/native/npu/CatKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/CatKernelNpu.cpp
@@ -154,6 +154,20 @@ Tensor& _cat_out_npu(Tensor& result, TensorList tensors, int64_t dim) {
 }
 
 Tensor& cat_out_npu(Tensor& result, TensorList tensors, int64_t dim) {
+  SmallVector<Tensor, N> inputTensors = cat_dest_tensor_list(tensors);
+
+  int64_t dim_post_expr = 0;
+  if (inputTensors.size() > 0) {
+    dim_post_expr = inputTensors[0].dim();
+  }
+  dim = CalcuOpUtil::make_wrap_dim(dim, dim_post_expr);
+  auto outputSize = cat_npu_output_size(inputTensors, dim);
+  OpPreparation::CheckOut(
+    {tensors[0]}, 
+    result, 
+    ACL_FORMAT_ND, 
+    tensors[0].scalar_type(), 
+    outputSize); 
   return at::_cat_out(result, tensors, dim);
 }
 
diff --git a/src/aten/src/ATen/native/npu/CeluKernelNpu.cpp b/src/aten/src/ATen/native/npu/CeluKernelNpu.cpp
index fc4602ea8554f5cd851623c72283df02dddf158d..c6bf9db470e132e51a8c263edab0209e3ddc03f2 100644
--- a/src/aten/src/ATen/native/npu/CeluKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/CeluKernelNpu.cpp
@@ -1,59 +1,59 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor celu_out_npu_nocheck(Tensor& result, const Tensor& self, Scalar alpha) {
-  float alpha3 = 1.0;
-  OpCommand cmd;
-  cmd.Name("Celu")
-        .Input(self)
-        .Output(result)
-        .Attr("alpha1", alpha)
-        .Attr("alpha2", alpha)
-        .Attr("alpha3", alpha3)
-        .Run();
-  return result;
-}
-
-Tensor celu_out_npu(Tensor& result, const Tensor& self, Scalar alpha) {
-  OpPipeWithDefinedOut pipe;
-  return pipe.CheckMemory({self}, {result})
-   .Func([&self, &alpha](Tensor& result){celu_out_npu_nocheck(result, self, alpha);})
-   .Call(result);
-}
-
-Tensor celu_npu(const Tensor& self, Scalar alpha) {
-  // construct the output tensor of the NPU
-  Tensor result = OpPreparation::ApplyTensor(self);
-
-  // calculate the output result of the NPU
-  celu_out_npu(result, self, alpha);
-
-  return result;
-}
-
-Tensor& celu_npu_(Tensor& self, Scalar alpha) {
-  celu_out_npu(self, self, alpha);
-  return self;
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor celu_out_npu_nocheck(Tensor& result, const Tensor& self, Scalar alpha) {
+  float alpha3 = 1.0;
+  OpCommand cmd;
+  cmd.Name("Celu")
+        .Input(self)
+        .Output(result)
+        .Attr("alpha1", alpha)
+        .Attr("alpha2", alpha)
+        .Attr("alpha3", alpha3)
+        .Run();
+  return result;
+}
+
+Tensor celu_out_npu(Tensor& result, const Tensor& self, Scalar alpha) {
+  OpPipeWithDefinedOut pipe;
+  return pipe.CheckMemory({self}, {result})
+   .Func([&self, &alpha](Tensor& result){celu_out_npu_nocheck(result, self, alpha);})
+   .Call(result);
+}
+
+Tensor celu_npu(const Tensor& self, Scalar alpha) {
+  // construct the output tensor of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self);
+
+  // calculate the output result of the NPU
+  celu_out_npu(result, self, alpha);
+
+  return result;
+}
+
+Tensor& celu_npu_(Tensor& self, Scalar alpha) {
+  celu_out_npu(self, self, alpha);
+  return self;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/ConfusionTransposeBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/ConfusionTransposeBackwardKernelNpu.cpp
index 0bbe872275761d0b02f68a61e790797fbd244100..776a0a76a18f1fca1fb52569e24cbd2aa915e412 100644
--- a/src/aten/src/ATen/native/npu/ConfusionTransposeBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/ConfusionTransposeBackwardKernelNpu.cpp
@@ -1,60 +1,60 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor confusion_transpose_backward_npu(
-    const Tensor& grad,
-    IntArrayRef perm,
-    IntArrayRef shape,
-    bool transpose_first) {
-  SmallVector<int64_t, SIZE> svec_shape;
-  if (transpose_first){
-    svec_shape = array_to_small_vector(shape);
-  } else {
-    for (int i = 0; i < perm.size(); i++){
-      svec_shape.emplace_back(shape[perm[i]]);
-    }
-  }
-  std::vector<int64_t> vec_perm;
-  int64_t perm_len =  perm.size();
-  int64_t temp_perm[perm_len] = {0};
-  for (int64_t i = 0; i < perm_len; i++){
-    temp_perm[perm[i]] = i;
-  }
-  vec_perm = std::vector<int64_t>(temp_perm, temp_perm+perm_len);
-  perm = IntArrayRef(vec_perm);
-
-  Tensor result = OpPreparation::ApplyTensor(grad, shape);
-
-  OpCommand cmd;
-  cmd.Name("ConfusionTransposeD")
-      .Input(grad)
-      .Output(result)
-      .Attr("perm", perm)
-      .Attr("shape", svec_shape)
-      .Attr("transpose_first", transpose_first)
-      .Run();
-
-  return result;
-}
-
-} // namespace native
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor confusion_transpose_backward_npu(
+    const Tensor& grad,
+    IntArrayRef perm,
+    IntArrayRef shape,
+    bool transpose_first) {
+  SmallVector<int64_t, SIZE> svec_shape;
+  if (transpose_first){
+    svec_shape = array_to_small_vector(shape);
+  } else {
+    for (int i = 0; i < perm.size(); i++){
+      svec_shape.emplace_back(shape[perm[i]]);
+    }
+  }
+  std::vector<int64_t> vec_perm;
+  int64_t perm_len =  perm.size();
+  int64_t temp_perm[perm_len] = {0};
+  for (int64_t i = 0; i < perm_len; i++){
+    temp_perm[perm[i]] = i;
+  }
+  vec_perm = std::vector<int64_t>(temp_perm, temp_perm+perm_len);
+  perm = IntArrayRef(vec_perm);
+
+  Tensor result = OpPreparation::ApplyTensor(grad, shape);
+
+  OpCommand cmd;
+  cmd.Name("ConfusionTransposeD")
+      .Input(grad)
+      .Output(result)
+      .Attr("perm", perm)
+      .Attr("shape", svec_shape)
+      .Attr("transpose_first", transpose_first)
+      .Run();
+
+  return result;
+}
+
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/ConfusionTransposeKernelNpu.cpp b/src/aten/src/ATen/native/npu/ConfusionTransposeKernelNpu.cpp
index 12b36826dedf724a54e7c63b83900ad218cf1aba..fc6b602d1f531d1f3059279f9ef4c3af0845b50c 100644
--- a/src/aten/src/ATen/native/npu/ConfusionTransposeKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/ConfusionTransposeKernelNpu.cpp
@@ -1,52 +1,52 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor confusion_transpose_npu(
-    const Tensor& self,
-    IntArrayRef perm,
-    IntArrayRef shape,
-    bool transpose_first) {
-  SmallVector<int64_t, SIZE> output_size;
-  if (transpose_first){
-    output_size = array_to_small_vector(shape);
-  } else {
-    for (int i = 0; i < perm.size(); i++){
-      output_size.emplace_back(shape[perm[i]]);
-    }
-  }
-
-  // construct the output tensor of the NPU
-  Tensor result = OpPreparation::ApplyTensor(self, output_size);
-  OpCommand cmd;
-  cmd.Name("ConfusionTransposeD")
-      .Input(self)
-      .Output(result)
-      .Attr("perm", perm)
-      .Attr("shape", shape)
-      .Attr("transpose_first", transpose_first)
-      .Run();
-
-  return result;
-}
-
-} // namespace native
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor confusion_transpose_npu(
+    const Tensor& self,
+    IntArrayRef perm,
+    IntArrayRef shape,
+    bool transpose_first) {
+  SmallVector<int64_t, SIZE> output_size;
+  if (transpose_first){
+    output_size = array_to_small_vector(shape);
+  } else {
+    for (int i = 0; i < perm.size(); i++){
+      output_size.emplace_back(shape[perm[i]]);
+    }
+  }
+
+  // construct the output tensor of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self, output_size);
+  OpCommand cmd;
+  cmd.Name("ConfusionTransposeD")
+      .Input(self)
+      .Output(result)
+      .Attr("perm", perm)
+      .Attr("shape", shape)
+      .Attr("transpose_first", transpose_first)
+      .Run();
+
+  return result;
+}
+
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/CrossKernelNpu.cpp b/src/aten/src/ATen/native/npu/CrossKernelNpu.cpp
index 64ddb33c2eef56fe67d91c0e66759d34b34349c0..c7322bfec336b4ed6bb9324d760841949ebd230c 100644
--- a/src/aten/src/ATen/native/npu/CrossKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/CrossKernelNpu.cpp
@@ -1,63 +1,63 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor cross_dest_output(const Tensor& self, const Tensor& other) {
-  bool isSelfWrapped = CalcuOpUtil::is_scalar_wrapped_to_tensor(self);
-  return isSelfWrapped ? other : self;
-}
-
-int64_t cross_real_dim(optional<int64_t> dim) {
-  // -65530 is the default value of dim
-  return dim.has_value() ? dim.value() : -65530;
-}
-
-Tensor& cross_out_npu(
-  Tensor& result, 
-  const Tensor& self,
-  const Tensor& other,
-  optional<int64_t> dim) {
-  int64_t realDim = cross_real_dim(dim);
-  OpCommand cmd;
-  cmd.Name("Cross")
-    .Input(self)
-    .Input(other)
-    .Output(result)
-    .Attr("dim", realDim)
-    .Run();
-  return result;
-}
-
-Tensor cross_npu(
-  const Tensor& self, 
-  const Tensor& other,
-  optional<int64_t> dim) {
-  auto outputSize = broadcast_ops_npu_output_size(self, other);
-  Tensor outputTensor = cross_dest_output(self, other);
-  Tensor result = at::empty_with_format(
-    outputSize, 
-    self.options(),
-    CalcuOpUtil::get_tensor_npu_format(outputTensor));
-  cross_out_npu(result, self, other, dim);
-  return result;
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor cross_dest_output(const Tensor& self, const Tensor& other) {
+  bool isSelfWrapped = CalcuOpUtil::is_scalar_wrapped_to_tensor(self);
+  return isSelfWrapped ? other : self;
+}
+
+int64_t cross_real_dim(optional<int64_t> dim) {
+  // -65530 is the default value of dim
+  return dim.has_value() ? dim.value() : -65530;
+}
+
+Tensor& cross_out_npu(
+  Tensor& result, 
+  const Tensor& self,
+  const Tensor& other,
+  optional<int64_t> dim) {
+  int64_t realDim = cross_real_dim(dim);
+  OpCommand cmd;
+  cmd.Name("Cross")
+    .Input(self)
+    .Input(other)
+    .Output(result)
+    .Attr("dim", realDim)
+    .Run();
+  return result;
+}
+
+Tensor cross_npu(
+  const Tensor& self, 
+  const Tensor& other,
+  optional<int64_t> dim) {
+  auto outputSize = broadcast_ops_npu_output_size(self, other);
+  Tensor outputTensor = cross_dest_output(self, other);
+  Tensor result = at::empty_with_format(
+    outputSize, 
+    self.options(),
+    CalcuOpUtil::get_tensor_npu_format(outputTensor));
+  cross_out_npu(result, self, other, dim);
+  return result;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/DropoutV2BackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/DropoutV2BackwardKernelNpu.cpp
index 34c41be07f188d2ae1ed3400ebb97b6e1aa6d926..89b720b61bf338dc994cf0a4558a6916180e42e8 100644
--- a/src/aten/src/ATen/native/npu/DropoutV2BackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/DropoutV2BackwardKernelNpu.cpp
@@ -1,64 +1,64 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-
-Tensor& dropout_v2_backward_out_npu(
-    Tensor& result,
-    const Tensor& self,
-    const Tensor& mask,
-    double p) {
-  
-  OpCommand cmd;
-  cmd.Name("MaskedScale")
-      .Input(self)
-      .Input(mask)
-      .Output(result)
-      .Attr("value",static_cast<float>(1./(1-p)))
-      .Run();
-  return result;
-}
-
-Tensor dropout_v2_backward_npu(const Tensor& grad_output, const Tensor& mask, double p){
-  TORCH_CHECK(grad_output.scalar_type() == ScalarType::Half ||
-              grad_output.scalar_type() == ScalarType::Float,
-              "grad_output's dtype only support fp16 or fp32 current");
-  TORCH_CHECK(mask.scalar_type() == ScalarType::Half ||
-              mask.scalar_type() == ScalarType::Float ||
-              mask.scalar_type() == ScalarType::Char || 
-              mask.scalar_type() == ScalarType::Byte,
-             "mask's dtype should be float32, float16, or int8 and uint8" );
-  TORCH_CHECK(grad_output.sizes() == mask.sizes(),
-              "grad_output must be the same shape with mask");
-
-  Tensor maskCopy = mask;
-  if (maskCopy.scalar_type() == ScalarType::Byte){
-    maskCopy = maskCopy.to(ScalarType::Half);
-  }
-  auto result = OpPreparation::ApplyTensor(grad_output);
-  dropout_v2_backward_out_npu(result, grad_output, maskCopy, p);
-
-  return result;
-
-}
-
-} // namespace native
-} // namespace at
-
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+
+Tensor& dropout_v2_backward_out_npu(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& mask,
+    double p) {
+  
+  OpCommand cmd;
+  cmd.Name("MaskedScale")
+      .Input(self)
+      .Input(mask)
+      .Output(result)
+      .Attr("value",static_cast<float>(1./(1-p)))
+      .Run();
+  return result;
+}
+
+Tensor dropout_v2_backward_npu(const Tensor& grad_output, const Tensor& mask, double p){
+  TORCH_CHECK(grad_output.scalar_type() == ScalarType::Half ||
+              grad_output.scalar_type() == ScalarType::Float,
+              "grad_output's dtype only support fp16 or fp32 current");
+  TORCH_CHECK(mask.scalar_type() == ScalarType::Half ||
+              mask.scalar_type() == ScalarType::Float ||
+              mask.scalar_type() == ScalarType::Char || 
+              mask.scalar_type() == ScalarType::Byte,
+             "mask's dtype should be float32, float16, or int8 and uint8" );
+  TORCH_CHECK(grad_output.sizes() == mask.sizes(),
+              "grad_output must be the same shape with mask");
+
+  Tensor maskCopy = mask;
+  if (maskCopy.scalar_type() == ScalarType::Byte){
+    maskCopy = maskCopy.to(ScalarType::Half);
+  }
+  auto result = OpPreparation::ApplyTensor(grad_output);
+  dropout_v2_backward_out_npu(result, grad_output, maskCopy, p);
+
+  return result;
+
+}
+
+} // namespace native
+} // namespace at
+
diff --git a/src/aten/src/ATen/native/npu/DropoutV2KernelNpu.cpp b/src/aten/src/ATen/native/npu/DropoutV2KernelNpu.cpp
index e4a435e77a4c8f9d479ae5a57b6ff43f2fa01eea..f03f5d88a6948908dde28df6f78ae4db39c498d7 100644
--- a/src/aten/src/ATen/native/npu/DropoutV2KernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/DropoutV2KernelNpu.cpp
@@ -1,58 +1,58 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-tuple<Tensor&, Tensor&, Tensor&> dropout_v2_out_npu(
-    Tensor& result,
-    Tensor& mask,
-    Tensor& new_seed,
-    const Tensor& self,
-    Tensor& seed,
-    double p) {
-
-  OpCommand cmd;
-  cmd.Name("DropoutV2")
-      .Input(self)
-      .Input(seed)
-      .Output(result)
-      .Output(mask)
-      .Output(new_seed)
-      .Attr("p", static_cast<float>(p))
-      .Run();
-  
-  return tuple<Tensor&, Tensor&, Tensor&>(result, mask, new_seed);
-}
-
-tuple <Tensor, Tensor, Tensor> dropout_v2_npu(const Tensor& self, Tensor& seed, double p) {
-  Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
-  Tensor formatCastOfSeed = OpPreparation::CastBackToOriFormat(seed);
-  
-  Tensor result = OpPreparation::ApplyTensor(formatCastOfSelf);
-  Tensor mask = OpPreparation::ApplyTensor(formatCastOfSelf, formatCastOfSeed.options());
-  dropout_v2_out_npu(result, mask, formatCastOfSeed, formatCastOfSelf, formatCastOfSeed, p);
-  NpuUtils::format_fresh_view(seed, formatCastOfSeed);
-  return std::tuple<Tensor, Tensor, Tensor>(result, mask, seed);
-}
-
-
-} // namespace native
-} // namespace at
-
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+tuple<Tensor&, Tensor&, Tensor&> dropout_v2_out_npu(
+    Tensor& result,
+    Tensor& mask,
+    Tensor& new_seed,
+    const Tensor& self,
+    Tensor& seed,
+    double p) {
+
+  OpCommand cmd;
+  cmd.Name("DropoutV2")
+      .Input(self)
+      .Input(seed)
+      .Output(result)
+      .Output(mask)
+      .Output(new_seed)
+      .Attr("p", static_cast<float>(p))
+      .Run();
+  
+  return tuple<Tensor&, Tensor&, Tensor&>(result, mask, new_seed);
+}
+
+tuple <Tensor, Tensor, Tensor> dropout_v2_npu(const Tensor& self, Tensor& seed, double p) {
+  Tensor formatCastOfSelf = OpPreparation::CastBackToOriFormat(self);
+  Tensor formatCastOfSeed = OpPreparation::CastBackToOriFormat(seed);
+  
+  Tensor result = OpPreparation::ApplyTensor(formatCastOfSelf);
+  Tensor mask = OpPreparation::ApplyTensor(formatCastOfSelf, formatCastOfSeed.options());
+  dropout_v2_out_npu(result, mask, formatCastOfSeed, formatCastOfSelf, formatCastOfSeed, p);
+  NpuUtils::format_fresh_view(seed, formatCastOfSeed);
+  return std::tuple<Tensor, Tensor, Tensor>(result, mask, seed);
+}
+
+
+} // namespace native
+} // namespace at
+
diff --git a/src/aten/src/ATen/native/npu/EmbeddingBagKernelNpu.cpp b/src/aten/src/ATen/native/npu/EmbeddingBagKernelNpu.cpp
index 67dfb8a3f9646902118b67184afdf15bfd6c1d60..c6c6b0517c8e069f705fe049eafe4ed18c002fed 100644
--- a/src/aten/src/ATen/native/npu/EmbeddingBagKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/EmbeddingBagKernelNpu.cpp
@@ -1,96 +1,96 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-namespace {
-SmallVector<int64_t, SIZE> _embedding_bag_npu_output_size(
-    const Tensor& weight,
-    const Tensor& indices,
-    const Tensor& offsets) {
-  SmallVector<int64_t, SIZE> outputSize = {};
-  if (indices.dim() == 1) {
-    outputSize = {offsets.size(0), weight.size(1)};
-  } else {
-    outputSize = {indices.size(0), weight.size(1)};
-  }
-  return outputSize;
-} // _embedding_bag_npu_output_size
-
-string get_mode_str(bool mode) {
-  string modeStr = "mean";
-  if (mode == 0) {
-    modeStr = "sum";
-  } else if (mode == 1) {
-    modeStr = "mean";
-  } else {
-    modeStr = "max";
-  }
-  return modeStr;
-} // get_mode_str
-
-} // namespace
-
-tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_npu(
-    const Tensor& weight,
-    const Tensor& indices,
-    const Tensor& offsets,
-    bool scale_grad_by_freq,
-    int64_t mode,
-    bool sparse,
-    const Tensor& per_sample_weights,
-    bool include_last_offset) {
-  auto outputSize = _embedding_bag_npu_output_size(weight, indices, offsets);
-
-  Tensor output = OpPreparation::ApplyTensorWithFormat(outputSize, weight.options(), ACL_FORMAT_ND);
-
-  Tensor indicesCopy = indices;
-  if (!(indices.dtype() == at::kInt)) {
-    indicesCopy = indicesCopy.to(at::kInt);
-  }
-
-  string modeStr = get_mode_str(mode);
-
-  OpCommand cmd;
-  cmd.Name("EmbeddingBag")
-      .Input(weight)
-      .Input(indicesCopy);
-  if (offsets.defined()) {
-    Tensor offsetsCopy = offsets;
-    if (!(offsets.dtype() == at::kInt)) {
-      offsetsCopy = offsetsCopy.to(at::kInt);
-    }
-    cmd.Input(offsetsCopy);
-  }
-  if (per_sample_weights.defined()) {
-    cmd.Input(per_sample_weights);
-  }
-  cmd.Output(output)
-      .Attr("mode", modeStr)
-      .Attr("scale_grad_by_freq", scale_grad_by_freq)
-      .Attr("sparse", sparse)
-      .Attr("include_last_offset", include_last_offset)
-      .Run();
-  
-  return std::tie(output, output, output, output);
-}
-
-} // namespace native
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+namespace {
+SmallVector<int64_t, SIZE> _embedding_bag_npu_output_size(
+    const Tensor& weight,
+    const Tensor& indices,
+    const Tensor& offsets) {
+  SmallVector<int64_t, SIZE> outputSize = {};
+  if (indices.dim() == 1) {
+    outputSize = {offsets.size(0), weight.size(1)};
+  } else {
+    outputSize = {indices.size(0), weight.size(1)};
+  }
+  return outputSize;
+} // _embedding_bag_npu_output_size
+
+string get_mode_str(bool mode) {
+  string modeStr = "mean";
+  if (mode == 0) {
+    modeStr = "sum";
+  } else if (mode == 1) {
+    modeStr = "mean";
+  } else {
+    modeStr = "max";
+  }
+  return modeStr;
+} // get_mode_str
+
+} // namespace
+
+tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_npu(
+    const Tensor& weight,
+    const Tensor& indices,
+    const Tensor& offsets,
+    bool scale_grad_by_freq,
+    int64_t mode,
+    bool sparse,
+    const Tensor& per_sample_weights,
+    bool include_last_offset) {
+  auto outputSize = _embedding_bag_npu_output_size(weight, indices, offsets);
+
+  Tensor output = OpPreparation::ApplyTensorWithFormat(outputSize, weight.options(), ACL_FORMAT_ND);
+
+  Tensor indicesCopy = indices;
+  if (!(indices.dtype() == at::kInt)) {
+    indicesCopy = indicesCopy.to(at::kInt);
+  }
+
+  string modeStr = get_mode_str(mode);
+
+  OpCommand cmd;
+  cmd.Name("EmbeddingBag")
+      .Input(weight)
+      .Input(indicesCopy);
+  if (offsets.defined()) {
+    Tensor offsetsCopy = offsets;
+    if (!(offsets.dtype() == at::kInt)) {
+      offsetsCopy = offsetsCopy.to(at::kInt);
+    }
+    cmd.Input(offsetsCopy);
+  }
+  if (per_sample_weights.defined()) {
+    cmd.Input(per_sample_weights);
+  }
+  cmd.Output(output)
+      .Attr("mode", modeStr)
+      .Attr("scale_grad_by_freq", scale_grad_by_freq)
+      .Attr("sparse", sparse)
+      .Attr("include_last_offset", include_last_offset)
+      .Run();
+  
+  return std::tie(output, output, output, output);
+}
+
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/EmbeddingRenormKernelNpu.cpp b/src/aten/src/ATen/native/npu/EmbeddingRenormKernelNpu.cpp
index 3a22c9157d07cec008c3c297ab515815d5349511..d10eb8bf02b61b9341ebe0cf812494a2124a1b61 100644
--- a/src/aten/src/ATen/native/npu/EmbeddingRenormKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/EmbeddingRenormKernelNpu.cpp
@@ -1,168 +1,138 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-SmallVector<NPUTensorDesc, N> embedding_renorm_npu_input(
-    const SmallVector<Tensor, N>& inputTensor) {
-  return CalcuOpUtil::create_npu_input_tensor_desc(inputTensor);
-}
-
-SmallVector<NPUTensorDesc, N> embedding_renorm_npu_output(
-    const SmallVector<Tensor, N>& outputTensor) {
-  return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor);
-}
-
-SmallVector<NPUAttrDesc, N> embedding_renorm_npu_attr(
-      double max_norm, 
-      double norm_type){
-  int64_t dim = 0;
-  float max_norm_float = (float) max_norm;
-  float norm_type_float = (float) norm_type;
-  NPUAttrDesc npuAttrScalarP = NPUAttrDesc("p", norm_type_float);
-  NPUAttrDesc npuAttrScalarMaxnorm = NPUAttrDesc("maxnorm", max_norm_float);
-  NPUAttrDesc npuAttrDim = NPUAttrDesc("dim", dim);
-  SmallVector<NPUAttrDesc, N> attrs = {npuAttrScalarP, npuAttrDim, npuAttrScalarMaxnorm};
-  return attrs;
-}
-SmallVector<NPUAttrDesc, N> embedding_gather2d_npu_attr() {
-  NPUAttrDesc npuAttrAxis = NPUAttrDesc("axis", (int64_t)0);
-  SmallVector<NPUAttrDesc, N> attrs = {npuAttrAxis};
-  return attrs;
-}
-
-SmallVector<NPUAttrDesc, N> embedding_renorm_scatter_update_npu_attr(){
-  NPUAttrDesc npuAttrAxis = NPUAttrDesc("use_locking", false);
-  SmallVector<NPUAttrDesc, N> attrs = {npuAttrAxis};
-  return attrs;
-}
-
-Tensor& embedding_renorm_gather2d_out_npu(
-    Tensor& result,
-    const Tensor& self,
-    const Tensor& indices
-    ){
-// execute the NPU operate  GatherV2D
-  auto inputs = embedding_renorm_npu_input({self, indices});
-  auto outputs = embedding_renorm_npu_output({result});
-  auto attrs = embedding_gather2d_npu_attr();
-  CalcuOpUtil::execute_npu_operate("GatherV2D", inputs, outputs, attrs);
-  return result;
-}
-
-Tensor& embedding_renorm_execute_out_npu(
-    Tensor& result,
-    const Tensor& self,
-    double max_norm, 
-    double norm_type){
-//execute the NPU operate  Renorm
-  auto inputs = embedding_renorm_npu_input({self});
-  auto outputs = embedding_renorm_npu_output({result});
-  auto attrs = embedding_renorm_npu_attr(max_norm, norm_type);
-  CalcuOpUtil::execute_npu_operate("Renorm", inputs, outputs, attrs);
-  return result;
-}
-
-
-Tensor& embedding_renorm_scatter_update_out_npu(
-    Tensor& result,
-    const Tensor& self,
-    const Tensor& indices,
-    const Tensor& update){
-  auto inputs = embedding_renorm_npu_input({self, indices, update});
-  auto outputs = embedding_renorm_npu_output({result});
-  auto attrs = embedding_renorm_scatter_update_npu_attr();
-  CalcuOpUtil::execute_npu_operate("ScatterUpdate", inputs, outputs, attrs);
-  return result;
-}
-
-
-Tensor& embedding_renorm_out_npu(
-    Tensor& result,
-    const Tensor& self,
-    const Tensor& indices,
-    Tensor& mid_input,
-    Tensor& mid_output,
-    double max_norm, 
-    double norm_type){
-// execute the NPU operate  GatherV2D,generate  new tensor by indices 
-  embedding_renorm_gather2d_out_npu(
-        mid_input,
-        self,
-        indices);
-//execute the NPU operate  Renorm
-  embedding_renorm_execute_out_npu(
-        mid_output,
-        mid_input,
-        max_norm, 
-        norm_type);
-// executing the NPU operator ScatterUpdate
-  embedding_renorm_scatter_update_out_npu(
-        result,
-        self,
-        indices,
-        mid_output); 
-  return result;
-}
-
-Tensor& embedding_renorm_npu_(
-    Tensor& self,
-    const Tensor& indices,
-    double max_norm, 
-    double norm_type) {
-
-//check dim and type
-  auto self_arg = TensorArg(self, "self", 1);
-  auto indices_arg = TensorArg(indices, "indices", 2);
-  checkDim("embedding_renorm_", self_arg, 2);
-  checkScalarType("embedding_renorm_", indices_arg, kLong);
-
-// indices must be int64 in pytorch, but npu can only support int32
-  auto indices_int32 = indices.to("cpu");
-  indices_int32 = indices_int32.to(at::kInt);
-  indices_int32 = indices_int32.to("npu");   
-
-//resize indices to 1D
-  Tensor indices_copy = indices.clone();
-  auto num_indices = indices.numel();
-  resize_npu_(indices_copy, num_indices);
-    
-  SmallVector<Tensor, N> inputs = {self};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
-//get the  outSize of  GatherV2 , the middle tensor
-  auto midSize = embedding_renorm_mid_npu_output_size(self, indices_copy);
-  Tensor mid = at::empty_with_format(midSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-  Tensor mid1 = at::empty_with_format(midSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-    
-//inplace operate
-  if (!NpuUtils::check_match(&self)) {
-    Tensor contiguousSelf = NpuUtils::format_contiguous(self);
-    Tensor result = embedding_renorm_out_npu(contiguousSelf, contiguousSelf, indices_copy, mid, mid1, max_norm, norm_type);
-  NpuUtils::format_fresh_view(self, result);
-  } else {
-    embedding_renorm_out_npu(self, self, indices_copy, mid, mid1, max_norm, norm_type);
-  }
-  return self;
-}
-
-}
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor& embedding_renorm_gather2d_out_npu(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& indices) {
+  OpCommand cmd;
+  cmd.Name("GatherV2D")
+    .Input(self)
+    .Input(indices)
+    .Output(result)
+    .Attr("axis", (int64_t)0)
+    .Run();
+  return result;
+}
+
+Tensor& embedding_renorm_execute_out_npu(
+    Tensor& result,
+    const Tensor& self,
+    double max_norm,
+    double norm_type) {
+  OpCommand cmd;
+  cmd.Name("Renorm")
+    .Input(self)
+    .Output(result)
+    .Attr("p", (float)norm_type)
+    .Attr("dim", (int64_t)0)
+    .Attr("maxnorm", (float)max_norm)
+    .Run();
+  return result;
+}
+
+
+Tensor& embedding_renorm_scatter_update_out_npu(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& indices,
+    const Tensor& update) {
+  OpCommand cmd;
+  cmd.Name("ScatterUpdate")
+    .Input(self)
+    .Input(indices)
+    .Input(update)
+    .Output(result)
+    .Attr("use_locking", false)
+    .Run();
+  return result;
+}
+
+Tensor& embedding_renorm_out_npu(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& indices,
+    double max_norm,
+    double norm_type){
+
+  //get the  outSize of  GatherV2 , the middle tensor
+  SmallVector<int64_t, SIZE> midSize = {indices.size(0), self.size(1)};
+  Tensor mid_input = OpPreparation::ApplyTensor(self, midSize);
+  Tensor mid_output = OpPreparation::ApplyTensor(self, midSize);
+
+  // execute the NPU operate  GatherV2D, generate  new tensor by indices
+  embedding_renorm_gather2d_out_npu(mid_input,self,indices);
+
+  //execute the NPU operate  Renorm
+  embedding_renorm_execute_out_npu(mid_output, mid_input, max_norm, norm_type);
+
+  //execute the NPU operate  ZerosLike or RangeD, generate new tensor by indices.numel()
+  Tensor mid_output_copy = mid_output.clone();
+  auto num_indices = indices.numel();
+  Tensor input_indices;
+  
+  // RangeD not support range(0,0)
+  if (num_indices - 1 == 0) {
+    input_indices = at::zeros({1}, self.options()).to(at::kLong);
+  } else {
+    input_indices = at::range(0, num_indices-1, self.options()).to(at::kLong);
+  }
+
+  //execute the NPU operate  MUL, generate change result
+  auto num_mid_output = mid_output.numel();
+  resize_npu_(mid_output_copy, num_mid_output);
+  Tensor scalar_out = OpPreparation::ApplyTensor(self, {num_indices, 1});
+  embedding_renorm_gather2d_out_npu(scalar_out, mid_output_copy, input_indices);
+  Tensor out_res = mid_input * scalar_out;
+
+  // executing the NPU operator ScatterUpdate
+  embedding_renorm_scatter_update_out_npu(result, self, indices, out_res);
+
+  return result;
+}
+
+Tensor& embedding_renorm_npu_(
+    Tensor& self,
+    const Tensor& indices,
+    double max_norm,
+    double norm_type) {
+
+  //check dim and type
+  auto self_arg = TensorArg(self, "self", 1);
+  auto indices_arg = TensorArg(indices, "indices", 2);
+  checkDim("embedding_renorm_", self_arg, 2);
+  checkScalarType("embedding_renorm_", indices_arg, kLong);
+
+  //resize indices to 1D
+  Tensor indices_copy = indices.clone();
+  auto num_indices = indices.numel();
+  resize_npu_(indices_copy, num_indices);
+
+  OpPipeWithDefinedOut pipe;
+  pipe.CheckMemory({self, indices_copy}, {self})
+   .Func([&self, &indices_copy, max_norm, norm_type](Tensor& result){
+        embedding_renorm_out_npu(self, self, indices_copy, max_norm, norm_type);})
+   .Call(self);
+
+  return self;
+}
+
+}
 }
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/EqualKernelNpu.cpp b/src/aten/src/ATen/native/npu/EqualKernelNpu.cpp
index 0d21a2f24546d62519b09ca46680a56a1c3d1ab3..b3dab641a20295f55483a0cd0af891504ea97dff 100644
--- a/src/aten/src/ATen/native/npu/EqualKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/EqualKernelNpu.cpp
@@ -1,55 +1,55 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-bool equal_npu(const Tensor& self, const Tensor& other) {
-  //check the shape of self and other
-  if(self.sizes() != other.sizes()) {
-    return false;
-  }
-
-  TORCH_CHECK(
-      self.scalar_type() == other.scalar_type(),
-      "Expected object of scalar type ",
-      self.scalar_type(),
-      ", but got ",
-      other.scalar_type(),
-      " for argument #2 'other' in call to equal_npu");
-  
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      {1},
-      self.options().dtype(kBool), 
-      ACL_FORMAT_ND);
-
-  // calculate the output result of the NPU
-  OpCommand cmd;
-  cmd.Name("TensorEqual")
-      .Input(self)
-      .Input(other)
-      .Output(result)
-      .Run();
-
-  return result.item().to<bool>();
-}
-} // namespace native
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
+#include "ATen/native/npu/utils/OpTemplate.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+bool equal_npu(const Tensor& self, const Tensor& other) {
+  //check the shape of self and other
+  if(self.sizes() != other.sizes()) {
+    return false;
+  }
+
+  TORCH_CHECK(
+      self.scalar_type() == other.scalar_type(),
+      "Expected object of scalar type ",
+      self.scalar_type(),
+      ", but got ",
+      other.scalar_type(),
+      " for argument #2 'other' in call to equal_npu");
+  
+  // construct the output tensor of the NPU
+  Tensor result = at::empty_with_format(
+      {1},
+      self.options().dtype(kBool), 
+      ACL_FORMAT_ND);
+
+  // calculate the output result of the NPU
+  OpCommand cmd;
+  cmd.Name("TensorEqual")
+      .Input(self)
+      .Input(other)
+      .Output(result)
+      .Run();
+
+  return result.item().to<bool>();
+}
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/FastGeluBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/FastGeluBackwardKernelNpu.cpp
index 9a4cc7bdfbd5c1f0bd33bd9004c4f208f8bb7b08..d0a1b9968391ac26f090476339ffa913862603ee 100644
--- a/src/aten/src/ATen/native/npu/FastGeluBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/FastGeluBackwardKernelNpu.cpp
@@ -1,60 +1,60 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-
-namespace {
-Tensor& fast_gelu_backward_npu_nocheck(
-    Tensor& grad_input,
-    const Tensor& grad,
-    const Tensor& self) {
-  // constructs the input and output NPUTensorDesc
-  OpCommand cmd;
-  cmd.Name("FastGeluGrad")
-    .Input(grad)
-    .Input(self)
-    .Output(grad_input)
-    .Run();
-
-  return grad_input;
-}
-
-}
-
-Tensor fast_gelu_backward_npu(
-    const Tensor& grad, 
-    const Tensor& self) {
-  // calculate the output size
-  //Tensor outputTensor = self;
-  auto outputSize = input_same_output_size(self);
-
-  // construct the output tensor of the NPU
-  Tensor grad_input = at::empty_with_format(
-        outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-  
-  // calculate the output result of the NPU
-  fast_gelu_backward_npu_nocheck(grad_input, grad, self);
-  
-  return grad_input;
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
+#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
+#include "ATen/native/npu/utils/OpTemplate.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+
+namespace {
+Tensor& fast_gelu_backward_npu_nocheck(
+    Tensor& grad_input,
+    const Tensor& grad,
+    const Tensor& self) {
+  // constructs the input and output NPUTensorDesc
+  OpCommand cmd;
+  cmd.Name("FastGeluGrad")
+    .Input(grad)
+    .Input(self)
+    .Output(grad_input)
+    .Run();
+
+  return grad_input;
+}
+
+}
+
+Tensor fast_gelu_backward_npu(
+    const Tensor& grad, 
+    const Tensor& self) {
+  // calculate the output size
+  //Tensor outputTensor = self;
+  auto outputSize = input_same_output_size(self);
+
+  // construct the output tensor of the NPU
+  Tensor grad_input = at::empty_with_format(
+        outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
+  
+  // calculate the output result of the NPU
+  fast_gelu_backward_npu_nocheck(grad_input, grad, self);
+  
+  return grad_input;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/FastGeluKernelNpu.cpp b/src/aten/src/ATen/native/npu/FastGeluKernelNpu.cpp
index ff2f136fc06ce4d7833f431828efaa953740ff73..e90d3a5af0453e7ecbfeb9a21a57762d6ee8dae3 100644
--- a/src/aten/src/ATen/native/npu/FastGeluKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/FastGeluKernelNpu.cpp
@@ -1,49 +1,49 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-namespace {
-Tensor fast_gelu_npu_nocheck(Tensor& result, const Tensor& self) {
-
-    OpCommand cmd;
-    cmd.Name("FastGelu")
-        .Input(self)
-        .Output(result)
-        .Run();
-
-    return result;
-}
-
-} //namespace
-
-Tensor fast_gelu_npu(const Tensor& self) {
-  // calculate the output size
-    auto outputSize = input_same_output_size(self);
-  // construct the output tensor of the NPU
-    Tensor result = at::empty_with_format(outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-  // calculate the output result of the NPU
-    fast_gelu_npu_nocheck(result, self);
-
-    return result;
-}
-
-} // namespace native
-} // namespace at
-
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
+#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
+#include "ATen/native/npu/utils/OpTemplate.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+namespace {
+Tensor fast_gelu_npu_nocheck(Tensor& result, const Tensor& self) {
+
+    OpCommand cmd;
+    cmd.Name("FastGelu")
+        .Input(self)
+        .Output(result)
+        .Run();
+
+    return result;
+}
+
+} //namespace
+
+Tensor fast_gelu_npu(const Tensor& self) {
+  // calculate the output size
+    auto outputSize = input_same_output_size(self);
+  // construct the output tensor of the NPU
+    Tensor result = at::empty_with_format(outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
+  // calculate the output result of the NPU
+    fast_gelu_npu_nocheck(result, self);
+
+    return result;
+}
+
+} // namespace native
+} // namespace at
+
diff --git a/src/aten/src/ATen/native/npu/GerKernelNpu.cpp b/src/aten/src/ATen/native/npu/GerKernelNpu.cpp
index 8651b3e70493d9ddc86a869f2deb45154dde1f05..baac62e61a5292734160907efca70501fec08dc0 100644
--- a/src/aten/src/ATen/native/npu/GerKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/GerKernelNpu.cpp
@@ -1,86 +1,86 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION.
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-SmallVector<int64_t, SIZE> ger_npu_output_size(
-    const Tensor& self,
-    const Tensor& vec2) {
-  int64_t outputsize_0 = self.size(0);
-  int64_t outputsize_1 = vec2.size(0);
-  SmallVector<int64_t, SIZE> outputsize = {outputsize_0, outputsize_1};
-
-  return outputsize;
-}
-
-Tensor& ger_out_npu_nocheck(Tensor& result, const Tensor& self , const Tensor& vec2) {
-  OpCommand cmd;
-  cmd.Name("Ger")
-      .Input(self)
-      .Input(vec2)
-      .Output(result)
-      .Run();
-
-  return result;
-}
-
-Tensor& ger_out_npu(Tensor& result, const Tensor& self , const Tensor& vec2) {
-  // check shape
-  TORCH_CHECK(
-      self.dim() == 1, "Input1 must have only1 dims."); 
-  TORCH_CHECK(
-      vec2.dim() == 1, "Input2 must have only1 dims.");
-
-  // calculate the output size
-  auto outputSize = ger_npu_output_size(self, vec2);
-
-  OpPreparation::CheckOut(
-      {self},
-      result,
-      self,
-      outputSize);
-
-  OpPipeWithDefinedOut pipe;
-  return pipe.CheckMemory({self, vec2}, {result})
-      .Func([&self, &vec2](Tensor& result){ger_out_npu_nocheck(result, self, vec2);})
-      .Call(result);
-}
-
-Tensor ger_npu(const Tensor& self, const Tensor& vec2) {
-  // check shape
-  TORCH_CHECK(
-      self.dim() == 1, "Input1 must have only1 dims."); 
-  TORCH_CHECK(
-      vec2.dim() == 1, "Input2 must have only1 dims.");
-
-  // calculate the output size
-  auto outputSize = ger_npu_output_size(self, vec2);
-
-  // construct the output tensor of the NPU 
-  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
-
-  // calculate the output result of the NPU
-  ger_out_npu_nocheck(result, self, vec2);
-
-  return result;
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+SmallVector<int64_t, SIZE> ger_npu_output_size(
+    const Tensor& self,
+    const Tensor& vec2) {
+  int64_t outputsize_0 = self.size(0);
+  int64_t outputsize_1 = vec2.size(0);
+  SmallVector<int64_t, SIZE> outputsize = {outputsize_0, outputsize_1};
+
+  return outputsize;
+}
+
+Tensor& ger_out_npu_nocheck(Tensor& result, const Tensor& self , const Tensor& vec2) {
+  OpCommand cmd;
+  cmd.Name("Ger")
+      .Input(self)
+      .Input(vec2)
+      .Output(result)
+      .Run();
+
+  return result;
+}
+
+Tensor& ger_out_npu(Tensor& result, const Tensor& self , const Tensor& vec2) {
+  // check shape
+  TORCH_CHECK(
+      self.dim() == 1, "Input1 must have only1 dims."); 
+  TORCH_CHECK(
+      vec2.dim() == 1, "Input2 must have only1 dims.");
+
+  // calculate the output size
+  auto outputSize = ger_npu_output_size(self, vec2);
+
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      self,
+      outputSize);
+
+  OpPipeWithDefinedOut pipe;
+  return pipe.CheckMemory({self, vec2}, {result})
+      .Func([&self, &vec2](Tensor& result){ger_out_npu_nocheck(result, self, vec2);})
+      .Call(result);
+}
+
+Tensor ger_npu(const Tensor& self, const Tensor& vec2) {
+  // check shape
+  TORCH_CHECK(
+      self.dim() == 1, "Input1 must have only1 dims."); 
+  TORCH_CHECK(
+      vec2.dim() == 1, "Input2 must have only1 dims.");
+
+  // calculate the output size
+  auto outputSize = ger_npu_output_size(self, vec2);
+
+  // construct the output tensor of the NPU 
+  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
+
+  // calculate the output result of the NPU
+  ger_out_npu_nocheck(result, self, vec2);
+
+  return result;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/GiouBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/GiouBackwardKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5672a390dfea63cd1e0842ab980b7b354e2519fb
--- /dev/null
+++ b/src/aten/src/ATen/native/npu/GiouBackwardKernelNpu.cpp
@@ -0,0 +1,73 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+std::tuple<Tensor&, Tensor&>
+giou_backward_inner_out_npu(
+    Tensor& dbboxes,
+    Tensor& dgtboxes,
+    const Tensor& grad,
+    const Tensor& bboxes,
+    const Tensor& gtboxes,
+    bool trans,
+    bool is_cross,
+    int64_t mode){
+  string mode_str = mode == 1 ? "iof" : "iou";
+
+  OpCommand cmd;
+  cmd.Name("GIoUGrad")
+      .Input(grad)
+      .Input(bboxes)
+      .Input(gtboxes)
+      .Output(dbboxes)
+      .Output(dgtboxes)
+      .Attr("trans", trans)
+      .Attr("is_cross", is_cross)
+      .Attr("mode", mode_str)
+      .Run();
+  return std::tie(dbboxes, dgtboxes);
+}
+
+std::tuple<Tensor, Tensor>
+giou_backward_npu(
+    const Tensor& grad,
+    const Tensor& bboxes,
+    const Tensor& gtboxes,
+    bool trans,
+    bool is_cross,
+    int64_t mode){
+  TORCH_CHECK(!trans && !is_cross &&  mode == 0,
+            "giou backward only support trans==False, ",
+            "is_cross==False, ",
+            "mode==0('iou') current version ",
+            "if you need to back propagation, ",
+            "please ensure your parameter is correct!");
+  // Op need form of [n] grad
+  Tensor gradCp = at::squeeze(grad, 0);
+  Tensor dbboxes = OpPreparation::ApplyTensor(bboxes);
+  Tensor dgtboxes = OpPreparation::ApplyTensor(gtboxes);
+
+  giou_backward_inner_out_npu(dbboxes, dgtboxes, gradCp, bboxes, gtboxes, trans, is_cross, mode);
+  return std::tie(dbboxes, dgtboxes);
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/GiouKernelNpu.cpp b/src/aten/src/ATen/native/npu/GiouKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5360ee39c8eff25fba13920109c001b77921ece6
--- /dev/null
+++ b/src/aten/src/ATen/native/npu/GiouKernelNpu.cpp
@@ -0,0 +1,87 @@
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+SmallVector<int64_t, N> giou_output_size(
+    const Tensor& self,
+    const Tensor& gtboxes,
+    bool is_cross){
+  SmallVector<int64_t, N> output_size;
+  if(is_cross){
+      output_size = {gtboxes.size(0), self.size(0)};
+  } else {
+      output_size = {1, self.size(0)};
+  }
+  return output_size;
+}
+
+Tensor& giou_inner_out_npu(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& gtboxes,
+    bool trans,
+    bool is_cross,
+    int64_t mode){
+  auto output_size = giou_output_size(self, gtboxes, is_cross);
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      self,
+      output_size);
+  string mode_str = mode == 1 ? "iof" : "iou";
+
+  OpCommand cmd;
+  cmd.Name("GIoU")
+      .Input(self)
+      .Input(gtboxes)
+      .Output(result)
+      .Attr("trans", trans)
+      .Attr("is_cross", is_cross)
+      .Attr("mode", mode_str)
+      .Run();
+  return result;
+}
+
+Tensor giou_npu(
+    const Tensor& self,
+    const Tensor& gtboxes,
+    bool trans,
+    bool is_cross,
+    int64_t mode){
+  TORCH_CHECK(!trans && !is_cross &&  mode == 0,
+            "giou backward only support trans==False, ",
+            "is_cross==False, ",
+            "mode==0('iou') current version ",
+            "if you need to back propagation, ",
+            "please ensure your parameter is correct!");
+  // Op need form of [n, 4], but pass should be [4, n];
+  Tensor selfCp = self.permute({1, 0});
+  Tensor gtboxesCp = gtboxes.permute({1, 0});
+  auto output_size = giou_output_size(selfCp, gtboxesCp, is_cross);
+  Tensor result = OpPreparation::ApplyTensor(selfCp, output_size);
+
+  giou_inner_out_npu(result, selfCp, gtboxesCp, trans, is_cross, mode);
+  result = result.permute({1, 0});
+  return result;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/GridSamplerKernelNpu.cpp b/src/aten/src/ATen/native/npu/GridSamplerKernelNpu.cpp
index bd98c6d8f10c03224ca0ce36ec9d8507326ecd6f..57bfa62db880278ba44b2de265bbc70ca4ea1f85 100644
--- a/src/aten/src/ATen/native/npu/GridSamplerKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/GridSamplerKernelNpu.cpp
@@ -1,47 +1,47 @@
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor grid_sampler_npu(const Tensor& self, const Tensor& grid,
-int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
-  Tensor formatCastOfSelf = self.npu_format_cast(ACL_FORMAT_ND);
-  Tensor formatCastOfGrid = grid.npu_format_cast(ACL_FORMAT_ND);
-  if (formatCastOfSelf.scalar_type() == ScalarType::Half) {
-    formatCastOfSelf = formatCastOfSelf.npu_dtype_cast(ScalarType::Float);
-  }
-  if (formatCastOfGrid.scalar_type() == ScalarType::Half) {
-    formatCastOfGrid = formatCastOfGrid.npu_dtype_cast(ScalarType::Float);
-  }
-
-  // calculate the output size
-  SmallVector<int64_t, SIZE> outputSize = {formatCastOfSelf.size(0),
-                                           formatCastOfSelf.size(1),
-                                           formatCastOfGrid.size(1),
-                                           formatCastOfGrid.size(2)};
-
-  // construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-      outputSize, formatCastOfSelf.options(), ACL_FORMAT_ND);
-
-  // calculate the output result of the NPU
-  OpCommand cmd;
-  cmd.Name("GridSampler2D")
-      .Input(formatCastOfSelf)
-      .Input(formatCastOfGrid)
-      .Output(result)
-      .Attr("interpolation_mode", interpolation_mode)
-      .Attr("padding_mode", padding_mode)
-      .Attr("align_corners", align_corners)
-      .Run();
-      
-  if (result.scalar_type() != self.scalar_type()) {
-    result = result.npu_dtype_cast(ScalarType::Half);
-  }
-
-  return result;
-}
-} // namespace native
-} // namespace at
+#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
+#include "ATen/native/npu/utils/OpTemplate.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor grid_sampler_npu(const Tensor& self, const Tensor& grid,
+int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
+  Tensor formatCastOfSelf = self.npu_format_cast(ACL_FORMAT_ND);
+  Tensor formatCastOfGrid = grid.npu_format_cast(ACL_FORMAT_ND);
+  if (formatCastOfSelf.scalar_type() == ScalarType::Half) {
+    formatCastOfSelf = formatCastOfSelf.npu_dtype_cast(ScalarType::Float);
+  }
+  if (formatCastOfGrid.scalar_type() == ScalarType::Half) {
+    formatCastOfGrid = formatCastOfGrid.npu_dtype_cast(ScalarType::Float);
+  }
+
+  // calculate the output size
+  SmallVector<int64_t, SIZE> outputSize = {formatCastOfSelf.size(0),
+                                           formatCastOfSelf.size(1),
+                                           formatCastOfGrid.size(1),
+                                           formatCastOfGrid.size(2)};
+
+  // construct the output tensor of the NPU
+  Tensor result = at::empty_with_format(
+      outputSize, formatCastOfSelf.options(), ACL_FORMAT_ND);
+
+  // calculate the output result of the NPU
+  OpCommand cmd;
+  cmd.Name("GridSampler2D")
+      .Input(formatCastOfSelf)
+      .Input(formatCastOfGrid)
+      .Output(result)
+      .Attr("interpolation_mode", interpolation_mode)
+      .Attr("padding_mode", padding_mode)
+      .Attr("align_corners", align_corners)
+      .Run();
+      
+  if (result.scalar_type() != self.scalar_type()) {
+    result = result.npu_dtype_cast(ScalarType::Half);
+  }
+
+  return result;
+}
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/GruBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/GruBackwardKernelNpu.cpp
index 6084597d4920e8c6a9946482323daa59c7ff9ba4..d6ab34e8b04744dfc128c02e889bfb505a37bd00 100644
--- a/src/aten/src/ATen/native/npu/GruBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/GruBackwardKernelNpu.cpp
@@ -1,96 +1,96 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> gru_backward_npu(
-    const Tensor& grady,
-    const Tensor& gradh,
-    const Tensor& input,
-    const Tensor& weight_input,
-    const Tensor& weight_hidden,
-    const Tensor& bias_input,
-    const Tensor& bias_hidden,
-    const Tensor& seq_length,
-    const Tensor& init_h,
-    const Tensor& output_y,
-    const Tensor& output_h,
-    const Tensor& output_updata,
-    const Tensor& output_reset,
-    const Tensor& output_new,
-    const Tensor& hidden_new) {
- 
-  Tensor inh = at::squeeze(init_h, 0);
-  auto grad_y =
-      grady.defined() ? grady : OpPreparation::ApplyTensorWithFormat(output_y.sizes(), output_y.options(), ACL_FORMAT_FRACTAL_NZ).mul(0);
-  auto grad_h =
-      gradh.defined() ? gradh[input.size(0)-1] : OpPreparation::ApplyTensorWithFormat(inh.sizes(), output_h.options(), ACL_FORMAT_FRACTAL_NZ).mul(0);
-
-  Tensor mask = at::zeros({}, input.options().dtype(kByte)); // uint8
-  Tensor seq_lengths = at::zeros({}, input.options());
-
-  int64_t npu_format = ACL_FORMAT_ND;
-
-  Tensor grad_w_input = OpPreparation::ApplyTensorWithFormat(weight_input.sizes(), input.options(), npu_format);
-  Tensor grad_w_hidden = OpPreparation::ApplyTensorWithFormat(weight_hidden.sizes(), input.options(), npu_format);
-  Tensor grad_x = OpPreparation::ApplyTensorWithFormat(input.sizes(), input.options(), npu_format);
-  Tensor grad_b_input = OpPreparation::ApplyTensorWithFormat(bias_input.sizes(), input.options(), npu_format);
-  Tensor grad_b_hidden = OpPreparation::ApplyTensorWithFormat(bias_hidden.sizes(), input.options(), npu_format);
-  Tensor grad_h_prev = OpPreparation::ApplyTensorWithFormat(init_h.sizes(), input.options(), npu_format);
-
-  OpCommand cmd;
-  cmd.Name("DynamicGRUV2Grad")
-      .Input(input)
-      .Input(weight_input)
-      .Input(weight_hidden)
-      .Input(output_y)
-      .Input(inh)
-      .Input(output_h)
-      .Input(grad_y)
-      .Input(grad_h)
-      .Input(output_updata)
-      .Input(output_reset)
-      .Input(output_new)
-      .Input(hidden_new)
-      .Input(seq_lengths)
-      .Input(mask)
-      .Output(grad_w_input)
-      .Output(grad_w_hidden)
-      .Output(grad_b_input)
-      .Output(grad_b_hidden)
-      .Output(grad_x)
-      .Output(grad_h_prev)
-      .Attr("direction", (string) "UNIDIRECTIONAL")
-      .Attr("cell_depth", (int64_t)1)
-      .Attr("keep_prob", (float)1.0)
-      .Attr("cell_clip", (float)-1.0)
-      .Attr("num_proj", (int64_t)0)
-      .Attr("time_major", (bool)true)
-      .Attr("bias_type", (string) "no_bias")
-      .Attr("gate_order", (string) "rzh")
-      .Attr("reset_after", (bool)true)
-      .Run();
-
-  return std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> {
-      grad_w_input, grad_w_hidden, grad_x, grad_b_input, grad_b_hidden, grad_h_prev
-  };
-}
-
-} // namespace native
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> gru_backward_npu(
+    const Tensor& grady,
+    const Tensor& gradh,
+    const Tensor& input,
+    const Tensor& weight_input,
+    const Tensor& weight_hidden,
+    const Tensor& bias_input,
+    const Tensor& bias_hidden,
+    const Tensor& seq_length,
+    const Tensor& init_h,
+    const Tensor& output_y,
+    const Tensor& output_h,
+    const Tensor& output_updata,
+    const Tensor& output_reset,
+    const Tensor& output_new,
+    const Tensor& hidden_new) {
+ 
+  Tensor inh = at::squeeze(init_h, 0);
+  auto grad_y =
+      grady.defined() ? grady : OpPreparation::ApplyTensorWithFormat(output_y.sizes(), output_y.options(), ACL_FORMAT_FRACTAL_NZ).mul(0);
+  auto grad_h =
+      gradh.defined() ? gradh[input.size(0)-1] : OpPreparation::ApplyTensorWithFormat(inh.sizes(), output_h.options(), ACL_FORMAT_FRACTAL_NZ).mul(0);
+
+  Tensor mask = at::zeros({}, input.options().dtype(kByte)); // uint8
+  Tensor seq_lengths = at::zeros({}, input.options());
+
+  int64_t npu_format = ACL_FORMAT_ND;
+
+  Tensor grad_w_input = OpPreparation::ApplyTensorWithFormat(weight_input.sizes(), input.options(), npu_format);
+  Tensor grad_w_hidden = OpPreparation::ApplyTensorWithFormat(weight_hidden.sizes(), input.options(), npu_format);
+  Tensor grad_x = OpPreparation::ApplyTensorWithFormat(input.sizes(), input.options(), npu_format);
+  Tensor grad_b_input = OpPreparation::ApplyTensorWithFormat(bias_input.sizes(), input.options(), npu_format);
+  Tensor grad_b_hidden = OpPreparation::ApplyTensorWithFormat(bias_hidden.sizes(), input.options(), npu_format);
+  Tensor grad_h_prev = OpPreparation::ApplyTensorWithFormat(init_h.sizes(), input.options(), npu_format);
+
+  OpCommand cmd;
+  cmd.Name("DynamicGRUV2Grad")
+      .Input(input)
+      .Input(weight_input)
+      .Input(weight_hidden)
+      .Input(output_y)
+      .Input(inh)
+      .Input(output_h)
+      .Input(grad_y)
+      .Input(grad_h)
+      .Input(output_updata)
+      .Input(output_reset)
+      .Input(output_new)
+      .Input(hidden_new)
+      .Input(seq_lengths)
+      .Input(mask)
+      .Output(grad_w_input)
+      .Output(grad_w_hidden)
+      .Output(grad_b_input)
+      .Output(grad_b_hidden)
+      .Output(grad_x)
+      .Output(grad_h_prev)
+      .Attr("direction", (string) "UNIDIRECTIONAL")
+      .Attr("cell_depth", (int64_t)1)
+      .Attr("keep_prob", (float)1.0)
+      .Attr("cell_clip", (float)-1.0)
+      .Attr("num_proj", (int64_t)0)
+      .Attr("time_major", (bool)true)
+      .Attr("bias_type", (string) "no_bias")
+      .Attr("gate_order", (string) "rzh")
+      .Attr("reset_after", (bool)true)
+      .Run();
+
+  return std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> {
+      grad_w_input, grad_w_hidden, grad_x, grad_b_input, grad_b_hidden, grad_h_prev
+  };
+}
+
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/GruKernelNpu.cpp b/src/aten/src/ATen/native/npu/GruKernelNpu.cpp
index 7a67a4aaec1558bc80d13790f61ac4f32f031ae7..ebbdd5474f6a44e80195b241e9ee61c5484a2c73 100644
--- a/src/aten/src/ATen/native/npu/GruKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/GruKernelNpu.cpp
@@ -1,147 +1,147 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> gru_npu(
-    const Tensor& input,
-    const Tensor& hx,
-    const Tensor& weight_input,
-    const Tensor& weight_hidden,
-    const Tensor& bias_input,
-    const Tensor& bias_hidden,
-    const Tensor& seq_length,
-    bool has_biases,
-    int64_t num_layers,
-    double dropout,
-    bool train,
-    bool bidirectional,
-    bool batch_first) {
-  int64_t numStep = input.size(0);
-  int64_t batchSize = input.size(1);
-  int64_t hiddenSize = bias_input.size(0) / 3;
-  SmallVector<int64_t, SIZE> outputSize = {numStep, batchSize, hiddenSize};
-  int64_t npu_format = ACL_FORMAT_FRACTAL_NZ;
-
-  Tensor output_y = OpPreparation::ApplyTensorWithFormat(
-      outputSize,
-      bias_input.options(),
-      npu_format);
-  Tensor output_h = OpPreparation::ApplyTensorWithFormat(
-      outputSize,
-      bias_input.options(),
-      ACL_FORMAT_ND); // 后续需要做slice和unsqueeze，BaseFormat方便
-  Tensor output_updata = OpPreparation::ApplyTensorWithFormat(
-      outputSize,
-      bias_input.options(),
-      npu_format);
-  Tensor output_reset = OpPreparation::ApplyTensorWithFormat(
-      outputSize,
-      bias_input.options(),
-      npu_format);
-  Tensor output_new = OpPreparation::ApplyTensorWithFormat(
-      outputSize,
-      bias_input.options(),
-      npu_format);
-  Tensor hidden_new = OpPreparation::ApplyTensorWithFormat(
-      outputSize,
-      bias_input.options(),
-      npu_format);
-  
-
-  OpCommand cmd;
-  cmd.Name("DynamicGRUV2")
-      .Input(input)
-      .Input(weight_input)
-      .Input(weight_hidden)
-      .Input(bias_input)
-      .Input(bias_hidden)
-      .Input()
-      .Input(hx)
-      .Output(output_y)
-      .Output(output_h)
-      .Output(output_updata)
-      .Output(output_reset)
-      .Output(output_new)
-      .Output(hidden_new)
-      .Attr("direction", (string)"UNIDIRECTIONAL")
-      .Attr("cell_depth", (int64_t)1)
-      .Attr("keep_prob", (float)1.0)
-      .Attr("cell_clip", (float)-1.0)
-      .Attr("num_proj", (int64_t)0)
-      .Attr("time_major", true)
-      .Attr("activation", (string)"tanh")
-      .Attr("gate_order", (string)"rzh")
-      .Attr("reset_after", true)
-      .Attr("is_training", true)
-      .Run();
-
-  return std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor>(
-      output_y, output_h, output_updata, output_reset, output_new, hidden_new);
-}
-
-tuple<Tensor, Tensor> gru_npu_(
-    const Tensor& input,
-    const Tensor& hx,
-    TensorList params,
-    bool has_biases,
-    int64_t num_layers,
-    double dropout,
-    bool train,
-    bool bidirectional,
-    bool batch_first) {
-  // get weight  fp16
-  Tensor weight_input = params[0].t();
-  Tensor weight_hidden = params[1].t();
-
-  // get bias  fp16 / fp32
-  Tensor bias_input;
-  Tensor bias_hidden;
-  if (has_biases) {
-    bias_input = params[2].to(input.dtype());
-    bias_hidden = params[3].to(input.dtype());
-  } else {
-    bias_input = OpPreparation::ApplyTensorWithFormat(weight_input.size(0), input.options(), ACL_FORMAT_FRACTAL_NZ).mul(0);
-    bias_hidden = OpPreparation::ApplyTensorWithFormat(weight_hidden.size(0), input.options(), ACL_FORMAT_FRACTAL_NZ).mul(0);
-  }
-
-  Tensor seq_length = OpPreparation::ApplyTensorWithFormat({}, input.options(), ACL_FORMAT_ND);
-
-  auto results = at::npu_gru(
-      input,
-      hx,
-      weight_input,
-      weight_hidden,
-      bias_input,
-      bias_hidden,
-      seq_length,
-      has_biases,
-      num_layers,
-      dropout,
-      train,
-      bidirectional,
-      batch_first);
-  int64_t numStep = input.size(0);
-  Tensor output_hy = at::unsqueeze(std::get<1>(results)[numStep - 1], 0);
-
-  return std::tuple<Tensor, Tensor>(std::get<0>(results), output_hy);
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> gru_npu(
+    const Tensor& input,
+    const Tensor& hx,
+    const Tensor& weight_input,
+    const Tensor& weight_hidden,
+    const Tensor& bias_input,
+    const Tensor& bias_hidden,
+    const Tensor& seq_length,
+    bool has_biases,
+    int64_t num_layers,
+    double dropout,
+    bool train,
+    bool bidirectional,
+    bool batch_first) {
+  int64_t numStep = input.size(0);
+  int64_t batchSize = input.size(1);
+  int64_t hiddenSize = bias_input.size(0) / 3;
+  SmallVector<int64_t, SIZE> outputSize = {numStep, batchSize, hiddenSize};
+  int64_t npu_format = ACL_FORMAT_FRACTAL_NZ;
+
+  Tensor output_y = OpPreparation::ApplyTensorWithFormat(
+      outputSize,
+      bias_input.options(),
+      npu_format);
+  Tensor output_h = OpPreparation::ApplyTensorWithFormat(
+      outputSize,
+      bias_input.options(),
+      ACL_FORMAT_ND); // 后续需要做slice和unsqueeze，BaseFormat方便
+  Tensor output_updata = OpPreparation::ApplyTensorWithFormat(
+      outputSize,
+      bias_input.options(),
+      npu_format);
+  Tensor output_reset = OpPreparation::ApplyTensorWithFormat(
+      outputSize,
+      bias_input.options(),
+      npu_format);
+  Tensor output_new = OpPreparation::ApplyTensorWithFormat(
+      outputSize,
+      bias_input.options(),
+      npu_format);
+  Tensor hidden_new = OpPreparation::ApplyTensorWithFormat(
+      outputSize,
+      bias_input.options(),
+      npu_format);
+  
+
+  OpCommand cmd;
+  cmd.Name("DynamicGRUV2")
+      .Input(input)
+      .Input(weight_input)
+      .Input(weight_hidden)
+      .Input(bias_input)
+      .Input(bias_hidden)
+      .Input()
+      .Input(hx)
+      .Output(output_y)
+      .Output(output_h)
+      .Output(output_updata)
+      .Output(output_reset)
+      .Output(output_new)
+      .Output(hidden_new)
+      .Attr("direction", (string)"UNIDIRECTIONAL")
+      .Attr("cell_depth", (int64_t)1)
+      .Attr("keep_prob", (float)1.0)
+      .Attr("cell_clip", (float)-1.0)
+      .Attr("num_proj", (int64_t)0)
+      .Attr("time_major", true)
+      .Attr("activation", (string)"tanh")
+      .Attr("gate_order", (string)"rzh")
+      .Attr("reset_after", true)
+      .Attr("is_training", true)
+      .Run();
+
+  return std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor>(
+      output_y, output_h, output_updata, output_reset, output_new, hidden_new);
+}
+
+tuple<Tensor, Tensor> gru_npu_(
+    const Tensor& input,
+    const Tensor& hx,
+    TensorList params,
+    bool has_biases,
+    int64_t num_layers,
+    double dropout,
+    bool train,
+    bool bidirectional,
+    bool batch_first) {
+  // get weight  fp16
+  Tensor weight_input = params[0].t();
+  Tensor weight_hidden = params[1].t();
+
+  // get bias  fp16 / fp32
+  Tensor bias_input;
+  Tensor bias_hidden;
+  if (has_biases) {
+    bias_input = params[2].to(input.dtype());
+    bias_hidden = params[3].to(input.dtype());
+  } else {
+    bias_input = OpPreparation::ApplyTensorWithFormat(weight_input.size(0), input.options(), ACL_FORMAT_FRACTAL_NZ).mul(0);
+    bias_hidden = OpPreparation::ApplyTensorWithFormat(weight_hidden.size(0), input.options(), ACL_FORMAT_FRACTAL_NZ).mul(0);
+  }
+
+  Tensor seq_length = OpPreparation::ApplyTensorWithFormat({}, input.options(), ACL_FORMAT_ND);
+
+  auto results = at::npu_gru(
+      input,
+      hx,
+      weight_input,
+      weight_hidden,
+      bias_input,
+      bias_hidden,
+      seq_length,
+      has_biases,
+      num_layers,
+      dropout,
+      train,
+      bidirectional,
+      batch_first);
+  int64_t numStep = input.size(0);
+  Tensor output_hy = at::unsqueeze(std::get<1>(results)[numStep - 1], 0);
+
+  return std::tuple<Tensor, Tensor>(std::get<0>(results), output_hy);
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/HardShrinkBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/HardShrinkBackwardKernelNpu.cpp
index 5a2c8646e3a46e71c464b3b44826d03a9a66b998..74ea38b0f3584f17eb9b66c336273a4945589d27 100644
--- a/src/aten/src/ATen/native/npu/HardShrinkBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/HardShrinkBackwardKernelNpu.cpp
@@ -1,52 +1,52 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-namespace {
-
-Tensor& hardshrink_backward_nocheck(
-    Tensor& grad_input,
-    const Tensor& grad_output,
-    const Tensor& self,
-    Scalar lambd) {
-  OpCommand cmd;
-  cmd.Name("HardShrinkGrad")
-      .Input(grad_output)
-      .Input(self)
-      .Attr("lambd", lambd)
-      .Output(grad_input)
-      .Run();
-
-  return grad_input;
-}
-} // namespace
-
-Tensor hardshrink_backward_npu(
-    const Tensor& grad_output,
-    const Tensor& self,
-    Scalar lambd) {
-  Tensor grad_input = OpPreparation::ApplyTensor(self);
-  // calculate the output result of the NPU
-  hardshrink_backward_nocheck(grad_input, grad_output, self, lambd);
-
-  return grad_input;
-}
-
-}
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+namespace {
+
+Tensor& hardshrink_backward_nocheck(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& self,
+    Scalar lambd) {
+  OpCommand cmd;
+  cmd.Name("HardShrinkGrad")
+      .Input(grad_output)
+      .Input(self)
+      .Attr("lambd", lambd)
+      .Output(grad_input)
+      .Run();
+
+  return grad_input;
+}
+} // namespace
+
+Tensor hardshrink_backward_npu(
+    const Tensor& grad_output,
+    const Tensor& self,
+    Scalar lambd) {
+  Tensor grad_input = OpPreparation::ApplyTensor(self);
+  // calculate the output result of the NPU
+  hardshrink_backward_nocheck(grad_input, grad_output, self, lambd);
+
+  return grad_input;
+}
+
+}
 }
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/HardShrinkKernelNpu.cpp b/src/aten/src/ATen/native/npu/HardShrinkKernelNpu.cpp
index cfd43a8c8618d087d57c7c748797d780a7e58b45..11204519d0819ddbf1b8e6771d32d7293d6b75a5 100644
--- a/src/aten/src/ATen/native/npu/HardShrinkKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/HardShrinkKernelNpu.cpp
@@ -1,43 +1,43 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-namespace {
-
-Tensor& hardshrink_nocheck(Tensor& result, const Tensor& self, Scalar lambd) {
-  OpCommand cmd;
-  cmd.Name("HardShrink")
-    .Input(self)
-    .Attr("lambd", lambd)
-    .Output(result).Run();
-    
-    return result;
-}
-} // namespace
-
-Tensor hardshrink_npu(const Tensor& self, Scalar lambd) {
-  // Tensor outputTensor = logical_or_dest_output(self, other);
-  Tensor result = OpPreparation::ApplyTensor(self);
-  hardshrink_nocheck(result, self, lambd);
-
-  return result;
-}
-
-} // namespace native
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+namespace {
+
+Tensor& hardshrink_nocheck(Tensor& result, const Tensor& self, Scalar lambd) {
+  OpCommand cmd;
+  cmd.Name("HardShrink")
+    .Input(self)
+    .Attr("lambd", lambd)
+    .Output(result).Run();
+    
+    return result;
+}
+} // namespace
+
+Tensor hardshrink_npu(const Tensor& self, Scalar lambd) {
+  // Tensor outputTensor = logical_or_dest_output(self, other);
+  Tensor result = OpPreparation::ApplyTensor(self);
+  hardshrink_nocheck(result, self, lambd);
+
+  return result;
+}
+
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/Im2colBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/Im2colBackwardKernelNpu.cpp
index c0a96311251e930e14be19ea74610e6a8f22671a..036d3c3e748ae38000eae195605b9df4b973d3b8 100644
--- a/src/aten/src/ATen/native/npu/Im2colBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/Im2colBackwardKernelNpu.cpp
@@ -1,121 +1,121 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor& im2col_backward_out_npu_nocheck(
-    Tensor& grad_input,
-    const Tensor& grad_output,
-    IntArrayRef input_size,
-    IntArrayRef kernel_size,
-    IntArrayRef dilation,
-    IntArrayRef padding,
-    IntArrayRef stride) {
-  Tensor gradOutput = grad_output;
-  gradOutput = gradOutput.view({
-    grad_output.size(0),
-    grad_output.size(1) / (kernel_size[0] * kernel_size[1]),
-    kernel_size[0] * kernel_size[1],
-    grad_output.size(2)});
-
-  SmallVector<int64_t, N> inputSize = {input_size[0], input_size[1]};  
-
-  SmallVector<int64_t, N> kernelSize = {kernel_size[0], kernel_size[1]};
-  SmallVector<int64_t, N> dilations = {dilation[0], dilation[1]};
-  SmallVector<int64_t, N> paddings = {padding[0], padding[1]};
-  SmallVector<int64_t, N> stridesSize = {stride[0], stride[1]};
-  
-  OpCommand cmd;
-  cmd.Name("Col2im")
-      .Input(gradOutput)
-      .Input(inputSize, at::kInt)
-      .Output(grad_input)
-      .Attr("kernel_size", kernelSize)
-      .Attr("dilation", dilations)
-      .Attr("padding", paddings)
-      .Attr("stride", stridesSize)
-      .Run();
-
-  return grad_input;
-}
-
-Tensor& im2col_backward_out_npu(
-    Tensor& grad_input,
-    const Tensor& grad_output,
-    IntArrayRef input_size,
-    IntArrayRef kernel_size,
-    IntArrayRef dilation,
-    IntArrayRef padding,
-    IntArrayRef stride) {
-  SmallVector<int64_t, SIZE> outputSize = {
-    grad_output.size(0),
-    grad_output.size(1) / (kernel_size[0] * kernel_size[1]),
-    input_size[0],
-    input_size[1]};
-
-  OpPreparation::CheckOut(
-      {grad_output},
-      grad_input,
-      grad_output,
-      outputSize);
-
-  OpPipeWithDefinedOut pipe;
-  return pipe.CheckMemory({grad_output}, {grad_input})
-    .Func([&grad_output, &input_size, &kernel_size, &dilation, &padding, &stride]
-    (Tensor& grad_input)
-    {im2col_backward_out_npu_nocheck(
-      grad_input,
-      grad_output,
-      input_size,
-      kernel_size,
-      dilation,
-      padding,
-      stride);})
-    .Call(grad_input);
-}
-
-Tensor im2col_backward_npu(
-    const Tensor& grad_output,
-    IntArrayRef input_size,
-    IntArrayRef kernel_size,
-    IntArrayRef dilation,
-    IntArrayRef padding,
-    IntArrayRef stride) {
-  // calculate the output size
-  SmallVector<int64_t, SIZE> outputSize = {
-    grad_output.size(0),
-    grad_output.size(1) / (kernel_size[0] * kernel_size[1]),
-    input_size[0],
-    input_size[1]};
-
-  // construct the input tensor of the NPU
-  Tensor grad_input = OpPreparation::ApplyTensor(grad_output, outputSize);
-
-  im2col_backward_out_npu_nocheck(
-      grad_input,
-      grad_output,
-      input_size,
-      kernel_size,
-      dilation,
-      padding,
-      stride);
-
-  return grad_input;
-}
-} // namespace native
-} // namespace at
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor& im2col_backward_out_npu_nocheck(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    IntArrayRef input_size,
+    IntArrayRef kernel_size,
+    IntArrayRef dilation,
+    IntArrayRef padding,
+    IntArrayRef stride) {
+  Tensor gradOutput = grad_output;
+  gradOutput = gradOutput.view({
+    grad_output.size(0),
+    grad_output.size(1) / (kernel_size[0] * kernel_size[1]),
+    kernel_size[0] * kernel_size[1],
+    grad_output.size(2)});
+
+  SmallVector<int64_t, N> inputSize = {input_size[0], input_size[1]};  
+
+  SmallVector<int64_t, N> kernelSize = {kernel_size[0], kernel_size[1]};
+  SmallVector<int64_t, N> dilations = {dilation[0], dilation[1]};
+  SmallVector<int64_t, N> paddings = {padding[0], padding[1]};
+  SmallVector<int64_t, N> stridesSize = {stride[0], stride[1]};
+  
+  OpCommand cmd;
+  cmd.Name("Col2im")
+      .Input(gradOutput)
+      .Input(inputSize, at::kInt)
+      .Output(grad_input)
+      .Attr("kernel_size", kernelSize)
+      .Attr("dilation", dilations)
+      .Attr("padding", paddings)
+      .Attr("stride", stridesSize)
+      .Run();
+
+  return grad_input;
+}
+
+Tensor& im2col_backward_out_npu(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    IntArrayRef input_size,
+    IntArrayRef kernel_size,
+    IntArrayRef dilation,
+    IntArrayRef padding,
+    IntArrayRef stride) {
+  SmallVector<int64_t, SIZE> outputSize = {
+    grad_output.size(0),
+    grad_output.size(1) / (kernel_size[0] * kernel_size[1]),
+    input_size[0],
+    input_size[1]};
+
+  OpPreparation::CheckOut(
+      {grad_output},
+      grad_input,
+      grad_output,
+      outputSize);
+
+  OpPipeWithDefinedOut pipe;
+  return pipe.CheckMemory({grad_output}, {grad_input})
+    .Func([&grad_output, &input_size, &kernel_size, &dilation, &padding, &stride]
+    (Tensor& grad_input)
+    {im2col_backward_out_npu_nocheck(
+      grad_input,
+      grad_output,
+      input_size,
+      kernel_size,
+      dilation,
+      padding,
+      stride);})
+    .Call(grad_input);
+}
+
+Tensor im2col_backward_npu(
+    const Tensor& grad_output,
+    IntArrayRef input_size,
+    IntArrayRef kernel_size,
+    IntArrayRef dilation,
+    IntArrayRef padding,
+    IntArrayRef stride) {
+  // calculate the output size
+  SmallVector<int64_t, SIZE> outputSize = {
+    grad_output.size(0),
+    grad_output.size(1) / (kernel_size[0] * kernel_size[1]),
+    input_size[0],
+    input_size[1]};
+
+  // construct the input tensor of the NPU
+  Tensor grad_input = OpPreparation::ApplyTensor(grad_output, outputSize);
+
+  im2col_backward_out_npu_nocheck(
+      grad_input,
+      grad_output,
+      input_size,
+      kernel_size,
+      dilation,
+      padding,
+      stride);
+
+  return grad_input;
+}
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/IndexPutKernelNpu.cpp b/src/aten/src/ATen/native/npu/IndexPutKernelNpu.cpp
index 9cbbf8f8416d6117e5b701cab3bb973e1c8b3fc8..6814d60261599c033c0b6f16b62965e660799d96 100644
--- a/src/aten/src/ATen/native/npu/IndexPutKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/IndexPutKernelNpu.cpp
@@ -26,6 +26,9 @@ Tensor& index_put_nocheck(
     const TensorList& indices,
     const Tensor& value,
     bool accumulate) {
+  if (value.numel() == 0) {
+    return result;
+  }
   // masks corresponds to indices. 0 indicates undefined tensor.
   SmallVector<int64_t, N> masks;
   std::vector<Tensor> allDefinedIndices;
diff --git a/src/aten/src/ATen/native/npu/InverseKernelNpu.cpp b/src/aten/src/ATen/native/npu/InverseKernelNpu.cpp
index c25c31599b68bcf1ea88fc5227561448cf10f19f..61e7cada9aa0f6b2d207ec621193f6472530a97c 100644
--- a/src/aten/src/ATen/native/npu/InverseKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/InverseKernelNpu.cpp
@@ -1,48 +1,48 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor& inverse_out_npu(
-    Tensor& result,
-    const Tensor& self) {
-
-  OpCommand cmd;
-  cmd.Name("MatrixInverse")
-      .Input(self)
-      .Output(result)
-      .Attr("adjoint", false)
-      .Run();
-  
-  return result;
-}
-
-Tensor inverse_npu(const Tensor& self) {
-  Tensor result = OpPreparation::ApplyTensor(self);
-
-  inverse_out_npu(result, self);
-
-  return result;
-}
-
-
-} // namespace native
-} // namespace at
-
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor& inverse_out_npu(
+    Tensor& result,
+    const Tensor& self) {
+
+  OpCommand cmd;
+  cmd.Name("MatrixInverse")
+      .Input(self)
+      .Output(result)
+      .Attr("adjoint", false)
+      .Run();
+  
+  return result;
+}
+
+Tensor inverse_npu(const Tensor& self) {
+  Tensor result = OpPreparation::ApplyTensor(self);
+
+  inverse_out_npu(result, self);
+
+  return result;
+}
+
+
+} // namespace native
+} // namespace at
+
diff --git a/src/aten/src/ATen/native/npu/IouKernelNpu.cpp b/src/aten/src/ATen/native/npu/IouKernelNpu.cpp
index 306022e5193be6acc203e0ce1e4c82c3846f2379..dd2ff4ac09ab63621b16d1b0b7a59ed87317a60a 100644
--- a/src/aten/src/ATen/native/npu/IouKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/IouKernelNpu.cpp
@@ -1,93 +1,93 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-SmallVector<NPUTensorDesc, N> iou_npu_input(
-    const SmallVector<Tensor, N>& inputTensor) {
-  return CalcuOpUtil::create_npu_input_tensor_desc(inputTensor);
-}
-
-SmallVector<NPUTensorDesc, N> iou_npu_output(
-    const SmallVector<Tensor, N>& outputTensor) {
-  return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor);
-}
-
-SmallVector<NPUAttrDesc, N> iou_npu_attr(int64_t mode) {
-  string modeStr = "iou";
-  if (mode == 1) {
-    modeStr = "iof";
-  }
-  NPUAttrDesc npuAttrIou = NPUAttrDesc("mode", modeStr);
-  SmallVector<NPUAttrDesc, N> attrs = {npuAttrIou};
-  return attrs;
-}
-
-Tensor& iou_out_npu(
-    Tensor& overlap,
-    const Tensor& bboxes,
-    const Tensor& gtboxes,
-    int64_t mode) {
-  // constructs the input and output NPUTensorDesc
-  auto inputs = iou_npu_input({bboxes, gtboxes});
-  auto outputs = iou_npu_output({overlap});
-
-  // constructs the attr of the NPUAttrDesc
-  auto attrs = iou_npu_attr(mode);
-
-  // executing the NPU operator
-  CalcuOpUtil::execute_npu_operate("Iou", inputs, outputs, attrs);
-
-  // return std::make_tuple(boxes, idx, mask);
-  return overlap;
-}
-
-Tensor iou_npu(
-    const Tensor& bboxes,
-    const Tensor& gtboxes,
-    int64_t mode) {
-  // calculate the output size
-  auto outputSize = iou_npu_output_size(bboxes, gtboxes);
-
-  Tensor bboxesFP16 = bboxes;
-  if (bboxes.scalar_type() != at::ScalarType::Half) {
-    bboxesFP16 = bboxes.to(at::kHalf);
-  }
-  Tensor gtboxesFP16 = gtboxes;
-  if (gtboxes.scalar_type() != at::ScalarType::Half) {
-    gtboxesFP16 = gtboxes.to(at::kHalf);
-  }
-
-  // construct the output tensor of the NPU
-  Tensor overlap = at::empty_with_format(outputSize, bboxesFP16.options(), CalcuOpUtil::get_tensor_npu_format(bboxes));
-
-  iou_out_npu(overlap, bboxesFP16, gtboxesFP16, mode);
-
-  if (overlap.scalar_type() != bboxes.scalar_type()) {
-    overlap = overlap.to(bboxes.scalar_type());
-  }
-
-  return overlap;
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
+#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
+#include "ATen/native/npu/utils/NpuUtils.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+SmallVector<NPUTensorDesc, N> iou_npu_input(
+    const SmallVector<Tensor, N>& inputTensor) {
+  return CalcuOpUtil::create_npu_input_tensor_desc(inputTensor);
+}
+
+SmallVector<NPUTensorDesc, N> iou_npu_output(
+    const SmallVector<Tensor, N>& outputTensor) {
+  return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor);
+}
+
+SmallVector<NPUAttrDesc, N> iou_npu_attr(int64_t mode) {
+  string modeStr = "iou";
+  if (mode == 1) {
+    modeStr = "iof";
+  }
+  NPUAttrDesc npuAttrIou = NPUAttrDesc("mode", modeStr);
+  SmallVector<NPUAttrDesc, N> attrs = {npuAttrIou};
+  return attrs;
+}
+
+Tensor& iou_out_npu(
+    Tensor& overlap,
+    const Tensor& bboxes,
+    const Tensor& gtboxes,
+    int64_t mode) {
+  // constructs the input and output NPUTensorDesc
+  auto inputs = iou_npu_input({bboxes, gtboxes});
+  auto outputs = iou_npu_output({overlap});
+
+  // constructs the attr of the NPUAttrDesc
+  auto attrs = iou_npu_attr(mode);
+
+  // executing the NPU operator
+  CalcuOpUtil::execute_npu_operate("Iou", inputs, outputs, attrs);
+
+  // return std::make_tuple(boxes, idx, mask);
+  return overlap;
+}
+
+Tensor iou_npu(
+    const Tensor& bboxes,
+    const Tensor& gtboxes,
+    int64_t mode) {
+  // calculate the output size
+  auto outputSize = iou_npu_output_size(bboxes, gtboxes);
+
+  Tensor bboxesFP16 = bboxes;
+  if (bboxes.scalar_type() != at::ScalarType::Half) {
+    bboxesFP16 = bboxes.to(at::kHalf);
+  }
+  Tensor gtboxesFP16 = gtboxes;
+  if (gtboxes.scalar_type() != at::ScalarType::Half) {
+    gtboxesFP16 = gtboxes.to(at::kHalf);
+  }
+
+  // construct the output tensor of the NPU
+  Tensor overlap = at::empty_with_format(outputSize, bboxesFP16.options(), CalcuOpUtil::get_tensor_npu_format(bboxes));
+
+  iou_out_npu(overlap, bboxesFP16, gtboxesFP16, mode);
+
+  if (overlap.scalar_type() != bboxes.scalar_type()) {
+    overlap = overlap.to(bboxes.scalar_type());
+  }
+
+  return overlap;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/IscloseKernelNpu.cpp b/src/aten/src/ATen/native/npu/IscloseKernelNpu.cpp
index 5a5ebb2d7ba71a504041c40cc78be875b3bab23a..6c783adfa79b8df294edbcaaac98137b0725bb6f 100644
--- a/src/aten/src/ATen/native/npu/IscloseKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/IscloseKernelNpu.cpp
@@ -1,69 +1,69 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at { 
-namespace native {
-using namespace at::native::npu;
-
-namespace {
-
-Tensor& isclose_nocheck(
-    Tensor& result,
-    const Tensor& self,
-    const Tensor& other,
-    double rtol,
-    double atol,
-    bool equal_nan) {
-  auto rtol1 = static_cast<float>(rtol);
-  auto atol1 = static_cast<float>(atol);
-
-  OpCommand cmd;
-  cmd.Name("IsClose")
-      .Input(self)
-      .Input(other)
-      .Attr("rtol", rtol1)
-      .Attr("atol", atol1)
-      .Attr("equal_nan", equal_nan)
-      .Output(result)
-      .Run();
-
-  return result;
-}
-} // namespace
-
-Tensor isclose_npu(
-    const Tensor& self,
-    const Tensor& other,
-    double rtol, 
-    double atol, 
-    bool equal_nan) {
-
-  TORCH_CHECK(self.scalar_type() == other.scalar_type(), self.scalar_type(), " did not match ", other.scalar_type());
-    
-  //calculate the output size
-  auto outputSize = input_same_output_size(self);
-        
-  //construct the output tensor of the NPU
-  Tensor result = OpPreparation::ApplyTensor(outputSize, self.options().dtype(kBool), self);
-  // constructs the attr of the NPUAttrDesc
-  result = isclose_nocheck(result, self, other, rtol, atol, equal_nan);
-    
-  return result;  
-}
-
-}}  // namespace at::native
-
-
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at { 
+namespace native {
+using namespace at::native::npu;
+
+namespace {
+
+Tensor& isclose_nocheck(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& other,
+    double rtol,
+    double atol,
+    bool equal_nan) {
+  auto rtol1 = static_cast<float>(rtol);
+  auto atol1 = static_cast<float>(atol);
+
+  OpCommand cmd;
+  cmd.Name("IsClose")
+      .Input(self)
+      .Input(other)
+      .Attr("rtol", rtol1)
+      .Attr("atol", atol1)
+      .Attr("equal_nan", equal_nan)
+      .Output(result)
+      .Run();
+
+  return result;
+}
+} // namespace
+
+Tensor isclose_npu(
+    const Tensor& self,
+    const Tensor& other,
+    double rtol, 
+    double atol, 
+    bool equal_nan) {
+
+  TORCH_CHECK(self.scalar_type() == other.scalar_type(), self.scalar_type(), " did not match ", other.scalar_type());
+    
+  //calculate the output size
+  auto outputSize = input_same_output_size(self);
+        
+  //construct the output tensor of the NPU
+  Tensor result = OpPreparation::ApplyTensor(outputSize, self.options().dtype(kBool), self);
+  // constructs the attr of the NPUAttrDesc
+  result = isclose_nocheck(result, self, other, rtol, atol, equal_nan);
+    
+  return result;  
+}
+
+}}  // namespace at::native
+
+
diff --git a/src/aten/src/ATen/native/npu/KlDivKernelNpu.cpp b/src/aten/src/ATen/native/npu/KlDivKernelNpu.cpp
index 6daf5011c95e661a56110eb93665c38d4269ba76..1799ba71e83fc841d4a0ed62eadd0e001e966487 100644
--- a/src/aten/src/ATen/native/npu/KlDivKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/KlDivKernelNpu.cpp
@@ -1,57 +1,57 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor kl_div_npu(
-    const Tensor& self, 
-    const Tensor& target, 
-    int64_t reduction) {
-  TORCH_CHECK(reduction != Reduction::None,
-    "Reduction of None has not been supported at present.");
-
-  Tensor result = at::empty_with_format(
-      {}, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  string reductionStr;
-  if (reduction == Reduction::Mean) {
-    reductionStr = "batchmean";
-  } else if (reduction == Reduction::Sum) {
-    reductionStr = "sum";
-  }
-
-  OpCommand cmd;
-  cmd.Name("KLDiv")
-      .Input(self)
-      .Input(target)
-      .Output(result)
-      .Attr("reduction", reductionStr)
-      .Run();
-
-  if (reduction == Reduction::Mean) {
-    auto inputShape = self.sizes();
-    int batchSquareSize = prod_intlist(inputShape) / inputShape[0];
-    result.div_(batchSquareSize);
-  }
-
-  return result;
-}
-
-} // namespace native
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
+#include "ATen/native/npu/utils/OpTemplate.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor kl_div_npu(
+    const Tensor& self, 
+    const Tensor& target, 
+    int64_t reduction) {
+  TORCH_CHECK(reduction != Reduction::None,
+    "Reduction of None has not been supported at present.");
+
+  Tensor result = at::empty_with_format(
+      {}, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
+
+  string reductionStr;
+  if (reduction == Reduction::Mean) {
+    reductionStr = "batchmean";
+  } else if (reduction == Reduction::Sum) {
+    reductionStr = "sum";
+  }
+
+  OpCommand cmd;
+  cmd.Name("KLDiv")
+      .Input(self)
+      .Input(target)
+      .Output(result)
+      .Attr("reduction", reductionStr)
+      .Run();
+
+  if (reduction == Reduction::Mean) {
+    auto inputShape = self.sizes();
+    int batchSquareSize = prod_intlist(inputShape) / inputShape[0];
+    result.div_(batchSquareSize);
+  }
+
+  return result;
+}
+
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/LinearBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/LinearBackwardKernelNpu.cpp
index 492007773216401faeafdadc57ea23b482ea4e59..6e5779920c17c23913f25a2f79389b30c3dc5ba5 100644
--- a/src/aten/src/ATen/native/npu/LinearBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/LinearBackwardKernelNpu.cpp
@@ -1,62 +1,62 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor linear_backward_out_npu(
-    Tensor& result,
-    const Tensor& input,
-    const Tensor& weight,
-    bool transpose_x1,
-    bool transpose_x2) {
-  int64_t offset_x = 0;
-  OpCommand cmd;
-  cmd.Name("MatMulV2")
-      .Input(input)
-      .Input(weight)
-      .Output(result)
-      .Attr("transpose_x1", transpose_x1)
-      .Attr("transpose_x2", transpose_x2)
-      .Attr("offset_x", offset_x)
-      .Run();
-  return result;
-}
-
-tuple<Tensor, Tensor> linear_backward_npu(
-    const Tensor& grad,
-    const Tensor& input,
-    const Tensor& weight) {
-  SmallVector<int64_t, SIZE> inputGradOutputSize = {
-      grad.size(0), 
-      weight.size(1)};
-  SmallVector<int64_t, SIZE> weightGradOutputSize = {
-      grad.size(1), 
-      input.size(1)};
-  Tensor inputGrad = OpPreparation::ApplyTensor(input, inputGradOutputSize);
-  Tensor weightGrad = OpPreparation::ApplyTensor(weight, weightGradOutputSize);
-
-  linear_backward_out_npu(inputGrad, grad, weight, false, false);
-  linear_backward_out_npu(weightGrad, grad, input, true, false);
-  
-  return std::tie(inputGrad, weightGrad);
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor linear_backward_out_npu(
+    Tensor& result,
+    const Tensor& input,
+    const Tensor& weight,
+    bool transpose_x1,
+    bool transpose_x2) {
+  int64_t offset_x = 0;
+  OpCommand cmd;
+  cmd.Name("MatMulV2")
+      .Input(input)
+      .Input(weight)
+      .Output(result)
+      .Attr("transpose_x1", transpose_x1)
+      .Attr("transpose_x2", transpose_x2)
+      .Attr("offset_x", offset_x)
+      .Run();
+  return result;
+}
+
+tuple<Tensor, Tensor> linear_backward_npu(
+    const Tensor& grad,
+    const Tensor& input,
+    const Tensor& weight) {
+  SmallVector<int64_t, SIZE> inputGradOutputSize = {
+      grad.size(0), 
+      weight.size(1)};
+  SmallVector<int64_t, SIZE> weightGradOutputSize = {
+      grad.size(1), 
+      input.size(1)};
+  Tensor inputGrad = OpPreparation::ApplyTensor(input, inputGradOutputSize);
+  Tensor weightGrad = OpPreparation::ApplyTensor(weight, weightGradOutputSize);
+
+  linear_backward_out_npu(inputGrad, grad, weight, false, false);
+  linear_backward_out_npu(weightGrad, grad, input, true, false);
+  
+  return std::tie(inputGrad, weightGrad);
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/LinearKernelNpu.cpp b/src/aten/src/ATen/native/npu/LinearKernelNpu.cpp
index f35c5de9fa2450586a10e89843f7d653b99207ba..fb64c66874e83cacf8bd5230b7327c55dc8de376 100644
--- a/src/aten/src/ATen/native/npu/LinearKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/LinearKernelNpu.cpp
@@ -1,48 +1,48 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor linear_npu(
-    const Tensor& input,
-    const Tensor& weight,
-    const Tensor& bias) {
-  SmallVector<int64_t, SIZE> outputSize = {input.size(0), weight.size(0)};
-  Tensor output = OpPreparation::ApplyTensor(input, outputSize);
-
-  int64_t offset_x = 0;
-  OpCommand cmd;
-  cmd.Name("MatMulV2")
-      .Input(input)
-      .Input(weight);
-  if (bias.defined()) {
-    cmd.Input(bias);
-  }
-  cmd.Output(output)
-      .Attr("transpose_x1", false)
-      .Attr("transpose_x2", true)
-      .Attr("offset_x", offset_x)
-      .Run();
-  
-  return output;
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor linear_npu(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias) {
+  SmallVector<int64_t, SIZE> outputSize = {input.size(0), weight.size(0)};
+  Tensor output = OpPreparation::ApplyTensor(input, outputSize);
+
+  int64_t offset_x = 0;
+  OpCommand cmd;
+  cmd.Name("MatMulV2")
+      .Input(input)
+      .Input(weight);
+  if (bias.defined()) {
+    cmd.Input(bias);
+  }
+  cmd.Output(output)
+      .Attr("transpose_x1", false)
+      .Attr("transpose_x2", true)
+      .Attr("offset_x", offset_x)
+      .Run();
+  
+  return output;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/LogSigmoidBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/LogSigmoidBackwardKernelNpu.cpp
index cafa78fdf39463a96ad2ec355e28ca3f1df2d1db..a31b44bbb00d9081b31002cb86997eef8b3f62fc 100644
--- a/src/aten/src/ATen/native/npu/LogSigmoidBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/LogSigmoidBackwardKernelNpu.cpp
@@ -1,80 +1,80 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-IntArrayRef log_sigmoid_backward_npu_output_size(const Tensor& grad_output) {
-  return input_same_output_size(grad_output);
-}
-
-SmallVector<NPUTensorDesc, N> log_sigmoid_backward_npu_input(
-    const SmallVector<Tensor, N>& inputTensor) {
-  return CalcuOpUtil::create_npu_input_tensor_desc(inputTensor);
-}
-
-SmallVector<NPUTensorDesc, N> log_sigmoid_backward_npu_output(
-    const SmallVector<Tensor, N>& outputTensor) {
-  return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor);
-}
-
-SmallVector<NPUAttrDesc, N> log_sigmoid_backward_npu_attr(const Tensor& self) {
-  SmallVector<NPUAttrDesc, N> attrs = {};
-  return attrs;
-}
-
-Tensor& log_sigmoid_backward_out_npu(
-    Tensor& grad_input,
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Tensor& buffer) {
-  // constructs the input and output NPUTensorDesc
-  auto inputs = log_sigmoid_backward_npu_input({grad_output, self});
-  auto outputs = log_sigmoid_backward_npu_output({grad_input});
-
-  // constructs the attr of the NPUAttrDesc
-  auto attrs = log_sigmoid_backward_npu_attr(self);
-
-  // executing the NPU operator
-  CalcuOpUtil::execute_npu_operate("LogSigmoidGrad", inputs, outputs, attrs);
-
-  return grad_input;
-}
-
-Tensor log_sigmoid_backward_npu(
-    const Tensor& grad_output,
-    const Tensor& self,
-    const Tensor& buffer) {
-  // calculate the output size
-  auto outputSize = log_sigmoid_backward_npu_output_size(grad_output);
-
-  // construct the output tensor of the NPU
-  Tensor grad_input = at::empty_with_format(
-      outputSize,
-      grad_output.options(),
-      CalcuOpUtil::get_tensor_npu_format(grad_output));
-
-  // calculate the output result of the NPU
-  log_sigmoid_backward_out_npu(grad_input, grad_output, self, buffer);
-
-  return grad_input;
-}
-
-} // namespace native
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
+#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
+#include "ATen/native/npu/utils/NpuUtils.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+IntArrayRef log_sigmoid_backward_npu_output_size(const Tensor& grad_output) {
+  return input_same_output_size(grad_output);
+}
+
+SmallVector<NPUTensorDesc, N> log_sigmoid_backward_npu_input(
+    const SmallVector<Tensor, N>& inputTensor) {
+  return CalcuOpUtil::create_npu_input_tensor_desc(inputTensor);
+}
+
+SmallVector<NPUTensorDesc, N> log_sigmoid_backward_npu_output(
+    const SmallVector<Tensor, N>& outputTensor) {
+  return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor);
+}
+
+SmallVector<NPUAttrDesc, N> log_sigmoid_backward_npu_attr(const Tensor& self) {
+  SmallVector<NPUAttrDesc, N> attrs = {};
+  return attrs;
+}
+
+Tensor& log_sigmoid_backward_out_npu(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& self,
+    const Tensor& buffer) {
+  // constructs the input and output NPUTensorDesc
+  auto inputs = log_sigmoid_backward_npu_input({grad_output, self});
+  auto outputs = log_sigmoid_backward_npu_output({grad_input});
+
+  // constructs the attr of the NPUAttrDesc
+  auto attrs = log_sigmoid_backward_npu_attr(self);
+
+  // executing the NPU operator
+  CalcuOpUtil::execute_npu_operate("LogSigmoidGrad", inputs, outputs, attrs);
+
+  return grad_input;
+}
+
+Tensor log_sigmoid_backward_npu(
+    const Tensor& grad_output,
+    const Tensor& self,
+    const Tensor& buffer) {
+  // calculate the output size
+  auto outputSize = log_sigmoid_backward_npu_output_size(grad_output);
+
+  // construct the output tensor of the NPU
+  Tensor grad_input = at::empty_with_format(
+      outputSize,
+      grad_output.options(),
+      CalcuOpUtil::get_tensor_npu_format(grad_output));
+
+  // calculate the output result of the NPU
+  log_sigmoid_backward_out_npu(grad_input, grad_output, self, buffer);
+
+  return grad_input;
+}
+
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/LogicalAndKernelNpu.cpp b/src/aten/src/ATen/native/npu/LogicalAndKernelNpu.cpp
index a967b73e29a350be8588f4df3a8acf982666d72c..3c4d9209ad404fc83c20117b2580e5735cc5985f 100644
--- a/src/aten/src/ATen/native/npu/LogicalAndKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/LogicalAndKernelNpu.cpp
@@ -1,113 +1,113 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-SmallVector<NPUTensorDesc, N> logical_and_npu_input(
-    const Tensor& self,
-    const Tensor& other) {
-  bool isSelfWrapped = CalcuOpUtil::is_scalar_wrapped_to_tensor(self);
-  bool isOtherWrapped = CalcuOpUtil::is_scalar_wrapped_to_tensor(other);
-  auto inputs = CalcuOpUtil::create_npu_input_tensor_desc({self, other});
-
-  // 't + 2' to work with any type of tensor, not just LongTensor (which is what
-  // integersin Python represent).
-  if (isSelfWrapped && (!isOtherWrapped)) {
-    inputs[0].scalarType = other.scalar_type();
-  } else if (isOtherWrapped && (!isSelfWrapped)) {
-    inputs[1].scalarType = self.scalar_type();
-  }
-
-  return inputs;
-}
-
-SmallVector<NPUTensorDesc, N> logical_and_npu_output(const SmallVector<Tensor, N>& outputTensor) {
-  return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor);
-}
-
-SmallVector<NPUAttrDesc, N> logical_and_npu_attr(const Tensor& self) {
-  SmallVector<NPUAttrDesc, N> attrs = {};
-  return attrs;
-}
-
-Tensor& logical_and_out_npu_nocheck(
-    Tensor& result,
-    const Tensor& self,
-    const Tensor& other) {
-
-  //constructs the input and output NPUTensorDesc
-  auto inputs = logical_and_npu_input(self, other);
-  auto outputs = logical_and_npu_output({result});
-  //constructs the attr of the NPUAttrDesc
-  auto attrs = logical_and_npu_attr(self);
-
-  //executing the NPU operator
-  CalcuOpUtil::execute_npu_operate("LogicalAnd", inputs, outputs, attrs);
-
-  return result;
-}
-
-Tensor& logical_and_out_npu(Tensor& result, const Tensor& self, const Tensor& other) {
-  auto outputSize = broadcast_ops_npu_output_size(self, other);
-  OpPreparation::CheckOut(
-      {self},
-      result,
-      CalcuOpUtil::get_tensor_npu_format(self),
-      result.scalar_type(),
-      outputSize);
-
-  logical_and_out_npu_nocheck(result, self, other);
-
-  return result;
-}
-
-Tensor logical_and_npu(const Tensor& self, const Tensor& other) {
-  auto outputSize = broadcast_ops_npu_output_size(self, other);
-
-  Tensor result = at::empty_with_format(
-      outputSize,
-      self.options(), 
-      CalcuOpUtil::get_tensor_npu_format(self));
-
-  logical_and_out_npu_nocheck(result, self, other);
-
-  return result.toType(kBool);
-  //return result;
-}
-
-Tensor& logical_and_npu_(Tensor& self, const Tensor& other) {
-  SmallVector<Tensor, N> inputs = {self, other};
-  SmallVector<Tensor, N> outputs = {self};
-  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
-
-  if (!NpuUtils::check_match(&self)) {
-    Tensor contiguousSelf = NpuUtils::format_contiguous(self);
-    Tensor result = logical_and_out_npu_nocheck(contiguousSelf, contiguousSelf, other);
-    NpuUtils::format_fresh_view(self, result);
-  } else {
-    logical_and_out_npu_nocheck(self, self, other);
-  }
-
-  return self;
-}
-
-} // namespace native
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
+#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
+#include "ATen/native/npu/utils/NpuUtils.h"
+#include "ATen/native/npu/utils/OpTemplate.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+SmallVector<NPUTensorDesc, N> logical_and_npu_input(
+    const Tensor& self,
+    const Tensor& other) {
+  bool isSelfWrapped = CalcuOpUtil::is_scalar_wrapped_to_tensor(self);
+  bool isOtherWrapped = CalcuOpUtil::is_scalar_wrapped_to_tensor(other);
+  auto inputs = CalcuOpUtil::create_npu_input_tensor_desc({self, other});
+
+  // 't + 2' to work with any type of tensor, not just LongTensor (which is what
+  // integersin Python represent).
+  if (isSelfWrapped && (!isOtherWrapped)) {
+    inputs[0].scalarType = other.scalar_type();
+  } else if (isOtherWrapped && (!isSelfWrapped)) {
+    inputs[1].scalarType = self.scalar_type();
+  }
+
+  return inputs;
+}
+
+SmallVector<NPUTensorDesc, N> logical_and_npu_output(const SmallVector<Tensor, N>& outputTensor) {
+  return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor);
+}
+
+SmallVector<NPUAttrDesc, N> logical_and_npu_attr(const Tensor& self) {
+  SmallVector<NPUAttrDesc, N> attrs = {};
+  return attrs;
+}
+
+Tensor& logical_and_out_npu_nocheck(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& other) {
+
+  //constructs the input and output NPUTensorDesc
+  auto inputs = logical_and_npu_input(self, other);
+  auto outputs = logical_and_npu_output({result});
+  //constructs the attr of the NPUAttrDesc
+  auto attrs = logical_and_npu_attr(self);
+
+  //executing the NPU operator
+  CalcuOpUtil::execute_npu_operate("LogicalAnd", inputs, outputs, attrs);
+
+  return result;
+}
+
+Tensor& logical_and_out_npu(Tensor& result, const Tensor& self, const Tensor& other) {
+  auto outputSize = broadcast_ops_npu_output_size(self, other);
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      CalcuOpUtil::get_tensor_npu_format(self),
+      result.scalar_type(),
+      outputSize);
+
+  logical_and_out_npu_nocheck(result, self, other);
+
+  return result;
+}
+
+Tensor logical_and_npu(const Tensor& self, const Tensor& other) {
+  auto outputSize = broadcast_ops_npu_output_size(self, other);
+
+  Tensor result = at::empty_with_format(
+      outputSize,
+      self.options(), 
+      CalcuOpUtil::get_tensor_npu_format(self));
+
+  logical_and_out_npu_nocheck(result, self, other);
+
+  return result.toType(kBool);
+  //return result;
+}
+
+Tensor& logical_and_npu_(Tensor& self, const Tensor& other) {
+  SmallVector<Tensor, N> inputs = {self, other};
+  SmallVector<Tensor, N> outputs = {self};
+  CalcuOpUtil::check_memory_over_laps(inputs, outputs);
+
+  if (!NpuUtils::check_match(&self)) {
+    Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    Tensor result = logical_and_out_npu_nocheck(contiguousSelf, contiguousSelf, other);
+    NpuUtils::format_fresh_view(self, result);
+  } else {
+    logical_and_out_npu_nocheck(self, self, other);
+  }
+
+  return self;
+}
+
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/MaxV1BackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/MaxV1BackwardKernelNpu.cpp
index 5cc9e6445f343b50cadd26cbb620f1b4752f6d67..b73257b1a34f289a7352d07c2657e8c877ac8907 100644
--- a/src/aten/src/ATen/native/npu/MaxV1BackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MaxV1BackwardKernelNpu.cpp
@@ -1,36 +1,36 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor max_backward_npu(const Tensor& grad, int64_t dim, const Tensor& indices, IntArrayRef sizes, bool keepdim) {
-  Tensor new_grad = grad;
-  Tensor new_indices = indices;
-  if (keepdim && sizes.size() > 0) {
-    new_grad = grad.squeeze(dim);
-    new_indices = indices.squeeze(dim);
-  }
-  auto grad_input = at::zeros(sizes, new_grad.options()).npu_scatter(new_indices, new_grad, dim);
-  return grad_input;
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
+#include "ATen/native/npu/utils/OpTemplate.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor max_backward_npu(const Tensor& grad, int64_t dim, const Tensor& indices, IntArrayRef sizes, bool keepdim) {
+  Tensor new_grad = grad;
+  Tensor new_indices = indices;
+  if (keepdim && sizes.size() > 0) {
+    new_grad = grad.squeeze(dim);
+    new_indices = indices.squeeze(dim);
+  }
+  auto grad_input = at::zeros(sizes, new_grad.options()).npu_scatter(new_indices, new_grad, dim);
+  return grad_input;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/MaxV1KernelNpu.cpp b/src/aten/src/ATen/native/npu/MaxV1KernelNpu.cpp
index d05fc30726260ed549e35d9274ff5913593be062..7c54e4a2787ca7cdbf880db4280f60f9b8f7ef47 100644
--- a/src/aten/src/ATen/native/npu/MaxV1KernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MaxV1KernelNpu.cpp
@@ -1,69 +1,69 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-tuple<Tensor&, Tensor&> max_v1_out_npu(
-    Tensor& output,
-    Tensor& indices,
-    const Tensor& self,
-    int64_t dim,
-    bool keepdim) {
-  OpCommand cmd;
-  cmd.Name("ArgMaxWithValue")
-      .Input(self)
-      .Output(indices)      
-      .Output(output)
-      .Attr("dimension", dim)
-      .Attr("keep_dims", keepdim)
-      .Run();
-  
-  return std::tie(output, indices);
-}
-
-tuple<Tensor, Tensor> max_v1_npu(const Tensor& self, int64_t dim, bool keepdim) {
-  SmallVector<int64_t, SIZE> dims = {dim};
-  SmallVector<int64_t, SIZE> outputSize =
-      reduce_ops_npu_output_size(self, dims, keepdim);
-  SmallVector<int64_t, SIZE> indicesSize =
-      reduce_ops_npu_output_size(self, dims, keepdim);
-  
-  int64_t npu_format = CalcuOpUtil::get_tensor_npu_format(self);
-  if (outputSize.empty()) {
-    npu_format = ACL_FORMAT_NCHW;
-  }
-
-  Tensor outputs = at::empty_with_format(
-      outputSize, self.options(), npu_format);
-  Tensor indices = at::empty_with_format(
-      indicesSize, self.options().dtype(kInt), ACL_FORMAT_NCHW);
-  max_v1_out_npu(outputs, indices, self, dim, keepdim);
-
-  return std::tie(outputs, indices);
-}
-
-tuple<Tensor, Tensor> max_v1_npu(const Tensor& self, Dimname dim, bool keepdim) {
-  return max_v1_npu(self, dimname_to_position(self, dim), keepdim);
-}
-
-
-} // namespace native
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+tuple<Tensor&, Tensor&> max_v1_out_npu(
+    Tensor& output,
+    Tensor& indices,
+    const Tensor& self,
+    int64_t dim,
+    bool keepdim) {
+  OpCommand cmd;
+  cmd.Name("ArgMaxWithValue")
+      .Input(self)
+      .Output(indices)      
+      .Output(output)
+      .Attr("dimension", dim)
+      .Attr("keep_dims", keepdim)
+      .Run();
+  
+  return std::tie(output, indices);
+}
+
+tuple<Tensor, Tensor> max_v1_npu(const Tensor& self, int64_t dim, bool keepdim) {
+  SmallVector<int64_t, SIZE> dims = {dim};
+  SmallVector<int64_t, SIZE> outputSize =
+      reduce_ops_npu_output_size(self, dims, keepdim);
+  SmallVector<int64_t, SIZE> indicesSize =
+      reduce_ops_npu_output_size(self, dims, keepdim);
+  
+  int64_t npu_format = CalcuOpUtil::get_tensor_npu_format(self);
+  if (outputSize.empty()) {
+    npu_format = ACL_FORMAT_NCHW;
+  }
+
+  Tensor outputs = at::empty_with_format(
+      outputSize, self.options(), npu_format);
+  Tensor indices = at::empty_with_format(
+      indicesSize, self.options().dtype(kInt), ACL_FORMAT_NCHW);
+  max_v1_out_npu(outputs, indices, self, dim, keepdim);
+
+  return std::tie(outputs, indices);
+}
+
+tuple<Tensor, Tensor> max_v1_npu(const Tensor& self, Dimname dim, bool keepdim) {
+  return max_v1_npu(self, dimname_to_position(self, dim), keepdim);
+}
+
+
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/MinKernelNpu.cpp b/src/aten/src/ATen/native/npu/MinKernelNpu.cpp
index f45ae27e9b5f9eedb40a54b483f692d9cd6f6129..680ec911791c8185a888ba54864166592954fb9d 100644
--- a/src/aten/src/ATen/native/npu/MinKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MinKernelNpu.cpp
@@ -67,21 +67,32 @@ tuple<Tensor&, Tensor&> min_out_npu(
 }
 
 tuple<Tensor, Tensor> min_npu(const Tensor& self, int64_t dim, bool keepdim) {
+  Tensor selfCast = self;
+  if(self.dtype() == ScalarType::Bool){
+    selfCast = self.to(ScalarType::Float);
+  }
+
   SmallVector<int64_t, SIZE> dims = {dim};
-  auto outputSize = reduce_ops_npu_output_size(self, dims, keepdim);
+  auto outputSize = reduce_ops_npu_output_size(selfCast, dims, keepdim);
   SmallVector<int64_t, SIZE> indicesSize = outputSize;
 
-  auto func = [&self, dim, keepdim](Tensor outputs, Tensor indices) {
-    min_out_npu_nocheck(outputs, indices, self, dim, keepdim);
+  auto func = [&selfCast, dim, keepdim](Tensor outputs, Tensor indices) {
+    min_out_npu_nocheck(outputs, indices, selfCast, dim, keepdim);
   };
 
   Tensor outputs, indices;
   OpPipeWithDefinedMultiOut<Tensor, Tensor> pipe(outputs, indices);
-  return pipe.ApplyOutputWithSpecailParams<0>(outputSize, self.options(), ACL_FORMAT_ND)
-            .ApplyOutputWithSpecailParams<1>(indicesSize, self.options().dtype(ScalarType::Int), ACL_FORMAT_NCHW)
-            .Call(func)
-            .ReflushOutputDtype<1>(ScalarType::Long)
-            .Return<Tensor, Tensor>();
+  std::tie(outputs, indices) = pipe.ApplyOutputWithSpecailParams<0>(outputSize, selfCast.options(), ACL_FORMAT_ND)
+      .ApplyOutputWithSpecailParams<1>(indicesSize, selfCast.options().dtype(ScalarType::Int), ACL_FORMAT_NCHW)
+      .Call(func)
+      .ReflushOutputDtype<1>(ScalarType::Long)
+      .Return<Tensor, Tensor>();
+
+  if(self.dtype() == ScalarType::Bool){
+    outputs = outputs.to(ScalarType::Bool);
+  }
+
+  return std::tie(outputs, indices);
 }
 
 tuple<Tensor&, Tensor&> min_out_npu(
diff --git a/src/aten/src/ATen/native/npu/MishBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/MishBackwardKernelNpu.cpp
index ff5f154f36a364fa0e1bdb1879bbf7f598fde99a..39373555d1e568daa0f4d4ebd5f5ec354c4ee355 100644
--- a/src/aten/src/ATen/native/npu/MishBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MishBackwardKernelNpu.cpp
@@ -1,37 +1,37 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor mish_backward_npu(const Tensor& grad, const Tensor& input) {
-  Tensor result =  OpPreparation::ApplyTensor(input);
-
-  OpCommand cmd;
-  cmd.Name("MishGrad")
-      .Input(grad)
-      .Input(input)
-      .Output(result)
-      .Run();
-
-  return result;
-}
-
-} // namespace native
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor mish_backward_npu(const Tensor& grad, const Tensor& input) {
+  Tensor result =  OpPreparation::ApplyTensor(input);
+
+  OpCommand cmd;
+  cmd.Name("MishGrad")
+      .Input(grad)
+      .Input(input)
+      .Output(result)
+      .Run();
+
+  return result;
+}
+
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/MishKernelNpu.cpp b/src/aten/src/ATen/native/npu/MishKernelNpu.cpp
index 41cce69c017a0dc5196c7db38f99bc04f14ad42a..e623547ec941ebadabaf283e9b17b35061d0544b 100644
--- a/src/aten/src/ATen/native/npu/MishKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MishKernelNpu.cpp
@@ -1,36 +1,36 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor mish_npu(const Tensor& self) {
-  Tensor result =  OpPreparation::ApplyTensor(self);
-
-  OpCommand cmd;
-  cmd.Name("Mish")
-      .Input(self)
-      .Output(result)
-      .Run();
-
-  return result;
-}
-
-} // namespace native
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor mish_npu(const Tensor& self) {
+  Tensor result =  OpPreparation::ApplyTensor(self);
+
+  OpCommand cmd;
+  cmd.Name("Mish")
+      .Input(self)
+      .Output(result)
+      .Run();
+
+  return result;
+}
+
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/MmKernelNpu.cpp b/src/aten/src/ATen/native/npu/MmKernelNpu.cpp
index 91af42d2af1248cc22cdd88ce4f58dc578fe4e6a..28ab0aa98118511eec34cdd63b981685a87124e2 100644
--- a/src/aten/src/ATen/native/npu/MmKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MmKernelNpu.cpp
@@ -18,8 +18,6 @@
 #include "ATen/native/npu/utils/KernelNpuOutputSize.h"
 #include "ATen/native/npu/utils/NpuUtils.h"
 #include "ATen/native/npu/utils/OpAdapter.h"
-#include "ATen/native/npu/common/InnerNpuNativeFunction.h"
-#include "ATen/native/npu/frame/StorageDescHelper.h"
 
 namespace at {
 namespace native {
@@ -28,7 +26,7 @@ using namespace at::native::npu;
 // Flexible transpose judgement for view+transpose+Matmul, 
 // i.e., tensors with dim=2 and base_size_.size=3 can also be Matmul directly!
 bool is_transpose_last_two_dims_flex(const Tensor& tensor) {
-  if (tensor.dim() != 2) {
+  if (tensor.dim() < 2 || tensor.dim() > 3) {
     return false;
   }
   int64_t numel = 1;
@@ -115,17 +113,10 @@ Tensor mm_npu(const Tensor& self, const Tensor& mat2) {
   // Matmul cannot directly deal with view+transposed tensor with NZ format, so Transdata is necessary
   if (self.sizes().size() != self_desc.base_sizes_.size()) {
     selfFormatCast = OpPreparation::CastBackToOriFormat(self);
-    // refresh storage desc info [origin shape and storage shape] of reshaped Tensor
-    if (is_transpose_last_two_dims_flex(selfFormatCast)) {
-      StorageDescHelper::ReflushDescBySelf(selfFormatCast.transpose(-2, -1));
-    }
   }
   
   if (mat2.sizes().size() != mat2_desc.base_sizes_.size()) {
     mat2FormatCast = OpPreparation::CastBackToOriFormat(mat2);
-    if (is_transpose_last_two_dims_flex(mat2FormatCast)) {
-      StorageDescHelper::ReflushDescBySelf(mat2FormatCast.transpose(-2, -1));
-    }
   }
   
   // construct the output tensor of the NPU
diff --git a/src/aten/src/ATen/native/npu/MvKernelNpu.cpp b/src/aten/src/ATen/native/npu/MvKernelNpu.cpp
index 3a09b43910915830761955d4c3074d15b1965120..a2ded7dbdf7d2a427f9754768a35e51be7347005 100644
--- a/src/aten/src/ATen/native/npu/MvKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/MvKernelNpu.cpp
@@ -1,72 +1,72 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION.
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/common/InnerNpuNativeFunction.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor& mv_out_npu_nocheck(Tensor& result, const Tensor& self, const Tensor& vec) {
-  bool isSelfT = CalcuOpUtil::is_transpose_last_two_dims(self);
-  Tensor contiguousSelf;
-  contiguousSelf = isSelfT ? self : NpuUtils::format_contiguous(self);
-  Tensor vecT = at::unsqueeze(vec, 1);
-
-  OpCommand cmd;
-  cmd.Name("MatMul")
-      .Input(contiguousSelf)
-      .Input(vecT)
-      .Attr("transpose_x1", isSelfT)
-      .Attr("transpose_x2", false)
-      .Output(result)
-      .Run();
-
-  result = at::squeeze(result, 1);
-  npu_fast_reshape_(result);
-  return result;
-}
-
-Tensor& mv_out_npu(Tensor& result, const Tensor& self, const Tensor& vec) {
-
-  OpPreparation::CheckOut(
-      {self},
-      result,
-      CalcuOpUtil::get_tensor_npu_format(self),
-      self.scalar_type(),
-      {self.size(0)});
-
-  result = at::unsqueeze(result, 1);
-  OpPipeWithDefinedOut pipe;
-  return pipe.CheckMemory({self, vec}, {result})
-      .Func([&self, &vec](Tensor& result){mv_out_npu_nocheck(result, self, vec);})
-      .Call(result);
-}
-
-Tensor mv_npu(const Tensor& self, const Tensor& vec) {
-
-  Tensor result = OpPreparation::ApplyTensor(self, {self.size(0), 1});
-
-  // calculate the output result of the NPU
-  mv_out_npu_nocheck(result, self, vec);
-
-  return result;
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
+#include "ATen/native/npu/common/InnerNpuNativeFunction.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor& mv_out_npu_nocheck(Tensor& result, const Tensor& self, const Tensor& vec) {
+  bool isSelfT = CalcuOpUtil::is_transpose_last_two_dims(self);
+  Tensor contiguousSelf;
+  contiguousSelf = isSelfT ? self : NpuUtils::format_contiguous(self);
+  Tensor vecT = at::unsqueeze(vec, 1);
+
+  OpCommand cmd;
+  cmd.Name("MatMul")
+      .Input(contiguousSelf)
+      .Input(vecT)
+      .Attr("transpose_x1", isSelfT)
+      .Attr("transpose_x2", false)
+      .Output(result)
+      .Run();
+
+  result = at::squeeze(result, 1);
+  npu_fast_reshape_(result);
+  return result;
+}
+
+Tensor& mv_out_npu(Tensor& result, const Tensor& self, const Tensor& vec) {
+
+  OpPreparation::CheckOut(
+      {self},
+      result,
+      CalcuOpUtil::get_tensor_npu_format(self),
+      self.scalar_type(),
+      {self.size(0)});
+
+  result = at::unsqueeze(result, 1);
+  OpPipeWithDefinedOut pipe;
+  return pipe.CheckMemory({self, vec}, {result})
+      .Func([&self, &vec](Tensor& result){mv_out_npu_nocheck(result, self, vec);})
+      .Call(result);
+}
+
+Tensor mv_npu(const Tensor& self, const Tensor& vec) {
+
+  Tensor result = OpPreparation::ApplyTensor(self, {self.size(0), 1});
+
+  // calculate the output result of the NPU
+  mv_out_npu_nocheck(result, self, vec);
+
+  return result;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/NmsV4KernelNpu.cpp b/src/aten/src/ATen/native/npu/NmsV4KernelNpu.cpp
index d3dea1461ba00e05b227f4a4ca071eb7743a5203..0ef5e8fd0d4bf9084c0bce904b41c47019fd6a74 100644
--- a/src/aten/src/ATen/native/npu/NmsV4KernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/NmsV4KernelNpu.cpp
@@ -1,108 +1,108 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-SmallVector<NPUTensorDesc, N> nms_v4_npu_input(
-    const Tensor& self,
-    const Tensor& scores,
-    Scalar max_output_size,
-    const Tensor& iou_threshold,
-    const Tensor& scores_threshold) {
-  SmallVector<NPUTensorDesc, N> inputs;
-
-  Tensor max_output_size_tensor = at::empty_with_format(
-          {}, self.options().dtype(at::kInt), CalcuOpUtil::get_tensor_npu_format(self))
-          .fill_(max_output_size);
-  return CalcuOpUtil::create_npu_input_tensor_desc({self, scores, max_output_size_tensor, iou_threshold, scores_threshold});
-}
-
-SmallVector<NPUTensorDesc, N> nms_v4_npu_output(
-    const SmallVector<Tensor, N>& outputTensor) {
-  return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor);
-}
-
-SmallVector<NPUAttrDesc, N> nms_v4_npu_attr(bool pad_to_max_output_size) {
-  NPUAttrDesc npuAttrPadToMaxOutputSize =
-      NPUAttrDesc("pad_to_max_output_size", pad_to_max_output_size);
-
-  SmallVector<NPUAttrDesc, N> attrs = {npuAttrPadToMaxOutputSize};
-  return attrs;
-}
-
-tuple<Tensor, Tensor> nms_v4_out_npu(
-    Tensor& selected_indices,
-    Tensor& valid_outputs,
-    const Tensor& self,
-    const Tensor& scores,
-    Scalar max_output_size,
-    const Tensor& iou_threshold,
-    const Tensor& scores_threshold,
-    bool pad_to_max_output_size) {
-  // constructs the input and output NPUTensorDesc
-  auto inputs = nms_v4_npu_input(self, scores, max_output_size, iou_threshold, scores_threshold);
-  auto outputs = nms_v4_npu_output({selected_indices, valid_outputs});
-
-  // constructs the attr of the NPUAttrDesc
-  auto attrs = nms_v4_npu_attr(pad_to_max_output_size);
-
-  // executing the NPU operator
-  CalcuOpUtil::execute_npu_operate("NonMaxSuppressionV4", inputs, outputs, attrs);
-
-  // return std::make_tuple(selected_indices, valid_outputs)
-  return std::tuple<Tensor, Tensor>(selected_indices, valid_outputs);
-}
-
-tuple<Tensor, Tensor> nms_v4_npu(
-    const Tensor& self,
-    const Tensor& scores,
-    Scalar max_output_size,
-    const Tensor& iou_threshold,
-    const Tensor& scores_threshold,
-    bool pad_to_max_output_size) {
-  // calculate the output size
-  auto outputSizes = nms_v4_npu_output_size(max_output_size);
-
-  // construct the output tensor of the NPU
-  Tensor selected_indices = at::empty_with_format(
-      std::get<0>(outputSizes),
-      self.options().dtype(at::kInt),
-      CalcuOpUtil::get_tensor_npu_format(self));
-
-  Tensor valid_outputs = at::empty_with_format(
-      std::get<1>(outputSizes),
-      self.options().dtype(at::kInt),
-      CalcuOpUtil::get_tensor_npu_format(self));
-
-  nms_v4_out_npu(
-    selected_indices,
-    valid_outputs,
-    self,
-    scores,
-    max_output_size,
-    iou_threshold,
-    scores_threshold,
-    pad_to_max_output_size);
-
-  return std::tuple<Tensor, Tensor>(selected_indices, valid_outputs);
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
+#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
+#include "ATen/native/npu/utils/NpuUtils.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+SmallVector<NPUTensorDesc, N> nms_v4_npu_input(
+    const Tensor& self,
+    const Tensor& scores,
+    Scalar max_output_size,
+    const Tensor& iou_threshold,
+    const Tensor& scores_threshold) {
+  SmallVector<NPUTensorDesc, N> inputs;
+
+  Tensor max_output_size_tensor = at::empty_with_format(
+          {}, self.options().dtype(at::kInt), CalcuOpUtil::get_tensor_npu_format(self))
+          .fill_(max_output_size);
+  return CalcuOpUtil::create_npu_input_tensor_desc({self, scores, max_output_size_tensor, iou_threshold, scores_threshold});
+}
+
+SmallVector<NPUTensorDesc, N> nms_v4_npu_output(
+    const SmallVector<Tensor, N>& outputTensor) {
+  return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor);
+}
+
+SmallVector<NPUAttrDesc, N> nms_v4_npu_attr(bool pad_to_max_output_size) {
+  NPUAttrDesc npuAttrPadToMaxOutputSize =
+      NPUAttrDesc("pad_to_max_output_size", pad_to_max_output_size);
+
+  SmallVector<NPUAttrDesc, N> attrs = {npuAttrPadToMaxOutputSize};
+  return attrs;
+}
+
+tuple<Tensor, Tensor> nms_v4_out_npu(
+    Tensor& selected_indices,
+    Tensor& valid_outputs,
+    const Tensor& self,
+    const Tensor& scores,
+    Scalar max_output_size,
+    const Tensor& iou_threshold,
+    const Tensor& scores_threshold,
+    bool pad_to_max_output_size) {
+  // constructs the input and output NPUTensorDesc
+  auto inputs = nms_v4_npu_input(self, scores, max_output_size, iou_threshold, scores_threshold);
+  auto outputs = nms_v4_npu_output({selected_indices, valid_outputs});
+
+  // constructs the attr of the NPUAttrDesc
+  auto attrs = nms_v4_npu_attr(pad_to_max_output_size);
+
+  // executing the NPU operator
+  CalcuOpUtil::execute_npu_operate("NonMaxSuppressionV4", inputs, outputs, attrs);
+
+  // return std::make_tuple(selected_indices, valid_outputs)
+  return std::tuple<Tensor, Tensor>(selected_indices, valid_outputs);
+}
+
+tuple<Tensor, Tensor> nms_v4_npu(
+    const Tensor& self,
+    const Tensor& scores,
+    Scalar max_output_size,
+    const Tensor& iou_threshold,
+    const Tensor& scores_threshold,
+    bool pad_to_max_output_size) {
+  // calculate the output size
+  auto outputSizes = nms_v4_npu_output_size(max_output_size);
+
+  // construct the output tensor of the NPU
+  Tensor selected_indices = at::empty_with_format(
+      std::get<0>(outputSizes),
+      self.options().dtype(at::kInt),
+      CalcuOpUtil::get_tensor_npu_format(self));
+
+  Tensor valid_outputs = at::empty_with_format(
+      std::get<1>(outputSizes),
+      self.options().dtype(at::kInt),
+      CalcuOpUtil::get_tensor_npu_format(self));
+
+  nms_v4_out_npu(
+    selected_indices,
+    valid_outputs,
+    self,
+    scores,
+    max_output_size,
+    iou_threshold,
+    scores_threshold,
+    pad_to_max_output_size);
+
+  return std::tuple<Tensor, Tensor>(selected_indices, valid_outputs);
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/NormKernelNpu.cpp b/src/aten/src/ATen/native/npu/NormKernelNpu.cpp
index 8308e4763aa4923b358d9ac4f23594b85cc3a159..2f25260240785bb679773e9bfd9cb76b04d38d4f 100644
--- a/src/aten/src/ATen/native/npu/NormKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/NormKernelNpu.cpp
@@ -38,7 +38,7 @@ int64_t calculate_p(optional<Scalar> p) {
 
 
 // norm.dtype_out
-Tensor& norm_out_npu(
+Tensor& norm_out_npu_nocheck(
     Tensor& out,
     const Tensor& self,
     optional<Scalar> p,
@@ -80,11 +80,36 @@ Tensor& norm_out_npu(
     optional<Scalar> p,
     IntArrayRef dim,
     bool keepdim) {
-  norm_out_npu(out, self, p, dim, keepdim, self.scalar_type());
+  auto outputSize = reduce_ops_npu_output_size(self, dim, keepdim);
+  OpPreparation::CheckOut(
+    {self}, 
+    out, 
+    ACL_FORMAT_ND, 
+    self.scalar_type(), 
+    outputSize); 
+  norm_out_npu_nocheck(out, self, p, dim, keepdim, self.scalar_type());
 
   return out;
 }
 
+Tensor& norm_out_npu(
+    Tensor& out,
+    const Tensor& self,
+    optional<Scalar> p,
+    IntArrayRef dim,
+    bool keepdim,
+    ScalarType dtype) {
+  auto outputSize = reduce_ops_npu_output_size(self, dim, keepdim);
+  OpPreparation::CheckOut(
+    {self}, 
+    out, 
+    ACL_FORMAT_ND, 
+    self.scalar_type(), 
+    outputSize); 
+  norm_out_npu_nocheck(out, self, p, dim, keepdim, dtype);
+
+  return out;
+}
 // norm.ScalarOpt_dim_dtype
 Tensor norm_npu(
     const Tensor& self,
@@ -99,7 +124,7 @@ Tensor norm_npu(
   Tensor out = OpPreparation::ApplyTensorWithSizes(outputSize, self.options().dtype(dtype));
 
   // calculate the output result of the NPU
-  norm_out_npu(out, self, p, dim, keepdim, dtype);
+  norm_out_npu_nocheck(out, self, p, dim, keepdim, dtype);
   
   return out;
 }
diff --git a/src/aten/src/ATen/native/npu/OneHotKernelNpu.cpp b/src/aten/src/ATen/native/npu/OneHotKernelNpu.cpp
index d92440dec002580786506999774f36fdfb2464aa..8030418ef0d429581493ba48d7ffa27043c00b39 100644
--- a/src/aten/src/ATen/native/npu/OneHotKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/OneHotKernelNpu.cpp
@@ -1,80 +1,80 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor one_hot_npu1(const Tensor& self, int64_t num_classes) {
-  Scalar on_value = 1;
-  Scalar off_value = 0;
-  int64_t axis = -1;
-  int64_t depth;
-
-  auto self_temp = self.to(at::kFloat);
-
-  //When run in NPU,the input tensor's dim must be smaller than 8.
-  TORCH_CHECK(
-      self_temp.dim() < 8, "NPU error,can not support the input tensor's dim bigger than 7.");
-
-  // empty tensor could be converted to one hot representation,
-  // but shape inference is not possible.
-  if (self.numel() == 0) {
-    if (num_classes <= 0) {
-      AT_ERROR("Can not infer total number of classes from empty tensor.");
-    } else {
-      depth = num_classes;
-    }
-  }
-
-  // non-empty tensor
-  TORCH_CHECK(
-      self_temp.min().item().toLong() >= 0, "Class values must be non-negative.");
-  if (num_classes == -1) {
-    depth = self_temp.max().item().toLong() + 1;
-  } else {
-    TORCH_CHECK(
-        num_classes > self_temp.max().item().toLong(),
-        "Class values must be smaller than num_classes.");
-    depth = num_classes;
-  }
-
-  // calculate output size
-  auto outputSize = array_to_small_vector(self.sizes());
-  outputSize.emplace_back(depth);
-
-  Tensor result = OpPreparation::ApplyTensor(
-      outputSize,
-      self.options().dtype(ScalarType::Int),
-      self);
-
-  SmallVector<int64_t, N> depthList = {depth};
-  
-  OpCommand cmd;
-  cmd.Name("OneHot")
-      .Input(self)
-      .Input(depthList, at::kInt)
-      .Input(on_value, ScalarType::Int)
-      .Input(off_value, ScalarType::Int)
-      .Output(result)
-      .Attr("axis", axis)
-      .Run();
-  
-  return result;
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor one_hot_npu1(const Tensor& self, int64_t num_classes) {
+  Scalar on_value = 1;
+  Scalar off_value = 0;
+  int64_t axis = -1;
+  int64_t depth;
+
+  auto self_temp = self.to(at::kFloat);
+
+  //When run in NPU,the input tensor's dim must be smaller than 8.
+  TORCH_CHECK(
+      self_temp.dim() < 8, "NPU error,can not support the input tensor's dim bigger than 7.");
+
+  // empty tensor could be converted to one hot representation,
+  // but shape inference is not possible.
+  if (self.numel() == 0) {
+    if (num_classes <= 0) {
+      AT_ERROR("Can not infer total number of classes from empty tensor.");
+    } else {
+      depth = num_classes;
+    }
+  }
+
+  // non-empty tensor
+  TORCH_CHECK(
+      self_temp.min().item().toLong() >= 0, "Class values must be non-negative.");
+  if (num_classes == -1) {
+    depth = self_temp.max().item().toLong() + 1;
+  } else {
+    TORCH_CHECK(
+        num_classes > self_temp.max().item().toLong(),
+        "Class values must be smaller than num_classes.");
+    depth = num_classes;
+  }
+
+  // calculate output size
+  auto outputSize = array_to_small_vector(self.sizes());
+  outputSize.emplace_back(depth);
+
+  Tensor result = OpPreparation::ApplyTensor(
+      outputSize,
+      self.options().dtype(ScalarType::Int),
+      self);
+
+  SmallVector<int64_t, N> depthList = {depth};
+  
+  OpCommand cmd;
+  cmd.Name("OneHot")
+      .Input(self)
+      .Input(depthList, at::kInt)
+      .Input(on_value, ScalarType::Int)
+      .Input(off_value, ScalarType::Int)
+      .Output(result)
+      .Attr("axis", axis)
+      .Run();
+  
+  return result;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/PadKernelNpu.cpp b/src/aten/src/ATen/native/npu/PadKernelNpu.cpp
index d2d338f12527d66c3a3dd2d6eb413621981dbee3..0aeda8ced95bb8778854d2989848502e8de534e7 100644
--- a/src/aten/src/ATen/native/npu/PadKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/PadKernelNpu.cpp
@@ -1,47 +1,47 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor& pad_out_npu(
-    Tensor& output,
-    const Tensor& input,
-    IntArrayRef paddings) {
-  SmallVector<int64_t, N> paddingsVector = array_to_small_vector(paddings);
-  paddingsVector.resize(2 * input.dim(), 0);
-
-  OpCommand cmd;
-  cmd.Name("Pad")
-      .Input(input)
-      .Input(paddingsVector)
-      .Output(output)
-      .Run();
-  return output;
-}
-
-Tensor pad_npu(const Tensor& input, IntArrayRef paddings) {
-  auto outputSize = pad_npu_output_size(input, paddings);
-  Tensor output = OpPreparation::ApplyTensor(input, outputSize);
-  pad_out_npu(output, input, paddings);
-  return output;
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor& pad_out_npu(
+    Tensor& output,
+    const Tensor& input,
+    IntArrayRef paddings) {
+  SmallVector<int64_t, N> paddingsVector = array_to_small_vector(paddings);
+  paddingsVector.resize(2 * input.dim(), 0);
+
+  OpCommand cmd;
+  cmd.Name("Pad")
+      .Input(input)
+      .Input(paddingsVector)
+      .Output(output)
+      .Run();
+  return output;
+}
+
+Tensor pad_npu(const Tensor& input, IntArrayRef paddings) {
+  auto outputSize = pad_npu_output_size(input, paddings);
+  Tensor output = OpPreparation::ApplyTensor(input, outputSize);
+  pad_out_npu(output, input, paddings);
+  return output;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/PdistKernelNpu.cpp b/src/aten/src/ATen/native/npu/PdistKernelNpu.cpp
index 5063ed5b35fe75cd2ede349f7d2b19ba36fe3ddd..c6dd231d07c2483f2f5585870f161acb3a8893bf 100644
--- a/src/aten/src/ATen/native/npu/PdistKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/PdistKernelNpu.cpp
@@ -1,95 +1,95 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-SmallVector<NPUTensorDesc, N> pdist_npu_input(
-    const SmallVector<Tensor, N>& inputTensor){
-  return CalcuOpUtil::create_npu_input_tensor_desc(inputTensor);
-}
-
-SmallVector<NPUTensorDesc, N> pdist_npu_output(
-    const SmallVector<Tensor, N>& outputTensor) {
-  return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor);
-}
-
-SmallVector<NPUAttrDesc, N>  pdist_npu_attr(float p_value) {
-  NPUAttrDesc P = NPUAttrDesc("p", p_value);
-  SmallVector<NPUAttrDesc, N> attrs = {P};
-  return attrs;
-}
-
-Tensor& pdist_out_npu(   
-    Tensor& result, 
-    const Tensor& self,
-    float p) {
-  // constructs the input and output NPUTensorDesc
-  auto inputs = pdist_npu_input({self});
-  auto outputs = pdist_npu_output({result});
-
-  // constructs the attr of the NPUAttrDesc
-  auto attrs = pdist_npu_attr(p);
-  
-  // executing the NPU operator
-  CalcuOpUtil::execute_npu_operate("Pdist", inputs, outputs, attrs);
-  
-  return result;
-}
-
-Tensor pdist_npu(const Tensor& self, double p) {
-  TORCH_CHECK(self.dim() == 2,
-      "pdist only supports 2D tensors, got: ", self.dim(), "D");
-  TORCH_CHECK(at::isFloatingType(self.scalar_type()), "pdist only supports floating-point dtypes");
-  TORCH_CHECK(p >= 0, "pdist only supports non-negative p values");
-  return at::_pdist_forward(self, p);
-}
-
-Tensor _pdist_forward_npu(const Tensor& self, double p) {
-  Tensor result;
-  if (self.size(0) <= 1) {
-    result = at::empty_with_format(
-        {0},
-        self.options(),
-        CalcuOpUtil::get_tensor_npu_format(self));   
-  } else {
-    // double is not supported in NPU,  type of P needs to be converted from double to float.
-    float p_float;
-    if (std::isinf(p)) {
-      p_float = std::numeric_limits<float>::infinity();
-    } else {
-      TORCH_CHECK(p <= std::numeric_limits<float>::max(), "npu dose not support float64" );
-      p_float = (float) p;
-    }
-    auto outputSize =  pdist_npu_output_size(self, p_float);
-    result = at::empty_with_format(
-        outputSize,
-        self.options(),
-        CalcuOpUtil::get_tensor_npu_format(self));
-    if(self.size(1) == 0){
-      result.fill_(0);
-    } else {
-      pdist_out_npu(result, self, p_float);
-    }  
-  }
-  return result;
-}
-
-} // namespace native
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
+#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
+#include "ATen/native/npu/utils/NpuUtils.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+SmallVector<NPUTensorDesc, N> pdist_npu_input(
+    const SmallVector<Tensor, N>& inputTensor){
+  return CalcuOpUtil::create_npu_input_tensor_desc(inputTensor);
+}
+
+SmallVector<NPUTensorDesc, N> pdist_npu_output(
+    const SmallVector<Tensor, N>& outputTensor) {
+  return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor);
+}
+
+SmallVector<NPUAttrDesc, N>  pdist_npu_attr(float p_value) {
+  NPUAttrDesc P = NPUAttrDesc("p", p_value);
+  SmallVector<NPUAttrDesc, N> attrs = {P};
+  return attrs;
+}
+
+Tensor& pdist_out_npu(   
+    Tensor& result, 
+    const Tensor& self,
+    float p) {
+  // constructs the input and output NPUTensorDesc
+  auto inputs = pdist_npu_input({self});
+  auto outputs = pdist_npu_output({result});
+
+  // constructs the attr of the NPUAttrDesc
+  auto attrs = pdist_npu_attr(p);
+  
+  // executing the NPU operator
+  CalcuOpUtil::execute_npu_operate("Pdist", inputs, outputs, attrs);
+  
+  return result;
+}
+
+Tensor pdist_npu(const Tensor& self, double p) {
+  TORCH_CHECK(self.dim() == 2,
+      "pdist only supports 2D tensors, got: ", self.dim(), "D");
+  TORCH_CHECK(at::isFloatingType(self.scalar_type()), "pdist only supports floating-point dtypes");
+  TORCH_CHECK(p >= 0, "pdist only supports non-negative p values");
+  return at::_pdist_forward(self, p);
+}
+
+Tensor _pdist_forward_npu(const Tensor& self, double p) {
+  Tensor result;
+  if (self.size(0) <= 1) {
+    result = at::empty_with_format(
+        {0},
+        self.options(),
+        CalcuOpUtil::get_tensor_npu_format(self));   
+  } else {
+    // double is not supported in NPU,  type of P needs to be converted from double to float.
+    float p_float;
+    if (std::isinf(p)) {
+      p_float = std::numeric_limits<float>::infinity();
+    } else {
+      TORCH_CHECK(p <= std::numeric_limits<float>::max(), "npu dose not support float64" );
+      p_float = (float) p;
+    }
+    auto outputSize =  pdist_npu_output_size(self, p_float);
+    result = at::empty_with_format(
+        outputSize,
+        self.options(),
+        CalcuOpUtil::get_tensor_npu_format(self));
+    if(self.size(1) == 0){
+      result.fill_(0);
+    } else {
+      pdist_out_npu(result, self, p_float);
+    }  
+  }
+  return result;
+}
+
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/PreluKernelNpu.cpp b/src/aten/src/ATen/native/npu/PreluKernelNpu.cpp
index 52b248ae36364df35ba901f8a936b037b7d6376f..41e3974f122a3c68ba861351b6b402204ecf59da 100644
--- a/src/aten/src/ATen/native/npu/PreluKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/PreluKernelNpu.cpp
@@ -1,39 +1,39 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor prelu_npu(const Tensor& self, const Tensor& weight_) {
-  auto input = self.contiguous();
-  auto weight = weight_.contiguous();
-
-  // calculate the output size
-  auto outputSize = input_same_output_size(self);
-  Tensor result = OpPreparation::ApplyTensor(input, outputSize);
-  
-  OpCommand cmd;
-  cmd.Name("PRelu")
-     .Input(self)
-     .Input(weight)
-     .Output(result)
-     .Run();
-  return result;
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor prelu_npu(const Tensor& self, const Tensor& weight_) {
+  auto input = self.contiguous();
+  auto weight = weight_.contiguous();
+
+  // calculate the output size
+  auto outputSize = input_same_output_size(self);
+  Tensor result = OpPreparation::ApplyTensor(input, outputSize);
+  
+  OpCommand cmd;
+  cmd.Name("PRelu")
+     .Input(self)
+     .Input(weight)
+     .Output(result)
+     .Run();
+  return result;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/PtIouKernelNpu.cpp b/src/aten/src/ATen/native/npu/PtIouKernelNpu.cpp
index 2875adc7aff695009560413b3f9e5a38e7cfb3ea..f186d78b1e929f33f43a93aedf7838c0af756d71 100644
--- a/src/aten/src/ATen/native/npu/PtIouKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/PtIouKernelNpu.cpp
@@ -1,45 +1,45 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor ptiou_npu(
-    const Tensor& bboxes,
-    const Tensor& gtboxes,
-    int64_t mode) {
-  auto outputSize = {gtboxes.size(0), bboxes.size(0)};
-  Tensor overlap = OpPreparation::ApplyTensor(bboxes, outputSize);
-  string modeStr = "iou";
-  if (mode == 1) {
-    modeStr = "iof";
-  }
-  OpCommand cmd;
-  cmd.Name("PtIou")
-      .Input(bboxes)
-      .Input(gtboxes)
-      .Output(overlap)
-      .Attr("mode", modeStr)
-      .Run();
-
-  return overlap;
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor ptiou_npu(
+    const Tensor& bboxes,
+    const Tensor& gtboxes,
+    int64_t mode) {
+  auto outputSize = {gtboxes.size(0), bboxes.size(0)};
+  Tensor overlap = OpPreparation::ApplyTensor(bboxes, outputSize);
+  string modeStr = "iou";
+  if (mode == 1) {
+    modeStr = "iof";
+  }
+  OpCommand cmd;
+  cmd.Name("PtIou")
+      .Input(bboxes)
+      .Input(gtboxes)
+      .Output(overlap)
+      .Attr("mode", modeStr)
+      .Run();
+
+  return overlap;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/QrKernelNpu.cpp b/src/aten/src/ATen/native/npu/QrKernelNpu.cpp
index a1212802cff3b79915fd17c36ea4e7eac94ebfca..b94c6824e69f4989a7d1933ad95b2ce789c14d74 100644
--- a/src/aten/src/ATen/native/npu/QrKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/QrKernelNpu.cpp
@@ -1,102 +1,102 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-std::tuple<SmallVector<int64_t, N>, SmallVector<int64_t, N>> 
-qr_npu_output_size(
-  const Tensor& self,
-  bool some)
-{
-  int m = self.size(-2);
-  int n = self.size(-1);
-  auto k = std::min<int>(m, n);
-  auto shape = array_to_small_vector(self.sizes());
-  SmallVector<int64_t, N> Qsize(shape.begin(), shape.end()-2);
-  SmallVector<int64_t, N> Rsize(shape.begin(), shape.end()-2);
-  // allocate size
-  if(some){
-      Qsize.insert(Qsize.end(), {m, k});
-      Rsize.insert(Rsize.end(), {k, n});
-  } else {
-      Qsize.insert(Qsize.end(), {m, m});
-      Rsize.insert(Rsize.end(), {m, n});
-  }
-  return std::tie(Qsize, Rsize);
-}
-
-static inline void qr_check(
-    const Tensor& self){
-  TORCH_CHECK(
-      self.ndimension() >= 2,
-      "Expected nonempty least 2D tensor, but got a tensor with sizes ",
-      self.dim());
-}
-
-std::tuple<Tensor&, Tensor&> qr_out_npu_nocheck(
-    Tensor& Q,
-    Tensor& R,
-    const Tensor& self,
-    bool some){
-  bool full_matrices = !some;
-  OpCommand cmd;
-  cmd.Name("Qr")
-      .Input(self)
-      .Output(Q)
-      .Output(R)
-      .Attr("full_matrices", full_matrices)
-      .Run();
-  return std::tie(Q, R);
-}
-
-std::tuple<Tensor&, Tensor&> qr_out_npu(
-    Tensor& Q,
-    Tensor& R,
-    const Tensor& self,
-    bool some){
- qr_check(self);
- auto sizes = qr_npu_output_size(self, some);
- OpPreparation::CheckOut(
-     {self},
-     Q,
-     self,
-     std::get<0>(sizes));
-  OpPreparation::CheckOut(
-     {self},
-     R,
-     self,
-     std::get<1>(sizes));
-  return qr_out_npu_nocheck(Q, R, self, some);
-}
-
-std::tuple<Tensor, Tensor> qr_npu(
-    const Tensor& self,
-    bool some){
-  qr_check(self);
-  auto sizes = qr_npu_output_size(self, some);
-  Tensor Q = OpPreparation::ApplyTensor(self, std::get<0>(sizes));
-  Tensor R = OpPreparation::ApplyTensor(self, std::get<1>(sizes));
-
-  qr_out_npu(Q, R, self, some);
-  return std::tie(Q, R);
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+std::tuple<SmallVector<int64_t, N>, SmallVector<int64_t, N>> 
+qr_npu_output_size(
+  const Tensor& self,
+  bool some)
+{
+  int m = self.size(-2);
+  int n = self.size(-1);
+  auto k = std::min<int>(m, n);
+  auto shape = array_to_small_vector(self.sizes());
+  SmallVector<int64_t, N> Qsize(shape.begin(), shape.end()-2);
+  SmallVector<int64_t, N> Rsize(shape.begin(), shape.end()-2);
+  // allocate size
+  if(some){
+      Qsize.insert(Qsize.end(), {m, k});
+      Rsize.insert(Rsize.end(), {k, n});
+  } else {
+      Qsize.insert(Qsize.end(), {m, m});
+      Rsize.insert(Rsize.end(), {m, n});
+  }
+  return std::tie(Qsize, Rsize);
+}
+
+static inline void qr_check(
+    const Tensor& self){
+  TORCH_CHECK(
+      self.ndimension() >= 2,
+      "Expected nonempty least 2D tensor, but got a tensor with sizes ",
+      self.dim());
+}
+
+std::tuple<Tensor&, Tensor&> qr_out_npu_nocheck(
+    Tensor& Q,
+    Tensor& R,
+    const Tensor& self,
+    bool some){
+  bool full_matrices = !some;
+  OpCommand cmd;
+  cmd.Name("Qr")
+      .Input(self)
+      .Output(Q)
+      .Output(R)
+      .Attr("full_matrices", full_matrices)
+      .Run();
+  return std::tie(Q, R);
+}
+
+std::tuple<Tensor&, Tensor&> qr_out_npu(
+    Tensor& Q,
+    Tensor& R,
+    const Tensor& self,
+    bool some){
+ qr_check(self);
+ auto sizes = qr_npu_output_size(self, some);
+ OpPreparation::CheckOut(
+     {self},
+     Q,
+     self,
+     std::get<0>(sizes));
+  OpPreparation::CheckOut(
+     {self},
+     R,
+     self,
+     std::get<1>(sizes));
+  return qr_out_npu_nocheck(Q, R, self, some);
+}
+
+std::tuple<Tensor, Tensor> qr_npu(
+    const Tensor& self,
+    bool some){
+  qr_check(self);
+  auto sizes = qr_npu_output_size(self, some);
+  Tensor Q = OpPreparation::ApplyTensor(self, std::get<0>(sizes));
+  Tensor R = OpPreparation::ApplyTensor(self, std::get<1>(sizes));
+
+  qr_out_npu(Q, R, self, some);
+  return std::tie(Q, R);
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/ReflectionPad2dKernelNpu.cpp b/src/aten/src/ATen/native/npu/ReflectionPad2dKernelNpu.cpp
deleted file mode 100644
index c3daebe725a216729aa2f7ed9f0114e0c13f74bd..0000000000000000000000000000000000000000
--- a/src/aten/src/ATen/native/npu/ReflectionPad2dKernelNpu.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-SmallVector<NPUTensorDesc, N> reflection_pad2d_npu_input(SmallVector<Tensor, N> inputs) {
-  return CalcuOpUtil::create_npu_input_tensor_desc(inputs);
-}
-
-SmallVector<NPUTensorDesc, N> reflection_pad2d_npu_output(const SmallVector<Tensor, N> &outputTensor) {
-  return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor);
-}
-
-SmallVector<NPUAttrDesc, N> reflection_pad2d_npu_attr(const Tensor& input, IntArrayRef paddingSize) {
-  int64_t pad_l = 0;
-  int64_t pad_r = 0;
-  int64_t pad_t = 0;
-  int64_t pad_b = 0;
-  int64_t pad_zeros = 0;
-
-  TORCH_CHECK(paddingSize.size() == 4, "padding size is expected to be 4");
-
-  pad_l = paddingSize[0];
-  pad_r = paddingSize[1];
-  pad_t = paddingSize[2];
-  pad_b = paddingSize[3];
-
-  SmallVector<int64_t, SIZE> vectorInt = {};
-  SmallVector<SmallVector<int64_t, SIZE>, SIZE> vectorVectorInt = {};
-  SmallVector<IntArrayRef, SIZE> vectorListInt = {};
-  SmallVector<int64_t, SIZE> paddingsVector = array_to_small_vector(paddingSize);
-  paddingsVector.resize(input.dim(), 0);
-
-  for (int i = 0; i < paddingsVector.size(); i ++) {
-    if (i<2) {
-      vectorInt.emplace_back(pad_zeros);
-      vectorInt.emplace_back(pad_zeros);
-    }
-    else if (i == 2) {
-       vectorInt.emplace_back(pad_t);
-       vectorInt.emplace_back(pad_b);
-    }
-    else {
-      vectorInt.emplace_back(pad_l);
-      vectorInt.emplace_back(pad_r);
-    }
-    vectorVectorInt.emplace_back(vectorInt);
-    vectorInt.clear();
-    vectorListInt.emplace_back(IntArrayRef(vectorVectorInt.back()));
-  }
-  int64_t constant_values = 0;
-  // string mode = "constant";
-  string mode = "reflect";
-  bool padding_contiguous = true;
-  NPUAttrDesc npuAttrConstantValues = NPUAttrDesc("constant_values", constant_values);
-  NPUAttrDesc npuAttrMode = NPUAttrDesc("mode", mode);
-  NPUAttrDesc npuAttrPaddingContiguous = NPUAttrDesc("padding_contiguous", padding_contiguous);
-  NPUAttrDesc npuAttrPadding = NPUAttrDesc("paddings", vectorListInt);
-  SmallVector<NPUAttrDesc, N> attrs = {
-      npuAttrPadding,
-      npuAttrConstantValues,
-      npuAttrMode,
-      npuAttrPaddingContiguous
-  };
-  return attrs;
-}
-
-Tensor& reflection_pad2d_out_npu_nocheck(Tensor& out, const Tensor& self, IntArrayRef padding) {
-  //constructs the input and output NPUTensorDesc
-  auto inputs = reflection_pad2d_npu_input({self});
-  auto outputs = reflection_pad2d_npu_output({out});
-
-  //constructs the attr of the NPUAttrDesc
-  auto attrs = reflection_pad2d_npu_attr(self, padding);
-
-  //executing the NPU operator
-  CalcuOpUtil::execute_npu_operate("PadV3D", inputs, outputs, attrs);
-
-  return out;
-}
-
-Tensor& reflection_pad2d_out_npu(Tensor& result, const Tensor& self, IntArrayRef padding){
-  //calculate the output size
-  auto outputSize = reflection_pad2d_npu_output_size(self, padding);
-  //construct the output tensor of the NPU
-  result = at::empty_with_format(outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-  OpPreparation::CheckOut(
-  {self},
-  result,
-  CalcuOpUtil::get_tensor_npu_format(self),
-  self.scalar_type(),
-  outputSize);
-  reflection_pad2d_out_npu_nocheck(result, self, padding);
-
-  return result;
-}
-
-Tensor reflection_pad2d_npu(const Tensor& self, IntArrayRef padding) {
-  //calculate the output size
-  auto outputSize = reflection_pad2d_npu_output_size(self, padding);
-  //construct the output tensor of the NPU
-  Tensor out = at::empty_with_format(outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-  //calculate the output result of the NPU
-  reflection_pad2d_out_npu_nocheck(out, self, padding);
-
-  return out;
-}
-}
-} // namespace at::native
diff --git a/src/aten/src/ATen/native/npu/ReplicationPad2dKernelNpu.cpp b/src/aten/src/ATen/native/npu/ReplicationPad2dKernelNpu.cpp
old mode 100644
new mode 100755
diff --git a/src/aten/src/ATen/native/npu/ScatterV1KernelNpu.cpp b/src/aten/src/ATen/native/npu/ScatterV1KernelNpu.cpp
index 3485b0608a1a9d1b591dcba3fc1466e5ed388410..9c653b13bd6f1e144e45e70508ea8ac069462cd9 100644
--- a/src/aten/src/ATen/native/npu/ScatterV1KernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/ScatterV1KernelNpu.cpp
@@ -1,50 +1,50 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor& scatter_out_npu(
-    Tensor& output,
-    const Tensor& self,
-    const Tensor& indices,
-    const Tensor& updates,
-    int64_t dim) {
-  OpCommand cmd;
-  cmd.Name("ArgMaxGrad")
-      .Input(self)
-      .Input(indices)
-      .Input(updates)
-      .Output(output)
-      .Attr("dimension", dim)
-      .Run();
-  
-  return output;
-}
-
-Tensor scatter_npu(const Tensor& self, const Tensor& indices, const Tensor& updates, int64_t dim) {
-  Tensor outputs = OpPreparation::ApplyTensor(self);
-  scatter_out_npu(outputs, self, indices, updates, dim);
-
-  return outputs;
-}
-
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor& scatter_out_npu(
+    Tensor& output,
+    const Tensor& self,
+    const Tensor& indices,
+    const Tensor& updates,
+    int64_t dim) {
+  OpCommand cmd;
+  cmd.Name("ArgMaxGrad")
+      .Input(self)
+      .Input(indices)
+      .Input(updates)
+      .Output(output)
+      .Attr("dimension", dim)
+      .Run();
+  
+  return output;
+}
+
+Tensor scatter_npu(const Tensor& self, const Tensor& indices, const Tensor& updates, int64_t dim) {
+  Tensor outputs = OpPreparation::ApplyTensor(self);
+  scatter_out_npu(outputs, self, indices, updates, dim);
+
+  return outputs;
+}
+
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/SliceKernelNpu.cpp b/src/aten/src/ATen/native/npu/SliceKernelNpu.cpp
index 4779f9418561b7339feeba7e8100bb9d41652e4e..276ef231dcbf61885e816c1e61250b7701f574d9 100644
--- a/src/aten/src/ATen/native/npu/SliceKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/SliceKernelNpu.cpp
@@ -1,78 +1,78 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor& slice_out_npu(
-    Tensor& result,
-    const Tensor& self,
-    IntArrayRef offsets,
-    IntArrayRef size) {
-
-  SmallVector<int64_t, N> offsetVec = array_to_small_vector(offsets);
-  SmallVector<int64_t, N> sizeVec = array_to_small_vector(size);
-  
-  if (!c10::npu::OptionsManager::CheckDynamicEnable()) {
-    OpCommand cmd;
-    cmd.Name("Slice")
-        .Input(self)
-        .Input(offsetVec)
-        .Input(sizeVec)
-        .Output(result)
-        .Run();
-  } else {
-    SmallVector<int64_t, N> offsetsList = array_to_small_vector(offsets);
-    SmallVector<int64_t, N> sizeList = array_to_small_vector(size);
-    OpDynamicCommand cmd;
-    cmd.Name("SliceD")
-        .Input(self)
-        .Output(result)
-        .Attr("offsets", offsets)
-        .Attr("size", size);
-    Tensor offsetCpuTensor = from_blob((void*)offsetVec.data(), {offsetVec.size()}, at::kLong).to(at::kInt);
-    Tensor offsetNpuTensor = CalcuOpUtil::copy_tensor_host_to_device(offsetCpuTensor);
-    Tensor sizeCpuTensor = from_blob((void*)sizeVec.data(), {sizeVec.size()}, at::kLong);
-    Tensor sizeNpuTensor = CalcuOpUtil::copy_tensor_host_to_device(sizeCpuTensor);
-    cmd.DynamicName("Slice")
-        .DynamicInput(self)
-        .DynamicInput(offsetsList, at::kLong, at::kInt, "offsets", true, FIXED_CONST_VALUE)
-        .DynamicInput(sizeList, at::kLong, at::kInt, "size", true, FIXED_CONST_VALUE)
-        .DynamicOutput(result)
-        .DynamicOpRun();
-  }
-  return result;
-}
-
-Tensor slice_npu(const Tensor& self, IntArrayRef offsets, IntArrayRef size) {
-  // calculate the output size
-  SmallVector<int64_t, SIZE> outputSize = 
-      CalcuOpUtil::ConvertIntArrayRefToSmallVector(size);
-  // construct the output tensor of the NPU
-  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
-
-  // calculate the output result of the NPU
-  slice_out_npu(result, self, offsets, size);
-
-  return result;
-}
-
-} // namespace native
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor& slice_out_npu(
+    Tensor& result,
+    const Tensor& self,
+    IntArrayRef offsets,
+    IntArrayRef size) {
+
+  SmallVector<int64_t, N> offsetVec = array_to_small_vector(offsets);
+  SmallVector<int64_t, N> sizeVec = array_to_small_vector(size);
+  
+  if (!c10::npu::OptionsManager::CheckDynamicEnable()) {
+    OpCommand cmd;
+    cmd.Name("Slice")
+        .Input(self)
+        .Input(offsetVec)
+        .Input(sizeVec)
+        .Output(result)
+        .Run();
+  } else {
+    SmallVector<int64_t, N> offsetsList = array_to_small_vector(offsets);
+    SmallVector<int64_t, N> sizeList = array_to_small_vector(size);
+    OpDynamicCommand cmd;
+    cmd.Name("SliceD")
+        .Input(self)
+        .Output(result)
+        .Attr("offsets", offsets)
+        .Attr("size", size);
+    Tensor offsetCpuTensor = from_blob((void*)offsetVec.data(), {offsetVec.size()}, at::kLong).to(at::kInt);
+    Tensor offsetNpuTensor = CalcuOpUtil::copy_tensor_host_to_device(offsetCpuTensor);
+    Tensor sizeCpuTensor = from_blob((void*)sizeVec.data(), {sizeVec.size()}, at::kLong);
+    Tensor sizeNpuTensor = CalcuOpUtil::copy_tensor_host_to_device(sizeCpuTensor);
+    cmd.DynamicName("Slice")
+        .DynamicInput(self)
+        .DynamicInput(offsetsList, at::kLong, at::kInt, "offsets", true, FIXED_CONST_VALUE)
+        .DynamicInput(sizeList, at::kLong, at::kInt, "size", true, FIXED_CONST_VALUE)
+        .DynamicOutput(result)
+        .DynamicOpRun();
+  }
+  return result;
+}
+
+Tensor slice_npu(const Tensor& self, IntArrayRef offsets, IntArrayRef size) {
+  // calculate the output size
+  SmallVector<int64_t, SIZE> outputSize = 
+      CalcuOpUtil::ConvertIntArrayRefToSmallVector(size);
+  // construct the output tensor of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
+
+  // calculate the output result of the NPU
+  slice_out_npu(result, self, offsets, size);
+
+  return result;
+}
+
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/SlogdetKernelNpu.cpp b/src/aten/src/ATen/native/npu/SlogdetKernelNpu.cpp
index 8283cf0d52e7279b30e7c3b37e036985686efc0f..223bb3de9f9b8c69734413bb5b4a6b7b2b92cebd 100644
--- a/src/aten/src/ATen/native/npu/SlogdetKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/SlogdetKernelNpu.cpp
@@ -1,56 +1,56 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-tuple<Tensor&, Tensor&> slogdet_out_npu(
-    Tensor& sign,
-    Tensor& y,
-    const Tensor& self) {
-  OpCommand cmd;
-  cmd.Name("LogMatrixDeterminant")
-      .Input(self)
-      .Output(sign)
-      .Output(y)
-      .Run();
-
-  return std::tie(sign, y);
-}
-
-tuple<Tensor, Tensor> slogdet_npu(const Tensor& self) {
-
-  TORCH_CHECK(self.dim() >= 2, "input must be at least 2 dimensions");
-
-  // calculate the output size
-  auto outputSize = array_to_small_vector(self.sizes());
-  outputSize.erase(outputSize.end() - 2, outputSize.end());
-
-  // construct the output tensor of the NPU
-  Tensor sign = OpPreparation::ApplyTensor(self, outputSize);
-  Tensor y = OpPreparation::ApplyTensor(self, outputSize);
-  
-  // calculate the output result of the NPU
-  slogdet_out_npu(sign, y, self);
-
-  return std::tie(sign, y);
-}
-
-} // namespace native
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+tuple<Tensor&, Tensor&> slogdet_out_npu(
+    Tensor& sign,
+    Tensor& y,
+    const Tensor& self) {
+  OpCommand cmd;
+  cmd.Name("LogMatrixDeterminant")
+      .Input(self)
+      .Output(sign)
+      .Output(y)
+      .Run();
+
+  return std::tie(sign, y);
+}
+
+tuple<Tensor, Tensor> slogdet_npu(const Tensor& self) {
+
+  TORCH_CHECK(self.dim() >= 2, "input must be at least 2 dimensions");
+
+  // calculate the output size
+  auto outputSize = array_to_small_vector(self.sizes());
+  outputSize.erase(outputSize.end() - 2, outputSize.end());
+
+  // construct the output tensor of the NPU
+  Tensor sign = OpPreparation::ApplyTensor(self, outputSize);
+  Tensor y = OpPreparation::ApplyTensor(self, outputSize);
+  
+  // calculate the output result of the NPU
+  slogdet_out_npu(sign, y, self);
+
+  return std::tie(sign, y);
+}
+
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/SlowConvDilated2DKernelNpu.cpp b/src/aten/src/ATen/native/npu/SlowConvDilated2DKernelNpu.cpp
index 8c10534e9b561409ab43a4d0d43690c75b4b0b96..ca32f0180a2ebf1046985b966245d34035c09796 100644
--- a/src/aten/src/ATen/native/npu/SlowConvDilated2DKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/SlowConvDilated2DKernelNpu.cpp
@@ -1,69 +1,69 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor slow_conv_dilated2d_npu(
-    const Tensor& self,
-    const Tensor& weight,
-    IntArrayRef kernel_size,
-    const Tensor& bias,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation) {
-  
-  if (stride[0] == 0) {
-    AT_ERROR("slow_conv_dilated2d_npu_output_size: stride[0] can not be zero");
-  }
-  if (padding[0] < 0 || padding[1] < 0){
-    AT_ERROR("slow_conv_dilated2d_npu_output_size: padding can not be less than zero");
-  }
-  auto outputSize = slow_conv_dilated2d_npu_output_size(
-      self, weight, stride, padding, dilation);
-  // construct the output tensor of the NPU
-  Tensor result =
-      at::empty_with_format(outputSize, self.options(), ACL_FORMAT_NC1HWC0);
-  
-  int64_t groups = 1;
-  string dataFormat = "NCHW";
-  SmallVector<int64_t,N> stridesSize = {1,1,stride[0],stride[1]};
-  SmallVector<int64_t, N> paddings = {
-      padding[0], padding[0], padding[1], padding[1]};
-  SmallVector<int64_t, N> dilations = {1, 1, dilation[0], dilation[1]};
-
-  // calculate the output result of the NPU
-  OpCommand cmd;
-  cmd.Name("Conv2D")
-      .Input(self)
-      .Input(weight);
-  if (bias.defined()){
-     cmd.Input(bias);
-  } 
-  cmd.Output(result)
-      .Attr("strides", stridesSize)
-      .Attr("pads", paddings)
-      .Attr("dilations", dilations)
-      .Attr("groups",groups)
-      .Attr("data_format",dataFormat)
-      .Run();
-      
-  return result;
-}
-} // namespace native
-} // namespace at
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
+#include "ATen/native/npu/utils/OpTemplate.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor slow_conv_dilated2d_npu(
+    const Tensor& self,
+    const Tensor& weight,
+    IntArrayRef kernel_size,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation) {
+  
+  if (stride[0] == 0) {
+    AT_ERROR("slow_conv_dilated2d_npu_output_size: stride[0] can not be zero");
+  }
+  if (padding[0] < 0 || padding[1] < 0){
+    AT_ERROR("slow_conv_dilated2d_npu_output_size: padding can not be less than zero");
+  }
+  auto outputSize = slow_conv_dilated2d_npu_output_size(
+      self, weight, stride, padding, dilation);
+  // construct the output tensor of the NPU
+  Tensor result =
+      at::empty_with_format(outputSize, self.options(), ACL_FORMAT_NC1HWC0);
+  
+  int64_t groups = 1;
+  string dataFormat = "NCHW";
+  SmallVector<int64_t,N> stridesSize = {1,1,stride[0],stride[1]};
+  SmallVector<int64_t, N> paddings = {
+      padding[0], padding[0], padding[1], padding[1]};
+  SmallVector<int64_t, N> dilations = {1, 1, dilation[0], dilation[1]};
+
+  // calculate the output result of the NPU
+  OpCommand cmd;
+  cmd.Name("Conv2D")
+      .Input(self)
+      .Input(weight);
+  if (bias.defined()){
+     cmd.Input(bias);
+  } 
+  cmd.Output(result)
+      .Attr("strides", stridesSize)
+      .Attr("pads", paddings)
+      .Attr("dilations", dilations)
+      .Attr("groups",groups)
+      .Attr("data_format",dataFormat)
+      .Run();
+      
+  return result;
+}
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/SlowConvTranspose2dKernelNpu.cpp b/src/aten/src/ATen/native/npu/SlowConvTranspose2dKernelNpu.cpp
index 5e2d8af1e230cb1e82d6c0972fb6912d63e890d2..92a0f12a19a729cdd167dbe7d1d24ef36a9844f6 100644
--- a/src/aten/src/ATen/native/npu/SlowConvTranspose2dKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/SlowConvTranspose2dKernelNpu.cpp
@@ -1,230 +1,230 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-SmallVector<int64_t, SIZE> slow_conv_transpose2d_npu_output_size(
-    const Tensor & self, 
-    const Tensor & weight, 
-    IntArrayRef kernel_size, 
-    const Tensor & bias, 
-    IntArrayRef stride, 
-    IntArrayRef padding, 
-    IntArrayRef output_padding, 
-    IntArrayRef dilation) {
-  int ndim = self.dim();
-  int dimh = 1;
-  int dimw = 2;
-
-  if (ndim == 4) {
-    dimh++;
-    dimw++;
-  }
-
-  TORCH_CHECK(
-      self.numel() != 0 && (ndim == 3 || ndim == 4),
-      "non-empty 3D or 4D input tensor expected but got a tensor with size ",
-      self.sizes());
-  int64_t N = self.size(0);
-  int64_t Co = weight.size(1);
-  int64_t H = self.size(dimh);
-  int64_t W = self.size(dimw);
-  
-
-  int64_t Ho = (H - 1) * stride[0] - 2 * padding[0] +
-      dilation[0] * (kernel_size[0] - 1) + output_padding[0] + 1;
-  int64_t Wo = (W - 1) * stride[1] - 2 * padding[1] +
-      dilation[1] * (kernel_size[1] - 1) + output_padding[1] + 1;
-
-  SmallVector<int64_t, SIZE> outputSize = {N, Co, Ho, Wo};
-
-  return outputSize;
-}
-
-static inline void slow_conv_transpose2d_shape_check_npu(
-    const Tensor & self, 
-    const Tensor & weight, 
-    IntArrayRef kernel_size, 
-    const Tensor & bias, 
-    IntArrayRef stride, 
-    IntArrayRef padding, 
-    IntArrayRef output_padding, 
-    IntArrayRef dilation) {
-  TORCH_CHECK(
-      kernel_size[0] > 0 && kernel_size[1] > 0,
-      "kernel size should be greater than zero, but got kernel_height: ",
-      kernel_size[0],
-      " kernel_width: ",
-      kernel_size[1]);
-  TORCH_CHECK(
-      stride[0] > 0 && stride[1] > 0,
-      "stride should be greater than zero, but got stride_height: ",
-      stride[0],
-      " stride_width: ",
-      stride[1]);
-  TORCH_CHECK(
-      dilation[0] > 0 && dilation[1] > 0,
-      "dilation should be greater than zero, but got dilation_height: ",
-      dilation[0],
-      ", dilation_width: ",
-      dilation[1]);
-  TORCH_CHECK(
-      (output_padding[1] < stride[1] ||
-       output_padding[1] < dilation[1]) &&
-      (output_padding[0] < stride[0] ||
-       output_padding[0] < dilation[0]),
-      "output padding must be smaller than either stride or dilation, but got output_padding_height: ",
-      output_padding[0],
-      " output_padding_width: ",
-      output_padding[1],
-      " stride_height: ",
-      stride[0],
-      " stride_width: ",
-      stride[1],
-      " dilation_height: ",
-      dilation[0],
-      " dilation_width: ",
-      dilation[1]);
-
-  TORCH_CHECK(
-        weight.numel() != 0 && (weight.dim() == 2 || weight.dim() == 4),
-        "non-empty 2D or 4D weight tensor expected, but got: ",
-        weight.sizes());
-  if (bias.defined()) {
-      check_dim_size(bias, 1, 0, weight.size(1));
-  }
-  
-  TORCH_CHECK(
-      kernel_size.size() == 2,
-      "It is expected kernel_size equals to 2, but got size ",
-      kernel_size.size());
-
-  TORCH_CHECK(
-      dilation.size() == 2,
-      "It is expected dilation equals to 2, but got size ",
-      dilation.size());
-
-  TORCH_CHECK(
-      padding.size() == 2,
-      "It is expected padding equals to 2, but got size ",
-      padding.size());
-
-  TORCH_CHECK(
-      stride.size() == 2,
-      "It is expected stride equals to 2, but got size ",
-      stride.size());
-
-  TORCH_CHECK(
-      output_padding.size() == 2,
-      "It is expected stride equals to 2, but got size ",
-      output_padding.size());
-}
-
-Tensor& slow_conv_transpose2d_out_npu(
-    Tensor& out, 
-    const Tensor & self, 
-    const Tensor & weight, 
-    IntArrayRef kernel_size, 
-    const Tensor & bias, 
-    IntArrayRef stride, 
-    IntArrayRef padding, 
-    IntArrayRef output_padding, 
-    IntArrayRef dilation) {
-  slow_conv_transpose2d_shape_check_npu(
-    self, weight, kernel_size, bias, stride, padding, output_padding, dilation);
-
-  auto outputSize = slow_conv_transpose2d_npu_output_size(
-      self, weight, kernel_size, bias, stride, padding, output_padding, dilation);
-  if (!out.sizes().equals(outputSize)) {
-    out.resize_(outputSize);
-  }
-
-  SmallVector<int64_t, N> paddings = {
-      padding[0], padding[0], padding[1], padding[1]};
-  SmallVector<int64_t, N> stridesSize = {1, 1, stride[0], stride[1]};
-  SmallVector<int64_t, N> dilations = {1, 1, dilation[0], dilation[1]};
-  SmallVector<int64_t, N> outputpadding = {
-      output_padding[0], output_padding[0], output_padding[1], output_padding[1]};
-  string dataFormat = "NCHW";
-  int64_t groups = 1;
-  SmallVector<int64_t, N> sizeVec = array_to_small_vector(out.sizes());
-  if (!c10::npu::OptionsManager::CheckDynamicEnable()) {
-    OpCommand cmd;
-    cmd.Name("Conv2DTranspose")
-        .Input(sizeVec, at::kInt)
-        .Input(self)
-        .Input(weight);
-    if (bias.defined()){
-      cmd.Input(bias);
-    }
-    cmd.Output(out)
-        .Attr("pads", paddings)
-        .Attr("output_padding", outputpadding)
-        .Attr("strides", stridesSize)
-        .Attr("dilations", dilations)
-        .Attr("groups", groups)
-        .Attr("data_format", dataFormat)
-        .Run();
-  } else {
-    OpCommand cmd;
-    cmd.Name("Conv2DTransposeD")
-        .Input(self)
-        .Input(weight);
-    if (bias.defined()){
-      cmd.Input(bias);
-    }
-    cmd.Output(out)
-        .Attr("input_size", sizeVec)
-        .Attr("pads", paddings)
-        .Attr("output_padding", outputpadding)
-        .Attr("strides", stridesSize)
-        .Attr("dilations", dilations)
-        .Attr("groups", groups)
-        .Attr("data_format", dataFormat)
-        .Run();
-  }
-
-  return out;
-}
-
-Tensor slow_conv_transpose2d_npu(
-    const Tensor & self, 
-    const Tensor & weight, 
-    IntArrayRef kernel_size, 
-    const Tensor & bias, 
-    IntArrayRef stride, 
-    IntArrayRef padding, 
-    IntArrayRef output_padding, 
-    IntArrayRef dilation) {
-  // calculate the output size
-  auto outputSize = slow_conv_transpose2d_npu_output_size(
-      self, weight, kernel_size, bias, stride, padding, output_padding, dilation);
-
-  // construct the output tensor of the NPU
-  Tensor result =
-      at::empty_with_format(outputSize, self.options(), ACL_FORMAT_NC1HWC0);
-
-  // calculate the output result of the NPU
-  slow_conv_transpose2d_out_npu(
-      result, self, weight, kernel_size, bias, stride, padding, output_padding, dilation);
-
-  return result;
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+SmallVector<int64_t, SIZE> slow_conv_transpose2d_npu_output_size(
+    const Tensor & self, 
+    const Tensor & weight, 
+    IntArrayRef kernel_size, 
+    const Tensor & bias, 
+    IntArrayRef stride, 
+    IntArrayRef padding, 
+    IntArrayRef output_padding, 
+    IntArrayRef dilation) {
+  int ndim = self.dim();
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimh++;
+    dimw++;
+  }
+
+  TORCH_CHECK(
+      self.numel() != 0 && (ndim == 3 || ndim == 4),
+      "non-empty 3D or 4D input tensor expected but got a tensor with size ",
+      self.sizes());
+  int64_t N = self.size(0);
+  int64_t Co = weight.size(1);
+  int64_t H = self.size(dimh);
+  int64_t W = self.size(dimw);
+  
+
+  int64_t Ho = (H - 1) * stride[0] - 2 * padding[0] +
+      dilation[0] * (kernel_size[0] - 1) + output_padding[0] + 1;
+  int64_t Wo = (W - 1) * stride[1] - 2 * padding[1] +
+      dilation[1] * (kernel_size[1] - 1) + output_padding[1] + 1;
+
+  SmallVector<int64_t, SIZE> outputSize = {N, Co, Ho, Wo};
+
+  return outputSize;
+}
+
+static inline void slow_conv_transpose2d_shape_check_npu(
+    const Tensor & self, 
+    const Tensor & weight, 
+    IntArrayRef kernel_size, 
+    const Tensor & bias, 
+    IntArrayRef stride, 
+    IntArrayRef padding, 
+    IntArrayRef output_padding, 
+    IntArrayRef dilation) {
+  TORCH_CHECK(
+      kernel_size[0] > 0 && kernel_size[1] > 0,
+      "kernel size should be greater than zero, but got kernel_height: ",
+      kernel_size[0],
+      " kernel_width: ",
+      kernel_size[1]);
+  TORCH_CHECK(
+      stride[0] > 0 && stride[1] > 0,
+      "stride should be greater than zero, but got stride_height: ",
+      stride[0],
+      " stride_width: ",
+      stride[1]);
+  TORCH_CHECK(
+      dilation[0] > 0 && dilation[1] > 0,
+      "dilation should be greater than zero, but got dilation_height: ",
+      dilation[0],
+      ", dilation_width: ",
+      dilation[1]);
+  TORCH_CHECK(
+      (output_padding[1] < stride[1] ||
+       output_padding[1] < dilation[1]) &&
+      (output_padding[0] < stride[0] ||
+       output_padding[0] < dilation[0]),
+      "output padding must be smaller than either stride or dilation, but got output_padding_height: ",
+      output_padding[0],
+      " output_padding_width: ",
+      output_padding[1],
+      " stride_height: ",
+      stride[0],
+      " stride_width: ",
+      stride[1],
+      " dilation_height: ",
+      dilation[0],
+      " dilation_width: ",
+      dilation[1]);
+
+  TORCH_CHECK(
+        weight.numel() != 0 && (weight.dim() == 2 || weight.dim() == 4),
+        "non-empty 2D or 4D weight tensor expected, but got: ",
+        weight.sizes());
+  if (bias.defined()) {
+      check_dim_size(bias, 1, 0, weight.size(1));
+  }
+  
+  TORCH_CHECK(
+      kernel_size.size() == 2,
+      "It is expected kernel_size equals to 2, but got size ",
+      kernel_size.size());
+
+  TORCH_CHECK(
+      dilation.size() == 2,
+      "It is expected dilation equals to 2, but got size ",
+      dilation.size());
+
+  TORCH_CHECK(
+      padding.size() == 2,
+      "It is expected padding equals to 2, but got size ",
+      padding.size());
+
+  TORCH_CHECK(
+      stride.size() == 2,
+      "It is expected stride equals to 2, but got size ",
+      stride.size());
+
+  TORCH_CHECK(
+      output_padding.size() == 2,
+      "It is expected stride equals to 2, but got size ",
+      output_padding.size());
+}
+
+Tensor& slow_conv_transpose2d_out_npu(
+    Tensor& out, 
+    const Tensor & self, 
+    const Tensor & weight, 
+    IntArrayRef kernel_size, 
+    const Tensor & bias, 
+    IntArrayRef stride, 
+    IntArrayRef padding, 
+    IntArrayRef output_padding, 
+    IntArrayRef dilation) {
+  slow_conv_transpose2d_shape_check_npu(
+    self, weight, kernel_size, bias, stride, padding, output_padding, dilation);
+
+  auto outputSize = slow_conv_transpose2d_npu_output_size(
+      self, weight, kernel_size, bias, stride, padding, output_padding, dilation);
+  if (!out.sizes().equals(outputSize)) {
+    out.resize_(outputSize);
+  }
+
+  SmallVector<int64_t, N> paddings = {
+      padding[0], padding[0], padding[1], padding[1]};
+  SmallVector<int64_t, N> stridesSize = {1, 1, stride[0], stride[1]};
+  SmallVector<int64_t, N> dilations = {1, 1, dilation[0], dilation[1]};
+  SmallVector<int64_t, N> outputpadding = {
+      output_padding[0], output_padding[0], output_padding[1], output_padding[1]};
+  string dataFormat = "NCHW";
+  int64_t groups = 1;
+  SmallVector<int64_t, N> sizeVec = array_to_small_vector(out.sizes());
+  if (!c10::npu::OptionsManager::CheckDynamicEnable()) {
+    OpCommand cmd;
+    cmd.Name("Conv2DTranspose")
+        .Input(sizeVec, at::kInt)
+        .Input(self)
+        .Input(weight);
+    if (bias.defined()){
+      cmd.Input(bias);
+    }
+    cmd.Output(out)
+        .Attr("pads", paddings)
+        .Attr("output_padding", outputpadding)
+        .Attr("strides", stridesSize)
+        .Attr("dilations", dilations)
+        .Attr("groups", groups)
+        .Attr("data_format", dataFormat)
+        .Run();
+  } else {
+    OpCommand cmd;
+    cmd.Name("Conv2DTransposeD")
+        .Input(self)
+        .Input(weight);
+    if (bias.defined()){
+      cmd.Input(bias);
+    }
+    cmd.Output(out)
+        .Attr("input_size", sizeVec)
+        .Attr("pads", paddings)
+        .Attr("output_padding", outputpadding)
+        .Attr("strides", stridesSize)
+        .Attr("dilations", dilations)
+        .Attr("groups", groups)
+        .Attr("data_format", dataFormat)
+        .Run();
+  }
+
+  return out;
+}
+
+Tensor slow_conv_transpose2d_npu(
+    const Tensor & self, 
+    const Tensor & weight, 
+    IntArrayRef kernel_size, 
+    const Tensor & bias, 
+    IntArrayRef stride, 
+    IntArrayRef padding, 
+    IntArrayRef output_padding, 
+    IntArrayRef dilation) {
+  // calculate the output size
+  auto outputSize = slow_conv_transpose2d_npu_output_size(
+      self, weight, kernel_size, bias, stride, padding, output_padding, dilation);
+
+  // construct the output tensor of the NPU
+  Tensor result =
+      at::empty_with_format(outputSize, self.options(), ACL_FORMAT_NC1HWC0);
+
+  // calculate the output result of the NPU
+  slow_conv_transpose2d_out_npu(
+      result, self, weight, kernel_size, bias, stride, padding, output_padding, dilation);
+
+  return result;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/SoftMarginLossBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/SoftMarginLossBackwardKernelNpu.cpp
index 60a6b4cbf43406d9562de8f3059ff6e1e1859308..bbf589375b03b94d72463ea5f4159956ef155b1f 100644
--- a/src/aten/src/ATen/native/npu/SoftMarginLossBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/SoftMarginLossBackwardKernelNpu.cpp
@@ -1,61 +1,61 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor& soft_margin_loss_backward_out_npu(
-    Tensor& grad_input,
-    const Tensor& grad_output,
-    const Tensor& input,
-    const Tensor& target,
-    int64_t reduction) {
-  string reductionStr;
-  if (reduction == Reduction::None) {
-    reductionStr = "none";
-  } else if (reduction == Reduction::Mean) {
-    reductionStr = "mean";
-  } else if (reduction == Reduction::Sum) {
-    reductionStr = "sum";
-  }
-
-  // calculate the output result of the NPU
-  OpCommand cmd;
-  cmd.Name("SoftMarginLossGrad")
-      .Input(input)
-      .Input(target)
-      .Input(grad_output)
-      .Output(grad_input)
-      .Attr("reduction", reductionStr)
-      .Run();
-
-  return grad_input;
-}
-
-Tensor soft_margin_loss_backward_npu(
-    const Tensor& grad_output,
-    const Tensor& input,
-    const Tensor& target,
-    int64_t reduction) {
-  Tensor grad_input = OpPreparation::ApplyTensor(input);
-  soft_margin_loss_backward_out_npu(
-      grad_input, grad_output, input, target, reduction);
-  return grad_input;
-}
-
-} // namespace native
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor& soft_margin_loss_backward_out_npu(
+    Tensor& grad_input,
+    const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& target,
+    int64_t reduction) {
+  string reductionStr;
+  if (reduction == Reduction::None) {
+    reductionStr = "none";
+  } else if (reduction == Reduction::Mean) {
+    reductionStr = "mean";
+  } else if (reduction == Reduction::Sum) {
+    reductionStr = "sum";
+  }
+
+  // calculate the output result of the NPU
+  OpCommand cmd;
+  cmd.Name("SoftMarginLossGrad")
+      .Input(input)
+      .Input(target)
+      .Input(grad_output)
+      .Output(grad_input)
+      .Attr("reduction", reductionStr)
+      .Run();
+
+  return grad_input;
+}
+
+Tensor soft_margin_loss_backward_npu(
+    const Tensor& grad_output,
+    const Tensor& input,
+    const Tensor& target,
+    int64_t reduction) {
+  Tensor grad_input = OpPreparation::ApplyTensor(input);
+  soft_margin_loss_backward_out_npu(
+      grad_input, grad_output, input, target, reduction);
+  return grad_input;
+}
+
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/SoftMarginLossKernelNpu.cpp b/src/aten/src/ATen/native/npu/SoftMarginLossKernelNpu.cpp
index 4247b5c3d1bfcb829857be7f5e5d451b9e3fb98f..28896488ad0669d900ad082756682717620ecf14 100644
--- a/src/aten/src/ATen/native/npu/SoftMarginLossKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/SoftMarginLossKernelNpu.cpp
@@ -1,87 +1,87 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/NpuUtils.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-SmallVector<NPUTensorDesc, N> soft_margin_loss_npu_input(
-    const SmallVector<Tensor, N>& inputTensor) {
-  return CalcuOpUtil::create_npu_input_tensor_desc(inputTensor);
-}
-
-SmallVector<NPUTensorDesc, N> soft_margin_loss_npu_output(
-    const SmallVector<Tensor, N>& outputTensor) {
-  return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor);
-}
-
-SmallVector<NPUAttrDesc, N> soft_margin_loss_npu_attr(
-    int64_t reduction) {
-  string reductionStr;
-  if (reduction == Reduction::None) {
-    reductionStr = "none";
-  } else if (reduction == Reduction::Mean) {
-    reductionStr = "mean";
-  } else if (reduction == Reduction::Sum) {
-    reductionStr = "sum";
-  }
-
-  NPUAttrDesc npuAttrReduction = NPUAttrDesc("reduction", reductionStr);
-  SmallVector<NPUAttrDesc, N> attrs = {npuAttrReduction};
-  return attrs;
-}
-
-Tensor& soft_margin_loss_out_npu(Tensor& result, const Tensor& self, const Tensor& target, int64_t reduction) {
-// constructs the input and output NPUTensorDesc
-  Tensor target_broadcast = target;
-  if(target.sizes() != self.sizes()) {
-    target_broadcast = broadcast_npu(target, self.sizes());
-  }
-  auto inputs = soft_margin_loss_npu_input({self, target_broadcast});
-  auto outputs = soft_margin_loss_npu_output({result});
-
-// constructs the attr of the NPUAttrDesc
-  auto attrs = soft_margin_loss_npu_attr(reduction);
-
-// executing the NPU operator
-  CalcuOpUtil::execute_npu_operate("SoftMarginLoss", inputs, outputs, attrs);
-  return result;
-}
-
-Tensor soft_margin_loss_npu(const Tensor& self, const Tensor& target, int64_t reduction) {
-// calculate the output size
-  auto outputSize = soft_margin_loss_npu_output_size(
-      self,
-      target,
-      reduction);
-
-// construct the output tensor of the NPU
-  Tensor result = at::empty_with_format(
-    outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
-
-// calculate the output result of the NPU
-  soft_margin_loss_out_npu(result, self, target, reduction);
-  if (reduction == Reduction::None) {
-    return result;
-  } else {
-    return result.reshape({});
-  }
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
+#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
+#include "ATen/native/npu/utils/NpuUtils.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+SmallVector<NPUTensorDesc, N> soft_margin_loss_npu_input(
+    const SmallVector<Tensor, N>& inputTensor) {
+  return CalcuOpUtil::create_npu_input_tensor_desc(inputTensor);
+}
+
+SmallVector<NPUTensorDesc, N> soft_margin_loss_npu_output(
+    const SmallVector<Tensor, N>& outputTensor) {
+  return CalcuOpUtil::create_npu_output_tensor_desc(outputTensor);
+}
+
+SmallVector<NPUAttrDesc, N> soft_margin_loss_npu_attr(
+    int64_t reduction) {
+  string reductionStr;
+  if (reduction == Reduction::None) {
+    reductionStr = "none";
+  } else if (reduction == Reduction::Mean) {
+    reductionStr = "mean";
+  } else if (reduction == Reduction::Sum) {
+    reductionStr = "sum";
+  }
+
+  NPUAttrDesc npuAttrReduction = NPUAttrDesc("reduction", reductionStr);
+  SmallVector<NPUAttrDesc, N> attrs = {npuAttrReduction};
+  return attrs;
+}
+
+Tensor& soft_margin_loss_out_npu(Tensor& result, const Tensor& self, const Tensor& target, int64_t reduction) {
+// constructs the input and output NPUTensorDesc
+  Tensor target_broadcast = target;
+  if(target.sizes() != self.sizes()) {
+    target_broadcast = broadcast_npu(target, self.sizes());
+  }
+  auto inputs = soft_margin_loss_npu_input({self, target_broadcast});
+  auto outputs = soft_margin_loss_npu_output({result});
+
+// constructs the attr of the NPUAttrDesc
+  auto attrs = soft_margin_loss_npu_attr(reduction);
+
+// executing the NPU operator
+  CalcuOpUtil::execute_npu_operate("SoftMarginLoss", inputs, outputs, attrs);
+  return result;
+}
+
+Tensor soft_margin_loss_npu(const Tensor& self, const Tensor& target, int64_t reduction) {
+// calculate the output size
+  auto outputSize = soft_margin_loss_npu_output_size(
+      self,
+      target,
+      reduction);
+
+// construct the output tensor of the NPU
+  Tensor result = at::empty_with_format(
+    outputSize, self.options(), CalcuOpUtil::get_tensor_npu_format(self));
+
+// calculate the output result of the NPU
+  soft_margin_loss_out_npu(result, self, target, reduction);
+  if (reduction == Reduction::None) {
+    return result;
+  } else {
+    return result.reshape({});
+  }
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/StdKernelNpu.cpp b/src/aten/src/ATen/native/npu/StdKernelNpu.cpp
index 91b36d5d73f1b92e9d497f0ae1731b2426d3bfe1..b71d4a1c309aeef4aac5c04ad9aaf2b71b18e0df 100644
--- a/src/aten/src/ATen/native/npu/StdKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/StdKernelNpu.cpp
@@ -1,185 +1,185 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-#include "ATen/native/npu/utils/CalcuOpUtil.h"
-
-namespace at { 
-namespace native {
-using namespace at::native::npu;
-
-tuple<Tensor&, Tensor&> std_mean_out_npu_nocheck(
-    Tensor& resultStd, 
-    Tensor& resultMean, 
-    const Tensor& self, 
-    IntArrayRef dim, 
-    bool unbiased, 
-    bool keepdim) {
-  // executing the NPU operator 
-  OpCommand cmd1;
-  cmd1.Name("ReduceMeanD")
-      .Input(self)
-      .Output(resultMean)
-      .Attr("axes", dim)
-      .Attr("keep_dims", keepdim)
-      .Run();
-  Tensor resultMeanCopy = resultMean;
-  if (resultMean.dim() != 0 && keepdim == false) {
-    auto dimVector = array_to_small_vector(dim);
-    std::sort(dimVector.begin(), dimVector.end());
-    for (int64_t i = 0; i < dimVector.size(); i++) {
-      resultMeanCopy = resultMeanCopy.unsqueeze(dimVector[i]);
-    }
-  }
-  resultMeanCopy = resultMeanCopy.expand(self.sizes());
-  OpCommand cmd2;
-  cmd2.Name("ReduceStdWithMean")
-      .Input(self)
-      .Input(resultMeanCopy)
-      .Output(resultStd)
-      .Attr("dim", dim)
-      .Attr("unbiased", unbiased)
-      .Attr("keepdim", keepdim)
-      .Run();
-
-  return std::tie(resultStd, resultMean);
-}
-
-Tensor& std_out_npu(
-    Tensor& result, 
-    const Tensor& self, 
-    DimnameList dim, 
-    bool unbiased, 
-    bool keepdim) {
-  return std_out_npu(result, self, dimnames_to_positions(self, dim), unbiased, keepdim);
-}
-
-Tensor& std_out_npu(
-    Tensor& result, 
-    const Tensor& self, 
-    IntArrayRef dim, 
-    bool unbiased, 
-    bool keepdim) {
-  auto outputSize = reduce_ops_npu_output_size(self, dim, keepdim);
-  Tensor meanResult = OpPreparation::ApplyTensor(self, outputSize);
-
-  OpPreparation::CheckOut(
-      {self}, 
-      result, 
-      ACL_FORMAT_ND,
-      self.scalar_type(),
-      outputSize);
-
-  // executing the NPU operator
-  std_mean_out_npu_nocheck(result, meanResult, self, dim, unbiased, keepdim);
-
-  return result;
-}
-
-tuple<Tensor&, Tensor&> std_mean_out_npu(
-    Tensor& result1, 
-    Tensor& result2, 
-    const Tensor& self, 
-    IntArrayRef dim, 
-    bool unbiased, 
-    bool keepdim) {
-  auto outputSize = reduce_ops_npu_output_size(self, dim, keepdim);
-
-  OpPreparation::CheckOut(
-      {self}, 
-      result1, 
-      ACL_FORMAT_ND,
-      self.scalar_type(),
-      outputSize);
-  OpPreparation::CheckOut(
-      {self}, 
-      result2, 
-      ACL_FORMAT_ND,
-      self.scalar_type(),
-      outputSize);
-      
-  // executing the NPU operator
-  std_mean_out_npu_nocheck(result1, result2, self, dim, unbiased, keepdim);
-
-  return std::tie(result1, result2);
-}
-
-Tensor std_dim_npu(
-    const Tensor & self, 
-    IntArrayRef dim, 
-    bool unbiased, 
-    bool keepdim) {
-  // calculate the output size
-  auto outputSize = reduce_ops_npu_output_size(self, dim, keepdim);
-
-  // construct the output tensor of the NPU
-  Tensor result1 = OpPreparation::ApplyTensor(self, outputSize);
-  Tensor result2 = OpPreparation::ApplyTensor(self, outputSize);
-
-  // calculate the output result of the NPU
-  std_mean_out_npu(result1, result2, self, dim, unbiased, keepdim);
-  return result1;
-}
-
-Tensor std_npu(
-    const Tensor & self, 
-    bool unbiased) {
-  SmallVector<int64_t, SIZE> dims = CalcuOpUtil::get_dimlist_for_tensor(self);
-  return std_dim_npu(self, dims, unbiased, false);
-}
-
-tuple <Tensor, Tensor> std_mean_npu(
-    const Tensor & self, 
-    bool unbiased) {
-  SmallVector<int64_t, SIZE> dims = CalcuOpUtil::get_dimlist_for_tensor(self);
-  return std_mean_dim_npu(self, dims, unbiased, false);
-}
-
-tuple <Tensor, Tensor> std_mean_dim_npu(
-    const Tensor & self, 
-    IntArrayRef dim, 
-    bool unbiased, 
-    bool keepdim) {
-  // calculate the output size
-  auto outputSize = reduce_ops_npu_output_size(self, dim, keepdim);
-
-  // construct the output tensor of the NPU
-  Tensor result1 = OpPreparation::ApplyTensor(self, outputSize);
-  Tensor result2 = OpPreparation::ApplyTensor(self, outputSize);
-
-  // calculate the output result of the NPU
-  std_mean_out_npu(result1, result2, self, dim, unbiased, keepdim);
-  return std::tie(result1, result2);
-}
-
-tuple <Tensor, Tensor> std_mean_names_npu(
-    const Tensor & self, 
-    DimnameList dim, 
-    bool unbiased, 
-    bool keepdim) {
-  return std_mean_dim_npu(self, dimnames_to_positions(self, dim), unbiased, keepdim);
-}
-
-Tensor std_names_npu(
-    const Tensor & self, 
-    DimnameList dim, 
-    bool unbiased, 
-    bool keepdim) {
-  return std_dim_npu(self, dimnames_to_positions(self, dim), unbiased, keepdim);
-}
-
-} // namespace native
-} // namespace at::native
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/CalcuOpUtil.h"
+
+namespace at { 
+namespace native {
+using namespace at::native::npu;
+
+tuple<Tensor&, Tensor&> std_mean_out_npu_nocheck(
+    Tensor& resultStd, 
+    Tensor& resultMean, 
+    const Tensor& self, 
+    IntArrayRef dim, 
+    bool unbiased, 
+    bool keepdim) {
+  // executing the NPU operator 
+  OpCommand cmd1;
+  cmd1.Name("ReduceMeanD")
+      .Input(self)
+      .Output(resultMean)
+      .Attr("axes", dim)
+      .Attr("keep_dims", keepdim)
+      .Run();
+  Tensor resultMeanCopy = resultMean;
+  if (resultMean.dim() != 0 && keepdim == false) {
+    auto dimVector = array_to_small_vector(dim);
+    std::sort(dimVector.begin(), dimVector.end());
+    for (int64_t i = 0; i < dimVector.size(); i++) {
+      resultMeanCopy = resultMeanCopy.unsqueeze(dimVector[i]);
+    }
+  }
+  resultMeanCopy = resultMeanCopy.expand(self.sizes());
+  OpCommand cmd2;
+  cmd2.Name("ReduceStdWithMean")
+      .Input(self)
+      .Input(resultMeanCopy)
+      .Output(resultStd)
+      .Attr("dim", dim)
+      .Attr("unbiased", unbiased)
+      .Attr("keepdim", keepdim)
+      .Run();
+
+  return std::tie(resultStd, resultMean);
+}
+
+Tensor& std_out_npu(
+    Tensor& result, 
+    const Tensor& self, 
+    DimnameList dim, 
+    bool unbiased, 
+    bool keepdim) {
+  return std_out_npu(result, self, dimnames_to_positions(self, dim), unbiased, keepdim);
+}
+
+Tensor& std_out_npu(
+    Tensor& result, 
+    const Tensor& self, 
+    IntArrayRef dim, 
+    bool unbiased, 
+    bool keepdim) {
+  auto outputSize = reduce_ops_npu_output_size(self, dim, keepdim);
+  Tensor meanResult = OpPreparation::ApplyTensor(self, outputSize);
+
+  OpPreparation::CheckOut(
+      {self}, 
+      result, 
+      ACL_FORMAT_ND,
+      self.scalar_type(),
+      outputSize);
+
+  // executing the NPU operator
+  std_mean_out_npu_nocheck(result, meanResult, self, dim, unbiased, keepdim);
+
+  return result;
+}
+
+tuple<Tensor&, Tensor&> std_mean_out_npu(
+    Tensor& result1, 
+    Tensor& result2, 
+    const Tensor& self, 
+    IntArrayRef dim, 
+    bool unbiased, 
+    bool keepdim) {
+  auto outputSize = reduce_ops_npu_output_size(self, dim, keepdim);
+
+  OpPreparation::CheckOut(
+      {self}, 
+      result1, 
+      ACL_FORMAT_ND,
+      self.scalar_type(),
+      outputSize);
+  OpPreparation::CheckOut(
+      {self}, 
+      result2, 
+      ACL_FORMAT_ND,
+      self.scalar_type(),
+      outputSize);
+      
+  // executing the NPU operator
+  std_mean_out_npu_nocheck(result1, result2, self, dim, unbiased, keepdim);
+
+  return std::tie(result1, result2);
+}
+
+Tensor std_dim_npu(
+    const Tensor & self, 
+    IntArrayRef dim, 
+    bool unbiased, 
+    bool keepdim) {
+  // calculate the output size
+  auto outputSize = reduce_ops_npu_output_size(self, dim, keepdim);
+
+  // construct the output tensor of the NPU
+  Tensor result1 = OpPreparation::ApplyTensor(self, outputSize);
+  Tensor result2 = OpPreparation::ApplyTensor(self, outputSize);
+
+  // calculate the output result of the NPU
+  std_mean_out_npu(result1, result2, self, dim, unbiased, keepdim);
+  return result1;
+}
+
+Tensor std_npu(
+    const Tensor & self, 
+    bool unbiased) {
+  SmallVector<int64_t, SIZE> dims = CalcuOpUtil::get_dimlist_for_tensor(self);
+  return std_dim_npu(self, dims, unbiased, false);
+}
+
+tuple <Tensor, Tensor> std_mean_npu(
+    const Tensor & self, 
+    bool unbiased) {
+  SmallVector<int64_t, SIZE> dims = CalcuOpUtil::get_dimlist_for_tensor(self);
+  return std_mean_dim_npu(self, dims, unbiased, false);
+}
+
+tuple <Tensor, Tensor> std_mean_dim_npu(
+    const Tensor & self, 
+    IntArrayRef dim, 
+    bool unbiased, 
+    bool keepdim) {
+  // calculate the output size
+  auto outputSize = reduce_ops_npu_output_size(self, dim, keepdim);
+
+  // construct the output tensor of the NPU
+  Tensor result1 = OpPreparation::ApplyTensor(self, outputSize);
+  Tensor result2 = OpPreparation::ApplyTensor(self, outputSize);
+
+  // calculate the output result of the NPU
+  std_mean_out_npu(result1, result2, self, dim, unbiased, keepdim);
+  return std::tie(result1, result2);
+}
+
+tuple <Tensor, Tensor> std_mean_names_npu(
+    const Tensor & self, 
+    DimnameList dim, 
+    bool unbiased, 
+    bool keepdim) {
+  return std_mean_dim_npu(self, dimnames_to_positions(self, dim), unbiased, keepdim);
+}
+
+Tensor std_names_npu(
+    const Tensor & self, 
+    DimnameList dim, 
+    bool unbiased, 
+    bool keepdim) {
+  return std_dim_npu(self, dimnames_to_positions(self, dim), unbiased, keepdim);
+}
+
+} // namespace native
+} // namespace at::native
diff --git a/src/aten/src/ATen/native/npu/TanKernelNpu.cpp b/src/aten/src/ATen/native/npu/TanKernelNpu.cpp
index 316263404daf412bb2abb82afc7ea7b66763ad30..a87735f0ef94f91259043aaadbf6abc647e5b038 100644
--- a/src/aten/src/ATen/native/npu/TanKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/TanKernelNpu.cpp
@@ -1,54 +1,54 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor& tan_out_npu(Tensor& result, const Tensor& self) {
-  OpCommand cmd;
-  cmd.Name("Tan")
-      .Input(self)
-      .Output(result)
-      .Run();
-
-  return result;
-}
-
-Tensor tan_npu(const Tensor& self) {
-  Tensor result = OpPreparation::ApplyTensor(self);
-  tan_out_npu(result, self);
-  return result;
-}
-
-Tensor& tan_npu_(Tensor& self) {
-  OpPreparation::CheckMemory({self}, {self});
-  if (!NpuUtils::check_match(&self)) {
-    Tensor contiguousSelf = NpuUtils::format_contiguous(self);
-    Tensor result = tan_out_npu(contiguousSelf, contiguousSelf);
-    NpuUtils::format_fresh_view(self, result);
-  } else {
-      tan_out_npu(self, self);
-    }
-
-  return self;
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
+#include "ATen/native/npu/utils/OpTemplate.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor& tan_out_npu(Tensor& result, const Tensor& self) {
+  OpCommand cmd;
+  cmd.Name("Tan")
+      .Input(self)
+      .Output(result)
+      .Run();
+
+  return result;
+}
+
+Tensor tan_npu(const Tensor& self) {
+  Tensor result = OpPreparation::ApplyTensor(self);
+  tan_out_npu(result, self);
+  return result;
+}
+
+Tensor& tan_npu_(Tensor& self) {
+  OpPreparation::CheckMemory({self}, {self});
+  if (!NpuUtils::check_match(&self)) {
+    Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    Tensor result = tan_out_npu(contiguousSelf, contiguousSelf);
+    NpuUtils::format_fresh_view(self, result);
+  } else {
+      tan_out_npu(self, self);
+    }
+
+  return self;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/TrilKernelNpu.cpp b/src/aten/src/ATen/native/npu/TrilKernelNpu.cpp
index 678854c4a391045571ecfa69175acda9d8ecdde1..35fda36df01bb4bb50978e7ab29e924ed0af9cbf 100644
--- a/src/aten/src/ATen/native/npu/TrilKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/TrilKernelNpu.cpp
@@ -1,63 +1,63 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace  at::native::npu;
-
-Tensor& tril_out_npu(Tensor& result, const Tensor& self, int64_t diagonal){
-  OpCommand cmd;
-  cmd.Name("Tril")
-      .Input(self)
-      .Output(result)
-      .Attr("diagonal", diagonal)
-      .Run();
-  return result;
-}
-
-Tensor tril_npu(const Tensor& self, int64_t diagonal){
-  auto selfCopy = self.npu_format_cast(ACL_FORMAT_NCHW);
-  auto is_last_two_dims = [&selfCopy](){
-      auto selfStorage = selfCopy.storage().get_npu_desc().storage_sizes_;
-      if (selfStorage.size() <= 1){
-          return false;
-      }
-      return true;
-  };
-  
-  TORCH_CHECK(is_last_two_dims(), "tril require tensor should be last two dims");
-  Tensor result = OpPreparation::ApplyTensor(selfCopy);
-  tril_out_npu(result, selfCopy, diagonal);
-  return result;
-}
-
-Tensor& tril_npu_(Tensor& self, int64_t diagonal){
-  OpPreparation::CheckMemory({self}, {self});  
-  self.npu_format_cast_(ACL_FORMAT_NCHW);
-  if(!NpuUtils::check_match(&self)){
-    Tensor contiguousSelf = NpuUtils::format_contiguous(self);
-    tril_out_npu(contiguousSelf, contiguousSelf, diagonal);
-    NpuUtils::format_fresh_view(self, contiguousSelf);
-  } else {
-    tril_out_npu(self, self, diagonal);
-  }
-  return self;
-}
-
-} // native
-} // at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace  at::native::npu;
+
+Tensor& tril_out_npu(Tensor& result, const Tensor& self, int64_t diagonal){
+  OpCommand cmd;
+  cmd.Name("Tril")
+      .Input(self)
+      .Output(result)
+      .Attr("diagonal", diagonal)
+      .Run();
+  return result;
+}
+
+Tensor tril_npu(const Tensor& self, int64_t diagonal){
+  auto selfCopy = self.npu_format_cast(ACL_FORMAT_NCHW);
+  auto is_last_two_dims = [&selfCopy](){
+      auto selfStorage = selfCopy.storage().get_npu_desc().storage_sizes_;
+      if (selfStorage.size() <= 1){
+          return false;
+      }
+      return true;
+  };
+  
+  TORCH_CHECK(is_last_two_dims(), "tril require tensor should be last two dims");
+  Tensor result = OpPreparation::ApplyTensor(selfCopy);
+  tril_out_npu(result, selfCopy, diagonal);
+  return result;
+}
+
+Tensor& tril_npu_(Tensor& self, int64_t diagonal){
+  OpPreparation::CheckMemory({self}, {self});  
+  self.npu_format_cast_(ACL_FORMAT_NCHW);
+  if(!NpuUtils::check_match(&self)){
+    Tensor contiguousSelf = NpuUtils::format_contiguous(self);
+    tril_out_npu(contiguousSelf, contiguousSelf, diagonal);
+    NpuUtils::format_fresh_view(self, contiguousSelf);
+  } else {
+    tril_out_npu(self, self, diagonal);
+  }
+  return self;
+}
+
+} // native
+} // at
diff --git a/src/aten/src/ATen/native/npu/UpsampleNearest1dKernelNpu.cpp b/src/aten/src/ATen/native/npu/UpsampleNearest1dKernelNpu.cpp
index 944f92fd8295f24439392940b7e7bd8e8fcbaa7f..2ca0f1130263bf3ba3ec54aec87feeed4459c623 100644
--- a/src/aten/src/ATen/native/npu/UpsampleNearest1dKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/UpsampleNearest1dKernelNpu.cpp
@@ -1,78 +1,78 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-SmallVector<int64_t, SIZE> upsample_nearest1d_npu_output_size(
-    const Tensor& input,
-    IntArrayRef output_size,
-    c10::optional<double> scales){
-  SmallVector<int64_t, SIZE> outputSize;
-  int64_t N = input.size(0);
-  int64_t C = input.size(1);
-  int64_t W;
-  if(output_size.size() != 0) {
-    W = output_size[0];
-  } else {
-    float temp_scales = (float)scales.value();
-    W = temp_scales * input.size(2);
-  }
-  outputSize = {N, C, W};
-  return outputSize;
-}
-
-Tensor& upsample_nearest1d_out_npu(
-    Tensor& result,
-    const Tensor& self,
-    IntArrayRef output_size,
-    c10::optional<double> scales) {
-
-  OpCommand cmd;
-  cmd.Name("UpsampleNearest1d")
-  
-      .Input(self)
-      .Output(result)
-      .Attr("output_size", output_size);
-      if (scales.has_value()) {
-        cmd.Attr("scales", static_cast<float>(scales.value()));
-      }
-      cmd.Run();
-
-  return result;
-}
-
-Tensor upsample_nearest1d_npu(
-    const Tensor& self,
-    IntArrayRef output_size,
-    c10::optional<double> scales) {
-  // calculate the output size
-  SmallVector<int64_t, SIZE> outputSize = upsample_nearest1d_npu_output_size(self, output_size, scales);
-
-  // construct the output tensor of the NPU
-  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
-
-  // calculate the output result of the NPU
-  upsample_nearest1d_out_npu(result, self, output_size, scales);
-
-  return result;
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+SmallVector<int64_t, SIZE> upsample_nearest1d_npu_output_size(
+    const Tensor& input,
+    IntArrayRef output_size,
+    c10::optional<double> scales){
+  SmallVector<int64_t, SIZE> outputSize;
+  int64_t N = input.size(0);
+  int64_t C = input.size(1);
+  int64_t W;
+  if(output_size.size() != 0) {
+    W = output_size[0];
+  } else {
+    float temp_scales = (float)scales.value();
+    W = temp_scales * input.size(2);
+  }
+  outputSize = {N, C, W};
+  return outputSize;
+}
+
+Tensor& upsample_nearest1d_out_npu(
+    Tensor& result,
+    const Tensor& self,
+    IntArrayRef output_size,
+    c10::optional<double> scales) {
+
+  OpCommand cmd;
+  cmd.Name("UpsampleNearest1d")
+  
+      .Input(self)
+      .Output(result)
+      .Attr("output_size", output_size);
+      if (scales.has_value()) {
+        cmd.Attr("scales", static_cast<float>(scales.value()));
+      }
+      cmd.Run();
+
+  return result;
+}
+
+Tensor upsample_nearest1d_npu(
+    const Tensor& self,
+    IntArrayRef output_size,
+    c10::optional<double> scales) {
+  // calculate the output size
+  SmallVector<int64_t, SIZE> outputSize = upsample_nearest1d_npu_output_size(self, output_size, scales);
+
+  // construct the output tensor of the NPU
+  Tensor result = OpPreparation::ApplyTensor(self, outputSize);
+
+  // calculate the output result of the NPU
+  upsample_nearest1d_out_npu(result, self, output_size, scales);
+
+  return result;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/_Unique2KernelNpu.cpp b/src/aten/src/ATen/native/npu/_Unique2KernelNpu.cpp
index 757bc2e258b143db8fa7b2928c10c4a8357c2feb..33d9e0cd815918a05d215c457ae6ca935498e08f 100644
--- a/src/aten/src/ATen/native/npu/_Unique2KernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/_Unique2KernelNpu.cpp
@@ -1,96 +1,96 @@
-// Copyright (c) 2020, Huawei Technologies.All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> _unique2_out_npu(
-    Tensor& y,
-    Tensor& yOutputSize,
-    Tensor& yInverse,
-    Tensor& yCounts,
-    const Tensor& self,
-    bool sorted,
-    bool return_inverse,
-    bool return_counts) {
-  OpCommand cmd;
-  cmd.Name("UniqueWithCountsAndSorting")
-     .Input(self)
-     .Output(y)
-     .Output(yOutputSize)
-     .Output(yInverse)
-     .Output(yCounts)
-     .Attr("sorted", sorted)
-     .Attr("return_inverse", true)
-     .Attr("return_counts", true)
-     .Run();
-
-  return std::tuple<Tensor&, Tensor&, Tensor&, Tensor&>(y, yOutputSize, yInverse, yCounts);
-}
-
-tuple<Tensor, Tensor, Tensor> _unique2_npu(
-    const Tensor& self,
-    bool sorted,
-    bool return_inverse,
-    bool return_counts) {
-  if(self.numel() == 0){
-    Tensor result= OpPreparation::ApplyTensor(self, {0});
-    Tensor yInverse = OpPreparation::ApplyTensor({0}, self.options().dtype(kLong), self);
-    Tensor yCounts = OpPreparation::ApplyTensor({0}, self.options().dtype(kLong), self);
-    return std::tie(result, yInverse, yCounts);
-  }
-  
-  auto yInverseSize = input_same_output_size(self);
-  auto outputSizes = tuple<SmallVector<int64_t, SIZE>, SmallVector<int64_t, SIZE>, IntArrayRef>(
-    {self.numel()}, {1}, yInverseSize);
-
-  Tensor selfCopy = self;
-  if (self.scalar_type() == ScalarType::Half) {
-    selfCopy = self.to(ScalarType::Float);
-  }
- 
-  Tensor y = OpPreparation::ApplyTensor(selfCopy, std::get<0>(outputSizes));
-  Tensor yOutputSize = at::empty_with_format(std::get<1>(outputSizes), self.options().dtype(kLong), ACL_FORMAT_ND);
-  Tensor yInverse = at::empty_with_format(std::get<2>(outputSizes), self.options().dtype(kLong), ACL_FORMAT_ND);
-  Tensor yCounts = at::empty_with_format(std::get<0>(outputSizes), self.options().dtype(kLong), ACL_FORMAT_ND);
-  
-  _unique2_out_npu(y, yOutputSize, yInverse, yCounts, selfCopy, sorted, return_inverse, return_counts);
-  
-  int64_t count = yOutputSize[0].item().toLong();
-  Tensor result = y.slice(0, 0, count, 1);
-  result = NpuUtils::format_contiguous(result);
-
-  if (self.scalar_type() == ScalarType::Half) {
-    result = result.to(ScalarType::Half);
-  }
-
-  if (return_counts) {
-    yCounts = yCounts.slice(0, 0, count, 1);
-    yCounts = NpuUtils::format_contiguous(yCounts);
-  } else {
-    yCounts = at::empty({0}, self.options().dtype(kLong));
-  }
-  
-  if (!(return_counts || return_inverse)) {
-    yInverse = at::empty({0}, self.options().dtype(kLong));
-  }
-  
-  return std::tuple<Tensor, Tensor, Tensor>(result, yInverse, yCounts);
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020, Huawei Technologies.All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+std::tuple<Tensor&, Tensor&, Tensor&, Tensor&> _unique2_out_npu(
+    Tensor& y,
+    Tensor& yOutputSize,
+    Tensor& yInverse,
+    Tensor& yCounts,
+    const Tensor& self,
+    bool sorted,
+    bool return_inverse,
+    bool return_counts) {
+  OpCommand cmd;
+  cmd.Name("UniqueWithCountsAndSorting")
+     .Input(self)
+     .Output(y)
+     .Output(yOutputSize)
+     .Output(yInverse)
+     .Output(yCounts)
+     .Attr("sorted", sorted)
+     .Attr("return_inverse", true)
+     .Attr("return_counts", true)
+     .Run();
+
+  return std::tuple<Tensor&, Tensor&, Tensor&, Tensor&>(y, yOutputSize, yInverse, yCounts);
+}
+
+tuple<Tensor, Tensor, Tensor> _unique2_npu(
+    const Tensor& self,
+    bool sorted,
+    bool return_inverse,
+    bool return_counts) {
+  if(self.numel() == 0){
+    Tensor result= OpPreparation::ApplyTensor(self, {0});
+    Tensor yInverse = OpPreparation::ApplyTensor({0}, self.options().dtype(kLong), self);
+    Tensor yCounts = OpPreparation::ApplyTensor({0}, self.options().dtype(kLong), self);
+    return std::tie(result, yInverse, yCounts);
+  }
+  
+  auto yInverseSize = input_same_output_size(self);
+  auto outputSizes = tuple<SmallVector<int64_t, SIZE>, SmallVector<int64_t, SIZE>, IntArrayRef>(
+    {self.numel()}, {1}, yInverseSize);
+
+  Tensor selfCopy = self;
+  if (self.scalar_type() == ScalarType::Half) {
+    selfCopy = self.to(ScalarType::Float);
+  }
+ 
+  Tensor y = OpPreparation::ApplyTensor(selfCopy, std::get<0>(outputSizes));
+  Tensor yOutputSize = at::empty_with_format(std::get<1>(outputSizes), self.options().dtype(kLong), ACL_FORMAT_ND);
+  Tensor yInverse = at::empty_with_format(std::get<2>(outputSizes), self.options().dtype(kLong), ACL_FORMAT_ND);
+  Tensor yCounts = at::empty_with_format(std::get<0>(outputSizes), self.options().dtype(kLong), ACL_FORMAT_ND);
+  
+  _unique2_out_npu(y, yOutputSize, yInverse, yCounts, selfCopy, sorted, return_inverse, return_counts);
+  
+  int64_t count = yOutputSize[0].item().toLong();
+  Tensor result = y.slice(0, 0, count, 1);
+  result = NpuUtils::format_contiguous(result);
+
+  if (self.scalar_type() == ScalarType::Half) {
+    result = result.to(ScalarType::Half);
+  }
+
+  if (return_counts) {
+    yCounts = yCounts.slice(0, 0, count, 1);
+    yCounts = NpuUtils::format_contiguous(yCounts);
+  } else {
+    yCounts = at::empty({0}, self.options().dtype(kLong));
+  }
+  
+  if (!(return_counts || return_inverse)) {
+    yInverse = at::empty({0}, self.options().dtype(kLong));
+  }
+  
+  return std::tuple<Tensor, Tensor, Tensor>(result, yInverse, yCounts);
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/common/FormatCastHelper.cpp b/src/aten/src/ATen/native/npu/common/FormatCastHelper.cpp
index 28909262503eb84f9cd6357f353f64b89df92e5c..bf72d425e5a4d981a20e0ea2eace74b3bb3d568b 100644
--- a/src/aten/src/ATen/native/npu/common/FormatCastHelper.cpp
+++ b/src/aten/src/ATen/native/npu/common/FormatCastHelper.cpp
@@ -36,7 +36,9 @@ void FormatCastHelper::format_cast_as_base_format(const Tensor& src, aclFormat f
   AT_ASSERT(FormatHelper::IsBaseFormatType(src), "src format must be base format");
   
   auto& src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_;
-  src_desc.storage_sizes_ = FormatHelper::GetSizeOfBaseFormat(src, format);
+  // due to CANN principle : if the ori format of a tensor is the
+  // same as the npu format, then its base shape must be same as storage shape
+  // so we should not change the storage shape when format cast between base format
   src_desc.origin_format_ = format;
   src_desc.npu_format_ = format;
   return;
diff --git a/src/aten/src/ATen/native/npu/convolution/Conv3dBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/convolution/Conv3dBackwardKernelNpu.cpp
index 9d2970bd36c55913115a97d187f5153c8e5c71c5..0d2c525df5ccdeaed479ce874f62e777ab63a009 100644
--- a/src/aten/src/ATen/native/npu/convolution/Conv3dBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/convolution/Conv3dBackwardKernelNpu.cpp
@@ -1,163 +1,163 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION.
-// All rights reserved.
-// 
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// 
-// https://opensource.org/licenses/BSD-3-Clause
-// 
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor conv3d_backward_inputmask(Tensor &gradInput, const Tensor &input,
-                                     const Tensor &grad, const Tensor &weight,
-                                     IntArrayRef stride, IntArrayRef padding,
-                                     IntArrayRef dilation, int64_t groups) {
-  SmallVector<int64_t, N> stridesSize = {1, 1, stride[0], stride[1], stride[2]};
-  SmallVector<int64_t, N> paddings = {padding[0], padding[0], padding[1],
-                                      padding[1], padding[2], padding[2]};
-  SmallVector<int64_t, N> dilations = {1, 1, dilation[0], dilation[1], dilation[2]};
-  IntArrayRef inputSize = input.sizes();
-  Tensor weightCast = weight.to(grad.dtype());
-  
-  OpCommand cmd;
-  if (!c10::npu::OptionsManager::CheckDynamicEnable()) {
-    cmd.Name("Conv3DBackpropInput")
-      .Input(inputSize, at::kInt)
-      .Input(weightCast)
-      .Input(grad)
-      .Output(gradInput)
-      .Attr("strides", stridesSize)
-      .Attr("pads", paddings)
-      .Attr("dilations", dilations)
-      .Attr("groups", groups)
-      .Attr("data_format", (string) "NCDHW")
-      .Run(); 
-  } else {
-    cmd.Name("Conv3DBackpropInputD")
-      .Input(weightCast)
-      .Input(grad)
-      .Output(gradInput)
-      .Attr("input_size", inputSize)
-      .Attr("strides", stridesSize)
-      .Attr("pads", paddings)
-      .Attr("dilations", dilations)
-      .Attr("groups", groups)
-      .Attr("data_format", (string) "NCDHW")
-      .Run();
-  }
-  return gradInput;
-}
-
-Tensor conv3d_backward_weightmask(Tensor &gradWeight, const Tensor &input,
-                                      const Tensor &grad, const Tensor &weight,
-                                      IntArrayRef stride, IntArrayRef padding,
-                                      IntArrayRef dilation, int64_t groups) {
-  SmallVector<int64_t, N> stridesSize = {1, 1, stride[0], stride[1], stride[2]};
-  SmallVector<int64_t, N> paddings = {padding[0], padding[0], padding[1],
-                                      padding[1], padding[2], padding[2]};
-  SmallVector<int64_t, N> dilations = {1, 1, dilation[0], dilation[1], dilation[2]};
-  IntArrayRef inputSize = weight.sizes();
-
-  OpCommand cmd;
-  if (!c10::npu::OptionsManager::CheckDynamicEnable()) {
-    cmd.Name("Conv3DBackpropFilter")
-      .Input(input)
-      .Input(inputSize, at::kInt)
-      .Input(grad)
-      .Output(gradWeight)
-      .Attr("strides", stridesSize)
-      .Attr("pads", paddings)
-      .Attr("dilations", dilations)
-      .Attr("groups", groups)
-      .Attr("data_format", (string) "NCDHW")
-      .Run();
-  } else {
-    cmd.Name("Conv3DBackpropFilterD")
-      .Input(input)
-      .Input(grad)
-      .Output(gradWeight)
-      .Attr("filter_size", inputSize)
-      .Attr("strides", stridesSize)
-      .Attr("pads", paddings)
-      .Attr("dilations", dilations)
-      .Attr("groups", groups)
-      .Attr("data_format", (string) "NCDHW")
-      .Run();
-  }
-
-  return gradWeight;
-}
-
-Tensor conv3d_backward_biasmask(Tensor &gradBias, const Tensor &input,
-                                    const Tensor &grad, const Tensor &weight,
-                                    IntArrayRef stride, IntArrayRef padding,
-                                    IntArrayRef dilation, int64_t groups) {
-  // constructs the input and output NPUTensorDesc
-  if (input.numel() == input.size(0) * input.size(1) * input.size(2)) {
-    Tensor gradView =
-        grad.contiguous().view({grad.size(0), grad.size(1), grad.size(2)});
-    at::sum_out(gradBias, gradView, SmallVector<int64_t, N>{0});
-  } else {
-    Tensor gradView =
-        grad.contiguous().view({grad.size(0), grad.size(1), grad.size(2), -1});
-    at::sum_out(gradBias, gradView, SmallVector<int64_t, N>{0, 2, 3});
-  }
-
-  return gradBias;
-}
-
-//interface
-tuple<Tensor, Tensor, Tensor>
-conv3d_backward_npu(const Tensor &input, const Tensor &grad,
-                    const Tensor &weight, IntArrayRef stride,
-                    IntArrayRef padding, IntArrayRef dilation, int64_t groups,
-                    std::array<bool, 3> grad_input_mask) {
-
-  Tensor gradInput;
-  Tensor gradWeight;
-  Tensor gradBias;
- 
-  if (grad_input_mask[0]) {
-    //format should be NDC1HWC0
-    gradInput = at::empty_with_format(
-        input.sizes(), input.options(), ACL_FORMAT_NDC1HWC0);
-    
-    conv3d_backward_inputmask(
-        gradInput, input, grad, weight, stride, padding, dilation, groups);
-  }
-
-  if (grad_input_mask[1]) {
-    //format should be FRACTAL_Z_3D
-    gradWeight = at::empty_with_format(
-        weight.sizes(), weight.options().dtype(kFloat), ACL_FRACTAL_Z_3D);
-    
-    conv3d_backward_weightmask(
-        gradWeight, input, grad, weight, stride, padding, dilation, groups);
-  }
-
-  if (grad_input_mask[2]) {
-    //format should be NCHW, gradias.size = grad.size(1)
-    gradBias = at::empty_with_format(
-        {grad.size(1)}, grad.options(), ACL_FORMAT_NCHW);
-    
-    conv3d_backward_biasmask(
-        gradBias, input, grad, weight, stride, padding, dilation, groups);
-  }
-
-  return std::make_tuple(gradInput, gradWeight, gradBias);
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+// 
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+// https://opensource.org/licenses/BSD-3-Clause
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor conv3d_backward_inputmask(Tensor &gradInput, const Tensor &input,
+                                     const Tensor &grad, const Tensor &weight,
+                                     IntArrayRef stride, IntArrayRef padding,
+                                     IntArrayRef dilation, int64_t groups) {
+  SmallVector<int64_t, N> stridesSize = {1, 1, stride[0], stride[1], stride[2]};
+  SmallVector<int64_t, N> paddings = {padding[0], padding[0], padding[1],
+                                      padding[1], padding[2], padding[2]};
+  SmallVector<int64_t, N> dilations = {1, 1, dilation[0], dilation[1], dilation[2]};
+  IntArrayRef inputSize = input.sizes();
+  Tensor weightCast = weight.to(grad.dtype());
+  
+  OpCommand cmd;
+  if (!c10::npu::OptionsManager::CheckDynamicEnable()) {
+    cmd.Name("Conv3DBackpropInput")
+      .Input(inputSize, at::kInt)
+      .Input(weightCast)
+      .Input(grad)
+      .Output(gradInput)
+      .Attr("strides", stridesSize)
+      .Attr("pads", paddings)
+      .Attr("dilations", dilations)
+      .Attr("groups", groups)
+      .Attr("data_format", (string) "NCDHW")
+      .Run(); 
+  } else {
+    cmd.Name("Conv3DBackpropInputD")
+      .Input(weightCast)
+      .Input(grad)
+      .Output(gradInput)
+      .Attr("input_size", inputSize)
+      .Attr("strides", stridesSize)
+      .Attr("pads", paddings)
+      .Attr("dilations", dilations)
+      .Attr("groups", groups)
+      .Attr("data_format", (string) "NCDHW")
+      .Run();
+  }
+  return gradInput;
+}
+
+Tensor conv3d_backward_weightmask(Tensor &gradWeight, const Tensor &input,
+                                      const Tensor &grad, const Tensor &weight,
+                                      IntArrayRef stride, IntArrayRef padding,
+                                      IntArrayRef dilation, int64_t groups) {
+  SmallVector<int64_t, N> stridesSize = {1, 1, stride[0], stride[1], stride[2]};
+  SmallVector<int64_t, N> paddings = {padding[0], padding[0], padding[1],
+                                      padding[1], padding[2], padding[2]};
+  SmallVector<int64_t, N> dilations = {1, 1, dilation[0], dilation[1], dilation[2]};
+  IntArrayRef inputSize = weight.sizes();
+
+  OpCommand cmd;
+  if (!c10::npu::OptionsManager::CheckDynamicEnable()) {
+    cmd.Name("Conv3DBackpropFilter")
+      .Input(input)
+      .Input(inputSize, at::kInt)
+      .Input(grad)
+      .Output(gradWeight)
+      .Attr("strides", stridesSize)
+      .Attr("pads", paddings)
+      .Attr("dilations", dilations)
+      .Attr("groups", groups)
+      .Attr("data_format", (string) "NCDHW")
+      .Run();
+  } else {
+    cmd.Name("Conv3DBackpropFilterD")
+      .Input(input)
+      .Input(grad)
+      .Output(gradWeight)
+      .Attr("filter_size", inputSize)
+      .Attr("strides", stridesSize)
+      .Attr("pads", paddings)
+      .Attr("dilations", dilations)
+      .Attr("groups", groups)
+      .Attr("data_format", (string) "NCDHW")
+      .Run();
+  }
+
+  return gradWeight;
+}
+
+Tensor conv3d_backward_biasmask(Tensor &gradBias, const Tensor &input,
+                                    const Tensor &grad, const Tensor &weight,
+                                    IntArrayRef stride, IntArrayRef padding,
+                                    IntArrayRef dilation, int64_t groups) {
+  // constructs the input and output NPUTensorDesc
+  if (input.numel() == input.size(0) * input.size(1) * input.size(2)) {
+    Tensor gradView =
+        grad.contiguous().view({grad.size(0), grad.size(1), grad.size(2)});
+    at::sum_out(gradBias, gradView, SmallVector<int64_t, N>{0});
+  } else {
+    Tensor gradView =
+        grad.contiguous().view({grad.size(0), grad.size(1), grad.size(2), -1});
+    at::sum_out(gradBias, gradView, SmallVector<int64_t, N>{0, 2, 3});
+  }
+
+  return gradBias;
+}
+
+//interface
+tuple<Tensor, Tensor, Tensor>
+conv3d_backward_npu(const Tensor &input, const Tensor &grad,
+                    const Tensor &weight, IntArrayRef stride,
+                    IntArrayRef padding, IntArrayRef dilation, int64_t groups,
+                    std::array<bool, 3> grad_input_mask) {
+
+  Tensor gradInput;
+  Tensor gradWeight;
+  Tensor gradBias;
+ 
+  if (grad_input_mask[0]) {
+    //format should be NDC1HWC0
+    gradInput = at::empty_with_format(
+        input.sizes(), input.options(), ACL_FORMAT_NDC1HWC0);
+    
+    conv3d_backward_inputmask(
+        gradInput, input, grad, weight, stride, padding, dilation, groups);
+  }
+
+  if (grad_input_mask[1]) {
+    //format should be FRACTAL_Z_3D
+    gradWeight = at::empty_with_format(
+        weight.sizes(), weight.options().dtype(kFloat), ACL_FRACTAL_Z_3D);
+    
+    conv3d_backward_weightmask(
+        gradWeight, input, grad, weight, stride, padding, dilation, groups);
+  }
+
+  if (grad_input_mask[2]) {
+    //format should be NCHW, gradias.size = grad.size(1)
+    gradBias = at::empty_with_format(
+        {grad.size(1)}, grad.options(), ACL_FORMAT_NCHW);
+    
+    conv3d_backward_biasmask(
+        gradBias, input, grad, weight, stride, padding, dilation, groups);
+  }
+
+  return std::make_tuple(gradInput, gradWeight, gradBias);
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/convolution/Conv3dKernelNpu.cpp b/src/aten/src/ATen/native/npu/convolution/Conv3dKernelNpu.cpp
index f1cd79906cbef0e93bd36a73524de2bba9a988a6..8e51890c37b8f5880240cf5f57cb0344dfa647f1 100644
--- a/src/aten/src/ATen/native/npu/convolution/Conv3dKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/convolution/Conv3dKernelNpu.cpp
@@ -1,102 +1,102 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION.
-// All rights reserved.
-// 
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// 
-// https://opensource.org/licenses/BSD-3-Clause
-// 
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
-#include "ATen/native/npu/utils/OpAdapter.h"
-#include "ATen/native/npu/utils/OpTemplate.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-SmallVector<int64_t, SIZE>
-conv3d_npu_output_size(const Tensor &input, const Tensor &weight,
-                       const Tensor &bias, IntArrayRef stride,
-                       IntArrayRef padding, IntArrayRef dilation,
-                       int64_t groups) {
-  int64_t N = input.size(0);
-  int64_t D = input.size(2);
-  int64_t H = input.size(3);
-  int64_t W = input.size(4);
-  int64_t Co = weight.size(0);
-  auto kernel_size = weight.sizes().slice(2);
-  int64_t Do = 
-      (D + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1;
-  int64_t Ho = 
-      (H + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) / stride[1] + 1;
-  int64_t Wo = 
-      (W + 2 * padding[2] - dilation[2] * (kernel_size[2] - 1) - 1) / stride[2] + 1;
-
-  SmallVector<int64_t, SIZE> outputSize = {N, Co, Do, Ho, Wo};
-
-  return outputSize;
-}
-
-Tensor &conv3d_out_npu_nocheck(Tensor &result, const Tensor &input,
-                               const Tensor &weight, const Tensor &bias,
-                               IntArrayRef stride, IntArrayRef padding,
-                               IntArrayRef dilation, int64_t groups) {
-  Tensor filter = weight.to(input.dtype());
-  SmallVector<Tensor, N> inputTensor = {input, filter, bias};
-  SmallVector<int64_t, N> stridesSize = {1, 1, stride[0], stride[1], stride[2]};
-  SmallVector<int64_t, N> paddings = {padding[0], padding[0], padding[1],
-                                      padding[1], padding[2], padding[2]};
-  SmallVector<int64_t, N> dilations = {1, 1, dilation[0], dilation[1], dilation[2]};
-
-  OpCommand cmd;
-  cmd.Name("Conv3D");
-  cmd.Input(input);
-  cmd.Input(filter);
-  if (bias.defined()) {
-    cmd.Input(bias);
-  }
-  cmd.Output(result);
-  cmd.Attr("strides", stridesSize);
-  cmd.Attr("pads", paddings);
-  cmd.Attr("dilations", dilations);
-  cmd.Attr("groups", groups);
-  cmd.Attr("data_format", (string) "NCDHW");
-  cmd.Run();
-
-  return result;
-}
-
-Tensor &conv3d_out_npu(Tensor &result, const Tensor &input,
-                       const Tensor &weight, const Tensor &bias,
-                       IntArrayRef stride, IntArrayRef padding,
-                       IntArrayRef dilation, int64_t groups) {
-  OpPipeWithDefinedOut pipe;
-  return pipe.CheckMemory({input, weight, bias}, {result})
-             .Func([&input, &weight, &bias, stride, padding, dilation, groups](Tensor &result) {
-                 conv3d_out_npu_nocheck(
-                     result, input, weight, bias, stride, padding, dilation, groups);
-              })
-             .Call(result);
-}
-
-Tensor conv3d_npu(const Tensor &input, const Tensor &weight, const Tensor &bias,
-                  IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation,
-                  int64_t groups) {
-  auto outputSize = conv3d_npu_output_size(
-      input, weight, bias, stride, padding, dilation, groups);
-  Tensor result = OpPreparation::ApplyTensor(input, outputSize);
-  conv3d_out_npu(result, input, weight, bias, stride, padding, dilation, groups);
-
-  return result;
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+// 
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+// https://opensource.org/licenses/BSD-3-Clause
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/KernelNpuOutputSize.h"
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include "ATen/native/npu/utils/OpTemplate.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+SmallVector<int64_t, SIZE>
+conv3d_npu_output_size(const Tensor &input, const Tensor &weight,
+                       const Tensor &bias, IntArrayRef stride,
+                       IntArrayRef padding, IntArrayRef dilation,
+                       int64_t groups) {
+  int64_t N = input.size(0);
+  int64_t D = input.size(2);
+  int64_t H = input.size(3);
+  int64_t W = input.size(4);
+  int64_t Co = weight.size(0);
+  auto kernel_size = weight.sizes().slice(2);
+  int64_t Do = 
+      (D + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1;
+  int64_t Ho = 
+      (H + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) / stride[1] + 1;
+  int64_t Wo = 
+      (W + 2 * padding[2] - dilation[2] * (kernel_size[2] - 1) - 1) / stride[2] + 1;
+
+  SmallVector<int64_t, SIZE> outputSize = {N, Co, Do, Ho, Wo};
+
+  return outputSize;
+}
+
+Tensor &conv3d_out_npu_nocheck(Tensor &result, const Tensor &input,
+                               const Tensor &weight, const Tensor &bias,
+                               IntArrayRef stride, IntArrayRef padding,
+                               IntArrayRef dilation, int64_t groups) {
+  Tensor filter = weight.to(input.dtype());
+  SmallVector<Tensor, N> inputTensor = {input, filter, bias};
+  SmallVector<int64_t, N> stridesSize = {1, 1, stride[0], stride[1], stride[2]};
+  SmallVector<int64_t, N> paddings = {padding[0], padding[0], padding[1],
+                                      padding[1], padding[2], padding[2]};
+  SmallVector<int64_t, N> dilations = {1, 1, dilation[0], dilation[1], dilation[2]};
+
+  OpCommand cmd;
+  cmd.Name("Conv3D");
+  cmd.Input(input);
+  cmd.Input(filter);
+  if (bias.defined()) {
+    cmd.Input(bias);
+  }
+  cmd.Output(result);
+  cmd.Attr("strides", stridesSize);
+  cmd.Attr("pads", paddings);
+  cmd.Attr("dilations", dilations);
+  cmd.Attr("groups", groups);
+  cmd.Attr("data_format", (string) "NCDHW");
+  cmd.Run();
+
+  return result;
+}
+
+Tensor &conv3d_out_npu(Tensor &result, const Tensor &input,
+                       const Tensor &weight, const Tensor &bias,
+                       IntArrayRef stride, IntArrayRef padding,
+                       IntArrayRef dilation, int64_t groups) {
+  OpPipeWithDefinedOut pipe;
+  return pipe.CheckMemory({input, weight, bias}, {result})
+             .Func([&input, &weight, &bias, stride, padding, dilation, groups](Tensor &result) {
+                 conv3d_out_npu_nocheck(
+                     result, input, weight, bias, stride, padding, dilation, groups);
+              })
+             .Call(result);
+}
+
+Tensor conv3d_npu(const Tensor &input, const Tensor &weight, const Tensor &bias,
+                  IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation,
+                  int64_t groups) {
+  auto outputSize = conv3d_npu_output_size(
+      input, weight, bias, stride, padding, dilation, groups);
+  Tensor result = OpPreparation::ApplyTensor(input, outputSize);
+  conv3d_out_npu(result, input, weight, bias, stride, padding, dilation, groups);
+
+  return result;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/convolution/DeformableConv2dBackwardKernelNpu.cpp b/src/aten/src/ATen/native/npu/convolution/DeformableConv2dBackwardKernelNpu.cpp
index a5f8a2cad0b7f418dd78eea651705dbc7fe78be1..3b0f3d2ba2bb97343ccc9e18a0ed742684303561 100644
--- a/src/aten/src/ATen/native/npu/convolution/DeformableConv2dBackwardKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/convolution/DeformableConv2dBackwardKernelNpu.cpp
@@ -1,74 +1,74 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-tuple<Tensor, Tensor, Tensor, Tensor> deformable_conv2d_backward_npu(
-    const Tensor& input,
-    const Tensor& grad_output,
-    const Tensor& offset_out,
-    const Tensor& weight,
-    const Tensor& offset,
-    IntArrayRef kernel_size,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    int64_t groups,
-    int64_t deformable_groups,
-    bool modulated) {
-  // construct the output tensor of the NPU
-  Tensor grad_input = OpPreparation::ApplyTensorWithFormat(input, ACL_FORMAT_NCHW);
-  Tensor grad_offset = OpPreparation::ApplyTensorWithFormat(offset, ACL_FORMAT_NCHW);
-
-  // deformable_conv2d_backward includes conv2d_backward and DeformableOffsetsGrad
-  SmallVector<int64_t, SIZE> conv2dStride = array_to_small_vector(kernel_size);
-  SmallVector<int64_t, SIZE> conv2dPadding = {0, 0, 0, 0};
-  SmallVector<int64_t, SIZE> conv2dDilation = {1, 1};
-  auto conv2dBackwardOutput = at::npu_conv2d_backward(
-      offset_out, grad_output, weight, conv2dStride, conv2dPadding, conv2dDilation, groups, {true, true, true});
-
-  // DeformableOffsetsGrad's input 'grad' is the output[0] of conv2d_backward
-  Tensor deformableOffsetsBackwardInput = get<0>(conv2dBackwardOutput);
-  Tensor grad_weight = get<1>(conv2dBackwardOutput);
-  Tensor grad_bias = get<2>(conv2dBackwardOutput);
-
-  string dataFormat = "NCHW";
-  // calculate the output result of the NPU
-  OpCommand cmd;
-  cmd.Name("DeformableOffsetsGrad")
-      .Input(deformableOffsetsBackwardInput)
-      .Input(input)
-      .Input(offset)
-      .Output(grad_input)
-      .Output(grad_offset)
-      .Attr("strides", stride)
-      .Attr("pads", padding)
-      .Attr("ksize", kernel_size)
-      .Attr("dilations", dilation)
-      .Attr("data_format",dataFormat)
-      .Attr("deformable_groups", deformable_groups)
-      .Attr("modulated",modulated)
-      .Run();
-      
-  return std::tie(grad_input, grad_weight, grad_offset, grad_bias);
-}
-
-} // namespace native
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+tuple<Tensor, Tensor, Tensor, Tensor> deformable_conv2d_backward_npu(
+    const Tensor& input,
+    const Tensor& grad_output,
+    const Tensor& offset_out,
+    const Tensor& weight,
+    const Tensor& offset,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t deformable_groups,
+    bool modulated) {
+  // construct the output tensor of the NPU
+  Tensor grad_input = OpPreparation::ApplyTensorWithFormat(input, ACL_FORMAT_NCHW);
+  Tensor grad_offset = OpPreparation::ApplyTensorWithFormat(offset, ACL_FORMAT_NCHW);
+
+  // deformable_conv2d_backward includes conv2d_backward and DeformableOffsetsGrad
+  SmallVector<int64_t, SIZE> conv2dStride = array_to_small_vector(kernel_size);
+  SmallVector<int64_t, SIZE> conv2dPadding = {0, 0, 0, 0};
+  SmallVector<int64_t, SIZE> conv2dDilation = {1, 1};
+  auto conv2dBackwardOutput = at::npu_conv2d_backward(
+      offset_out, grad_output, weight, conv2dStride, conv2dPadding, conv2dDilation, groups, {true, true, true});
+
+  // DeformableOffsetsGrad's input 'grad' is the output[0] of conv2d_backward
+  Tensor deformableOffsetsBackwardInput = get<0>(conv2dBackwardOutput);
+  Tensor grad_weight = get<1>(conv2dBackwardOutput);
+  Tensor grad_bias = get<2>(conv2dBackwardOutput);
+
+  string dataFormat = "NCHW";
+  // calculate the output result of the NPU
+  OpCommand cmd;
+  cmd.Name("DeformableOffsetsGrad")
+      .Input(deformableOffsetsBackwardInput)
+      .Input(input)
+      .Input(offset)
+      .Output(grad_input)
+      .Output(grad_offset)
+      .Attr("strides", stride)
+      .Attr("pads", padding)
+      .Attr("ksize", kernel_size)
+      .Attr("dilations", dilation)
+      .Attr("data_format",dataFormat)
+      .Attr("deformable_groups", deformable_groups)
+      .Attr("modulated",modulated)
+      .Run();
+      
+  return std::tie(grad_input, grad_weight, grad_offset, grad_bias);
+}
+
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/convolution/DeformableConv2dKernelNpu.cpp b/src/aten/src/ATen/native/npu/convolution/DeformableConv2dKernelNpu.cpp
index 106c81a2b14592516309c9c744ce366c924f483a..5648d4fc2e17b24937fed6166386c95e9bf5fca7 100644
--- a/src/aten/src/ATen/native/npu/convolution/DeformableConv2dKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/convolution/DeformableConv2dKernelNpu.cpp
@@ -1,68 +1,68 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-tuple<Tensor, Tensor> deformable_conv2d_npu(
-    const Tensor& input,
-    const Tensor& weight,
-    const Tensor& offset,
-    const Tensor& bias,
-    IntArrayRef kernel_size,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    int64_t groups,
-    int64_t deformable_groups,
-    bool modulated) {
-  // calculate the output size
-  auto outputSize = deformable_conv2d_npu_output_size(
-      input, weight, offset, bias, kernel_size, stride, padding, dilation, groups, deformable_groups, modulated);
-
-  // construct the output tensor of the NPU
-  Tensor deformableOffsetsOutput = OpPreparation::ApplyTensorWithFormat(outputSize, input.options(), ACL_FORMAT_NCHW);
-
-  string dataFormat = "NCHW";
-  // calculate the output result of the NPU
-  OpCommand cmd;
-  cmd.Name("DeformableOffsets")
-      .Input(input)
-      .Input(offset)
-      .Output(deformableOffsetsOutput)
-      .Attr("ksize", kernel_size)
-      .Attr("strides", stride)
-      .Attr("pads", padding)
-      .Attr("dilations", dilation)
-      .Attr("deformable_groups", deformable_groups)
-      .Attr("data_format",dataFormat)
-      .Attr("modulated",modulated)
-      .Run();
-  
-  SmallVector<int64_t, SIZE> conv2dStride = array_to_small_vector(kernel_size);
-  SmallVector<int64_t, SIZE> conv2dPadding = {0, 0, 0, 0};
-  SmallVector<int64_t, SIZE> conv2dDilation = {1, 1};
-  Tensor conv2dOutput = at::npu_conv2d(
-      deformableOffsetsOutput, weight, bias, conv2dStride, conv2dPadding, conv2dDilation, groups);
-
-  return std::tie(conv2dOutput, deformableOffsetsOutput);
-}
-
-} // namespace native
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+tuple<Tensor, Tensor> deformable_conv2d_npu(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& offset,
+    const Tensor& bias,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t deformable_groups,
+    bool modulated) {
+  // calculate the output size
+  auto outputSize = deformable_conv2d_npu_output_size(
+      input, weight, offset, bias, kernel_size, stride, padding, dilation, groups, deformable_groups, modulated);
+
+  // construct the output tensor of the NPU
+  Tensor deformableOffsetsOutput = OpPreparation::ApplyTensorWithFormat(outputSize, input.options(), ACL_FORMAT_NCHW);
+
+  string dataFormat = "NCHW";
+  // calculate the output result of the NPU
+  OpCommand cmd;
+  cmd.Name("DeformableOffsets")
+      .Input(input)
+      .Input(offset)
+      .Output(deformableOffsetsOutput)
+      .Attr("ksize", kernel_size)
+      .Attr("strides", stride)
+      .Attr("pads", padding)
+      .Attr("dilations", dilation)
+      .Attr("deformable_groups", deformable_groups)
+      .Attr("data_format",dataFormat)
+      .Attr("modulated",modulated)
+      .Run();
+  
+  SmallVector<int64_t, SIZE> conv2dStride = array_to_small_vector(kernel_size);
+  SmallVector<int64_t, SIZE> conv2dPadding = {0, 0, 0, 0};
+  SmallVector<int64_t, SIZE> conv2dDilation = {1, 1};
+  Tensor conv2dOutput = at::npu_conv2d(
+      deformableOffsetsOutput, weight, bias, conv2dStride, conv2dPadding, conv2dDilation, groups);
+
+  return std::tie(conv2dOutput, deformableOffsetsOutput);
+}
+
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/dynamicstrategy/LayerNormGardStrategy.cpp b/src/aten/src/ATen/native/npu/dynamicstrategy/LayerNormGardStrategy.cpp
index f845336972f25c1dc619358c5db55db474afa8bf..e3213ce7e68192154d890131d3c5cdd4012aa8bf 100644
--- a/src/aten/src/ATen/native/npu/dynamicstrategy/LayerNormGardStrategy.cpp
+++ b/src/aten/src/ATen/native/npu/dynamicstrategy/LayerNormGardStrategy.cpp
@@ -1,87 +1,87 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <c10/util/SmallVector.h>
-#include <ATen/native/npu/dynamicstrategy/Strategy.h>
-#include "ATen/native/npu/utils/NpuUtils.h"
-#include <third_party/acl/inc/acl/acl_base.h>
-#include <ATen/native/npu/frame/InputInfoLib.h>
-
-namespace at {
-namespace native {
-namespace npu {
-
-class LayerNormGardStrategy : public DescStrategyBase
-{
-public:
-  virtual void CreateInputDescInfo(ACL_PARAMS& params,
-    DynamicCompileShape& compileShape) override;
-
-  virtual void CreateOutputDescInfo(ACL_PARAMS& params,
-    DynamicCompileShape& compileShape) override;
-};
-
-// create input shape
-void LayerNormGardStrategy::CreateInputDescInfo(ACL_PARAMS& params, 
-    DynamicCompileShape& compileShape) {
-  for (int64_t i = 0; i < params.input_num; ++i) {
-    aclTensorDesc* desc = const_cast<aclTensorDesc*>(params.input_desc[i]);
-    aclFormat storageFormat = params.inputFormats[i];
-    if (i < 2) {
-      FormatShape shape = {-1, -1, -1};
-      aclGetTensorDescDimV2(desc, 2, &shape[2]);
-      FormatShape storageShape = FormatHelper::GetStorageSizes(storageFormat, shape);
-      compileShape.inputShape.emplace_back(shape);
-      compileShape.inputStorageShape.emplace_back(storageShape);
-    } else if (i == 2 || i == 3) {
-      FormatShape shape = {-1, -1, 1};
-      FormatShape storageShape = {-1, -1, 1};
-      compileShape.inputShape.emplace_back(shape);
-      compileShape.inputStorageShape.emplace_back(storageShape);
-    } else {
-      FormatShape shape = {-1};
-      aclGetTensorDescDimV2(desc, 0, &shape[0]);
-      FormatShape storageShape = FormatHelper::GetStorageSizes(storageFormat, shape);
-      compileShape.inputShape.emplace_back(shape);
-      compileShape.inputStorageShape.emplace_back(storageShape);
-    }
-  }
-}
-
-void LayerNormGardStrategy::CreateOutputDescInfo(ACL_PARAMS& params, 
-    DynamicCompileShape& compileShape) { 
-  // create output shape
-  for (int64_t i = 0; i < params.output_num; ++i) {
-    aclTensorDesc* desc = const_cast<aclTensorDesc*>(params.output_desc[i]);
-    aclFormat storageFormat = params.inputFormats[i];
-    if (i == 0) {
-      FormatShape shape = {-1, -1, -1};
-      aclGetTensorDescDimV2(desc, 2, &shape[2]);
-      FormatShape storageShape = FormatHelper::GetStorageSizes(storageFormat, shape);
-      compileShape.outputShape.emplace_back(shape);
-      compileShape.outputStorageShape.emplace_back(storageShape);
-    } else {
-      FormatShape shape = {-1};
-      aclGetTensorDescDimV2(desc, 0, &shape[0]);
-      FormatShape storageShape = FormatHelper::GetStorageSizes(storageFormat, shape);
-      compileShape.outputShape.emplace_back(shape);
-      compileShape.outputStorageShape.emplace_back(storageShape);
-    }
-  }
-}
-REGISTER_DYNAMIC_SHAPE_OPT(LayerNormGrad, LayerNormGardStrategy)
-
-} // namespace npu
-} // namespace native
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <c10/util/SmallVector.h>
+#include <ATen/native/npu/dynamicstrategy/Strategy.h>
+#include "ATen/native/npu/utils/NpuUtils.h"
+#include <third_party/acl/inc/acl/acl_base.h>
+#include <ATen/native/npu/frame/InputInfoLib.h>
+
+namespace at {
+namespace native {
+namespace npu {
+
+class LayerNormGardStrategy : public DescStrategyBase
+{
+public:
+  virtual void CreateInputDescInfo(ACL_PARAMS& params,
+    DynamicCompileShape& compileShape) override;
+
+  virtual void CreateOutputDescInfo(ACL_PARAMS& params,
+    DynamicCompileShape& compileShape) override;
+};
+
+// create input shape
+void LayerNormGardStrategy::CreateInputDescInfo(ACL_PARAMS& params, 
+    DynamicCompileShape& compileShape) {
+  for (int64_t i = 0; i < params.input_num; ++i) {
+    aclTensorDesc* desc = const_cast<aclTensorDesc*>(params.input_desc[i]);
+    aclFormat storageFormat = params.inputFormats[i];
+    if (i < 2) {
+      FormatShape shape = {-1, -1, -1};
+      aclGetTensorDescDimV2(desc, 2, &shape[2]);
+      FormatShape storageShape = FormatHelper::GetStorageSizes(storageFormat, shape);
+      compileShape.inputShape.emplace_back(shape);
+      compileShape.inputStorageShape.emplace_back(storageShape);
+    } else if (i == 2 || i == 3) {
+      FormatShape shape = {-1, -1, 1};
+      FormatShape storageShape = {-1, -1, 1};
+      compileShape.inputShape.emplace_back(shape);
+      compileShape.inputStorageShape.emplace_back(storageShape);
+    } else {
+      FormatShape shape = {-1};
+      aclGetTensorDescDimV2(desc, 0, &shape[0]);
+      FormatShape storageShape = FormatHelper::GetStorageSizes(storageFormat, shape);
+      compileShape.inputShape.emplace_back(shape);
+      compileShape.inputStorageShape.emplace_back(storageShape);
+    }
+  }
+}
+
+void LayerNormGardStrategy::CreateOutputDescInfo(ACL_PARAMS& params, 
+    DynamicCompileShape& compileShape) { 
+  // create output shape
+  for (int64_t i = 0; i < params.output_num; ++i) {
+    aclTensorDesc* desc = const_cast<aclTensorDesc*>(params.output_desc[i]);
+    aclFormat storageFormat = params.inputFormats[i];
+    if (i == 0) {
+      FormatShape shape = {-1, -1, -1};
+      aclGetTensorDescDimV2(desc, 2, &shape[2]);
+      FormatShape storageShape = FormatHelper::GetStorageSizes(storageFormat, shape);
+      compileShape.outputShape.emplace_back(shape);
+      compileShape.outputStorageShape.emplace_back(storageShape);
+    } else {
+      FormatShape shape = {-1};
+      aclGetTensorDescDimV2(desc, 0, &shape[0]);
+      FormatShape storageShape = FormatHelper::GetStorageSizes(storageFormat, shape);
+      compileShape.outputShape.emplace_back(shape);
+      compileShape.outputStorageShape.emplace_back(storageShape);
+    }
+  }
+}
+REGISTER_DYNAMIC_SHAPE_OPT(LayerNormGrad, LayerNormGardStrategy)
+
+} // namespace npu
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/dynamicstrategy/RoiAlignBackwardStrategy.cpp b/src/aten/src/ATen/native/npu/dynamicstrategy/RoiAlignBackwardStrategy.cpp
index cc7cb6d77405ded9b50af4a06b79a12854a9e780..f69e3601f3c0a810edda5a86c59dd163d4d55c05 100644
--- a/src/aten/src/ATen/native/npu/dynamicstrategy/RoiAlignBackwardStrategy.cpp
+++ b/src/aten/src/ATen/native/npu/dynamicstrategy/RoiAlignBackwardStrategy.cpp
@@ -1,76 +1,76 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <c10/util/SmallVector.h>
-#include <ATen/native/npu/dynamicstrategy/Strategy.h>
-#include "ATen/native/npu/utils/NpuUtils.h"
-#include <third_party/acl/inc/acl/acl_base.h>
-#include <ATen/native/npu/frame/InputInfoLib.h>
-
-namespace at {
-namespace native {
-namespace npu {
-
-class ROIAlignBackwardStrategy : public DescStrategyBase
-{
-public:
-  virtual void CreateInputDescInfo(ACL_PARAMS& params,
-    DynamicCompileShape& compileShape) override;
-
-  virtual void CreateOutputDescInfo(ACL_PARAMS& params,
-    DynamicCompileShape& compileShape) override;
-};
-
-void ROIAlignBackwardStrategy::CreateInputDescInfo(ACL_PARAMS& params,
-  DynamicCompileShape& compileShape) {
-  CreateDefaultDescInfo(params.input_desc,
-    params.input_num,
-    params.inputDims,
-    params.inputFormats,
-    compileShape.inputShape,
-    compileShape.inputStorageShape);
-}
-
-void ROIAlignBackwardStrategy::CreateOutputDescInfo(ACL_PARAMS& params,
-  DynamicCompileShape& compileShape) {
-  for (int64_t i = 0; i < params.output_num; ++i) {
-    aclTensorDesc* desc = const_cast<aclTensorDesc*>(params.output_desc[i]);
-
-    int64_t dim = (int64_t)aclGetTensorDescNumDims(desc);
-    dim = (dim == 0) ? 1 : dim;
-
-    int64_t storageDim = (params.outputDims[i] == 0) ? 1 : params.outputDims[i];
-    aclFormat storageFormat = params.outputFormats[i];
-
-    FormatShape shape(dim, -1);
-    FormatShape storageShape(storageDim, -1);  
-
-    // fix all dims
-    aclGetTensorDescDimV2(desc, 0, &shape[0]);
-    aclGetTensorDescDimV2(desc, 1, &shape[1]);
-    aclGetTensorDescDimV2(desc, 2, &shape[2]);
-    aclGetTensorDescDimV2(desc, 3, &shape[3]);
-
-    storageShape = FormatHelper::GetStorageSizes(storageFormat, shape);
-    
-    compileShape.outputShape.emplace_back(shape);
-    compileShape.outputStorageShape.emplace_back(storageShape);
-  }
-}
-
-REGISTER_DYNAMIC_SHAPE_OPT(ROIAlignGrad, ROIAlignBackwardStrategy)
-
-} // namespace npu
-} // namespace native
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <c10/util/SmallVector.h>
+#include <ATen/native/npu/dynamicstrategy/Strategy.h>
+#include "ATen/native/npu/utils/NpuUtils.h"
+#include <third_party/acl/inc/acl/acl_base.h>
+#include <ATen/native/npu/frame/InputInfoLib.h>
+
+namespace at {
+namespace native {
+namespace npu {
+
+class ROIAlignBackwardStrategy : public DescStrategyBase
+{
+public:
+  virtual void CreateInputDescInfo(ACL_PARAMS& params,
+    DynamicCompileShape& compileShape) override;
+
+  virtual void CreateOutputDescInfo(ACL_PARAMS& params,
+    DynamicCompileShape& compileShape) override;
+};
+
+void ROIAlignBackwardStrategy::CreateInputDescInfo(ACL_PARAMS& params,
+  DynamicCompileShape& compileShape) {
+  CreateDefaultDescInfo(params.input_desc,
+    params.input_num,
+    params.inputDims,
+    params.inputFormats,
+    compileShape.inputShape,
+    compileShape.inputStorageShape);
+}
+
+void ROIAlignBackwardStrategy::CreateOutputDescInfo(ACL_PARAMS& params,
+  DynamicCompileShape& compileShape) {
+  for (int64_t i = 0; i < params.output_num; ++i) {
+    aclTensorDesc* desc = const_cast<aclTensorDesc*>(params.output_desc[i]);
+
+    int64_t dim = (int64_t)aclGetTensorDescNumDims(desc);
+    dim = (dim == 0) ? 1 : dim;
+
+    int64_t storageDim = (params.outputDims[i] == 0) ? 1 : params.outputDims[i];
+    aclFormat storageFormat = params.outputFormats[i];
+
+    FormatShape shape(dim, -1);
+    FormatShape storageShape(storageDim, -1);  
+
+    // fix all dims
+    aclGetTensorDescDimV2(desc, 0, &shape[0]);
+    aclGetTensorDescDimV2(desc, 1, &shape[1]);
+    aclGetTensorDescDimV2(desc, 2, &shape[2]);
+    aclGetTensorDescDimV2(desc, 3, &shape[3]);
+
+    storageShape = FormatHelper::GetStorageSizes(storageFormat, shape);
+    
+    compileShape.outputShape.emplace_back(shape);
+    compileShape.outputStorageShape.emplace_back(storageShape);
+  }
+}
+
+REGISTER_DYNAMIC_SHAPE_OPT(ROIAlignGrad, ROIAlignBackwardStrategy)
+
+} // namespace npu
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/dynamicstrategy/RoiAlignStrategy.cpp b/src/aten/src/ATen/native/npu/dynamicstrategy/RoiAlignStrategy.cpp
index 716a38ae19e343567db771f3f443323a4b2bbda1..513ac6f52d73bf7e696379898da148a1aabe4993 100644
--- a/src/aten/src/ATen/native/npu/dynamicstrategy/RoiAlignStrategy.cpp
+++ b/src/aten/src/ATen/native/npu/dynamicstrategy/RoiAlignStrategy.cpp
@@ -1,86 +1,86 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <c10/util/SmallVector.h>
-#include <ATen/native/npu/dynamicstrategy/Strategy.h>
-#include "ATen/native/npu/utils/NpuUtils.h"
-#include <third_party/acl/inc/acl/acl_base.h>
-#include <ATen/native/npu/frame/InputInfoLib.h>
-
-
-namespace at {
-namespace native {
-namespace npu {
-
-class ROIAlignStrategy : public DescStrategyBase
-{
-public:
-  virtual void CreateInputDescInfo(ACL_PARAMS& params,
-    DynamicCompileShape& compileShape) override;
-
-  virtual void CreateOutputDescInfo(ACL_PARAMS& params,
-    DynamicCompileShape& compileShape) override;
-};
-
-void ROIAlignStrategy::CreateInputDescInfo(ACL_PARAMS& params,
-  DynamicCompileShape& compileShape) {
-  CreateDefaultDescInfo(params.input_desc,
-    params.input_num,
-    params.inputDims,
-    params.inputFormats,
-    compileShape.inputShape,
-    compileShape.inputStorageShape);
-}
-
-void ROIAlignStrategy::CreateOutputDescInfo(ACL_PARAMS& params,
-  DynamicCompileShape& compileShape) {
-  for (int64_t i = 0; i < params.output_num; ++i) {
-    aclTensorDesc* desc = const_cast<aclTensorDesc*>(params.output_desc[i]);
-
-    int64_t dim = (int64_t)aclGetTensorDescNumDims(desc);
-    dim = (dim == 0) ? 1 : dim;
-
-    int64_t storageDim = (params.outputDims[i] == 0) ? 1 : params.outputDims[i];
-    aclFormat storageFormat = params.outputFormats[i];
-
-    FormatShape shape(dim, -1);
-    FormatShape storageShape(storageDim, -1);
-
-    // fix height dim value
-    int64_t index_h = dim - 2;
-    aclGetTensorDescDimV2(desc, index_h, &shape[index_h]);
-
-    // fix width dim value
-    int64_t index_w = dim - 1;
-    aclGetTensorDescDimV2(desc, index_w, &shape[index_w]);    
-
-    if (storageFormat == ACL_FORMAT_NC1HWC0) {
-        storageShape[storageDim - 3] = shape[dim - 2];
-        storageShape[storageDim - 2] = shape[dim - 1];
-        storageShape[storageDim - 1] = 16;
-    } else {
-        storageShape[storageDim - 2] = shape[dim - 2];
-        storageShape[storageDim - 1] = shape[dim - 1];
-    }
-    
-    compileShape.outputShape.emplace_back(shape);
-    compileShape.outputStorageShape.emplace_back(storageShape);
-  }
-}
-
-REGISTER_DYNAMIC_SHAPE_OPT(ROIAlign, ROIAlignStrategy)
-
-} // namespace npu
-} // namespace native
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <c10/util/SmallVector.h>
+#include <ATen/native/npu/dynamicstrategy/Strategy.h>
+#include "ATen/native/npu/utils/NpuUtils.h"
+#include <third_party/acl/inc/acl/acl_base.h>
+#include <ATen/native/npu/frame/InputInfoLib.h>
+
+
+namespace at {
+namespace native {
+namespace npu {
+
+class ROIAlignStrategy : public DescStrategyBase
+{
+public:
+  virtual void CreateInputDescInfo(ACL_PARAMS& params,
+    DynamicCompileShape& compileShape) override;
+
+  virtual void CreateOutputDescInfo(ACL_PARAMS& params,
+    DynamicCompileShape& compileShape) override;
+};
+
+void ROIAlignStrategy::CreateInputDescInfo(ACL_PARAMS& params,
+  DynamicCompileShape& compileShape) {
+  CreateDefaultDescInfo(params.input_desc,
+    params.input_num,
+    params.inputDims,
+    params.inputFormats,
+    compileShape.inputShape,
+    compileShape.inputStorageShape);
+}
+
+void ROIAlignStrategy::CreateOutputDescInfo(ACL_PARAMS& params,
+  DynamicCompileShape& compileShape) {
+  for (int64_t i = 0; i < params.output_num; ++i) {
+    aclTensorDesc* desc = const_cast<aclTensorDesc*>(params.output_desc[i]);
+
+    int64_t dim = (int64_t)aclGetTensorDescNumDims(desc);
+    dim = (dim == 0) ? 1 : dim;
+
+    int64_t storageDim = (params.outputDims[i] == 0) ? 1 : params.outputDims[i];
+    aclFormat storageFormat = params.outputFormats[i];
+
+    FormatShape shape(dim, -1);
+    FormatShape storageShape(storageDim, -1);
+
+    // fix height dim value
+    int64_t index_h = dim - 2;
+    aclGetTensorDescDimV2(desc, index_h, &shape[index_h]);
+
+    // fix width dim value
+    int64_t index_w = dim - 1;
+    aclGetTensorDescDimV2(desc, index_w, &shape[index_w]);    
+
+    if (storageFormat == ACL_FORMAT_NC1HWC0) {
+        storageShape[storageDim - 3] = shape[dim - 2];
+        storageShape[storageDim - 2] = shape[dim - 1];
+        storageShape[storageDim - 1] = 16;
+    } else {
+        storageShape[storageDim - 2] = shape[dim - 2];
+        storageShape[storageDim - 1] = shape[dim - 1];
+    }
+    
+    compileShape.outputShape.emplace_back(shape);
+    compileShape.outputStorageShape.emplace_back(storageShape);
+  }
+}
+
+REGISTER_DYNAMIC_SHAPE_OPT(ROIAlign, ROIAlignStrategy)
+
+} // namespace npu
+} // namespace native
 } // namespace at
\ No newline at end of file
diff --git a/src/aten/src/ATen/native/npu/frame/FormatHelper.cpp b/src/aten/src/ATen/native/npu/frame/FormatHelper.cpp
index 018f6e2707fd4960e8afde21a86203b7e7ff29ee..d13bccb4c09be79ef1197124baf5395a6e64231d 100644
--- a/src/aten/src/ATen/native/npu/frame/FormatHelper.cpp
+++ b/src/aten/src/ATen/native/npu/frame/FormatHelper.cpp
@@ -59,22 +59,6 @@ std::unordered_map<aclFormat, FormatHelper::FormatInfo> FormatHelper::info = {
   {ACL_FRACTAL_Z_3D,        (FormatInfo){ACL_FRACTAL_Z_3D,      ACL_FORMAT_NCDHW,   InferShapeOfFZ3D,       "FRACTAL_Z_3D", true}},
 };
 
-std::unordered_map<aclFormat, std::unordered_map<aclFormat, baseFormatConverter>> FormatHelper::base_format_convert_info = {
-  {ACL_FORMAT_ND,      {
-                          {ACL_FORMAT_NCHW,    InferShapeNDToNCHW},
-                          {ACL_FORMAT_NCDHW,    InferShapeNDToNCDHW},
-                       }
-  },
-  {ACL_FORMAT_NCHW,    {
-                          {ACL_FORMAT_ND,      InferShapeNCHWToND},
-                       }
-  },
-  {ACL_FORMAT_NCDHW,    {
-                          {ACL_FORMAT_ND,      InferShapeNCDHWToND},
-                       }
-  },
-};
-
 bool FormatHelper::IsPadded(const Tensor* tensor) {
   auto format = tensor->storage().unsafeGetStorageImpl()->npu_desc_.npu_format_;
   return IsPadded(format);
@@ -136,20 +120,6 @@ FormatShape FormatHelper::GetStorageSizes(NPUStorageDesc desc) {
   return GetStorageSizes(format, ori_size);
 }
 
-FormatShape FormatHelper::GetSizeOfBaseFormat(const Tensor& src, aclFormat dst_format) {
-  auto src_format = GetBaseFormat(src);
-  auto itr = base_format_convert_info.find(src_format);
-  if (itr != base_format_convert_info.end()) {
-    auto next_itr = itr->second.find(dst_format);
-    if (next_itr != itr->second.end()) {
-      auto src_desc = src.storage().unsafeGetStorageImpl()->npu_desc_;
-      return next_itr->second(src_desc.storage_sizes_, src_desc.base_sizes_);
-    }
-  }
-  AT_ERROR("unsupport InferShape from ", GetFormatName(src_format), " to ", GetFormatName(dst_format));
-  return {};
-}
-
 // 
 namespace {
 FormatShape InferShapeLessTo4(IntArrayRef dims) {
diff --git a/src/aten/src/ATen/native/npu/frame/FormatHelper.h b/src/aten/src/ATen/native/npu/frame/FormatHelper.h
index 862ff1b7d33d6d034dfda7b8923ed6ef993e2872..9f0d1f024239fcce5cc7dc5136f3634d2334e8a0 100644
--- a/src/aten/src/ATen/native/npu/frame/FormatHelper.h
+++ b/src/aten/src/ATen/native/npu/frame/FormatHelper.h
@@ -48,7 +48,6 @@ public:
   static FormatShape GetStorageSizes(aclFormat format, sizeType ori_size);
   // GetStorageSizes used to calculate the storage sizes of op at npu device at different format.
   static FormatShape GetStorageSizes(NPUStorageDesc desc);
-  static FormatShape GetSizeOfBaseFormat(const Tensor& src, aclFormat dst_format);
 
 private:
   static bool IsPadded(aclFormat format);
@@ -64,7 +63,6 @@ private:
     bool isPadded = false;
   } FormatInfo;
   static std::unordered_map<aclFormat, FormatInfo> info;
-  static std::unordered_map<aclFormat, std::unordered_map<aclFormat, baseFormatConverter>> base_format_convert_info;
 }; // class FormatHelper
 
 // template impl
diff --git a/src/aten/src/ATen/native/npu/interface/EnvVariables.cpp b/src/aten/src/ATen/native/npu/interface/EnvVariables.cpp
index 1985cbffbbc309661fae435fda629967cf31f0ca..46abd15c008bb4f760ea7aea96c530489dea9c2e 100644
--- a/src/aten/src/ATen/native/npu/interface/EnvVariables.cpp
+++ b/src/aten/src/ATen/native/npu/interface/EnvVariables.cpp
@@ -42,9 +42,7 @@ REGISTER_OPTION_HOOK(ACL_OP_COMPILER_CACHE_DIR, [](const std::string& val) {
   aclSetCompileopt(aclCompileOpt::ACL_OP_COMPILER_CACHE_DIR, val.c_str());
  })
 REGISTER_OPTION_HOOK(NPU_FUZZY_COMPILE_BLACKLIST, [](const std::string& val) { 
-  if (CheckFuzzyEnable()) {
     FuzzyCompileBlacklist::GetInstance().RegisterBlacklist(val);
-  }
  })
 
  REGISTER_OPTION_INIT_BY_ENV(PROFILING_MODE)
diff --git a/src/aten/src/ATen/native/npu/pooling/AvgPool3dKernelNpu.cpp b/src/aten/src/ATen/native/npu/pooling/AvgPool3dKernelNpu.cpp
index 5f5aa51aa94d838603630bbe3b2d52689366f024..f93424e278279034ea0d1b8fab7be3cbc3eb88c4 100644
--- a/src/aten/src/ATen/native/npu/pooling/AvgPool3dKernelNpu.cpp
+++ b/src/aten/src/ATen/native/npu/pooling/AvgPool3dKernelNpu.cpp
@@ -1,177 +1,177 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ATen/native/npu/utils/OpAdapter.h"
-#include <ATen/native/Pool.h>
-
-namespace at {
-namespace native {
-using namespace at::native::npu;
-
-Tensor& avg_pool3d_out_npu(
-    Tensor& out,
-    const Tensor& self,
-    IntArrayRef kernel_size,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    bool ceil_mode,
-    bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
-  SmallVector<int64_t, N> pads = {0, 0, 0, padding[0], padding[1], padding[2]};
-
-  Tensor input = self;
-  if (self.ndimension() == 4) {
-    input = input.unsqueeze(0);
-    out = out.unsqueeze(0);
-  }
-
-  // calculate the output size
-  int D = self.size(-3);
-  int H = self.size(-2);
-  int W = self.size(-1);
-
-  int64_t D_size = ceil_mode
-      ? (CeilDiv(D + 2 * padding[0] - kernel_size[0], stride[0]) + 1)
-      : ((D + 2 * padding[0] - kernel_size[0]) / stride[0] + 1);
-  int64_t H_size = ceil_mode
-      ? (CeilDiv(H + 2 * padding[1] - kernel_size[1], stride[1]) + 1)
-      : ((H + 2 * padding[1] - kernel_size[1]) / stride[1] + 1);
-  int64_t W_size = ceil_mode
-      ? (CeilDiv(W + 2 * padding[2] - kernel_size[2], stride[2]) + 1)
-      : ((W + 2 * padding[2] - kernel_size[2]) / stride[2] + 1);
-
-  SmallVector<int64_t, SIZE> outputSize = {input.size(0), input.size(1), D_size, H_size, W_size};
-  OpPreparation::CheckOut(
-      {self},
-      out,
-      ACL_FORMAT_NCDHW,
-      out.scalar_type(),
-      outputSize);
-
-  OpCommand cmd;
-  cmd.Name("AvgPool3D")
-      .Input(input)
-      .Output(out)
-      .Attr("ksize", kernel_size)
-      .Attr("strides", stride)
-      .Attr("pads", pads)
-      .Attr("ceil_mode", ceil_mode)
-      .Attr("count_include_pad", count_include_pad);
-
-  if (divisor_override.has_value()) {
-    cmd.Attr("divisor_override", divisor_override.value());
-  }
-
-  cmd.Attr("data_format", (string)"NCDHW")
-      .Run();
-
-  if (self.ndimension() == 4) {
-    out = out.squeeze(0);
-  }
-  return out;
-}
-
-Tensor avg_pool3d_npu(
-    const Tensor& self,
-    IntArrayRef kernel_size,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    bool ceil_mode,
-    bool count_include_pad,
-    c10::optional<int64_t> divisor_override) {
-
-  // #20866, #22032: Guarantee this for the official C++ API?
-  TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 3,
-    "avg_pool3d: kernel_size must be a single int, or a tuple of three ints");
-  const int kT = safe_downcast<int, int64_t>(kernel_size[0]);
-  const int kH = kernel_size.size() == 1 ? kT : safe_downcast<int, int64_t>(kernel_size[1]);
-  const int kW = kernel_size.size() == 1 ? kT : safe_downcast<int, int64_t>(kernel_size[2]);
-  SmallVector<int64_t, SIZE> kernel_sizes = {kT, kH, kW};
-  IntArrayRef kernel_sizess = IntArrayRef(kernel_sizes);
-
-  TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 3,
-    "avg_pool3d: stride must be omitted, a single int, or a tuple of three ints");
-  const int dT = stride.empty() ? kT : safe_downcast<int, int64_t>(stride[0]);
-  const int dH = stride.empty() ? kH :
-                 stride.size() == 1 ? dT : safe_downcast<int, int64_t>(stride[1]);
-  const int dW = stride.empty() ? kW :
-                 stride.size() == 1 ? dT : safe_downcast<int, int64_t>(stride[2]);
-
-  SmallVector<int64_t, SIZE> strides = {dT, dH, dW};
-  IntArrayRef stridess = IntArrayRef(strides);
-
-  TORCH_CHECK(padding.size() == 1 || padding.size() == 3,
-    "avg_pool3d: padding must be a single int, or a tuple of three ints");
-  const int padT = safe_downcast<int, int64_t>(padding[0]);
-  const int padH = padding.size() == 1 ? padT : safe_downcast<int, int64_t>(padding[1]);
-  const int padW = padding.size() == 1 ? padT : safe_downcast<int, int64_t>(padding[2]);
-  SmallVector<int64_t, SIZE> paddings = {padT, padH, padW};
-  IntArrayRef paddingss = IntArrayRef(paddings);
-
-  TORCH_CHECK((self.ndimension() == 4 || self.ndimension() == 5),
-    "non-empty 4D or 5D (batch mode) tensor expected for input");
-
-  TORCH_CHECK(!divisor_override.has_value() || divisor_override.value() != 0,
-    "divisor must be not zero");
-
-  const int64_t nslices = self.size(-4);
-  const int64_t itime = self.size(-3);
-  const int64_t iheight = self.size(-2);
-  const int64_t iwidth = self.size(-1);
-
-  const int64_t otime = pooling_output_shape<int64_t>(itime, kT, padT, dT, 1, ceil_mode);
-  const int64_t oheight = pooling_output_shape<int64_t>(iheight, kH, padH, dH, 1, ceil_mode);
-  const int64_t owidth = pooling_output_shape<int64_t>(iwidth, kW, padW, dW, 1, ceil_mode);
-
-  pool3d_shape_check(
-    self,
-    nslices,
-    kT, kH, kW,
-    dT, dH, dW,
-    padT, padH, padW,
-    1, 1, 1,
-    itime, iheight, iwidth,
-    otime, oheight, owidth,
-    /*check_input_size=*/ true);
-
-  Tensor input = self;
-  if (self.ndimension() == 4) {
-    input = self.unsqueeze(0);
-  }
-
-  SmallVector<int64_t, SIZE> outputSize = {input.size(0), input.size(1), otime, oheight, owidth};
-
-  Tensor result = OpPreparation::ApplyTensorWithFormat(input, outputSize, ACL_FORMAT_NCDHW);
-
-  // calculate the output result of the NPU
-  avg_pool3d_out_npu(
-      result,
-      input,
-      kernel_sizess,
-      stridess,
-      paddingss,
-      ceil_mode,
-      count_include_pad,
-      divisor_override);
-
-  if (self.ndimension() == 4) {
-    result = result.squeeze(0);
-  }
-  return result;
-}
-
-} // namespace native
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ATen/native/npu/utils/OpAdapter.h"
+#include <ATen/native/Pool.h>
+
+namespace at {
+namespace native {
+using namespace at::native::npu;
+
+Tensor& avg_pool3d_out_npu(
+    Tensor& out,
+    const Tensor& self,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    bool ceil_mode,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override) {
+  SmallVector<int64_t, N> pads = {0, 0, 0, padding[0], padding[1], padding[2]};
+
+  Tensor input = self;
+  if (self.ndimension() == 4) {
+    input = input.unsqueeze(0);
+    out = out.unsqueeze(0);
+  }
+
+  // calculate the output size
+  int D = self.size(-3);
+  int H = self.size(-2);
+  int W = self.size(-1);
+
+  int64_t D_size = ceil_mode
+      ? (CeilDiv(D + 2 * padding[0] - kernel_size[0], stride[0]) + 1)
+      : ((D + 2 * padding[0] - kernel_size[0]) / stride[0] + 1);
+  int64_t H_size = ceil_mode
+      ? (CeilDiv(H + 2 * padding[1] - kernel_size[1], stride[1]) + 1)
+      : ((H + 2 * padding[1] - kernel_size[1]) / stride[1] + 1);
+  int64_t W_size = ceil_mode
+      ? (CeilDiv(W + 2 * padding[2] - kernel_size[2], stride[2]) + 1)
+      : ((W + 2 * padding[2] - kernel_size[2]) / stride[2] + 1);
+
+  SmallVector<int64_t, SIZE> outputSize = {input.size(0), input.size(1), D_size, H_size, W_size};
+  OpPreparation::CheckOut(
+      {self},
+      out,
+      ACL_FORMAT_NCDHW,
+      out.scalar_type(),
+      outputSize);
+
+  OpCommand cmd;
+  cmd.Name("AvgPool3D")
+      .Input(input)
+      .Output(out)
+      .Attr("ksize", kernel_size)
+      .Attr("strides", stride)
+      .Attr("pads", pads)
+      .Attr("ceil_mode", ceil_mode)
+      .Attr("count_include_pad", count_include_pad);
+
+  if (divisor_override.has_value()) {
+    cmd.Attr("divisor_override", divisor_override.value());
+  }
+
+  cmd.Attr("data_format", (string)"NCDHW")
+      .Run();
+
+  if (self.ndimension() == 4) {
+    out = out.squeeze(0);
+  }
+  return out;
+}
+
+Tensor avg_pool3d_npu(
+    const Tensor& self,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    bool ceil_mode,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override) {
+
+  // #20866, #22032: Guarantee this for the official C++ API?
+  TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 3,
+    "avg_pool3d: kernel_size must be a single int, or a tuple of three ints");
+  const int kT = safe_downcast<int, int64_t>(kernel_size[0]);
+  const int kH = kernel_size.size() == 1 ? kT : safe_downcast<int, int64_t>(kernel_size[1]);
+  const int kW = kernel_size.size() == 1 ? kT : safe_downcast<int, int64_t>(kernel_size[2]);
+  SmallVector<int64_t, SIZE> kernel_sizes = {kT, kH, kW};
+  IntArrayRef kernel_sizess = IntArrayRef(kernel_sizes);
+
+  TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 3,
+    "avg_pool3d: stride must be omitted, a single int, or a tuple of three ints");
+  const int dT = stride.empty() ? kT : safe_downcast<int, int64_t>(stride[0]);
+  const int dH = stride.empty() ? kH :
+                 stride.size() == 1 ? dT : safe_downcast<int, int64_t>(stride[1]);
+  const int dW = stride.empty() ? kW :
+                 stride.size() == 1 ? dT : safe_downcast<int, int64_t>(stride[2]);
+
+  SmallVector<int64_t, SIZE> strides = {dT, dH, dW};
+  IntArrayRef stridess = IntArrayRef(strides);
+
+  TORCH_CHECK(padding.size() == 1 || padding.size() == 3,
+    "avg_pool3d: padding must be a single int, or a tuple of three ints");
+  const int padT = safe_downcast<int, int64_t>(padding[0]);
+  const int padH = padding.size() == 1 ? padT : safe_downcast<int, int64_t>(padding[1]);
+  const int padW = padding.size() == 1 ? padT : safe_downcast<int, int64_t>(padding[2]);
+  SmallVector<int64_t, SIZE> paddings = {padT, padH, padW};
+  IntArrayRef paddingss = IntArrayRef(paddings);
+
+  TORCH_CHECK((self.ndimension() == 4 || self.ndimension() == 5),
+    "non-empty 4D or 5D (batch mode) tensor expected for input");
+
+  TORCH_CHECK(!divisor_override.has_value() || divisor_override.value() != 0,
+    "divisor must be not zero");
+
+  const int64_t nslices = self.size(-4);
+  const int64_t itime = self.size(-3);
+  const int64_t iheight = self.size(-2);
+  const int64_t iwidth = self.size(-1);
+
+  const int64_t otime = pooling_output_shape<int64_t>(itime, kT, padT, dT, 1, ceil_mode);
+  const int64_t oheight = pooling_output_shape<int64_t>(iheight, kH, padH, dH, 1, ceil_mode);
+  const int64_t owidth = pooling_output_shape<int64_t>(iwidth, kW, padW, dW, 1, ceil_mode);
+
+  pool3d_shape_check(
+    self,
+    nslices,
+    kT, kH, kW,
+    dT, dH, dW,
+    padT, padH, padW,
+    1, 1, 1,
+    itime, iheight, iwidth,
+    otime, oheight, owidth,
+    /*check_input_size=*/ true);
+
+  Tensor input = self;
+  if (self.ndimension() == 4) {
+    input = self.unsqueeze(0);
+  }
+
+  SmallVector<int64_t, SIZE> outputSize = {input.size(0), input.size(1), otime, oheight, owidth};
+
+  Tensor result = OpPreparation::ApplyTensorWithFormat(input, outputSize, ACL_FORMAT_NCDHW);
+
+  // calculate the output result of the NPU
+  avg_pool3d_out_npu(
+      result,
+      input,
+      kernel_sizess,
+      stridess,
+      paddingss,
+      ceil_mode,
+      count_include_pad,
+      divisor_override);
+
+  if (self.ndimension() == 4) {
+    result = result.squeeze(0);
+  }
+  return result;
+}
+
+} // namespace native
+} // namespace at
diff --git a/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp b/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp
index a49aa9b9945afa46ac43295733adce86ded1ece0..412d1fc32b7bca4bb8f5d7bcac31eee8458a5bc8 100644
--- a/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp
+++ b/src/aten/src/ATen/native/npu/utils/CalcuOpUtil.cpp
@@ -347,7 +347,18 @@ NPUStatus CalcuOpUtil::CreateAclTensorDescInfo(
         input[i].tensorDescType == NPUTensorDesc::TensorDescType::TENSOR) {
       Tensor* aclInput = &input[i].tensor;
       SmallVector<int64_t, 5> dims;
-      dims = aclInput->storage().get_npu_desc().base_sizes_;
+      if (opName == "MatMul") {
+        auto dims_pre = aclInput->sizes();
+        if (attrs[i].boolAttrValue == 1) {
+          dims.push_back(dims_pre[1]);
+          dims.push_back(dims_pre[0]);
+        } else if (attrs[i].boolAttrValue == 0) {
+          dims.push_back(dims_pre[0]);
+          dims.push_back(dims_pre[1]);
+        }
+      } else {
+        dims = aclInput->storage().get_npu_desc().base_sizes_;
+      }
       auto storageDims = aclInput->storage().get_npu_desc().storage_sizes_;
       int64_t numel = 1;
       for (int j = 0; j < storageDims.size(); j++) {
diff --git a/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp b/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp
index 773f25ab306403e43169c5a489625a18d9aea2a5..10672bf1137986a0becedc8714eae163b64b7190 100644
--- a/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp
+++ b/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.cpp
@@ -333,12 +333,6 @@ SmallVector<int64_t, SIZE> embedding_dense_backward_npu_output_size(
   return {num_weights, grad_output.size(-1)};
 }
 
-SmallVector<int64_t, SIZE> embedding_renorm_mid_npu_output_size(
-    const Tensor& self,
-    const Tensor& indices){
-  return {indices.size(0), self.size(1)};
-}
-
 SmallVector<int64_t, SIZE> equal_npu_output_size(void) {
   int64_t outputshape = 1;
   SmallVector<int64_t, SIZE> outputSize = {outputshape};
diff --git a/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.h b/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.h
index 9290da7ddd91ee55d3e88cf46fc065973ab0a4be..b676141652f53263bae5911302824bbe69d66c8b 100644
--- a/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.h
+++ b/src/aten/src/ATen/native/npu/utils/KernelNpuOutputSize.h
@@ -182,10 +182,6 @@ SmallVector<int64_t, SIZE> embedding_dense_backward_npu_output_size(
     int64_t padding_idx, 
     bool scale_grad_by_freq);
 
-SmallVector<int64_t, SIZE> embedding_renorm_mid_npu_output_size(
-    const Tensor& self,
-    const Tensor& indices);
-
 SmallVector<int64_t, SIZE> index_npu_output_size(
   const Tensor& self, 
   TensorList indices);
diff --git a/src/aten/src/ATen/npu/NPUGenerator.cpp b/src/aten/src/ATen/npu/NPUGenerator.cpp
index 93f75473c1eb61cc513b88ef86c09ead04a89690..2609b15f68d3eb4fa31342029c9c549726f967ad 100644
--- a/src/aten/src/ATen/npu/NPUGenerator.cpp
+++ b/src/aten/src/ATen/npu/NPUGenerator.cpp
@@ -1,195 +1,195 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/npu/NPUGenerator.h>
-#include <c10/npu/NPUFunctions.h>
-
-namespace at {
-
-namespace npu { namespace detail {
-
-// Ensures we only call npuGetDeviceCount only once.
-static std::once_flag num_npu_init_flag;
-
-// Total number of npus in the system.
-static int64_t num_npus;
-
-// Ensures default_gens_npu is initialized once.
-static std::deque<std::once_flag> npu_gens_init_flag;
-
-// Default, global NPU generators, one per NPU.
-static std::vector<std::shared_ptr<NPUGenerator>> default_gens_npu;
-
-/* 
-* Populates the global variables related to NPU generators
-* Warning: this function must only be called once!
-*/
-static void initNPUGenVector(){
-  num_npus = c10::npu::device_count();
-  npu_gens_init_flag.resize(num_npus);
-  default_gens_npu.resize(num_npus);
-}
-
-/**
- * PyTorch maintains a collection of default generators that get
- * initialized once. The purpose of these default generators is to
- * maintain a global running state of the pseudo random number generation,
- * when a user does not explicitly mention any generator.
- * getDefaultNPUGenerator gets the default generator for a particular
- * npu device.
- */
-NPUGenerator* getDefaultNPUGenerator(DeviceIndex device_index) {
-  std::call_once(num_npu_init_flag, initNPUGenVector);
-  DeviceIndex idx = device_index;
-  if (idx == -1) {
-    idx = c10::npu::current_device();
-  } else {
-    TORCH_CHECK(idx >= 0 && idx < num_npus);
-  }
-  std::call_once(npu_gens_init_flag[idx], [&] {
-    default_gens_npu[idx] = std::make_shared<NPUGenerator>(idx);
-    default_gens_npu[idx]->seed();
-  });
-  return default_gens_npu[idx].get();
-}
-
-/**
- * Utility to create a NPUGenerator. Returns a shared_ptr
- */
-std::shared_ptr<NPUGenerator> createNPUGenerator(DeviceIndex device_index) {
-  std::call_once(num_npu_init_flag, initNPUGenVector);
-  DeviceIndex idx = device_index;
-  if (idx == -1) {
-    idx = c10::npu::current_device();
-  }
-  TORCH_CHECK(idx >= 0 && idx < num_npus, "The device_index is invalid.");
-  auto gen = std::make_shared<NPUGenerator>(idx);
-  gen->set_current_seed(default_rng_seed_val);
-  gen->set_philox_offset_per_thread(0);
-  return gen;
-}
-
-} // namespace detail
-} // namespace npu
-
-
-/**
- * NPUGenerator class implementation
- */
-NPUGenerator::NPUGenerator(DeviceIndex device_index)
-  : Generator{Device(DeviceType::NPU, device_index),
-              DispatchKeySet(c10::DispatchKey::NPUTensorId)} { }
-
-/**
- * Sets the seed to be used by curandStatePhilox4_32_10
- * Resets the philox_offset_per_thread_ to 0
- * 
- * See Note [Acquire lock when using random generators]
- */
-void NPUGenerator::set_current_seed(uint64_t seed) {
-  seed_ = seed;
-  philox_offset_per_thread_ = 0;
-}
-
-/**
- * Gets the current seed of NPUGenerator.
- */
-uint64_t NPUGenerator::current_seed() const {
-  return seed_;
-}
-
-/**
- * Gets a nondeterministic random number from /dev/urandom or time,
- * seeds the CPUGenerator with it and then returns that number.
- * 
- * FIXME: You can move this function to Generator.cpp if the algorithm
- * in getNonDeterministicRandom is unified for both CPU and NPU
- */
-uint64_t NPUGenerator::seed() {
-  auto random = at::detail::getNonDeterministicRandom(true);
-  this->set_current_seed(random);
-  return random;
-}
-
-/**
- * Sets the philox_offset_per_thread_ to be used by curandStatePhilox4_32_10
- * 
- * See Note [Acquire lock when using random generators]
- */
-void NPUGenerator::set_philox_offset_per_thread(uint64_t offset) {
-  philox_offset_per_thread_ = offset;
-}
-
-/**
- * Gets the current philox_offset_per_thread_ of NPUGenerator.
- */
-uint64_t NPUGenerator::philox_offset_per_thread() {
-  return philox_offset_per_thread_;
-}
-
-/**
- * Gets the seed and philox offset value to be used in
- * curandStatePhilox4_32_10
- * 
- * Each kernel using philox has to sensibly increment offset
- * for future users of philox. So it gets the "old" value for
- * itself (before add), and tells subsequent users which offset
- * they should use, since only the kernel knows how many randoms
- * it intends to generate. 
- * 
- * Increment should be at least the number of curand() random numbers used in
- * each thread. It is the user's responsibility to make sure that the increment
- * for philox is never smaller than the number of curand() calls. Increment
- * value > the number of curand() calls won't harm but anything less would mean
- * that you would be reusing random values from previous calls.
- * 
- * See Note [Acquire lock when using random generators]
- */
-std::pair<uint64_t, uint64_t> NPUGenerator::philox_engine_inputs(uint64_t increment) {
-  uint64_t offset = this->philox_offset_per_thread_;
-  this->philox_offset_per_thread_ += increment;
-  return std::make_pair(this->seed_, offset);
-}
-
-/*
- * Gets the DeviceType of NPUGenerator.
- * Used for type checking during run time.
- */
-DeviceType NPUGenerator::device_type() {
-  return DeviceType::NPU;
-}
-
-/**
- * Public clone method implementation
- * 
- * See Note [Acquire lock when using random generators]
- */
-std::shared_ptr<NPUGenerator> NPUGenerator::clone() const {
-  return std::shared_ptr<NPUGenerator>(this->clone_impl());
-}
-
-/**
- * Private clone method implementation
- * 
- * See Note [Acquire lock when using random generators]
- */
-NPUGenerator* NPUGenerator::clone_impl() const {
-  auto gen = new NPUGenerator(this->device().index());
-  gen->set_current_seed(this->seed_);
-  gen->set_philox_offset_per_thread(this->philox_offset_per_thread_);
-  return gen;
-}
-} // namespace at
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/npu/NPUGenerator.h>
+#include <c10/npu/NPUFunctions.h>
+
+namespace at {
+
+namespace npu { namespace detail {
+
+// Ensures we only call npuGetDeviceCount only once.
+static std::once_flag num_npu_init_flag;
+
+// Total number of npus in the system.
+static int64_t num_npus;
+
+// Ensures default_gens_npu is initialized once.
+static std::deque<std::once_flag> npu_gens_init_flag;
+
+// Default, global NPU generators, one per NPU.
+static std::vector<std::shared_ptr<NPUGenerator>> default_gens_npu;
+
+/* 
+* Populates the global variables related to NPU generators
+* Warning: this function must only be called once!
+*/
+static void initNPUGenVector(){
+  num_npus = c10::npu::device_count();
+  npu_gens_init_flag.resize(num_npus);
+  default_gens_npu.resize(num_npus);
+}
+
+/**
+ * PyTorch maintains a collection of default generators that get
+ * initialized once. The purpose of these default generators is to
+ * maintain a global running state of the pseudo random number generation,
+ * when a user does not explicitly mention any generator.
+ * getDefaultNPUGenerator gets the default generator for a particular
+ * npu device.
+ */
+NPUGenerator* getDefaultNPUGenerator(DeviceIndex device_index) {
+  std::call_once(num_npu_init_flag, initNPUGenVector);
+  DeviceIndex idx = device_index;
+  if (idx == -1) {
+    idx = c10::npu::current_device();
+  } else {
+    TORCH_CHECK(idx >= 0 && idx < num_npus);
+  }
+  std::call_once(npu_gens_init_flag[idx], [&] {
+    default_gens_npu[idx] = std::make_shared<NPUGenerator>(idx);
+    default_gens_npu[idx]->seed();
+  });
+  return default_gens_npu[idx].get();
+}
+
+/**
+ * Utility to create a NPUGenerator. Returns a shared_ptr
+ */
+std::shared_ptr<NPUGenerator> createNPUGenerator(DeviceIndex device_index) {
+  std::call_once(num_npu_init_flag, initNPUGenVector);
+  DeviceIndex idx = device_index;
+  if (idx == -1) {
+    idx = c10::npu::current_device();
+  }
+  TORCH_CHECK(idx >= 0 && idx < num_npus, "The device_index is invalid.");
+  auto gen = std::make_shared<NPUGenerator>(idx);
+  gen->set_current_seed(default_rng_seed_val);
+  gen->set_philox_offset_per_thread(0);
+  return gen;
+}
+
+} // namespace detail
+} // namespace npu
+
+
+/**
+ * NPUGenerator class implementation
+ */
+NPUGenerator::NPUGenerator(DeviceIndex device_index)
+  : Generator{Device(DeviceType::NPU, device_index),
+              DispatchKeySet(c10::DispatchKey::NPUTensorId)} { }
+
+/**
+ * Sets the seed to be used by curandStatePhilox4_32_10
+ * Resets the philox_offset_per_thread_ to 0
+ * 
+ * See Note [Acquire lock when using random generators]
+ */
+void NPUGenerator::set_current_seed(uint64_t seed) {
+  seed_ = seed;
+  philox_offset_per_thread_ = 0;
+}
+
+/**
+ * Gets the current seed of NPUGenerator.
+ */
+uint64_t NPUGenerator::current_seed() const {
+  return seed_;
+}
+
+/**
+ * Gets a nondeterministic random number from /dev/urandom or time,
+ * seeds the CPUGenerator with it and then returns that number.
+ * 
+ * FIXME: You can move this function to Generator.cpp if the algorithm
+ * in getNonDeterministicRandom is unified for both CPU and NPU
+ */
+uint64_t NPUGenerator::seed() {
+  auto random = at::detail::getNonDeterministicRandom(true);
+  this->set_current_seed(random);
+  return random;
+}
+
+/**
+ * Sets the philox_offset_per_thread_ to be used by curandStatePhilox4_32_10
+ * 
+ * See Note [Acquire lock when using random generators]
+ */
+void NPUGenerator::set_philox_offset_per_thread(uint64_t offset) {
+  philox_offset_per_thread_ = offset;
+}
+
+/**
+ * Gets the current philox_offset_per_thread_ of NPUGenerator.
+ */
+uint64_t NPUGenerator::philox_offset_per_thread() {
+  return philox_offset_per_thread_;
+}
+
+/**
+ * Gets the seed and philox offset value to be used in
+ * curandStatePhilox4_32_10
+ * 
+ * Each kernel using philox has to sensibly increment offset
+ * for future users of philox. So it gets the "old" value for
+ * itself (before add), and tells subsequent users which offset
+ * they should use, since only the kernel knows how many randoms
+ * it intends to generate. 
+ * 
+ * Increment should be at least the number of curand() random numbers used in
+ * each thread. It is the user's responsibility to make sure that the increment
+ * for philox is never smaller than the number of curand() calls. Increment
+ * value > the number of curand() calls won't harm but anything less would mean
+ * that you would be reusing random values from previous calls.
+ * 
+ * See Note [Acquire lock when using random generators]
+ */
+std::pair<uint64_t, uint64_t> NPUGenerator::philox_engine_inputs(uint64_t increment) {
+  uint64_t offset = this->philox_offset_per_thread_;
+  this->philox_offset_per_thread_ += increment;
+  return std::make_pair(this->seed_, offset);
+}
+
+/*
+ * Gets the DeviceType of NPUGenerator.
+ * Used for type checking during run time.
+ */
+DeviceType NPUGenerator::device_type() {
+  return DeviceType::NPU;
+}
+
+/**
+ * Public clone method implementation
+ * 
+ * See Note [Acquire lock when using random generators]
+ */
+std::shared_ptr<NPUGenerator> NPUGenerator::clone() const {
+  return std::shared_ptr<NPUGenerator>(this->clone_impl());
+}
+
+/**
+ * Private clone method implementation
+ * 
+ * See Note [Acquire lock when using random generators]
+ */
+NPUGenerator* NPUGenerator::clone_impl() const {
+  auto gen = new NPUGenerator(this->device().index());
+  gen->set_current_seed(this->seed_);
+  gen->set_philox_offset_per_thread(this->philox_offset_per_thread_);
+  return gen;
+}
+} // namespace at
diff --git a/src/aten/src/ATen/npu/NPUGenerator.h b/src/aten/src/ATen/npu/NPUGenerator.h
index 5ae9ba3da6fc343474a8193bae865e2109e155bf..896a96c5a6b9e4005a5d6879b301606b04118b35 100644
--- a/src/aten/src/ATen/npu/NPUGenerator.h
+++ b/src/aten/src/ATen/npu/NPUGenerator.h
@@ -1,53 +1,53 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <ATen/core/Generator.h>
-
-namespace at {
-
-struct TORCH_NPU_API NPUGenerator : public Generator {
-  // Constructors
-  NPUGenerator(DeviceIndex device_index = -1);
-  ~NPUGenerator() = default;
-
-  // NPUGenerator methods
-  std::shared_ptr<NPUGenerator> clone() const;
-  void set_current_seed(uint64_t seed) override;
-  uint64_t current_seed() const override;
-  uint64_t seed() override;
-  void set_philox_offset_per_thread(uint64_t offset);
-  uint64_t philox_offset_per_thread();
-  std::pair<uint64_t, uint64_t> philox_engine_inputs(uint64_t increment);
-  static DeviceType device_type();
-
-private:
-  NPUGenerator* clone_impl() const override;
-  uint64_t seed_ = default_rng_seed_val;
-  uint64_t philox_offset_per_thread_ = 0;
-};
-
-namespace npu {
-namespace detail {
-
-  TORCH_NPU_API NPUGenerator* getDefaultNPUGenerator(DeviceIndex device_index = -1);
-  TORCH_NPU_API std::shared_ptr<NPUGenerator> createNPUGenerator(DeviceIndex device_index = -1);
-
-} // namespace detail
-} // namespace npu
-} // namespace at
-
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ATen/core/Generator.h>
+
+namespace at {
+
+struct TORCH_NPU_API NPUGenerator : public Generator {
+  // Constructors
+  NPUGenerator(DeviceIndex device_index = -1);
+  ~NPUGenerator() = default;
+
+  // NPUGenerator methods
+  std::shared_ptr<NPUGenerator> clone() const;
+  void set_current_seed(uint64_t seed) override;
+  uint64_t current_seed() const override;
+  uint64_t seed() override;
+  void set_philox_offset_per_thread(uint64_t offset);
+  uint64_t philox_offset_per_thread();
+  std::pair<uint64_t, uint64_t> philox_engine_inputs(uint64_t increment);
+  static DeviceType device_type();
+
+private:
+  NPUGenerator* clone_impl() const override;
+  uint64_t seed_ = default_rng_seed_val;
+  uint64_t philox_offset_per_thread_ = 0;
+};
+
+namespace npu {
+namespace detail {
+
+  TORCH_NPU_API NPUGenerator* getDefaultNPUGenerator(DeviceIndex device_index = -1);
+  TORCH_NPU_API std::shared_ptr<NPUGenerator> createNPUGenerator(DeviceIndex device_index = -1);
+
+} // namespace detail
+} // namespace npu
+} // namespace at
+
diff --git a/src/aten/src/ATen/utils/LoadUtils.cpp b/src/aten/src/ATen/utils/LoadUtils.cpp
index 10ed418f73741cbe46f7c78e5193da07fd1013a1..52ecc66d213e539a47b7ee51ebf839287f3fa050 100644
--- a/src/aten/src/ATen/utils/LoadUtils.cpp
+++ b/src/aten/src/ATen/utils/LoadUtils.cpp
@@ -689,17 +689,23 @@ namespace at {
 
   }
 
+  void ZeroStrideClear(Tensor& dst, Tensor& src) {
+    auto strides = dst.strides().vec();
+    auto position = std::find(strides.begin(), strides.end(), 0);
+    if (position != strides.end()) {
+      dst = dst.select(position - strides.begin(), 0);
+      src = src.select(position - strides.begin(), 0);
+    } else {
+      return;
+    }
+    ZeroStrideClear(dst, src);
+  }
+
   // when the stride of some dim is zero, the tensor may has been "expand", copy should only
   // process on any axis of that dim
   // To do: is this kind of copy matches other zero stride cases?
   void CopyMaybeWithZeroStride(Tensor dst, Tensor src) {
-    auto strides = dst.strides().vec();
-    for (int i = 0; i < strides.size(); i++) {
-      if (strides[i] == 0) {
-        dst = dst.select(i, 0);
-        src = src.select(i, 0);
-      }
-    }
+    ZeroStrideClear(dst, src);
     dst.copy_(src);
   }
 
diff --git a/src/third_party/hccl/inc/hccl/hccl.h b/src/third_party/hccl/inc/hccl/hccl.h
index 311e78f2cbe8b97e0545b075a5e2ebef15ec855c..9606e89443003765b4d5506b93aeadd8dd29bb0c 100644
--- a/src/third_party/hccl/inc/hccl/hccl.h
+++ b/src/third_party/hccl/inc/hccl/hccl.h
@@ -1,133 +1,133 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file hccl.h
- * @brief HCCL API
- */
-
-#ifndef HCCL_H_
-#define HCCL_H_
-
-#include <hccl/hccl_types.h>
-#include <acl/acl.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif // __cplusplus
-
-/**
- * @brief Initialize HCCL.
- *
- * @param clusterInfo A string identifying the cluster info file path, include file name.
- * @param rank A integer identifying the identify for the rank.
- * @param comm A pointer identifying the initialized communication resource.
- * @return HcclResult
- * @see HcclCommDestroy()
- */
-extern HcclResult HcclCommInitClusterInfo(const char *clusterInfo, uint32_t rank, HcclComm *comm);
-
-/**
- * @brief Get hccl root info.
- *
- * @param rootInfo A pointer identifying the hccl root info.
- * @return HcclResult
- */
-extern HcclResult HcclGetRootInfo(HcclRootInfo *rootInfo);
-
-/**
- * @brief Initialize HCCL with root info.
- *
- * @param nRanks A integer identifying the rank size of the cluster.
- * @param rootInfo A struct identifying the hccl root info.
- * @param rank A integer identifying the identify for the rank.
- * @param comm A pointer identifying the initialized communication resource.
- * @return HcclResult
- * @see HcclCommDestroy()
- */
-extern HcclResult HcclCommInitRootInfo(uint32_t nRanks, const HcclRootInfo *rootInfo, uint32_t rank, HcclComm *comm);
-
-/**
- * @brief AllReduce operator.
- *
- * @param sendBuf A pointer identifying the input data address of the operator.
- * @param recvBuf A pointer identifying the output data address of the operator.
- * @param count An integer(u64) identifying the number of the output data.
- * @param dataType The data type of the operator, must be one of the following types: int8, int16, int32, float16, float32.
- * @param op The reduction type of the operator, must be one of the following types: sum, min, max, prod.
- * @param comm A pointer identifying the communication resource based on.
- * @param stream A pointer identifying the stream information.
- * @return HcclResult 
- */
-extern HcclResult HcclAllReduce(void *sendBuf, void *recvBuf, uint64_t count, HcclDataType dataType, 
-HcclReduceOp op, HcclComm comm, aclrtStream stream);
-
-/**
- * @brief Broadcast operator.
- *
- * @param buf A pointer identifying the data address of the operator.
- * @param count An integer(u64) identifying the number of the data.
- * @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
- * @param root An integer(u32) identifying the the root rank in the operator.
- * @param comm A pointer identifying the communication resource based on
- * @param stream A pointer identifying the stream information.
- * @return HcclResult 
- */
-extern HcclResult HcclBroadcast(void *buf, uint64_t count, HcclDataType dataType, uint32_t root, HcclComm comm, 
-aclrtStream stream);
-
-/**
- * @brief ReduceScatter operator.
- *
- * @param sendBuf A pointer identifying the input data address of the operator.
- * @param recvBuf A pointer identifying the output data address of the operator.
- * @param recvCount An integer(u64) identifying the number of the output data.
- * @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
- * @param op The reduction type of the operator, must be one of the following types: sum, min, max, prod.
- * @param comm A pointer identifying the communication resource based on.
- * @param stream A pointer identifying the stream information.
- * @return HcclResult 
- */
-extern HcclResult HcclReduceScatter(void *sendBuf, void *recvBuf, uint64_t recvCount, HcclDataType dataType, 
-HcclReduceOp op, HcclComm comm, aclrtStream stream);
-
-/**
- * @brief AllGather operator.
- *
- * @param sendBuf A pointer identifying the input data address of the operator.
- * @param recvBuf A pointer identifying the output data address of the operator.
- * @param sendCount An integer(u64) identifying the number of the input data.
- * @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
- * @param comm A pointer identifying the communication resource based on.
- * @param stream A pointer identifying the stream information.
- * @return HcclResult 
- */
-extern HcclResult HcclAllGather(void *sendBuf, void *recvBuf, uint64_t sendCount, HcclDataType dataType, 
-HcclComm comm, aclrtStream stream);
-
-/**
- * @brief Destroy HCCL comm
- *
- * @param comm A pointer identifying the communication resource targetting
- * @return HcclResult
- * @see HcclCommInitClusterInfo()
- */
-extern HcclResult HcclCommDestroy(HcclComm comm);
-
-#ifdef __cplusplus
-}
-#endif // __cplusplus
-#endif // HCCL_H_
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file hccl.h
+ * @brief HCCL API
+ */
+
+#ifndef HCCL_H_
+#define HCCL_H_
+
+#include <hccl/hccl_types.h>
+#include <acl/acl.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/**
+ * @brief Initialize HCCL.
+ *
+ * @param clusterInfo A string identifying the cluster info file path, include file name.
+ * @param rank A integer identifying the identify for the rank.
+ * @param comm A pointer identifying the initialized communication resource.
+ * @return HcclResult
+ * @see HcclCommDestroy()
+ */
+extern HcclResult HcclCommInitClusterInfo(const char *clusterInfo, uint32_t rank, HcclComm *comm);
+
+/**
+ * @brief Get hccl root info.
+ *
+ * @param rootInfo A pointer identifying the hccl root info.
+ * @return HcclResult
+ */
+extern HcclResult HcclGetRootInfo(HcclRootInfo *rootInfo);
+
+/**
+ * @brief Initialize HCCL with root info.
+ *
+ * @param nRanks A integer identifying the rank size of the cluster.
+ * @param rootInfo A struct identifying the hccl root info.
+ * @param rank A integer identifying the identify for the rank.
+ * @param comm A pointer identifying the initialized communication resource.
+ * @return HcclResult
+ * @see HcclCommDestroy()
+ */
+extern HcclResult HcclCommInitRootInfo(uint32_t nRanks, const HcclRootInfo *rootInfo, uint32_t rank, HcclComm *comm);
+
+/**
+ * @brief AllReduce operator.
+ *
+ * @param sendBuf A pointer identifying the input data address of the operator.
+ * @param recvBuf A pointer identifying the output data address of the operator.
+ * @param count An integer(u64) identifying the number of the output data.
+ * @param dataType The data type of the operator, must be one of the following types: int8, int16, int32, float16, float32.
+ * @param op The reduction type of the operator, must be one of the following types: sum, min, max, prod.
+ * @param comm A pointer identifying the communication resource based on.
+ * @param stream A pointer identifying the stream information.
+ * @return HcclResult 
+ */
+extern HcclResult HcclAllReduce(void *sendBuf, void *recvBuf, uint64_t count, HcclDataType dataType, 
+HcclReduceOp op, HcclComm comm, aclrtStream stream);
+
+/**
+ * @brief Broadcast operator.
+ *
+ * @param buf A pointer identifying the data address of the operator.
+ * @param count An integer(u64) identifying the number of the data.
+ * @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
+ * @param root An integer(u32) identifying the the root rank in the operator.
+ * @param comm A pointer identifying the communication resource based on
+ * @param stream A pointer identifying the stream information.
+ * @return HcclResult 
+ */
+extern HcclResult HcclBroadcast(void *buf, uint64_t count, HcclDataType dataType, uint32_t root, HcclComm comm, 
+aclrtStream stream);
+
+/**
+ * @brief ReduceScatter operator.
+ *
+ * @param sendBuf A pointer identifying the input data address of the operator.
+ * @param recvBuf A pointer identifying the output data address of the operator.
+ * @param recvCount An integer(u64) identifying the number of the output data.
+ * @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
+ * @param op The reduction type of the operator, must be one of the following types: sum, min, max, prod.
+ * @param comm A pointer identifying the communication resource based on.
+ * @param stream A pointer identifying the stream information.
+ * @return HcclResult 
+ */
+extern HcclResult HcclReduceScatter(void *sendBuf, void *recvBuf, uint64_t recvCount, HcclDataType dataType, 
+HcclReduceOp op, HcclComm comm, aclrtStream stream);
+
+/**
+ * @brief AllGather operator.
+ *
+ * @param sendBuf A pointer identifying the input data address of the operator.
+ * @param recvBuf A pointer identifying the output data address of the operator.
+ * @param sendCount An integer(u64) identifying the number of the input data.
+ * @param dataType The data type of the operator, must be one of the following types: int8, int32, float16, float32.
+ * @param comm A pointer identifying the communication resource based on.
+ * @param stream A pointer identifying the stream information.
+ * @return HcclResult 
+ */
+extern HcclResult HcclAllGather(void *sendBuf, void *recvBuf, uint64_t sendCount, HcclDataType dataType, 
+HcclComm comm, aclrtStream stream);
+
+/**
+ * @brief Destroy HCCL comm
+ *
+ * @param comm A pointer identifying the communication resource targetting
+ * @return HcclResult
+ * @see HcclCommInitClusterInfo()
+ */
+extern HcclResult HcclCommDestroy(HcclComm comm);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // HCCL_H_
diff --git a/src/third_party/hccl/inc/hccl/hccl_types.h b/src/third_party/hccl/inc/hccl/hccl_types.h
index 3fe701c044d354f9a128b5229f39fd7610a9b9b9..29ab1a95cbaebad063021900247c94bb63ed377c 100644
--- a/src/third_party/hccl/inc/hccl/hccl_types.h
+++ b/src/third_party/hccl/inc/hccl/hccl_types.h
@@ -1,100 +1,100 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file hccl_types.h
- * @brief HCCL data type definition 
- * 
- */
- 
-#ifndef HCCL_TYPES_H_
-#define HCCL_TYPES_H_
-
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif // __cplusplus
-
-/**
- * @brief HCCL functions return value definition
- */
-typedef enum {
-    HCCL_SUCCESS = 0,               /**< success */
-    HCCL_E_PARA = 1,                /**< parameter error */
-    HCCL_E_PTR = 2,                 /**< empty pointer */
-    HCCL_E_MEMORY = 3,              /**< memory error */
-    HCCL_E_INTERNAL = 4,            /**< internal error */
-    HCCL_E_NOT_SUPPORT = 5,         /**< not support feature */
-    HCCL_E_NOT_FOUND = 6,           /**< not found specific resource */
-    HCCL_E_UNAVAIL = 7,             /**< resource unavailable */
-    HCCL_E_SYSCALL = 8,             /**< call system interface error */
-    HCCL_E_TIMEOUT = 9,             /**< timeout */
-    HCCL_E_OPEN_FILE_FAILURE = 10,  /**< open file fail */
-    HCCL_E_TCP_CONNECT = 11,        /**< tcp connect fail */
-    HCCL_E_ROCE_CONNECT = 12,       /**< roce connect fail */
-    HCCL_E_TCP_TRANSFER = 13,       /**< tcp transfer fail */
-    HCCL_E_ROCE_TRANSFER = 14,      /**< roce transfer fail */
-    HCCL_E_RUNTIME = 15,            /**< call runtime api fail */
-    HCCL_E_DRV = 16,                /**< call driver api fail */
-    HCCL_E_PROFILING = 17,          /**< call profiling api fail */
-    HCCL_E_CCE = 18,                /**< call cce api fail */
-    HCCL_E_NETWORK = 19,            /**< call network api fail */
-    HCCL_E_RESERVED                 /**< reserved */
-} HcclResult;
-
-/**
- * @brief handle to HCCL communicator
- */
-typedef void *HcclComm;
-
-/**
- * @brief HCCL Reduction opperation
- */
-typedef enum {
-    HCCL_REDUCE_SUM = 0,    /**< sum */
-    HCCL_REDUCE_PROD = 1,   /**< prod */
-    HCCL_REDUCE_MAX = 2,    /**< max */
-    HCCL_REDUCE_MIN = 3,    /**< min */
-    HCCL_REDUCE_RESERVED    /**< reserved */
-} HcclReduceOp;
-
-/**
- * @brief HCCL data type
- */
-typedef enum {
-    HCCL_DATA_TYPE_INT8 = 0,    /**< int8 */
-    HCCL_DATA_TYPE_INT16 = 1,   /**< int16 */
-    HCCL_DATA_TYPE_INT32 = 2,   /**< int32 */
-    HCCL_DATA_TYPE_FP16 = 3,    /**< fp16 */
-    HCCL_DATA_TYPE_FP32 = 4,    /**< fp32 */
-    HCCL_DATA_TYPE_INT64 = 5,    /**< int 64 */
-    HCCL_DATA_TYPE_RESERVED     /**< reserved */
-} HcclDataType;
-
-const uint32_t HCCL_ROOT_INFO_BYTES =  4108; // 4108: root info length
-
-/**
- * @brief HCCL root info
- */
-typedef struct HcclRootInfoDef {
-    char internal[HCCL_ROOT_INFO_BYTES];
-} HcclRootInfo;
-
-#ifdef __cplusplus
-}
-#endif // __cplusplus
-#endif // HCCL_TYPES_H_
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file hccl_types.h
+ * @brief HCCL data type definition 
+ * 
+ */
+ 
+#ifndef HCCL_TYPES_H_
+#define HCCL_TYPES_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/**
+ * @brief HCCL functions return value definition
+ */
+typedef enum {
+    HCCL_SUCCESS = 0,               /**< success */
+    HCCL_E_PARA = 1,                /**< parameter error */
+    HCCL_E_PTR = 2,                 /**< empty pointer */
+    HCCL_E_MEMORY = 3,              /**< memory error */
+    HCCL_E_INTERNAL = 4,            /**< internal error */
+    HCCL_E_NOT_SUPPORT = 5,         /**< not support feature */
+    HCCL_E_NOT_FOUND = 6,           /**< not found specific resource */
+    HCCL_E_UNAVAIL = 7,             /**< resource unavailable */
+    HCCL_E_SYSCALL = 8,             /**< call system interface error */
+    HCCL_E_TIMEOUT = 9,             /**< timeout */
+    HCCL_E_OPEN_FILE_FAILURE = 10,  /**< open file fail */
+    HCCL_E_TCP_CONNECT = 11,        /**< tcp connect fail */
+    HCCL_E_ROCE_CONNECT = 12,       /**< roce connect fail */
+    HCCL_E_TCP_TRANSFER = 13,       /**< tcp transfer fail */
+    HCCL_E_ROCE_TRANSFER = 14,      /**< roce transfer fail */
+    HCCL_E_RUNTIME = 15,            /**< call runtime api fail */
+    HCCL_E_DRV = 16,                /**< call driver api fail */
+    HCCL_E_PROFILING = 17,          /**< call profiling api fail */
+    HCCL_E_CCE = 18,                /**< call cce api fail */
+    HCCL_E_NETWORK = 19,            /**< call network api fail */
+    HCCL_E_RESERVED                 /**< reserved */
+} HcclResult;
+
+/**
+ * @brief handle to HCCL communicator
+ */
+typedef void *HcclComm;
+
+/**
+ * @brief HCCL Reduction opperation
+ */
+typedef enum {
+    HCCL_REDUCE_SUM = 0,    /**< sum */
+    HCCL_REDUCE_PROD = 1,   /**< prod */
+    HCCL_REDUCE_MAX = 2,    /**< max */
+    HCCL_REDUCE_MIN = 3,    /**< min */
+    HCCL_REDUCE_RESERVED    /**< reserved */
+} HcclReduceOp;
+
+/**
+ * @brief HCCL data type
+ */
+typedef enum {
+    HCCL_DATA_TYPE_INT8 = 0,    /**< int8 */
+    HCCL_DATA_TYPE_INT16 = 1,   /**< int16 */
+    HCCL_DATA_TYPE_INT32 = 2,   /**< int32 */
+    HCCL_DATA_TYPE_FP16 = 3,    /**< fp16 */
+    HCCL_DATA_TYPE_FP32 = 4,    /**< fp32 */
+    HCCL_DATA_TYPE_INT64 = 5,    /**< int 64 */
+    HCCL_DATA_TYPE_RESERVED     /**< reserved */
+} HcclDataType;
+
+const uint32_t HCCL_ROOT_INFO_BYTES =  4108; // 4108: root info length
+
+/**
+ * @brief HCCL root info
+ */
+typedef struct HcclRootInfoDef {
+    char internal[HCCL_ROOT_INFO_BYTES];
+} HcclRootInfo;
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // HCCL_TYPES_H_
diff --git a/src/tools/autograd/derivatives.yaml b/src/tools/autograd/derivatives.yaml
index 1db83b1c5a6a2870f5721b3d2483ec24b45e2ab3..ee68e09e8dccd12c5bd3023a5cc16d06814822e5 100644
--- a/src/tools/autograd/derivatives.yaml
+++ b/src/tools/autograd/derivatives.yaml
@@ -1691,4 +1691,7 @@
 
 - name: npu_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
   input, weight: npu_linear_backward(grad, input, weight)
-  bias: maybe_multiply(grad, 1)
\ No newline at end of file
+  bias: maybe_multiply(grad, 1)
+
+- name: npu_giou(Tensor self, Tensor gtboxes, bool trans=False, bool is_cross=False, int mode=0) -> Tensor
+  self, gtboxes: npu_giou_backward(grad, self, gtboxes, trans, is_cross, mode)
\ No newline at end of file
diff --git a/src/torch/lib/c10d/HCCLUtils.hpp b/src/torch/lib/c10d/HCCLUtils.hpp
index 6f19a66b4b5d1f03dd3936e09ed3e5ccc313e1d9..46e98d8b3e49cdb83b22db388f0f64ee15e3f02f 100644
--- a/src/torch/lib/c10d/HCCLUtils.hpp
+++ b/src/torch/lib/c10d/HCCLUtils.hpp
@@ -1,79 +1,79 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <c10/npu/npu_log.h>
-#include <memory>
-
-#define C10D_HCCL_CHECK(cmd)                                        \
-  do {                                                              \
-    HcclResult error = cmd;                                         \
-    if (error != HCCL_SUCCESS) {                                    \
-      std::string err = "HCCL error in: " + std::string(__FILE__) + \
-          std::to_string(__LINE__) + ", " + std::to_string(error);  \
-      throw std::runtime_error(err);                                \
-    }                                                               \
-  } while (0)
-
-namespace c10d {
-
-// RAII wrapper for HCCL communicator
-class HCCLComm {
- public:
-  explicit HCCLComm(HcclComm hcclComm) : hcclComm_(hcclComm) {}
-
-  HCCLComm() : HCCLComm(nullptr) {}
-
-  ~HCCLComm() noexcept {
-    if (hcclComm_) {
-      HcclCommDestroy(hcclComm_);
-    }
-  }
-
-  static std::shared_ptr<HCCLComm> create(
-      int numRanks,
-      int rank,
-      HcclRootInfo& rootInfo) {
-    auto comm = std::make_shared<HCCLComm>();
-    C10D_HCCL_CHECK(
-        HcclCommInitRootInfo(numRanks, &rootInfo, rank, &(comm->hcclComm_)));
-    return comm;
-  }
-
-  // Must not be copyable
-  HCCLComm(const HCCLComm&) = delete;
-  HCCLComm& operator=(const HCCLComm&) = delete;
-
-  // Move constructable
-  HCCLComm(HCCLComm&& other) {
-    std::swap(hcclComm_, other.hcclComm_);
-  }
-
-  // Move assignable
-  HCCLComm& operator=(HCCLComm&& other) {
-    std::swap(hcclComm_, other.hcclComm_);
-    return *this;
-  }
-
-  HcclComm getHcclComm() const{
-    return hcclComm_;
-  }
-
- protected:
-  HcclComm hcclComm_;
-};
-} // namespace c10d
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <c10/npu/npu_log.h>
+#include <memory>
+
+#define C10D_HCCL_CHECK(cmd)                                        \
+  do {                                                              \
+    HcclResult error = cmd;                                         \
+    if (error != HCCL_SUCCESS) {                                    \
+      std::string err = "HCCL error in: " + std::string(__FILE__) + \
+          std::to_string(__LINE__) + ", " + std::to_string(error);  \
+      throw std::runtime_error(err);                                \
+    }                                                               \
+  } while (0)
+
+namespace c10d {
+
+// RAII wrapper for HCCL communicator
+class HCCLComm {
+ public:
+  explicit HCCLComm(HcclComm hcclComm) : hcclComm_(hcclComm) {}
+
+  HCCLComm() : HCCLComm(nullptr) {}
+
+  ~HCCLComm() noexcept {
+    if (hcclComm_) {
+      HcclCommDestroy(hcclComm_);
+    }
+  }
+
+  static std::shared_ptr<HCCLComm> create(
+      int numRanks,
+      int rank,
+      HcclRootInfo& rootInfo) {
+    auto comm = std::make_shared<HCCLComm>();
+    C10D_HCCL_CHECK(
+        HcclCommInitRootInfo(numRanks, &rootInfo, rank, &(comm->hcclComm_)));
+    return comm;
+  }
+
+  // Must not be copyable
+  HCCLComm(const HCCLComm&) = delete;
+  HCCLComm& operator=(const HCCLComm&) = delete;
+
+  // Move constructable
+  HCCLComm(HCCLComm&& other) {
+    std::swap(hcclComm_, other.hcclComm_);
+  }
+
+  // Move assignable
+  HCCLComm& operator=(HCCLComm&& other) {
+    std::swap(hcclComm_, other.hcclComm_);
+    return *this;
+  }
+
+  HcclComm getHcclComm() const{
+    return hcclComm_;
+  }
+
+ protected:
+  HcclComm hcclComm_;
+};
+} // namespace c10d
diff --git a/src/torch/lib/c10d/ProcessGroupHCCL.cpp b/src/torch/lib/c10d/ProcessGroupHCCL.cpp
index d821404dacef4bb6c5fdb1ae28d70012b8d9ea74..a94f0e593103c28df39596a1b0285aedb8c056a6 100644
--- a/src/torch/lib/c10d/ProcessGroupHCCL.cpp
+++ b/src/torch/lib/c10d/ProcessGroupHCCL.cpp
@@ -1,774 +1,774 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION.
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <c10/npu/NPUCachingAllocator.h>
-#include <c10/npu/NPUGuard.h>
-#include <c10/npu/NPUStream.h>
-#include <c10d/ProcessGroupHCCL.hpp>
-#include <c10d/Utils.hpp>
-#include <third_party/acl/inc/acl/acl.h>
-#include <third_party/acl/inc/acl/acl_base.h>
-#include <torch/csrc/autograd/record_function.h>
-#include <map>
-#include <tuple>
-#include <unordered_set>
-
-namespace c10d {
-namespace {
-using hcclUs = std::chrono::steady_clock::time_point;
-#define DURATION_US(x) \
-  (std::chrono::duration_cast<std::chrono::microseconds>(x))
-#define TIME_NOW() ({ std::chrono::steady_clock::now(); })
-
-// HCCL ReduceOp mapping
-std::map<ReduceOp, HcclReduceOp> hcclOp = {
-    {ReduceOp::MIN, HCCL_REDUCE_MIN},
-    {ReduceOp::MAX, HCCL_REDUCE_MAX},
-    {ReduceOp::SUM, HCCL_REDUCE_SUM},
-    {ReduceOp::PRODUCT, HCCL_REDUCE_PROD},
-};
-
-// HCCL DataType mapping
-std::map<at::ScalarType, HcclDataType> hcclDataType = {
-    {at::kChar, HCCL_DATA_TYPE_INT8},
-    {at::kFloat, HCCL_DATA_TYPE_FP32},
-    {at::kInt, HCCL_DATA_TYPE_INT32},
-    {at::kHalf, HCCL_DATA_TYPE_FP16},
-    {at::kShort, HCCL_DATA_TYPE_INT16},
-    {at::kLong, HCCL_DATA_TYPE_INT64},
-};
-
-// Helper function that gets the data type and issues error if not supported
-HcclDataType getHcclDataType(at::ScalarType type) {
-  try {
-    return hcclDataType.at(type);
-  } catch (std::out_of_range& e) {
-    throw std::runtime_error("Unsupported data type for HCCL process group");
-  }
-}
-
-// Get the deviceList String from the list of devices
-std::string getKeyFromDevices(const std::vector<at::Device>& devices) {
-  std::string deviceList;
-  for (auto& device : devices) {
-    if (deviceList.empty()) {
-      deviceList = std::to_string(device.index());
-    } else {
-      deviceList += "," + std::to_string(device.index());
-    }
-  }
-  return deviceList;
-}
-
-// Get the list of devices from list of tensors
-std::vector<at::Device> getDeviceList(const std::vector<at::Tensor>& tensors) {
-  std::vector<at::Device> res;
-  res.reserve(tensors.size());
-  for (auto& tensor : tensors) {
-    res.push_back(tensor.device());
-  }
-  return res;
-}
-
-// [Sync Streams] Helper that lets the input hcclStreams to wait for the current
-// stream. HCCL communications run on hcclStreams, but input tensors are
-// allocated on different streams (i.e., current streams). Communications on
-// hcclStreams cannot start before pending input tensor ops on current streams
-// finish. Otherwise, ops on two streams might read/write same tensors
-// concurrently.
-
-// The synchronization above alone is not enough. We also need to make sure
-// input tensors are not freed before their usages on hcclStreams finish. This
-// can be achieved by calling ::recordStream,
-// which remembers the usage stream (hcclStream), creates an event on the usage
-// stream when GC attempts to free the input tensor, and delays GC until that
-// event is done.
-void syncStreams(
-    const std::vector<at::Device>& devices,
-    std::vector<at::npu::NPUEvent>& hcclEvents,
-    std::vector<c10::npu::NPUStream>& hcclStreams) {
-  for (size_t i = 0; i < devices.size(); ++i) {
-    c10::npu::NPUStream& hcclStream = hcclStreams[i];
-    at::npu::NPUEvent& hcclEvent = hcclEvents[i];
-    hcclEvent.record(c10::npu::getCurrentNPUStream(devices[i].index()));
-    hcclEvent.block(hcclStream);
-  }
-}
-
-// exit call back for allreduce error
-void exceptionCallback(aclrtExceptionInfo* exceptionInfo) {
-  std::string err = "AllReduce error in:" + std::string(__FILE__) + ": " +
-      std::to_string(__LINE__);
-  throw std::runtime_error(err);
-}
-} // namespace
-
-constexpr int64_t kSynchronizeBusyWaitMillis = 10;
-constexpr int64_t maxOpNumPerSyncPoint = 2;
-const int64_t ProcessGroupHCCL::kProcessGroupHCCLOpTimeoutMillis = 10 * 1000;
-ProcessGroupHCCL::WorkHCCL::WorkHCCL(const std::vector<at::Device>& devices)
-    : devices_(devices), workStartTime_(std::chrono::steady_clock::now()) {
-  // Creates the npu event wrappers
-  // Note: The actual events are lazily created when first recorded to with
-  // DEFAULT_FLAGS = npuEventDisableTiming.
-  npuEvents_.resize(devices.size());
-  hcclComms_.resize(devices.size());
-}
-
-ProcessGroupHCCL::WorkHCCL::~WorkHCCL() {}
-
-bool ProcessGroupHCCL::WorkHCCL::isCompleted() {
-  checkAndSetException();
-  return exception() || finishedNPUExecutionInternal();
-}
-
-bool ProcessGroupHCCL::WorkHCCL::isSuccess() const {
-  if (exception()) {
-    // Already detected an exception.
-    return false;
-  }
-  // TODO support checkForHCCLErrors
-  return finishedNPUExecutionInternal();
-}
-
-void ProcessGroupHCCL::WorkHCCL::checkAndSetException() {
-  if (exception()) {
-    // We already have an exception.
-    return;
-  }
-  // TODO support checkForHCCLErrors
-}
-
-// Helper that checks if the HCCL kernels are completed on the NPU
-bool ProcessGroupHCCL::WorkHCCL::finishedNPUExecution() {
-  checkAndSetException();
-  return finishedNPUExecutionInternal();
-}
-
-// check if HCCL task is finished
-bool ProcessGroupHCCL::WorkHCCL::finishedNPUExecutionInternal() const {
-  for (size_t i = 0; i < devices_.size(); ++i) {
-    // Checking Event completed by Eventquery
-    aclrtEventStatus status;
-    auto ret = aclrtQueryEvent(npuEvents_[i], &status);
-    if (ret != ACL_ERROR_NONE || status == ACL_EVENT_STATUS_NOT_READY) {
-      return false;
-    }
-  }
-  return true;
-}
-
-void ProcessGroupHCCL::WorkHCCL::checkAndThrowException() {
-  // Set the appropriate exception if found.
-  checkAndSetException();
-
-  // Throw an exception, only if we have a valid exception.
-  if (exception()) {
-    std::rethrow_exception(exception());
-  }
-}
-
-// Waiting on the work's corresponding NPU events
-void ProcessGroupHCCL::WorkHCCL::synchronize() {
-  for (size_t i = 0; i < devices_.size(); ++i) {
-    auto currentStream = at::npu::getCurrentNPUStream(devices_[i].index());
-    // Block the current stream on the HCCL stream
-    npuEvents_[i].block(currentStream);
-    // If we use the work to do barrier, we should block here
-    if (!barrierTensors_.empty()) {
-      c10::npu::NPUGuard npuGuard(devices_[i]);
-      c10::npu::npuSynchronizeDevice();
-    }
-  }
-
-  // In case of blocking, wait for the operation to complete.
-  if (blockingWait_) {
-    // Wait for the operation to complete.
-    while (!isCompleted()) {
-      auto currentTimepoint = std::chrono::steady_clock::now();
-      if (std::chrono::duration_cast<std::chrono::milliseconds>(
-              currentTimepoint - workStartTime_) > opTimeout_) {
-        throw std::runtime_error("Operation timed out!");
-      }
-      // Check for errors and throw appropriate exception.
-      checkAndThrowException(); // TODO support checkAndThrowException
-      std::this_thread::sleep_for(
-          std::chrono::milliseconds(kSynchronizeBusyWaitMillis));
-    }
-    checkAndThrowException(); // TODO support checkAndThrowException
-  }
-}
-
-// Same as calling synchronize().
-bool ProcessGroupHCCL::WorkHCCL::wait() {
-  synchronize();
-  // Always return true, because abort API is not implemented.
-  return true;
-}
-
-ProcessGroupHCCL::ProcessGroupHCCL(
-    const std::shared_ptr<Store>& store,
-    int rank,
-    int size,
-    const std::chrono::milliseconds& opTimeout)
-    : ProcessGroup(rank, size),
-      store_(store),
-      hcclCommCounter_(0),
-      terminateWatchdog_(false),
-      opTimeout_(opTimeout) {
-  char* blockingWait = getenv(HCCL_BLOCKING_WAIT);
-  try {
-    if (blockingWait != nullptr) {
-      auto val = std::stoi(blockingWait);
-      if (val == 1) {
-        // Make wait() and synchronize() a blocking call.
-        blockingWait_ = true;
-      } else if (val != 0) {
-        throw std::runtime_error(
-            "Invalid value for environment variable: " +
-            std::string(HCCL_BLOCKING_WAIT));
-      }
-    }
-  } catch (std::exception& e) {
-    throw std::runtime_error(
-        "Invalid value for environment variable: " +
-        std::string(HCCL_BLOCKING_WAIT));
-  }
-}
-
-ProcessGroupHCCL::~ProcessGroupHCCL() {}
-
-void ProcessGroupHCCL::broadcastMasterID(HcclRootInfo* hcclID) {
-  // For every HCCL communicator that we create we need to broadcast
-  // a unique ID from rank 0 to all other ranks. This broadcast is
-  // done by rank 0 setting a key in the store and all other ranks
-  // retrieving the contents of that key. A single process group
-  // may create multiple HCCL communicators, so we use a sequence
-  // number to differentiate between them.
-  std::string storeKey = std::to_string(hcclCommCounter_++);
-  if (rank_ == 0) {
-    auto vec = std::vector<uint8_t>(
-        reinterpret_cast<uint8_t*>(hcclID),
-        reinterpret_cast<uint8_t*>(hcclID) + HCCL_ROOT_INFO_BYTES);
-    store_->set(storeKey, vec);
-  } else {
-    auto vec = store_->get(storeKey);
-    TORCH_CHECK(vec.size() == HCCL_ROOT_INFO_BYTES);
-    std::memcpy(hcclID, vec.data(), vec.size());
-  }
-}
-
-/*
-void ProcessGroupHCCL::fluxLimit (
-    const std::string& devicesKey,
-    const int index) {
-  // event sync every two allreduce
-  if ((++collectiveCnts_[devicesKey][index]) < maxOpNumPerSyncPoint) {
-    return;
-  }
-  // sync with last sync point
-  at::npu::NPUEvent &fluxEvent = rateCtrlEvents_[devicesKey][index];
-  if (fluxEvent.isCreated()) {
-    // printf("synchronize point reached. begin event sync\r\n");
-    while(!fluxEvent.query()) {
-      std::this_thread::sleep_for(
-          std::chrono::milliseconds(1));
-    }
-    fluxEvent.synchronize();
-  } else {
-    // printf("fluxEvent[%s][%d] is not created\r\n", devicesKey.c_str(),
-index);
-  }
-  // record new sync point
-  c10::npu::NPUStream& hcclStream = hcclStreams_[devicesKey][index];
-  fluxEvent.record(hcclStream);
-
-  // clear collective count
-  collectiveCnts_[devicesKey][index] = 0;
-}
-*/
-
-std::vector<std::shared_ptr<HCCLComm>>& ProcessGroupHCCL::getHCCLComm(
-    const std::string& devicesKey,
-    const std::vector<at::Device>& devices) {
-  // Sanity check
-  if (devicesKey.empty()) {
-    throw std::runtime_error(
-        "Not able to create/get the HCCL Communicator since "
-        "the NPU devices are not known");
-  }
-
-  for (auto& device : devices) {
-    usedDeviceIdxs_.insert(device.index());
-  }
-
-  {
-    std::lock_guard<std::mutex> lock(devHCCLCommMapLock_);
-    if (devHCCLCommMap_.find(devicesKey) != devHCCLCommMap_.end()) {
-      // Reuse the cached communicator if there is one.
-      return devHCCLCommMap_[devicesKey];
-    }
-  }
-
-  // HCCL communicator not cached, create a new entry
-  std::vector<std::shared_ptr<HCCLComm>> hcclComms;
-  hcclComms.resize(devices.size());
-
-  HcclRootInfo hcclID;
-  if (rank_ == 0) {
-    C10D_HCCL_CHECK(HcclGetRootInfo(&hcclID));
-  }
-  broadcastMasterID(&hcclID);
-
-  c10::npu::OptionalNPUGuard npuGuard;
-  std::vector<c10::npu::NPUStream> streamVal;
-  streamVal.reserve(devices.size());
-
-  for (size_t i = 0; i < devices.size(); ++i) {
-    int numRanks = getSize();
-    int rank = getRank() * devices.size() + i;
-
-    npuGuard.set_index(devices[i].index());
-    hcclComms[i] = HCCLComm::create(numRanks, rank, hcclID);
-
-    // Creates the HCCL streams
-    streamVal.push_back(c10::npu::getNPUStreamFromPool(devices[i].index()));
-  }
-
-  hcclStreams_.emplace(devicesKey, std::move(streamVal));
-
-  // Note: these events are created with the (default) cudaEventDisableTiming
-  // flag This flag provides the best performance when used with
-  // StreamWaitEvent() and EventQuery(). Since we here don't measure the
-  // performance using npuEvent, this should be set.
-  hcclEvents_.emplace(
-      std::piecewise_construct,
-      std::make_tuple(devicesKey),
-      std::make_tuple(devices.size()));
-
-  // stream length is 1024,
-  rateCtrlEvents_.emplace(
-      std::piecewise_construct,
-      std::make_tuple(devicesKey),
-      std::make_tuple(devices.size()));
-
-  // record collectiveCnts.
-  collectiveCnts_.emplace(
-      std::piecewise_construct,
-      std::make_tuple(devicesKey),
-      std::make_tuple(devices.size()));
-
-  // Hold the lock before modifying the cache.
-  std::lock_guard<std::mutex> lock(devHCCLCommMapLock_);
-
-  // Move the NCCL resource to cache
-  devHCCLCommMap_.emplace(devicesKey, std::move(hcclComms));
-  return devHCCLCommMap_[devicesKey];
-}
-
-namespace {
-
-// Check that all `tensors' have the same type and shape and are distributed
-// across distinct NPUs.
-void check_npu_tensors(const std::vector<at::Tensor>& tensors) {
-  // HCCL support one NPU per process only
-  if (tensors.size() != 1) {
-    throw std::runtime_error(
-        "Tensor list mustn't be larger than the number of available NPUs");
-  }
-  // HCCL support contiguous tensor only
-  if (!tensors[0].is_contiguous()) {
-    throw std::runtime_error("Tensors must be contiguous");
-  }
-}
-
-// Flatten each list in `tensor_lists' for a gather or scatter operation, and
-// ensure compatibility with the corresponding tensor in `other'.
-std::vector<at::Tensor> flatten_for_scatter_gather(
-    std::vector<std::vector<at::Tensor>>& tensor_lists,
-    std::vector<at::Tensor>& other,
-    size_t world_size) {
-  if (tensor_lists.size() != other.size()) {
-    throw std::runtime_error(
-        "Tensor list operands to scatter/gather must have the same length");
-  }
-  const auto num_devices = tensor_lists.size();
-
-  std::vector<at::Tensor> flattened;
-  flattened.resize(num_devices);
-
-  for (auto i = size_t{}; i < num_devices; ++i) {
-    if (tensor_lists[i].size() != world_size * num_devices) {
-      throw std::runtime_error(
-          "Tensor list input to scatter/gather must match number of collective"
-          " participants");
-    }
-
-    // Only check device match for the first tensor in the list; the call to
-    // newLikeFlat() below will check the rest.
-    if (tensor_lists[i].front().get_device() != other[i].get_device()) {
-      throw std::runtime_error(
-          "Corresponding input/output tensors to scatter/gather must all reside"
-          " on the same device");
-    }
-
-    for (const auto& t : tensor_lists[i]) {
-      if (t.numel() != other[i].numel()) {
-        throw std::runtime_error(
-            "All tensor operands to scatter/gather must have the same size");
-      }
-    }
-    // Flatten the tensors (from all ranks) into a single big tensor.
-    flattened[i] = newLikeFlat(tensor_lists, i);
-  }
-  return flattened;
-}
-
-} // namespace
-
-std::shared_ptr<ProcessGroupHCCL::WorkHCCL> ProcessGroupHCCL::initWork(
-    std::vector<at::Device> devices) {
-  if (devices.size() != 1) {
-    throw std::runtime_error(
-        "ProcessGroupHCCL support one device per process only");
-  }
-  return std::make_shared<ProcessGroupHCCL::WorkHCCL>(devices);
-}
-
-template <typename Fn, typename PreProcess, typename PostProcess>
-std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::collective(
-    std::vector<at::Tensor>& inputs,
-    std::vector<at::Tensor>& outputs,
-    Fn fn,
-    PreProcess pre,
-    PostProcess post) {
-  const auto devices = getDeviceList(inputs);
-  const auto key = getKeyFromDevices(devices);
-  auto& hcclComms = getHCCLComm(key, devices);
-  // First let HCCL streams wait for input tensors allocation streams
-  syncStreams(devices, hcclEvents_[key], hcclStreams_[key]);
-  // Work itself will create the events on all NPUs of tensors
-  auto work = initWork(devices);
-
-  c10::npu::OptionalNPUGuard npuGuard;
-  pre(hcclStreams_[key]);
-
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    npuGuard.set_index(devices[i].index());
-    c10::npu::NPUStream& hcclStream = hcclStreams_[key][i];
-
-    // Both `inputs' and `outputs' are created on a worker stream and used in
-    // different hcclStreams.  Hence, both must record the hcclStream to
-    // prevent being freed before the collective finishes.
-    //
-    // We only record `inputs' here, and leave recording `outputs' to `fn' for
-    // operations where `inputs' and `outputs' are not the same.
-    //
-    // See [Sync Streams].
-    c10::npu::NPUCachingAllocator::recordStream(
-        inputs[i].storage().data_ptr(), hcclStream);
-  }
-  {
-    for (size_t i = 0; i < inputs.size(); ++i) {
-      npuGuard.set_index(devices[i].index());
-      // to avoid to much task pushed to the stream, leading to stream overflow
-      // insert sync point
-      // fluxLimit(key, i);
-      c10::npu::NPUStream& hcclStream = hcclStreams_[key][i];
-      hcclUs startut = TIME_NOW();
-      C10D_HCCL_CHECK(
-          fn(inputs[i], outputs[i], hcclComms[i]->getHcclComm(), hcclStream));
-    }
-  }
-  post(hcclStreams_[key]);
-
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    c10::npu::NPUStream& hcclStream = hcclStreams_[key][i];
-    work->npuEvents_[i].record(hcclStream);
-    work->hcclComms_[i] = hcclComms[i];
-    work->blockingWait_ = blockingWait_;
-    work->opTimeout_ = opTimeout_;
-  }
-
-  return work;
-}
-
-template <typename Fn>
-std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::collective(
-    std::vector<at::Tensor>& inputs,
-    std::vector<at::Tensor>& outputs,
-    Fn fn) {
-  return collective(
-      inputs,
-      outputs,
-      fn,
-      [](std::vector<c10::npu::NPUStream>&) {},
-      [](std::vector<c10::npu::NPUStream>&) {});
-}
-
-int g_allreduceID = 0;
-std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::allreduce(
-    std::vector<at::Tensor>& tensors,
-    const AllreduceOptions& opts) {
-  check_npu_tensors(tensors);
-  return collective(
-      tensors,
-      tensors,
-      [&](at::Tensor& input,
-          at::Tensor& output,
-          HcclComm comm,
-          c10::npu::NPUStream& stream) {
-        aclrtSetExceptionInfoCallback(exceptionCallback);
-        RECORD_FUNCTION("HcclAllreduce", std::vector<c10::IValue>({input}));
-        return HcclAllReduce(
-            input.data_ptr(),
-            output.data_ptr(),
-            input.storage().unsafeGetStorageImpl()->numel(),
-            getHcclDataType(input.scalar_type()),
-            hcclOp[opts.reduceOp],
-            comm,
-            stream.stream());
-      });
-}
-int g_broadcastID = 100000;
-std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::broadcast(
-    std::vector<at::Tensor>& tensors,
-    const BroadcastOptions& opts) {
-  check_npu_tensors(tensors);
-  return collective(
-      tensors,
-      tensors,
-      [&](at::Tensor& input,
-          at::Tensor& output,
-          HcclComm comm,
-          c10::npu::NPUStream& stream) {
-        RECORD_FUNCTION("HcclBroadcast", std::vector<c10::IValue>({input}));
-        const auto root = opts.rootRank * tensors.size() + opts.rootTensor;
-        return HcclBroadcast(
-            input.data_ptr(),
-            input.storage().unsafeGetStorageImpl()->numel(),
-            getHcclDataType(input.scalar_type()),
-            root,
-            comm,
-            stream.stream());
-      });
-}
-
-std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::allreduce_coalesced(
-    std::vector<at::Tensor>& /* unused */,
-    const AllreduceCoalescedOptions& /* unused */) {
-  throw std::runtime_error(
-      "ProcessGroupHCCL does not support allreduce_coalesced");
-}
-
-std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::reduce(
-    std::vector<at::Tensor>& /* unused */,
-    const ReduceOptions& /* unused */) {
-  throw std::runtime_error("ProcessGroupHCCL does not support reduce");
-}
-
-std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::allgather(
-    std::vector<std::vector<at::Tensor>>& outputTensors,
-    std::vector<at::Tensor>& inputTensors,
-    const AllgatherOptions& opts) {
-  check_npu_tensors(inputTensors);
-  auto outputFlattened =
-      flatten_for_scatter_gather(outputTensors, inputTensors, size_);
-  check_npu_tensors(outputFlattened);
-
-  return collective(
-      inputTensors,
-      outputFlattened,
-      [&](at::Tensor& input,
-          at::Tensor& output,
-          HcclComm comm,
-          c10::npu::NPUStream& stream) {
-        RECORD_FUNCTION("HcclAllgather", std::vector<c10::IValue>({input}));
-        c10::npu::NPUCachingAllocator::recordStream(
-            output.storage().data_ptr(), stream);
-        return HcclAllGather(
-            input.data_ptr(),
-            output.data_ptr(),
-            input.storage().unsafeGetStorageImpl()->numel(),
-            getHcclDataType(input.scalar_type()),
-            comm,
-            stream.stream());
-      },
-      [&](std::vector<c10::npu::NPUStream>& hcclStreams) {},
-      [&](std::vector<c10::npu::NPUStream>& hcclStreams) {
-        // Copy the flattened output tensors to the outputs.
-        for (size_t i = 0; i < outputTensors.size(); ++i) {
-          c10::npu::NPUStreamGuard guard(hcclStreams[i]);
-          for (size_t j = 0; j < outputTensors[0].size(); ++j) {
-            // See [Sync Streams].
-            c10::npu::NPUCachingAllocator::recordStream(
-                outputTensors[i][j].storage().data_ptr(), hcclStreams[i]);
-
-            outputTensors[i][j].copy_(outputFlattened[i][j], true);
-          }
-        }
-      });
-}
-
-std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::allgather_base(
-    at::Tensor& /*unused */,
-    at::Tensor& /*unused */,
-    const AllgatherOptions& /*unused */) {
-  throw std::runtime_error("ProcessGroupHCCL does not support allgather_base");
-}
-
-std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::reduce_scatter(
-    std::vector<at::Tensor>& outputTensors,
-    std::vector<std::vector<at::Tensor>>& inputTensors,
-    const ReduceScatterOptions& opts) {
-  check_npu_tensors(outputTensors);
-
-  auto inputFlattened =
-      flatten_for_scatter_gather(inputTensors, outputTensors, size_);
-  check_npu_tensors(inputFlattened);
-
-  return collective(
-      inputFlattened,
-      outputTensors,
-      [&](at::Tensor& input,
-          at::Tensor& output,
-          HcclComm comm,
-          c10::npu::NPUStream& stream) {
-        RECORD_FUNCTION("HcclReduceScatter", std::vector<c10::IValue>({input}));
-        c10::npu::NPUCachingAllocator::recordStream(
-            output.storage().data_ptr(), stream);
-        return HcclReduceScatter(
-            input.data_ptr(),
-            output.data_ptr(),
-            output.numel(),
-            getHcclDataType(input.scalar_type()),
-            hcclOp[opts.reduceOp],
-            comm,
-            stream.stream());
-      },
-      [&](std::vector<c10::npu::NPUStream>& hcclStreams) {
-        // Copy the input tensors to the flattened inputs.
-        for (size_t i = 0; i < inputTensors.size(); ++i) {
-          c10::npu::NPUStreamGuard guard(hcclStreams[i]);
-          for (size_t j = 0; j < inputTensors[0].size(); ++j) {
-            // See [Sync Streams].
-            c10::npu::NPUCachingAllocator::recordStream(
-                inputTensors[i][j].storage().data_ptr(), hcclStreams[i]);
-
-            inputFlattened[i][j].copy_(inputTensors[i][j], true);
-          }
-        }
-      },
-      [&](std::vector<c10::npu::NPUStream>& hcclStreams) {});
-}
-
-std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::barrier(
-    const BarrierOptions& opts) {
-  std::vector<at::Device> devices;
-  if (usedDeviceIdxs_.empty()) {
-    auto numNPUs = c10::npu::device_count();
-    int16_t deviceIdx = static_cast<int16_t>(rank_ % numNPUs);
-    devices.push_back(at::Device(at::DeviceType::NPU, deviceIdx));
-  } else {
-    for (auto usedDeviceIdx : usedDeviceIdxs_) {
-      devices.push_back(at::Device(at::DeviceType::NPU, usedDeviceIdx));
-    }
-  }
-
-  std::vector<at::Tensor> barrierTensors;
-  barrierTensors.reserve(devices.size());
-
-  at::npu::OptionalNPUGuard npuGuard;
-  for (auto& device : devices) {
-    npuGuard.set_index(device.index());
-    barrierTensors.push_back(at::empty(
-        {1},
-        at::TensorOptions().device(at::DeviceType::NPU).dtype(at::kFloat)));
-  }
-
-  auto work = BarrierInside(barrierTensors);
-
-  // Work will take over barrierTensors
-  auto hcclWork = dynamic_cast<ProcessGroupHCCL::WorkHCCL*>(work.get());
-  TORCH_CHECK(hcclWork);
-  hcclWork->barrierTensors_ = std::move(barrierTensors);
-
-  return work;
-}
-
-std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::BarrierInside(
-    std::vector<at::Tensor>& tensors) {
-    check_npu_tensors(tensors);
-
-  return collective(
-      tensors,
-      tensors,
-      [&](at::Tensor& input,
-          at::Tensor& output,
-          HcclComm comm,
-          c10::npu::NPUStream& stream) {
-        aclrtSetExceptionInfoCallback(exceptionCallback);
-        auto ret = c10::npu::hccl::hccl_barrier(comm, stream.stream());
-        if (ret == HcclResult::HCCL_E_NOT_SUPPORT) {
-          return HcclAllReduce(
-            input.data_ptr(),
-            output.data_ptr(),
-            input.storage().unsafeGetStorageImpl()->numel(),
-            getHcclDataType(input.scalar_type()),
-            hcclOp[ReduceOp::SUM],
-            comm,
-            stream.stream());
-        }
-        else {
-          return ret;
-        }
-    });
-}
-
-std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::gather(
-    std::vector<std::vector<at::Tensor>>& /* unused */,
-    std::vector<at::Tensor>& /* unused */,
-    const GatherOptions& /* unused */) {
-  throw std::runtime_error("ProcessGroupHCCL does not support gather");
-}
-
-std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::scatter(
-    std::vector<at::Tensor>& /* unused */,
-    std::vector<std::vector<at::Tensor>>& /* unused */,
-    const ScatterOptions& /* unused */) {
-  throw std::runtime_error("ProcessGroupHCCL does not support scatter");
-}
-
-std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::send(
-    std::vector<at::Tensor>& /* unused */,
-    int /* unused */,
-    int /* unused */) {
-  throw std::runtime_error("ProcessGroupHCCL does not support send");
-}
-
-std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::recv(
-    std::vector<at::Tensor>& /* unused */,
-    int /* unused */,
-    int /* unused */) {
-  throw std::runtime_error("ProcessGroupHCCL does not support recv");
-}
-
-std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::recvAnysource(
-    std::vector<at::Tensor>& /* unused */,
-    int /* unused */) {
-  throw std::runtime_error("ProcessGroupHCCL does not support recv");
-}
-} // namespace c10d
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <c10/npu/NPUCachingAllocator.h>
+#include <c10/npu/NPUGuard.h>
+#include <c10/npu/NPUStream.h>
+#include <c10d/ProcessGroupHCCL.hpp>
+#include <c10d/Utils.hpp>
+#include <third_party/acl/inc/acl/acl.h>
+#include <third_party/acl/inc/acl/acl_base.h>
+#include <torch/csrc/autograd/record_function.h>
+#include <map>
+#include <tuple>
+#include <unordered_set>
+
+namespace c10d {
+namespace {
+using hcclUs = std::chrono::steady_clock::time_point;
+#define DURATION_US(x) \
+  (std::chrono::duration_cast<std::chrono::microseconds>(x))
+#define TIME_NOW() ({ std::chrono::steady_clock::now(); })
+
+// HCCL ReduceOp mapping
+std::map<ReduceOp, HcclReduceOp> hcclOp = {
+    {ReduceOp::MIN, HCCL_REDUCE_MIN},
+    {ReduceOp::MAX, HCCL_REDUCE_MAX},
+    {ReduceOp::SUM, HCCL_REDUCE_SUM},
+    {ReduceOp::PRODUCT, HCCL_REDUCE_PROD},
+};
+
+// HCCL DataType mapping
+std::map<at::ScalarType, HcclDataType> hcclDataType = {
+    {at::kChar, HCCL_DATA_TYPE_INT8},
+    {at::kFloat, HCCL_DATA_TYPE_FP32},
+    {at::kInt, HCCL_DATA_TYPE_INT32},
+    {at::kHalf, HCCL_DATA_TYPE_FP16},
+    {at::kShort, HCCL_DATA_TYPE_INT16},
+    {at::kLong, HCCL_DATA_TYPE_INT64},
+};
+
+// Helper function that gets the data type and issues error if not supported
+HcclDataType getHcclDataType(at::ScalarType type) {
+  try {
+    return hcclDataType.at(type);
+  } catch (std::out_of_range& e) {
+    throw std::runtime_error("Unsupported data type for HCCL process group");
+  }
+}
+
+// Get the deviceList String from the list of devices
+std::string getKeyFromDevices(const std::vector<at::Device>& devices) {
+  std::string deviceList;
+  for (auto& device : devices) {
+    if (deviceList.empty()) {
+      deviceList = std::to_string(device.index());
+    } else {
+      deviceList += "," + std::to_string(device.index());
+    }
+  }
+  return deviceList;
+}
+
+// Get the list of devices from list of tensors
+std::vector<at::Device> getDeviceList(const std::vector<at::Tensor>& tensors) {
+  std::vector<at::Device> res;
+  res.reserve(tensors.size());
+  for (auto& tensor : tensors) {
+    res.push_back(tensor.device());
+  }
+  return res;
+}
+
+// [Sync Streams] Helper that lets the input hcclStreams to wait for the current
+// stream. HCCL communications run on hcclStreams, but input tensors are
+// allocated on different streams (i.e., current streams). Communications on
+// hcclStreams cannot start before pending input tensor ops on current streams
+// finish. Otherwise, ops on two streams might read/write same tensors
+// concurrently.
+
+// The synchronization above alone is not enough. We also need to make sure
+// input tensors are not freed before their usages on hcclStreams finish. This
+// can be achieved by calling ::recordStream,
+// which remembers the usage stream (hcclStream), creates an event on the usage
+// stream when GC attempts to free the input tensor, and delays GC until that
+// event is done.
+void syncStreams(
+    const std::vector<at::Device>& devices,
+    std::vector<at::npu::NPUEvent>& hcclEvents,
+    std::vector<c10::npu::NPUStream>& hcclStreams) {
+  for (size_t i = 0; i < devices.size(); ++i) {
+    c10::npu::NPUStream& hcclStream = hcclStreams[i];
+    at::npu::NPUEvent& hcclEvent = hcclEvents[i];
+    hcclEvent.record(c10::npu::getCurrentNPUStream(devices[i].index()));
+    hcclEvent.block(hcclStream);
+  }
+}
+
+// exit call back for allreduce error
+void exceptionCallback(aclrtExceptionInfo* exceptionInfo) {
+  std::string err = "AllReduce error in:" + std::string(__FILE__) + ": " +
+      std::to_string(__LINE__);
+  throw std::runtime_error(err);
+}
+} // namespace
+
+constexpr int64_t kSynchronizeBusyWaitMillis = 10;
+constexpr int64_t maxOpNumPerSyncPoint = 2;
+const int64_t ProcessGroupHCCL::kProcessGroupHCCLOpTimeoutMillis = 10 * 1000;
+ProcessGroupHCCL::WorkHCCL::WorkHCCL(const std::vector<at::Device>& devices)
+    : devices_(devices), workStartTime_(std::chrono::steady_clock::now()) {
+  // Creates the npu event wrappers
+  // Note: The actual events are lazily created when first recorded to with
+  // DEFAULT_FLAGS = npuEventDisableTiming.
+  npuEvents_.resize(devices.size());
+  hcclComms_.resize(devices.size());
+}
+
+ProcessGroupHCCL::WorkHCCL::~WorkHCCL() {}
+
+bool ProcessGroupHCCL::WorkHCCL::isCompleted() {
+  checkAndSetException();
+  return exception() || finishedNPUExecutionInternal();
+}
+
+bool ProcessGroupHCCL::WorkHCCL::isSuccess() const {
+  if (exception()) {
+    // Already detected an exception.
+    return false;
+  }
+  // TODO support checkForHCCLErrors
+  return finishedNPUExecutionInternal();
+}
+
+void ProcessGroupHCCL::WorkHCCL::checkAndSetException() {
+  if (exception()) {
+    // We already have an exception.
+    return;
+  }
+  // TODO support checkForHCCLErrors
+}
+
+// Helper that checks if the HCCL kernels are completed on the NPU
+bool ProcessGroupHCCL::WorkHCCL::finishedNPUExecution() {
+  checkAndSetException();
+  return finishedNPUExecutionInternal();
+}
+
+// check if HCCL task is finished
+bool ProcessGroupHCCL::WorkHCCL::finishedNPUExecutionInternal() const {
+  for (size_t i = 0; i < devices_.size(); ++i) {
+    // Checking Event completed by Eventquery
+    aclrtEventStatus status;
+    auto ret = aclrtQueryEvent(npuEvents_[i], &status);
+    if (ret != ACL_ERROR_NONE || status == ACL_EVENT_STATUS_NOT_READY) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void ProcessGroupHCCL::WorkHCCL::checkAndThrowException() {
+  // Set the appropriate exception if found.
+  checkAndSetException();
+
+  // Throw an exception, only if we have a valid exception.
+  if (exception()) {
+    std::rethrow_exception(exception());
+  }
+}
+
+// Waiting on the work's corresponding NPU events
+void ProcessGroupHCCL::WorkHCCL::synchronize() {
+  for (size_t i = 0; i < devices_.size(); ++i) {
+    auto currentStream = at::npu::getCurrentNPUStream(devices_[i].index());
+    // Block the current stream on the HCCL stream
+    npuEvents_[i].block(currentStream);
+    // If we use the work to do barrier, we should block here
+    if (!barrierTensors_.empty()) {
+      c10::npu::NPUGuard npuGuard(devices_[i]);
+      c10::npu::npuSynchronizeDevice();
+    }
+  }
+
+  // In case of blocking, wait for the operation to complete.
+  if (blockingWait_) {
+    // Wait for the operation to complete.
+    while (!isCompleted()) {
+      auto currentTimepoint = std::chrono::steady_clock::now();
+      if (std::chrono::duration_cast<std::chrono::milliseconds>(
+              currentTimepoint - workStartTime_) > opTimeout_) {
+        throw std::runtime_error("Operation timed out!");
+      }
+      // Check for errors and throw appropriate exception.
+      checkAndThrowException(); // TODO support checkAndThrowException
+      std::this_thread::sleep_for(
+          std::chrono::milliseconds(kSynchronizeBusyWaitMillis));
+    }
+    checkAndThrowException(); // TODO support checkAndThrowException
+  }
+}
+
+// Same as calling synchronize().
+bool ProcessGroupHCCL::WorkHCCL::wait() {
+  synchronize();
+  // Always return true, because abort API is not implemented.
+  return true;
+}
+
+ProcessGroupHCCL::ProcessGroupHCCL(
+    const std::shared_ptr<Store>& store,
+    int rank,
+    int size,
+    const std::chrono::milliseconds& opTimeout)
+    : ProcessGroup(rank, size),
+      store_(store),
+      hcclCommCounter_(0),
+      terminateWatchdog_(false),
+      opTimeout_(opTimeout) {
+  char* blockingWait = getenv(HCCL_BLOCKING_WAIT);
+  try {
+    if (blockingWait != nullptr) {
+      auto val = std::stoi(blockingWait);
+      if (val == 1) {
+        // Make wait() and synchronize() a blocking call.
+        blockingWait_ = true;
+      } else if (val != 0) {
+        throw std::runtime_error(
+            "Invalid value for environment variable: " +
+            std::string(HCCL_BLOCKING_WAIT));
+      }
+    }
+  } catch (std::exception& e) {
+    throw std::runtime_error(
+        "Invalid value for environment variable: " +
+        std::string(HCCL_BLOCKING_WAIT));
+  }
+}
+
+ProcessGroupHCCL::~ProcessGroupHCCL() {}
+
+void ProcessGroupHCCL::broadcastMasterID(HcclRootInfo* hcclID) {
+  // For every HCCL communicator that we create we need to broadcast
+  // a unique ID from rank 0 to all other ranks. This broadcast is
+  // done by rank 0 setting a key in the store and all other ranks
+  // retrieving the contents of that key. A single process group
+  // may create multiple HCCL communicators, so we use a sequence
+  // number to differentiate between them.
+  std::string storeKey = std::to_string(hcclCommCounter_++);
+  if (rank_ == 0) {
+    auto vec = std::vector<uint8_t>(
+        reinterpret_cast<uint8_t*>(hcclID),
+        reinterpret_cast<uint8_t*>(hcclID) + HCCL_ROOT_INFO_BYTES);
+    store_->set(storeKey, vec);
+  } else {
+    auto vec = store_->get(storeKey);
+    TORCH_CHECK(vec.size() == HCCL_ROOT_INFO_BYTES);
+    std::memcpy(hcclID, vec.data(), vec.size());
+  }
+}
+
+/*
+void ProcessGroupHCCL::fluxLimit (
+    const std::string& devicesKey,
+    const int index) {
+  // event sync every two allreduce
+  if ((++collectiveCnts_[devicesKey][index]) < maxOpNumPerSyncPoint) {
+    return;
+  }
+  // sync with last sync point
+  at::npu::NPUEvent &fluxEvent = rateCtrlEvents_[devicesKey][index];
+  if (fluxEvent.isCreated()) {
+    // printf("synchronize point reached. begin event sync\r\n");
+    while(!fluxEvent.query()) {
+      std::this_thread::sleep_for(
+          std::chrono::milliseconds(1));
+    }
+    fluxEvent.synchronize();
+  } else {
+    // printf("fluxEvent[%s][%d] is not created\r\n", devicesKey.c_str(),
+index);
+  }
+  // record new sync point
+  c10::npu::NPUStream& hcclStream = hcclStreams_[devicesKey][index];
+  fluxEvent.record(hcclStream);
+
+  // clear collective count
+  collectiveCnts_[devicesKey][index] = 0;
+}
+*/
+
+std::vector<std::shared_ptr<HCCLComm>>& ProcessGroupHCCL::getHCCLComm(
+    const std::string& devicesKey,
+    const std::vector<at::Device>& devices) {
+  // Sanity check
+  if (devicesKey.empty()) {
+    throw std::runtime_error(
+        "Not able to create/get the HCCL Communicator since "
+        "the NPU devices are not known");
+  }
+
+  for (auto& device : devices) {
+    usedDeviceIdxs_.insert(device.index());
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(devHCCLCommMapLock_);
+    if (devHCCLCommMap_.find(devicesKey) != devHCCLCommMap_.end()) {
+      // Reuse the cached communicator if there is one.
+      return devHCCLCommMap_[devicesKey];
+    }
+  }
+
+  // HCCL communicator not cached, create a new entry
+  std::vector<std::shared_ptr<HCCLComm>> hcclComms;
+  hcclComms.resize(devices.size());
+
+  HcclRootInfo hcclID;
+  if (rank_ == 0) {
+    C10D_HCCL_CHECK(HcclGetRootInfo(&hcclID));
+  }
+  broadcastMasterID(&hcclID);
+
+  c10::npu::OptionalNPUGuard npuGuard;
+  std::vector<c10::npu::NPUStream> streamVal;
+  streamVal.reserve(devices.size());
+
+  for (size_t i = 0; i < devices.size(); ++i) {
+    int numRanks = getSize();
+    int rank = getRank() * devices.size() + i;
+
+    npuGuard.set_index(devices[i].index());
+    hcclComms[i] = HCCLComm::create(numRanks, rank, hcclID);
+
+    // Creates the HCCL streams
+    streamVal.push_back(c10::npu::getNPUStreamFromPool(devices[i].index()));
+  }
+
+  hcclStreams_.emplace(devicesKey, std::move(streamVal));
+
+  // Note: these events are created with the (default) cudaEventDisableTiming
+  // flag This flag provides the best performance when used with
+  // StreamWaitEvent() and EventQuery(). Since we here don't measure the
+  // performance using npuEvent, this should be set.
+  hcclEvents_.emplace(
+      std::piecewise_construct,
+      std::make_tuple(devicesKey),
+      std::make_tuple(devices.size()));
+
+  // stream length is 1024,
+  rateCtrlEvents_.emplace(
+      std::piecewise_construct,
+      std::make_tuple(devicesKey),
+      std::make_tuple(devices.size()));
+
+  // record collectiveCnts.
+  collectiveCnts_.emplace(
+      std::piecewise_construct,
+      std::make_tuple(devicesKey),
+      std::make_tuple(devices.size()));
+
+  // Hold the lock before modifying the cache.
+  std::lock_guard<std::mutex> lock(devHCCLCommMapLock_);
+
+  // Move the NCCL resource to cache
+  devHCCLCommMap_.emplace(devicesKey, std::move(hcclComms));
+  return devHCCLCommMap_[devicesKey];
+}
+
+namespace {
+
+// Check that all `tensors' have the same type and shape and are distributed
+// across distinct NPUs.
+void check_npu_tensors(const std::vector<at::Tensor>& tensors) {
+  // HCCL support one NPU per process only
+  if (tensors.size() != 1) {
+    throw std::runtime_error(
+        "Tensor list mustn't be larger than the number of available NPUs");
+  }
+  // HCCL support contiguous tensor only
+  if (!tensors[0].is_contiguous()) {
+    throw std::runtime_error("Tensors must be contiguous");
+  }
+}
+
+// Flatten each list in `tensor_lists' for a gather or scatter operation, and
+// ensure compatibility with the corresponding tensor in `other'.
+std::vector<at::Tensor> flatten_for_scatter_gather(
+    std::vector<std::vector<at::Tensor>>& tensor_lists,
+    std::vector<at::Tensor>& other,
+    size_t world_size) {
+  if (tensor_lists.size() != other.size()) {
+    throw std::runtime_error(
+        "Tensor list operands to scatter/gather must have the same length");
+  }
+  const auto num_devices = tensor_lists.size();
+
+  std::vector<at::Tensor> flattened;
+  flattened.resize(num_devices);
+
+  for (auto i = size_t{}; i < num_devices; ++i) {
+    if (tensor_lists[i].size() != world_size * num_devices) {
+      throw std::runtime_error(
+          "Tensor list input to scatter/gather must match number of collective"
+          " participants");
+    }
+
+    // Only check device match for the first tensor in the list; the call to
+    // newLikeFlat() below will check the rest.
+    if (tensor_lists[i].front().get_device() != other[i].get_device()) {
+      throw std::runtime_error(
+          "Corresponding input/output tensors to scatter/gather must all reside"
+          " on the same device");
+    }
+
+    for (const auto& t : tensor_lists[i]) {
+      if (t.numel() != other[i].numel()) {
+        throw std::runtime_error(
+            "All tensor operands to scatter/gather must have the same size");
+      }
+    }
+    // Flatten the tensors (from all ranks) into a single big tensor.
+    flattened[i] = newLikeFlat(tensor_lists, i);
+  }
+  return flattened;
+}
+
+} // namespace
+
+std::shared_ptr<ProcessGroupHCCL::WorkHCCL> ProcessGroupHCCL::initWork(
+    std::vector<at::Device> devices) {
+  if (devices.size() != 1) {
+    throw std::runtime_error(
+        "ProcessGroupHCCL support one device per process only");
+  }
+  return std::make_shared<ProcessGroupHCCL::WorkHCCL>(devices);
+}
+
+template <typename Fn, typename PreProcess, typename PostProcess>
+std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::collective(
+    std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
+    Fn fn,
+    PreProcess pre,
+    PostProcess post) {
+  const auto devices = getDeviceList(inputs);
+  const auto key = getKeyFromDevices(devices);
+  auto& hcclComms = getHCCLComm(key, devices);
+  // First let HCCL streams wait for input tensors allocation streams
+  syncStreams(devices, hcclEvents_[key], hcclStreams_[key]);
+  // Work itself will create the events on all NPUs of tensors
+  auto work = initWork(devices);
+
+  c10::npu::OptionalNPUGuard npuGuard;
+  pre(hcclStreams_[key]);
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    npuGuard.set_index(devices[i].index());
+    c10::npu::NPUStream& hcclStream = hcclStreams_[key][i];
+
+    // Both `inputs' and `outputs' are created on a worker stream and used in
+    // different hcclStreams.  Hence, both must record the hcclStream to
+    // prevent being freed before the collective finishes.
+    //
+    // We only record `inputs' here, and leave recording `outputs' to `fn' for
+    // operations where `inputs' and `outputs' are not the same.
+    //
+    // See [Sync Streams].
+    c10::npu::NPUCachingAllocator::recordStream(
+        inputs[i].storage().data_ptr(), hcclStream);
+  }
+  {
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      npuGuard.set_index(devices[i].index());
+      // to avoid to much task pushed to the stream, leading to stream overflow
+      // insert sync point
+      // fluxLimit(key, i);
+      c10::npu::NPUStream& hcclStream = hcclStreams_[key][i];
+      hcclUs startut = TIME_NOW();
+      C10D_HCCL_CHECK(
+          fn(inputs[i], outputs[i], hcclComms[i]->getHcclComm(), hcclStream));
+    }
+  }
+  post(hcclStreams_[key]);
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    c10::npu::NPUStream& hcclStream = hcclStreams_[key][i];
+    work->npuEvents_[i].record(hcclStream);
+    work->hcclComms_[i] = hcclComms[i];
+    work->blockingWait_ = blockingWait_;
+    work->opTimeout_ = opTimeout_;
+  }
+
+  return work;
+}
+
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::collective(
+    std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
+    Fn fn) {
+  return collective(
+      inputs,
+      outputs,
+      fn,
+      [](std::vector<c10::npu::NPUStream>&) {},
+      [](std::vector<c10::npu::NPUStream>&) {});
+}
+
+int g_allreduceID = 0;
+std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::allreduce(
+    std::vector<at::Tensor>& tensors,
+    const AllreduceOptions& opts) {
+  check_npu_tensors(tensors);
+  return collective(
+      tensors,
+      tensors,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          HcclComm comm,
+          c10::npu::NPUStream& stream) {
+        aclrtSetExceptionInfoCallback(exceptionCallback);
+        RECORD_FUNCTION("HcclAllreduce", std::vector<c10::IValue>({input}));
+        return HcclAllReduce(
+            input.data_ptr(),
+            output.data_ptr(),
+            input.storage().unsafeGetStorageImpl()->numel(),
+            getHcclDataType(input.scalar_type()),
+            hcclOp[opts.reduceOp],
+            comm,
+            stream.stream());
+      });
+}
+int g_broadcastID = 100000;
+std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::broadcast(
+    std::vector<at::Tensor>& tensors,
+    const BroadcastOptions& opts) {
+  check_npu_tensors(tensors);
+  return collective(
+      tensors,
+      tensors,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          HcclComm comm,
+          c10::npu::NPUStream& stream) {
+        RECORD_FUNCTION("HcclBroadcast", std::vector<c10::IValue>({input}));
+        const auto root = opts.rootRank * tensors.size() + opts.rootTensor;
+        return HcclBroadcast(
+            input.data_ptr(),
+            input.storage().unsafeGetStorageImpl()->numel(),
+            getHcclDataType(input.scalar_type()),
+            root,
+            comm,
+            stream.stream());
+      });
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::allreduce_coalesced(
+    std::vector<at::Tensor>& /* unused */,
+    const AllreduceCoalescedOptions& /* unused */) {
+  throw std::runtime_error(
+      "ProcessGroupHCCL does not support allreduce_coalesced");
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::reduce(
+    std::vector<at::Tensor>& /* unused */,
+    const ReduceOptions& /* unused */) {
+  throw std::runtime_error("ProcessGroupHCCL does not support reduce");
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::allgather(
+    std::vector<std::vector<at::Tensor>>& outputTensors,
+    std::vector<at::Tensor>& inputTensors,
+    const AllgatherOptions& opts) {
+  check_npu_tensors(inputTensors);
+  auto outputFlattened =
+      flatten_for_scatter_gather(outputTensors, inputTensors, size_);
+  check_npu_tensors(outputFlattened);
+
+  return collective(
+      inputTensors,
+      outputFlattened,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          HcclComm comm,
+          c10::npu::NPUStream& stream) {
+        RECORD_FUNCTION("HcclAllgather", std::vector<c10::IValue>({input}));
+        c10::npu::NPUCachingAllocator::recordStream(
+            output.storage().data_ptr(), stream);
+        return HcclAllGather(
+            input.data_ptr(),
+            output.data_ptr(),
+            input.storage().unsafeGetStorageImpl()->numel(),
+            getHcclDataType(input.scalar_type()),
+            comm,
+            stream.stream());
+      },
+      [&](std::vector<c10::npu::NPUStream>& hcclStreams) {},
+      [&](std::vector<c10::npu::NPUStream>& hcclStreams) {
+        // Copy the flattened output tensors to the outputs.
+        for (size_t i = 0; i < outputTensors.size(); ++i) {
+          c10::npu::NPUStreamGuard guard(hcclStreams[i]);
+          for (size_t j = 0; j < outputTensors[0].size(); ++j) {
+            // See [Sync Streams].
+            c10::npu::NPUCachingAllocator::recordStream(
+                outputTensors[i][j].storage().data_ptr(), hcclStreams[i]);
+
+            outputTensors[i][j].copy_(outputFlattened[i][j], true);
+          }
+        }
+      });
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::allgather_base(
+    at::Tensor& /*unused */,
+    at::Tensor& /*unused */,
+    const AllgatherOptions& /*unused */) {
+  throw std::runtime_error("ProcessGroupHCCL does not support allgather_base");
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::reduce_scatter(
+    std::vector<at::Tensor>& outputTensors,
+    std::vector<std::vector<at::Tensor>>& inputTensors,
+    const ReduceScatterOptions& opts) {
+  check_npu_tensors(outputTensors);
+
+  auto inputFlattened =
+      flatten_for_scatter_gather(inputTensors, outputTensors, size_);
+  check_npu_tensors(inputFlattened);
+
+  return collective(
+      inputFlattened,
+      outputTensors,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          HcclComm comm,
+          c10::npu::NPUStream& stream) {
+        RECORD_FUNCTION("HcclReduceScatter", std::vector<c10::IValue>({input}));
+        c10::npu::NPUCachingAllocator::recordStream(
+            output.storage().data_ptr(), stream);
+        return HcclReduceScatter(
+            input.data_ptr(),
+            output.data_ptr(),
+            output.numel(),
+            getHcclDataType(input.scalar_type()),
+            hcclOp[opts.reduceOp],
+            comm,
+            stream.stream());
+      },
+      [&](std::vector<c10::npu::NPUStream>& hcclStreams) {
+        // Copy the input tensors to the flattened inputs.
+        for (size_t i = 0; i < inputTensors.size(); ++i) {
+          c10::npu::NPUStreamGuard guard(hcclStreams[i]);
+          for (size_t j = 0; j < inputTensors[0].size(); ++j) {
+            // See [Sync Streams].
+            c10::npu::NPUCachingAllocator::recordStream(
+                inputTensors[i][j].storage().data_ptr(), hcclStreams[i]);
+
+            inputFlattened[i][j].copy_(inputTensors[i][j], true);
+          }
+        }
+      },
+      [&](std::vector<c10::npu::NPUStream>& hcclStreams) {});
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::barrier(
+    const BarrierOptions& opts) {
+  std::vector<at::Device> devices;
+  if (usedDeviceIdxs_.empty()) {
+    auto numNPUs = c10::npu::device_count();
+    int16_t deviceIdx = static_cast<int16_t>(rank_ % numNPUs);
+    devices.push_back(at::Device(at::DeviceType::NPU, deviceIdx));
+  } else {
+    for (auto usedDeviceIdx : usedDeviceIdxs_) {
+      devices.push_back(at::Device(at::DeviceType::NPU, usedDeviceIdx));
+    }
+  }
+
+  std::vector<at::Tensor> barrierTensors;
+  barrierTensors.reserve(devices.size());
+
+  at::npu::OptionalNPUGuard npuGuard;
+  for (auto& device : devices) {
+    npuGuard.set_index(device.index());
+    barrierTensors.push_back(at::empty(
+        {1},
+        at::TensorOptions().device(at::DeviceType::NPU).dtype(at::kFloat)));
+  }
+
+  auto work = BarrierInside(barrierTensors);
+
+  // Work will take over barrierTensors
+  auto hcclWork = dynamic_cast<ProcessGroupHCCL::WorkHCCL*>(work.get());
+  TORCH_CHECK(hcclWork);
+  hcclWork->barrierTensors_ = std::move(barrierTensors);
+
+  return work;
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::BarrierInside(
+    std::vector<at::Tensor>& tensors) {
+    check_npu_tensors(tensors);
+
+  return collective(
+      tensors,
+      tensors,
+      [&](at::Tensor& input,
+          at::Tensor& output,
+          HcclComm comm,
+          c10::npu::NPUStream& stream) {
+        aclrtSetExceptionInfoCallback(exceptionCallback);
+        auto ret = c10::npu::hccl::hccl_barrier(comm, stream.stream());
+        if (ret == HcclResult::HCCL_E_NOT_SUPPORT) {
+          return HcclAllReduce(
+            input.data_ptr(),
+            output.data_ptr(),
+            input.storage().unsafeGetStorageImpl()->numel(),
+            getHcclDataType(input.scalar_type()),
+            hcclOp[ReduceOp::SUM],
+            comm,
+            stream.stream());
+        }
+        else {
+          return ret;
+        }
+    });
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::gather(
+    std::vector<std::vector<at::Tensor>>& /* unused */,
+    std::vector<at::Tensor>& /* unused */,
+    const GatherOptions& /* unused */) {
+  throw std::runtime_error("ProcessGroupHCCL does not support gather");
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::scatter(
+    std::vector<at::Tensor>& /* unused */,
+    std::vector<std::vector<at::Tensor>>& /* unused */,
+    const ScatterOptions& /* unused */) {
+  throw std::runtime_error("ProcessGroupHCCL does not support scatter");
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::send(
+    std::vector<at::Tensor>& /* unused */,
+    int /* unused */,
+    int /* unused */) {
+  throw std::runtime_error("ProcessGroupHCCL does not support send");
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::recv(
+    std::vector<at::Tensor>& /* unused */,
+    int /* unused */,
+    int /* unused */) {
+  throw std::runtime_error("ProcessGroupHCCL does not support recv");
+}
+
+std::shared_ptr<ProcessGroup::Work> ProcessGroupHCCL::recvAnysource(
+    std::vector<at::Tensor>& /* unused */,
+    int /* unused */) {
+  throw std::runtime_error("ProcessGroupHCCL does not support recv");
+}
+} // namespace c10d
diff --git a/src/torch/lib/c10d/ProcessGroupHCCL.hpp b/src/torch/lib/c10d/ProcessGroupHCCL.hpp
index 76c0253e5c5843836a87f709db1d00350aa90c02..3c7e830e073810ae170b9019e6a5a18e2414eb6e 100644
--- a/src/torch/lib/c10d/ProcessGroupHCCL.hpp
+++ b/src/torch/lib/c10d/ProcessGroupHCCL.hpp
@@ -1,392 +1,392 @@
-// Copyright (c) 2020 Huawei Technologies Co., Ltd
-// Copyright (c) 2019, Facebook CORPORATION. 
-// All rights reserved.
-//
-// Licensed under the BSD 3-Clause License  (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <mutex>
-#include <thread>
-#include <unordered_map>
-
-#include <third_party/hccl/inc/hccl/hccl.h>
-#include <c10d/HCCLUtils.hpp>
-#include <c10d/ProcessGroup.hpp>
-#include <c10d/Store.hpp>
-
-#include <aten/src/ATen/npu/NPUEvent.h>
-#include <c10/npu/interface/HcclInterface.h>
-namespace c10d {
-// Environment variable which controls whether or not wait() is blocking or
-// non-blocking.
-constexpr const char* HCCL_BLOCKING_WAIT = "HCCL_BLOCKING_WAIT";
-
-// ProcessGroupHCCL implements HCCL bindings for c10d.
-//
-// All functions of the class are expected to be called in the same order
-// across all processes in the process group.  This is the only way that we
-// can guarantee to match up the same calls among all processes.
-//
-// All HCCL functions provided by this class are asynchronous functions. More
-// specifically, each HCCL call is scheduled on a separate runtime stream that
-// is different from the current runtime stream. This is for the purpose of
-// achieving potentially concurrency and better performance. As a result,
-// it is the callers' responsibilty to make sure that the runtime stream their
-// code works on needs to wait for the HCCL operation from
-// this class.
-//
-// This can be done by calling:
-//
-// either WorkHCCL::wait() or WorkHCCL::synchronize(), both achieves the same
-// functionality and are synonyms.
-//
-// Also note that WorkHCCL::finishedGPUExecution() is a helper function only
-// provided by ProcessGroupHCCL to check if the HCCL operation of WorkHCCL has
-// finished execution on the NPU (not just scheduled).
-//
-// Example on using the HCCL process group
-//
-//   ProcessGroupHCCL pg(store, rank, size);
-//   std::shared_ptr<WorkNCCL> work = pg.allreduce(tensors);
-//
-//   // At this point, HCCL kernel has already by queued successfully
-//   // Now, let current stream wait for the HCCL to finish, this function is
-//   // async operation as well
-//
-//   work->wait()
-//
-//   // Now continue on other work in the current stream.
-class ProcessGroupHCCL : public ProcessGroup {
- public:
-  class WorkHCCL : public ProcessGroup::Work {
-   public:
-    // Constructor takes a list of NPU devices to adapt framework
-    // But HCCL support one device only!!!
-    WorkHCCL(const std::vector<at::Device>& devices);
-    virtual ~WorkHCCL();
-
-    // Checks if request has completed. In this specific case of HCCL, it checks
-    // if the HCCL operation has completed on the NPU in its own HCCL stream.
-    // Non-blocking operation.
-    bool isCompleted() override;
-
-    bool isSuccess() const override;
-
-    // Same as calling synchronize() for HCCL work.
-    bool wait() override;
-
-    // Temporarily not implemented
-    // void abort() override;
-
-    // Let current stream wait on the completing of the HCCL work
-    // Throws on exceptions. Blocking operation, which will wait for work
-    // completion.
-    void synchronize() override;
-
-    // Helper function that checks if the HCCL have finished
-    // execution on the NPUs
-    bool finishedNPUExecution();
-
-   protected:
-    // The cached list of NPU devices to operate on.
-    // HCCL support one device per rank only
-    std::vector<at::Device> devices_;
-
-    // The NPU events tracking this work item on multiple NPU devices
-    std::vector<at::npu::NPUEvent> npuEvents_;
-
-    // The HCCL communicators used for this work item.
-    std::vector<std::shared_ptr<HCCLComm>> hcclComms_;
-
-    // Tensors used for barrier op
-    std::vector<at::Tensor> barrierTensors_;
-
-    // Clone of blockingWait_ from ProcessGroupHCCL.
-    bool blockingWait_ = false;
-
-    // Clone of opTimeout_ from ProcessGroupHCCL.
-    std::chrono::milliseconds opTimeout_;
-
-    // Time point representing when the work started.
-    std::chrono::time_point<std::chrono::steady_clock> workStartTime_;
-
-    // Temporarily not implemented
-    // virtual std::exception_ptr checkForHCCLErrors(const
-    // std::vector<std::shared_ptr<HCCLComm>>& hcclComms) const;
-
-   private:
-    // Checks for HCCL errors and sets an appropriate exception_ptr.
-    void checkAndSetException();
-
-    // Checks for HCCL errors and throws an appropriate exception.
-    void checkAndThrowException();
-
-    // Just checks whether NPU execution has completed, without modifying
-    // exception_ptr.
-    bool finishedNPUExecutionInternal() const;
-
-    // Temporarily not implemented
-    // std::shared_ptr<Store> store_;
-
-    friend class ProcessGroupHCCL;
-  };
-
-  // If you wish to create multiple process groups, each with a potentially
-  // different rank and size, you can do so by passing a new store instance
-  // to each one. If you have only a single store object, you can
-  // use the `c10d::PrefixStore` to derive scoped instances.
-  // This is also what the Python API in torch.distributed does.
-  //
-  // The process group instance keeps a reference to the store because
-  // it may be used long after the constructor runs. In fact, the constructor
-  // doesn't create any HCCL communicators. A single HCCL communicator can
-  // only be used on a specific set of devices, and are therefore created
-  // on-demand when a collective runs. If another collective is executed later,
-  // against a different set of devices, the process group creates another NCCL
-  // communicator. These HCCL communicators are cached and reused if possible.
-  //
-  ProcessGroupHCCL(
-      const std::shared_ptr<Store>& store,
-      int rank,
-      int size,
-      const std::chrono::milliseconds& opTimeout =
-          std::chrono::milliseconds(kProcessGroupHCCLOpTimeoutMillis));
-
-  // This constructor includes the deprecated `groupName` argument.
-  // If you have existing code that uses the `groupName`, you can replace
-  // it by specifying a `c10d::PrefixStore(groupName, store)` for store.
-  C10_DEPRECATED ProcessGroupHCCL(
-      const std::shared_ptr<Store>& store,
-      int rank,
-      int size,
-      const std::string& groupName,
-      const std::chrono::milliseconds& opTimeout =
-          std::chrono::milliseconds(kProcessGroupHCCLOpTimeoutMillis))
-      : ProcessGroupHCCL(store, rank, size, opTimeout) {}
-
-  virtual ~ProcessGroupHCCL();
-
-  std::shared_ptr<ProcessGroup::Work> broadcast(
-      std::vector<at::Tensor>& tensors,
-      const BroadcastOptions& opts = BroadcastOptions()) override;
-
-  std::shared_ptr<ProcessGroup::Work> allreduce(
-      std::vector<at::Tensor>& tensors,
-      const AllreduceOptions& opts = AllreduceOptions()) override;
-
-  std::shared_ptr<ProcessGroup::Work> allreduce_coalesced(
-      std::vector<at::Tensor>& tensors,
-      const AllreduceCoalescedOptions& opts =
-          AllreduceCoalescedOptions()) override;
-
-  std::shared_ptr<ProcessGroup::Work> reduce(
-      std::vector<at::Tensor>& tensors,
-      const ReduceOptions& opts = ReduceOptions()) override;
-
-  std::shared_ptr<ProcessGroup::Work> allgather(
-      std::vector<std::vector<at::Tensor>>& outputTensors,
-      std::vector<at::Tensor>& inputTensors,
-      const AllgatherOptions& opts = AllgatherOptions()) override;
-
-  std::shared_ptr<ProcessGroup::Work> allgather_base(
-      at::Tensor& outputbuffer,
-      at::Tensor& inputbuffer,
-      const AllgatherOptions& opts = AllgatherOptions()) override;
-
-  std::shared_ptr<ProcessGroup::Work> reduce_scatter(
-      std::vector<at::Tensor>& outputTensors,
-      std::vector<std::vector<at::Tensor>>& inputTensors,
-      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
-
-  std::shared_ptr<ProcessGroup::Work> barrier(
-      const BarrierOptions& opts = BarrierOptions()) override;
-
-/**
-    HCCL barrier API for ProcessGroupHCCL Class.
-    */
-  std::shared_ptr<ProcessGroup::Work> BarrierInside(
-      std::vector<at::Tensor>& Tensors);
-
-  // Unsupported Ops
-  std::shared_ptr<ProcessGroup::Work> gather(
-      std::vector<std::vector<at::Tensor>>& outputTensors,
-      std::vector<at::Tensor>& inputTensors,
-      const GatherOptions& opts = GatherOptions()) override;
-
-  std::shared_ptr<ProcessGroup::Work> scatter(
-      std::vector<at::Tensor>& outputTensors,
-      std::vector<std::vector<at::Tensor>>& inputTensors,
-      const ScatterOptions& opts = ScatterOptions()) override;
-
-  std::shared_ptr<ProcessGroup::Work> send(
-      std::vector<at::Tensor>& tensors,
-      int dstRank,
-      int tag) override;
-
-  std::shared_ptr<ProcessGroup::Work> recv(
-      std::vector<at::Tensor>& tensors,
-      int srcRank,
-      int tag) override;
-
-  std::shared_ptr<ProcessGroup::Work> recvAnysource(
-      std::vector<at::Tensor>& tensors,
-      int tag) override;
-
-  static const int64_t kProcessGroupHCCLOpTimeoutMillis;
-
- protected:
-  // Helper that broadcasts HCCL Master ID to all ranks through the store
-  void broadcastMasterID(HcclRootInfo* hcclID);
-
-  // Helper that either looks up the cached HCCL communicators or creates
-  // a new set of NCCL communicators as a cache entry
-  std::vector<std::shared_ptr<HCCLComm>>& getHCCLComm(
-      const std::string& devicesKey,
-      const std::vector<at::Device>& devices);
-
-  // Temporarily not implemented
-  // virtual std::exception_ptr checkForHCCLErrors(const
-  // std::vector<std::shared_ptr<HCCLComm>>& hcclComms);
-
-  virtual std::shared_ptr<ProcessGroupHCCL::WorkHCCL> initWork(
-      std::vector<at::Device> devices);
-
- private:
-  // Helper that encapsulates work shared across all collective communication
-  // primitives.  The callbacks have the following signatures:
-  //
-  //    HcclResult fn(at::Tensor& input, at::Tensor& output,
-  //                    ncclComm_t, at::cuda::CUDAStream&);
-  //    void {pre,post}(std::vector<at::cuda::CUDAStream&>);
-  template <typename Fn>
-  std::shared_ptr<ProcessGroup::Work> collective(
-      std::vector<at::Tensor>& input,
-      std::vector<at::Tensor>& output,
-      Fn fn);
-  template <typename Fn, typename PreProcess, typename PostProcess>
-  std::shared_ptr<ProcessGroup::Work> collective(
-      std::vector<at::Tensor>& input,
-      std::vector<at::Tensor>& output,
-      Fn fn,
-      PreProcess pre,
-      PostProcess post);
-
-  // Temporarily not implemented
-  // static std::exception_ptr checkForHCCLErrorsInternal(const
-  // std::vector<std::shared_ptr<HCCLComm>>& hcclComms); void
-  // ncclCommWatchdog(); void ncclCommWatchdogInternal();
-
-  // Limit the number of tasks issued to the HCCL stream.
-  // This interface will introduce RTS bug,
-  // so we withdraw it temporarily.
-  // void fluxLimit ( const std::string& key, const int index);
-
- protected:
-  static const int64_t kWatchdogThreadSleepMillis;
-
-  // The store is used to broadcast the HCCL Master ID of rank 0.
-  std::shared_ptr<Store> store_;
-
-  // The number of HCCL communicators that have been created during
-  // the lifetime of this process group. This sequence number is
-  // used to scope keys used in the store.
-  uint64_t hcclCommCounter_{0};
-
-  // The HCCL communicator that the process group has cached.
-  // The key is a list of NPU devices that an operation is operating on
-  // The NPU devices are stored in a device sequence and the cache NCCL
-  // communicator is associated with this NPU device sequence
-  //
-  // e.g. If the process group op only uses device 0, then the value of
-  // the used device string stored (value of the hashmap) would be "0".
-  //
-  //      If the process group op uses device 0 - 7 and the each tensor of the
-  //      input tensor list is on device, 0, 1, 2, 3, 4, 5, 6, 7 separately,
-  //      then the value of the used device string (key) stored would be
-  //      "0,1,2,3,4,5,6,7"
-  //
-  //      If the process group op uses device 0 - 7 and the each tensor of the
-  //      input tensor list is on device, 0, 4, 5, 6, 7, 1, 2, 3 separately,
-  //      then the value of the used device string stored would be
-  //      "0,4,5,6,7,1,2,3"
-  //
-  //      Note that the order of the device for the tensor list matters.
-  std::unordered_map<std::string, std::vector<std::shared_ptr<HCCLComm>>>
-      devHCCLCommMap_;
-
-  // Temporarily not implemented
-  // std::unordered_map<std::string, std::vector<std::shared_ptr<HCCLComm>>>
-  // hcclIdToCommMap_;
-
-  // Mutex to guard devNCCLCommMap_.
-  std::mutex devHCCLCommMapLock_;
-
-  // Watchdog thread which looks for errors on the cached NCCL communicators.
-  std::thread hcclCommWatchdogThread_;
-
-  // Whether or not we should terminate the watchdog thread.
-  std::atomic<bool> terminateWatchdog_;
-
-  // Condition variable to control how long the watchdog thread waits.
-  std::condition_variable watchdogCV_;
-
-  // Mutex for watchdog.
-  std::mutex watchdogCVMutex_;
-
-  // The NPU steams used by NCCL kernels
-  std::unordered_map<std::string, std::vector<c10::npu::NPUStream>>
-      hcclStreams_;
-
-  // The NPU events used to sync HCCL streams
-  std::unordered_map<std::string, std::vector<at::npu::NPUEvent>> hcclEvents_;
-
-  // The NPU events used to control task rate to protect streams
-  std::unordered_map<std::string, std::vector<at::npu::NPUEvent>>
-      rateCtrlEvents_;
-  std::unordered_map<std::string, std::vector<uint64_t>> collectiveCnts_;
-
-  // Device Indexes used for all collectives in this group
-  std::set<int> usedDeviceIdxs_;
-
-  // map from the key: "group name + pg counter (ID)" to the
-  // HCCL Master ID count. This needs to be group and pg specific
-  //
-  // For each process group, we need a uniform unique HCCL Master ID counter to
-  // ensure that HCCL operation in this process group can be completed
-  // successfully. Since each process group ID belongs to a group name, the key
-  // to this map is a combination of group name and ProcessGroupHCCL ID.
-  static std::unordered_map<std::string, ssize_t> pgUniqueHCCLIDCnt_;
-
-  // map from group name to the pg counter (ID) within that group
-  //
-  // For each group with the "group name" (which is the key), we need to
-  // keep track of a unique process group ID when creating a new
-  // ProcessGroupNCCL for this "group name". Therefore, the value of this
-  // map keeps the unique ProcessGroupHCCL's ID for a specific group with
-  // the "group name". The reason we need a per-group process group ID counter
-  // is that different group can have different ranks and we need ensure that
-  // each group has its own uniform process group ID for all its ranks.
-  static std::unordered_map<std::string, ssize_t> processGroupCounterMap_;
-
-  // Whether or not wait() and synchronize() are blocking operations that wait
-  // for the operation to complete.
-  bool blockingWait_ = false;
-
-  // Timeout for operations. This is only used when blockingWait_ is enabled.
-  std::chrono::milliseconds opTimeout_;
-
-  // Temporarily not implemented
-  // std::unordered_set<std::string> abortedComms_;
-};
+// Copyright (c) 2020 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION. 
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+
+#include <third_party/hccl/inc/hccl/hccl.h>
+#include <c10d/HCCLUtils.hpp>
+#include <c10d/ProcessGroup.hpp>
+#include <c10d/Store.hpp>
+
+#include <aten/src/ATen/npu/NPUEvent.h>
+#include <c10/npu/interface/HcclInterface.h>
+namespace c10d {
+// Environment variable which controls whether or not wait() is blocking or
+// non-blocking.
+constexpr const char* HCCL_BLOCKING_WAIT = "HCCL_BLOCKING_WAIT";
+
+// ProcessGroupHCCL implements HCCL bindings for c10d.
+//
+// All functions of the class are expected to be called in the same order
+// across all processes in the process group.  This is the only way that we
+// can guarantee to match up the same calls among all processes.
+//
+// All HCCL functions provided by this class are asynchronous functions. More
+// specifically, each HCCL call is scheduled on a separate runtime stream that
+// is different from the current runtime stream. This is for the purpose of
+// achieving potentially concurrency and better performance. As a result,
+// it is the callers' responsibilty to make sure that the runtime stream their
+// code works on needs to wait for the HCCL operation from
+// this class.
+//
+// This can be done by calling:
+//
+// either WorkHCCL::wait() or WorkHCCL::synchronize(), both achieves the same
+// functionality and are synonyms.
+//
+// Also note that WorkHCCL::finishedGPUExecution() is a helper function only
+// provided by ProcessGroupHCCL to check if the HCCL operation of WorkHCCL has
+// finished execution on the NPU (not just scheduled).
+//
+// Example on using the HCCL process group
+//
+//   ProcessGroupHCCL pg(store, rank, size);
+//   std::shared_ptr<WorkNCCL> work = pg.allreduce(tensors);
+//
+//   // At this point, HCCL kernel has already by queued successfully
+//   // Now, let current stream wait for the HCCL to finish, this function is
+//   // async operation as well
+//
+//   work->wait()
+//
+//   // Now continue on other work in the current stream.
+class ProcessGroupHCCL : public ProcessGroup {
+ public:
+  class WorkHCCL : public ProcessGroup::Work {
+   public:
+    // Constructor takes a list of NPU devices to adapt framework
+    // But HCCL support one device only!!!
+    WorkHCCL(const std::vector<at::Device>& devices);
+    virtual ~WorkHCCL();
+
+    // Checks if request has completed. In this specific case of HCCL, it checks
+    // if the HCCL operation has completed on the NPU in its own HCCL stream.
+    // Non-blocking operation.
+    bool isCompleted() override;
+
+    bool isSuccess() const override;
+
+    // Same as calling synchronize() for HCCL work.
+    bool wait() override;
+
+    // Temporarily not implemented
+    // void abort() override;
+
+    // Let current stream wait on the completing of the HCCL work
+    // Throws on exceptions. Blocking operation, which will wait for work
+    // completion.
+    void synchronize() override;
+
+    // Helper function that checks if the HCCL have finished
+    // execution on the NPUs
+    bool finishedNPUExecution();
+
+   protected:
+    // The cached list of NPU devices to operate on.
+    // HCCL support one device per rank only
+    std::vector<at::Device> devices_;
+
+    // The NPU events tracking this work item on multiple NPU devices
+    std::vector<at::npu::NPUEvent> npuEvents_;
+
+    // The HCCL communicators used for this work item.
+    std::vector<std::shared_ptr<HCCLComm>> hcclComms_;
+
+    // Tensors used for barrier op
+    std::vector<at::Tensor> barrierTensors_;
+
+    // Clone of blockingWait_ from ProcessGroupHCCL.
+    bool blockingWait_ = false;
+
+    // Clone of opTimeout_ from ProcessGroupHCCL.
+    std::chrono::milliseconds opTimeout_;
+
+    // Time point representing when the work started.
+    std::chrono::time_point<std::chrono::steady_clock> workStartTime_;
+
+    // Temporarily not implemented
+    // virtual std::exception_ptr checkForHCCLErrors(const
+    // std::vector<std::shared_ptr<HCCLComm>>& hcclComms) const;
+
+   private:
+    // Checks for HCCL errors and sets an appropriate exception_ptr.
+    void checkAndSetException();
+
+    // Checks for HCCL errors and throws an appropriate exception.
+    void checkAndThrowException();
+
+    // Just checks whether NPU execution has completed, without modifying
+    // exception_ptr.
+    bool finishedNPUExecutionInternal() const;
+
+    // Temporarily not implemented
+    // std::shared_ptr<Store> store_;
+
+    friend class ProcessGroupHCCL;
+  };
+
+  // If you wish to create multiple process groups, each with a potentially
+  // different rank and size, you can do so by passing a new store instance
+  // to each one. If you have only a single store object, you can
+  // use the `c10d::PrefixStore` to derive scoped instances.
+  // This is also what the Python API in torch.distributed does.
+  //
+  // The process group instance keeps a reference to the store because
+  // it may be used long after the constructor runs. In fact, the constructor
+  // doesn't create any HCCL communicators. A single HCCL communicator can
+  // only be used on a specific set of devices, and are therefore created
+  // on-demand when a collective runs. If another collective is executed later,
+  // against a different set of devices, the process group creates another NCCL
+  // communicator. These HCCL communicators are cached and reused if possible.
+  //
+  ProcessGroupHCCL(
+      const std::shared_ptr<Store>& store,
+      int rank,
+      int size,
+      const std::chrono::milliseconds& opTimeout =
+          std::chrono::milliseconds(kProcessGroupHCCLOpTimeoutMillis));
+
+  // This constructor includes the deprecated `groupName` argument.
+  // If you have existing code that uses the `groupName`, you can replace
+  // it by specifying a `c10d::PrefixStore(groupName, store)` for store.
+  C10_DEPRECATED ProcessGroupHCCL(
+      const std::shared_ptr<Store>& store,
+      int rank,
+      int size,
+      const std::string& groupName,
+      const std::chrono::milliseconds& opTimeout =
+          std::chrono::milliseconds(kProcessGroupHCCLOpTimeoutMillis))
+      : ProcessGroupHCCL(store, rank, size, opTimeout) {}
+
+  virtual ~ProcessGroupHCCL();
+
+  std::shared_ptr<ProcessGroup::Work> broadcast(
+      std::vector<at::Tensor>& tensors,
+      const BroadcastOptions& opts = BroadcastOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Work> allreduce(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Work> allreduce_coalesced(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceCoalescedOptions& opts =
+          AllreduceCoalescedOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Work> reduce(
+      std::vector<at::Tensor>& tensors,
+      const ReduceOptions& opts = ReduceOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Work> allgather_base(
+      at::Tensor& outputbuffer,
+      at::Tensor& inputbuffer,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Work> reduce_scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Work> barrier(
+      const BarrierOptions& opts = BarrierOptions()) override;
+
+/**
+    HCCL barrier API for ProcessGroupHCCL Class.
+    */
+  std::shared_ptr<ProcessGroup::Work> BarrierInside(
+      std::vector<at::Tensor>& Tensors);
+
+  // Unsupported Ops
+  std::shared_ptr<ProcessGroup::Work> gather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const GatherOptions& opts = GatherOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Work> scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ScatterOptions& opts = ScatterOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) override;
+
+  std::shared_ptr<ProcessGroup::Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) override;
+
+  std::shared_ptr<ProcessGroup::Work> recvAnysource(
+      std::vector<at::Tensor>& tensors,
+      int tag) override;
+
+  static const int64_t kProcessGroupHCCLOpTimeoutMillis;
+
+ protected:
+  // Helper that broadcasts HCCL Master ID to all ranks through the store
+  void broadcastMasterID(HcclRootInfo* hcclID);
+
+  // Helper that either looks up the cached HCCL communicators or creates
+  // a new set of NCCL communicators as a cache entry
+  std::vector<std::shared_ptr<HCCLComm>>& getHCCLComm(
+      const std::string& devicesKey,
+      const std::vector<at::Device>& devices);
+
+  // Temporarily not implemented
+  // virtual std::exception_ptr checkForHCCLErrors(const
+  // std::vector<std::shared_ptr<HCCLComm>>& hcclComms);
+
+  virtual std::shared_ptr<ProcessGroupHCCL::WorkHCCL> initWork(
+      std::vector<at::Device> devices);
+
+ private:
+  // Helper that encapsulates work shared across all collective communication
+  // primitives.  The callbacks have the following signatures:
+  //
+  //    HcclResult fn(at::Tensor& input, at::Tensor& output,
+  //                    ncclComm_t, at::cuda::CUDAStream&);
+  //    void {pre,post}(std::vector<at::cuda::CUDAStream&>);
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Work> collective(
+      std::vector<at::Tensor>& input,
+      std::vector<at::Tensor>& output,
+      Fn fn);
+  template <typename Fn, typename PreProcess, typename PostProcess>
+  std::shared_ptr<ProcessGroup::Work> collective(
+      std::vector<at::Tensor>& input,
+      std::vector<at::Tensor>& output,
+      Fn fn,
+      PreProcess pre,
+      PostProcess post);
+
+  // Temporarily not implemented
+  // static std::exception_ptr checkForHCCLErrorsInternal(const
+  // std::vector<std::shared_ptr<HCCLComm>>& hcclComms); void
+  // ncclCommWatchdog(); void ncclCommWatchdogInternal();
+
+  // Limit the number of tasks issued to the HCCL stream.
+  // This interface will introduce RTS bug,
+  // so we withdraw it temporarily.
+  // void fluxLimit ( const std::string& key, const int index);
+
+ protected:
+  static const int64_t kWatchdogThreadSleepMillis;
+
+  // The store is used to broadcast the HCCL Master ID of rank 0.
+  std::shared_ptr<Store> store_;
+
+  // The number of HCCL communicators that have been created during
+  // the lifetime of this process group. This sequence number is
+  // used to scope keys used in the store.
+  uint64_t hcclCommCounter_{0};
+
+  // The HCCL communicator that the process group has cached.
+  // The key is a list of NPU devices that an operation is operating on
+  // The NPU devices are stored in a device sequence and the cache NCCL
+  // communicator is associated with this NPU device sequence
+  //
+  // e.g. If the process group op only uses device 0, then the value of
+  // the used device string stored (value of the hashmap) would be "0".
+  //
+  //      If the process group op uses device 0 - 7 and the each tensor of the
+  //      input tensor list is on device, 0, 1, 2, 3, 4, 5, 6, 7 separately,
+  //      then the value of the used device string (key) stored would be
+  //      "0,1,2,3,4,5,6,7"
+  //
+  //      If the process group op uses device 0 - 7 and the each tensor of the
+  //      input tensor list is on device, 0, 4, 5, 6, 7, 1, 2, 3 separately,
+  //      then the value of the used device string stored would be
+  //      "0,4,5,6,7,1,2,3"
+  //
+  //      Note that the order of the device for the tensor list matters.
+  std::unordered_map<std::string, std::vector<std::shared_ptr<HCCLComm>>>
+      devHCCLCommMap_;
+
+  // Temporarily not implemented
+  // std::unordered_map<std::string, std::vector<std::shared_ptr<HCCLComm>>>
+  // hcclIdToCommMap_;
+
+  // Mutex to guard devNCCLCommMap_.
+  std::mutex devHCCLCommMapLock_;
+
+  // Watchdog thread which looks for errors on the cached NCCL communicators.
+  std::thread hcclCommWatchdogThread_;
+
+  // Whether or not we should terminate the watchdog thread.
+  std::atomic<bool> terminateWatchdog_;
+
+  // Condition variable to control how long the watchdog thread waits.
+  std::condition_variable watchdogCV_;
+
+  // Mutex for watchdog.
+  std::mutex watchdogCVMutex_;
+
+  // The NPU steams used by NCCL kernels
+  std::unordered_map<std::string, std::vector<c10::npu::NPUStream>>
+      hcclStreams_;
+
+  // The NPU events used to sync HCCL streams
+  std::unordered_map<std::string, std::vector<at::npu::NPUEvent>> hcclEvents_;
+
+  // The NPU events used to control task rate to protect streams
+  std::unordered_map<std::string, std::vector<at::npu::NPUEvent>>
+      rateCtrlEvents_;
+  std::unordered_map<std::string, std::vector<uint64_t>> collectiveCnts_;
+
+  // Device Indexes used for all collectives in this group
+  std::set<int> usedDeviceIdxs_;
+
+  // map from the key: "group name + pg counter (ID)" to the
+  // HCCL Master ID count. This needs to be group and pg specific
+  //
+  // For each process group, we need a uniform unique HCCL Master ID counter to
+  // ensure that HCCL operation in this process group can be completed
+  // successfully. Since each process group ID belongs to a group name, the key
+  // to this map is a combination of group name and ProcessGroupHCCL ID.
+  static std::unordered_map<std::string, ssize_t> pgUniqueHCCLIDCnt_;
+
+  // map from group name to the pg counter (ID) within that group
+  //
+  // For each group with the "group name" (which is the key), we need to
+  // keep track of a unique process group ID when creating a new
+  // ProcessGroupNCCL for this "group name". Therefore, the value of this
+  // map keeps the unique ProcessGroupHCCL's ID for a specific group with
+  // the "group name". The reason we need a per-group process group ID counter
+  // is that different group can have different ranks and we need ensure that
+  // each group has its own uniform process group ID for all its ranks.
+  static std::unordered_map<std::string, ssize_t> processGroupCounterMap_;
+
+  // Whether or not wait() and synchronize() are blocking operations that wait
+  // for the operation to complete.
+  bool blockingWait_ = false;
+
+  // Timeout for operations. This is only used when blockingWait_ is enabled.
+  std::chrono::milliseconds opTimeout_;
+
+  // Temporarily not implemented
+  // std::unordered_set<std::string> abortedComms_;
+};
 } // namespace c10d
\ No newline at end of file
diff --git a/src/torch/npu/random.py b/src/torch/npu/random.py
index d84fb925e5f198a242b49a3625906021e8cf6205..9cee1829563453c84b12ae485e01ce8f6c4895e6 100644
--- a/src/torch/npu/random.py
+++ b/src/torch/npu/random.py
@@ -1,110 +1,110 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-from . import _lazy_init, _lazy_call, device_count, current_device
-
-__all__ = ['manual_seed', 'manual_seed_all',
-           'seed', 'seed_all', 'initial_seed']
-
-
-def manual_seed(seed):
-    r"""Sets the seed for generating random numbers for the current NPU.
-    It's safe to call this function if NPU is not available; in that
-    case, it is silently ignored.
-
-    Args:
-        seed (int): The desired seed.
-
-    .. warning::
-        If you are working with a multi-NPU model, this function is insufficient
-        to get determinism.  To seed all NPUs, use :func:`manual_seed_all`.
-    """
-    seed = int(seed)
-
-    def cb():
-        idx = current_device()
-        default_generator = torch.npu.default_generators[idx]
-        default_generator.manual_seed(seed)
-
-    _lazy_call(cb)
-
-
-def manual_seed_all(seed):
-    r"""Sets the seed for generating random numbers on all NPUs.
-    It's safe to call this function if NPU is not available; in that
-    case, it is silently ignored.
-
-    Args:
-        seed (int): The desired seed.
-    """
-    seed = int(seed)
-
-    def cb():
-        for i in range(device_count()):
-            default_generator = torch.npu.default_generators[i]
-            default_generator.manual_seed(seed)
-
-    _lazy_call(cb)
-
-
-def seed():
-    r"""Sets the seed for generating random numbers to a random number for the current NPU.
-    It's safe to call this function if NPU is not available; in that
-    case, it is silently ignored.
-
-    .. warning::
-        If you are working with a multi-NPU model, this function will only initialize
-        the seed on one NPU.  To initialize all NPUs, use :func:`seed_all`.
-    """
-    def cb():
-        idx = current_device()
-        default_generator = torch.npu.default_generators[idx]
-        default_generator.seed()
-
-    _lazy_call(cb)
-
-
-def seed_all():
-    r"""Sets the seed for generating random numbers to a random number on all NPUs.
-    It's safe to call this function if NPU is not available; in that
-    case, it is silently ignored.
-    """
-    def cb():
-        random_seed = 0
-        seeded = False
-        for i in range(device_count()):
-            default_generator = torch.npu.default_generators[i]
-            if not seeded:
-                default_generator.seed()
-                random_seed = default_generator.initial_seed()
-                seeded = True
-            else:
-                default_generator.manual_seed(random_seed)
-
-    _lazy_call(cb)
-
-
-def initial_seed():
-    r"""Returns the current random seed of the current NPU.
-
-    .. warning::
-        This function eagerly initializes NPU.
-    """
-    _lazy_init()
-    idx = current_device()
-    default_generator = torch.npu.default_generators[idx]
-    return default_generator.initial_seed()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from . import _lazy_init, _lazy_call, device_count, current_device
+
+__all__ = ['manual_seed', 'manual_seed_all',
+           'seed', 'seed_all', 'initial_seed']
+
+
+def manual_seed(seed):
+    r"""Sets the seed for generating random numbers for the current NPU.
+    It's safe to call this function if NPU is not available; in that
+    case, it is silently ignored.
+
+    Args:
+        seed (int): The desired seed.
+
+    .. warning::
+        If you are working with a multi-NPU model, this function is insufficient
+        to get determinism.  To seed all NPUs, use :func:`manual_seed_all`.
+    """
+    seed = int(seed)
+
+    def cb():
+        idx = current_device()
+        default_generator = torch.npu.default_generators[idx]
+        default_generator.manual_seed(seed)
+
+    _lazy_call(cb)
+
+
+def manual_seed_all(seed):
+    r"""Sets the seed for generating random numbers on all NPUs.
+    It's safe to call this function if NPU is not available; in that
+    case, it is silently ignored.
+
+    Args:
+        seed (int): The desired seed.
+    """
+    seed = int(seed)
+
+    def cb():
+        for i in range(device_count()):
+            default_generator = torch.npu.default_generators[i]
+            default_generator.manual_seed(seed)
+
+    _lazy_call(cb)
+
+
+def seed():
+    r"""Sets the seed for generating random numbers to a random number for the current NPU.
+    It's safe to call this function if NPU is not available; in that
+    case, it is silently ignored.
+
+    .. warning::
+        If you are working with a multi-NPU model, this function will only initialize
+        the seed on one NPU.  To initialize all NPUs, use :func:`seed_all`.
+    """
+    def cb():
+        idx = current_device()
+        default_generator = torch.npu.default_generators[idx]
+        default_generator.seed()
+
+    _lazy_call(cb)
+
+
+def seed_all():
+    r"""Sets the seed for generating random numbers to a random number on all NPUs.
+    It's safe to call this function if NPU is not available; in that
+    case, it is silently ignored.
+    """
+    def cb():
+        random_seed = 0
+        seeded = False
+        for i in range(device_count()):
+            default_generator = torch.npu.default_generators[i]
+            if not seeded:
+                default_generator.seed()
+                random_seed = default_generator.initial_seed()
+                seeded = True
+            else:
+                default_generator.manual_seed(random_seed)
+
+    _lazy_call(cb)
+
+
+def initial_seed():
+    r"""Returns the current random seed of the current NPU.
+
+    .. warning::
+        This function eagerly initializes NPU.
+    """
+    _lazy_init()
+    idx = current_device()
+    default_generator = torch.npu.default_generators[idx]
+    return default_generator.initial_seed()
diff --git a/test/test_npu/test_adaptive_avg_pool2d.py b/test/test_npu/test_adaptive_avg_pool2d.py
index 45aca180e5b868c5cb5ec7566f96d31b4b4043cf..c356fcc97a00eb968f883d66fdc8cca6f1c1c8b2 100644
--- a/test/test_npu/test_adaptive_avg_pool2d.py
+++ b/test/test_npu/test_adaptive_avg_pool2d.py
@@ -1,74 +1,74 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestAdaptiveAvgPool2d(TestCase):
-    def cpu_op_exec(self, input, output_size):
-        m = nn.AdaptiveAvgPool2d(output_size)
-        output= m(input)
-        return output.numpy()
-
-    def npu_op_exec(self, input, output_size):
-        m = nn.AdaptiveAvgPool2d(output_size).npu()
-        output = m(input)
-        return output.cpu().numpy()
-
-    def test_adaptiveAvgPool2d_shape_format_fp16(self, device):
-        format_list = [0, 3]
-        shape_list = [(32, 16, 16),
-                      (16, 1024, 256),
-                      (1024, 464, 11, 9),
-                      (1, 2048, 15, 15)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        # TODO(Ascend): tbe operator has problem in precision and (x, 1) case and so on.
-        output_list = [(4, 4), (3, 5), (1), (1, None), (None, 2)]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            cpu_input = cpu_input.to(torch.float32)
-            for output_size in output_list:
-                cpu_output = self.cpu_op_exec(cpu_input, output_size)
-                npu_output = self.npu_op_exec(npu_input, output_size)
-                cpu_output = cpu_output.astype(np.float16)
-                self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_adaptiveAvgPool2d_shape_format_fp32(self, device):
-        format_list = [0, 3]
-        shape_list = [(32, 16, 16),
-                      (16, 1024, 256),
-                      (1024, 464, 11, 9),
-                      (1, 2048, 15, 15)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        output_list = [(4, 4), (3, 5), (1), (1, None), (None, 2)]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            for output_size in output_list:
-                cpu_output = self.cpu_op_exec(cpu_input, output_size)
-                npu_output = self.npu_op_exec(npu_input, output_size)
-                self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestAdaptiveAvgPool2d, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestAdaptiveAvgPool2d(TestCase):
+    def cpu_op_exec(self, input, output_size):
+        m = nn.AdaptiveAvgPool2d(output_size)
+        output= m(input)
+        return output.numpy()
+
+    def npu_op_exec(self, input, output_size):
+        m = nn.AdaptiveAvgPool2d(output_size).npu()
+        output = m(input)
+        return output.cpu().numpy()
+
+    def test_adaptiveAvgPool2d_shape_format_fp16(self, device):
+        format_list = [0, 3]
+        shape_list = [(32, 16, 16),
+                      (16, 1024, 256),
+                      (1024, 464, 11, 9),
+                      (1, 2048, 15, 15)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        # TODO(Ascend): tbe operator has problem in precision and (x, 1) case and so on.
+        output_list = [(4, 4), (3, 5), (1), (1, None), (None, 2)]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_input = cpu_input.to(torch.float32)
+            for output_size in output_list:
+                cpu_output = self.cpu_op_exec(cpu_input, output_size)
+                npu_output = self.npu_op_exec(npu_input, output_size)
+                cpu_output = cpu_output.astype(np.float16)
+                self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_adaptiveAvgPool2d_shape_format_fp32(self, device):
+        format_list = [0, 3]
+        shape_list = [(32, 16, 16),
+                      (16, 1024, 256),
+                      (1024, 464, 11, 9),
+                      (1, 2048, 15, 15)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        output_list = [(4, 4), (3, 5), (1), (1, None), (None, 2)]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            for output_size in output_list:
+                cpu_output = self.cpu_op_exec(cpu_input, output_size)
+                npu_output = self.npu_op_exec(npu_input, output_size)
+                self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestAdaptiveAvgPool2d, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_all.py b/test/test_npu/test_all.py
index b68f9abad8c450140f0ab1f54eee797085f6f051..e12d08e7f5e9fa4cfdbed6714cf9e1094f5810bd 100644
--- a/test/test_npu/test_all.py
+++ b/test/test_npu/test_all.py
@@ -1,85 +1,85 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestAll(TestCase):
-    def cpu_op_exec1(self, input):
-        output = torch.all(input)
-        output = output.numpy()
-        return output
-    
-    def npu_op_exec1(self, input):
-        input = input.to("npu")
-        output = torch.all(input)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec3(self, input, axis, keepdim):
-        output = torch.all(input, axis, keepdim)
-        output = output.numpy()
-        return output
-    
-    def npu_op_exec3(self, input, axis, keepdim):
-        input = input.to("npu")
-        output = torch.all(input, axis, keepdim)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-    
-    def test_all_noaxis_notkeedim_bool(self, device):
-        shape_format = [
-            [np.bool_, -1, (4, 2, 5)],
-            [np.bool_, -1, (7, 4, 5, 8)]
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 2)
-            cpu_output = self.cpu_op_exec1(cpu_input)
-            npu_output = self.npu_op_exec1(npu_input)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_all_axis_notkeedim_bool(self, device):
-        shape_format = [
-            [[np.bool_, -1, (4, 2, 5)], 3],
-            [[np.bool_, -1, (4, 2, 5, 8)], 4]
-        ]
-        for item in shape_format:
-            for i in range(item[1]):
-                cpu_input, npu_input = create_common_tensor(item[0], 0, 2)
-                cpu_output = self.cpu_op_exec3(cpu_input, i, False)
-                npu_output = self.npu_op_exec3(npu_input, i, False)
-                self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_all_axis_keedim_bool(self, device):
-        shape_format = [
-            [[np.bool_, -1, (4, 2, 5)], 3],
-            [[np.bool_, -1, (4, 2, 5, 8)], 4]
-        ]
-        for item in shape_format:
-            for i in range(item[1]):
-                cpu_input, npu_input = create_common_tensor(item[0], 0, 2)
-                cpu_output = self.cpu_op_exec3(cpu_input, i, True)
-                npu_output = self.npu_op_exec3(npu_input, i, True)
-                self.assertRtolEqual(cpu_output, npu_output)   
-
-instantiate_device_type_tests(TestAll, globals(), except_for='cpu')
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestAll(TestCase):
+    def cpu_op_exec1(self, input):
+        output = torch.all(input)
+        output = output.numpy()
+        return output
+    
+    def npu_op_exec1(self, input):
+        input = input.to("npu")
+        output = torch.all(input)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec3(self, input, axis, keepdim):
+        output = torch.all(input, axis, keepdim)
+        output = output.numpy()
+        return output
+    
+    def npu_op_exec3(self, input, axis, keepdim):
+        input = input.to("npu")
+        output = torch.all(input, axis, keepdim)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+    
+    def test_all_noaxis_notkeedim_bool(self, device):
+        shape_format = [
+            [np.bool_, -1, (4, 2, 5)],
+            [np.bool_, -1, (7, 4, 5, 8)]
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 2)
+            cpu_output = self.cpu_op_exec1(cpu_input)
+            npu_output = self.npu_op_exec1(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_all_axis_notkeedim_bool(self, device):
+        shape_format = [
+            [[np.bool_, -1, (4, 2, 5)], 3],
+            [[np.bool_, -1, (4, 2, 5, 8)], 4]
+        ]
+        for item in shape_format:
+            for i in range(item[1]):
+                cpu_input, npu_input = create_common_tensor(item[0], 0, 2)
+                cpu_output = self.cpu_op_exec3(cpu_input, i, False)
+                npu_output = self.npu_op_exec3(npu_input, i, False)
+                self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_all_axis_keedim_bool(self, device):
+        shape_format = [
+            [[np.bool_, -1, (4, 2, 5)], 3],
+            [[np.bool_, -1, (4, 2, 5, 8)], 4]
+        ]
+        for item in shape_format:
+            for i in range(item[1]):
+                cpu_input, npu_input = create_common_tensor(item[0], 0, 2)
+                cpu_output = self.cpu_op_exec3(cpu_input, i, True)
+                npu_output = self.npu_op_exec3(npu_input, i, True)
+                self.assertRtolEqual(cpu_output, npu_output)   
+
+instantiate_device_type_tests(TestAll, globals(), except_for='cpu')
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_col2im.py b/test/test_npu/test_col2im.py
index c045583737586837428566927057bbf9a9c1527d..719507f77c52b53fdb377c25d4ffba7b9daf8b80 100644
--- a/test/test_npu/test_col2im.py
+++ b/test/test_npu/test_col2im.py
@@ -1,56 +1,56 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestCol2ImBackward(TestCase):
-
-    def cpu_op_exec(self,input1, output_size, ksizes, strides, dilates, padding):
-        output = torch._C._nn.col2im(input1, output_size, ksizes, dilates, padding, strides)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1,output_size, ksizes, strides, dilates,padding):
-        output = torch._C._nn.col2im(input1, output_size, ksizes, dilates, padding, strides)
-        output = output.to("cpu") 
-        output = output.numpy()       
-        return output
-
-    def test_col2im_shape_format(self, device):
-        shape_format = [
-               [ [np.float32, 0, (4,4)], (4,5), (2,2), (2,2), (1,1), (0,0)],
-               [ [np.float32, 3, (2, 8,30 )], (4,5), (2,2), (1,1), (1,1), (1,1)],
-               [ [np.float32, 4, ( 12, 20)], (12,6), (2,3), (1,1), (2,2), (0,0)],
-               [ [np.float32, 29, ( 1,12, 12)], (4,5), (2,2), (1,1), (1,1), (0,0)],
-               [ [np.float16, 29, ( 1,12, 12)], (4,5), (2,2), (1,1), (1,1), (0,0)],
-        ]
-         
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 1, 20)
-            cpu_output = self.cpu_op_exec(cpu_input, item[1], item[2], item[3], item[4], item[5])
-            npu_output = self.npu_op_exec(npu_input, item[1], item[2], item[3], item[4], item[5])
-            self.assertRtolEqual(cpu_output, npu_output)
-           
-
-
-instantiate_device_type_tests(TestCol2ImBackward, globals(), except_for="cpu")
-if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestCol2ImBackward(TestCase):
+
+    def cpu_op_exec(self,input1, output_size, ksizes, strides, dilates, padding):
+        output = torch._C._nn.col2im(input1, output_size, ksizes, dilates, padding, strides)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1,output_size, ksizes, strides, dilates,padding):
+        output = torch._C._nn.col2im(input1, output_size, ksizes, dilates, padding, strides)
+        output = output.to("cpu") 
+        output = output.numpy()       
+        return output
+
+    def test_col2im_shape_format(self, device):
+        shape_format = [
+               [ [np.float32, 0, (4,4)], (4,5), (2,2), (2,2), (1,1), (0,0)],
+               [ [np.float32, 3, (2, 8,30 )], (4,5), (2,2), (1,1), (1,1), (1,1)],
+               [ [np.float32, 4, ( 12, 20)], (12,6), (2,3), (1,1), (2,2), (0,0)],
+               [ [np.float32, 29, ( 1,12, 12)], (4,5), (2,2), (1,1), (1,1), (0,0)],
+               [ [np.float16, 29, ( 1,12, 12)], (4,5), (2,2), (1,1), (1,1), (0,0)],
+        ]
+         
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 1, 20)
+            cpu_output = self.cpu_op_exec(cpu_input, item[1], item[2], item[3], item[4], item[5])
+            npu_output = self.npu_op_exec(npu_input, item[1], item[2], item[3], item[4], item[5])
+            self.assertRtolEqual(cpu_output, npu_output)
+           
+
+
+instantiate_device_type_tests(TestCol2ImBackward, globals(), except_for="cpu")
+if __name__ == "__main__":
+    torch.npu.set_device("npu:5")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_constant_pad_nd.py b/test/test_npu/test_constant_pad_nd.py
index 59d0bbae99a66d27dda0293e97094747ba29010b..b84c2ccd542b405e6d7a7b1b2f032e755ce1dd6d 100644
--- a/test/test_npu/test_constant_pad_nd.py
+++ b/test/test_npu/test_constant_pad_nd.py
@@ -1,70 +1,70 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestConstantPadNd(TestCase):
-    
-    def op_exec_cpu(self, input1, pad_shape):
-        output = torch.constant_pad_nd(input1, pad_shape)
-        output = output.numpy()
-        
-        return output
-
-    def op_exec_npu(self, input1, pad_shape):
-        input1 = input1.to("npu")
-        output = torch.constant_pad_nd(input1, pad_shape)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-        
-    def test_constant_pad_nd_shape_format(self, device):
-        shape_format = [  
-            [[np.float32, 3, (25, 32, 1, 1)], (1,1)],
-            [[np.float32, 0, [25, 32, 11, 11]], (2,2,2,2)],
-            [[np.float32, 0, [25, 3, 22, 22]],(2,2,2,2,20,20)],
-            [[np.float16, 3, [25, 12, 7, 7]], (20,20,20,20)],
-            [[np.float16, 0, [25, 3, 22, 22]], (20,20,20,20,5,5,5,5)],
-            [[np.float16, 4, (2, 3, 3, 3)], (1,1,1,20,5,5,5,5)],
-            [[np.float16, 4, [100, 20, 7, 7]], (0,0,0,0,0,0,0,0)],
-            [[np.float16, 0, [2,3,4,5]], (1,0,1,0,1,0,1,0)],
-            [[np.float16, 4, [2]],(0,1)],
-            [[np.float16, 0, [20,20]],(0,1,0,2)],
-            [[np.float16, 0, [20,20,20]],(1,1,1,1) ],
-            [[np.float16, 3, [1,1,1,1]], (1,1)],
-            [[np.float16, 3, [1]], (1,1)],
-            [[np.float16, 0, [50, 24, 56, 56]], (100, 100, 100, 100, 100, 100, 100, 100)],
-        ]
-
-        for item in shape_format:
-            input_cpu, input_npu = create_common_tensor(item[0], 1, 1)
-            pad_shape = item[1]
-            cpu_output = self.op_exec_cpu(input_cpu, pad_shape)
-            npu_output = self.op_exec_npu(input_npu, pad_shape)
-            
-            
-            self.assertRtolEqual(cpu_output, npu_output)
-            
-
-
-instantiate_device_type_tests(TestConstantPadNd, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestConstantPadNd(TestCase):
+    
+    def op_exec_cpu(self, input1, pad_shape):
+        output = torch.constant_pad_nd(input1, pad_shape)
+        output = output.numpy()
+        
+        return output
+
+    def op_exec_npu(self, input1, pad_shape):
+        input1 = input1.to("npu")
+        output = torch.constant_pad_nd(input1, pad_shape)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+        
+    def test_constant_pad_nd_shape_format(self, device):
+        shape_format = [  
+            [[np.float32, 3, (25, 32, 1, 1)], (1,1)],
+            [[np.float32, 0, [25, 32, 11, 11]], (2,2,2,2)],
+            [[np.float32, 0, [25, 3, 22, 22]],(2,2,2,2,20,20)],
+            [[np.float16, 3, [25, 12, 7, 7]], (20,20,20,20)],
+            [[np.float16, 0, [25, 3, 22, 22]], (20,20,20,20,5,5,5,5)],
+            [[np.float16, 4, (2, 3, 3, 3)], (1,1,1,20,5,5,5,5)],
+            [[np.float16, 4, [100, 20, 7, 7]], (0,0,0,0,0,0,0,0)],
+            [[np.float16, 0, [2,3,4,5]], (1,0,1,0,1,0,1,0)],
+            [[np.float16, 4, [2]],(0,1)],
+            [[np.float16, 0, [20,20]],(0,1,0,2)],
+            [[np.float16, 0, [20,20,20]],(1,1,1,1) ],
+            [[np.float16, 3, [1,1,1,1]], (1,1)],
+            [[np.float16, 3, [1]], (1,1)],
+            [[np.float16, 0, [50, 24, 56, 56]], (100, 100, 100, 100, 100, 100, 100, 100)],
+        ]
+
+        for item in shape_format:
+            input_cpu, input_npu = create_common_tensor(item[0], 1, 1)
+            pad_shape = item[1]
+            cpu_output = self.op_exec_cpu(input_cpu, pad_shape)
+            npu_output = self.op_exec_npu(input_npu, pad_shape)
+            
+            
+            self.assertRtolEqual(cpu_output, npu_output)
+            
+
+
+instantiate_device_type_tests(TestConstantPadNd, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_convolution_backward.py b/test/test_npu/test_convolution_backward.py
index ded1c9a232a8fc5d6797bf2fe65c248cf9c24bf9..0e4a0334071543e8339c9793b84059c91d666a1b 100644
--- a/test/test_npu/test_convolution_backward.py
+++ b/test/test_npu/test_convolution_backward.py
@@ -1,96 +1,96 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-import torch.nn as nn
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestConv2dBackward(TestCase):
-    weight_grad = []
-    input_grad = []
-
-    def getWeightGrad(self, grad):
-        self.weight_grad.append(grad.to("cpu"))
-
-    def getInputGrad(self, grad):
-        self.input_grad.append(grad.to("cpu"))
-
-    def cpu_op_exec(self, input1, weight, padding = 0, stride = 1, bias1 = None):
-        input1.requires_grad = True
-        input1.register_hook(lambda grad: self.getInputGrad(grad))
-        weight.requires_grad = True
-        weight.register_hook(lambda grad: self.getWeightGrad(grad))
-        bias1.requires_grad = True
-
-        res_forward = nn.functional.conv2d(input1, weight, bias1, stride, padding)
-        grads = torch.ones_like(res_forward).float()
-        res_forward.backward(grads, retain_graph=True)
-        res_forward = res_forward.detach().numpy()
-        return res_forward
-
-    def npu_op_exec(self, input1, weight, padding = 0, stride = 1, bias1 = None):
-        input1.requires_grad = True
-        input1.register_hook(lambda grad: self.getInputGrad(grad))
-        weight.requires_grad = True
-        weight.register_hook(lambda grad: self.getWeightGrad(grad))
-        bias1 = bias1.to("npu")
-        bias1.requires_grad = True
-
-        res_forward = nn.functional.conv2d(input1, weight, bias1, stride, padding)
-        grads = torch.ones_like(res_forward).float()
-        grads = grads.to("npu")
-        res_forward.backward(grads, retain_graph=True)
-        res_forward = res_forward.to("cpu")
-        res_forward = res_forward.detach().numpy()
-        return res_forward
-
-    def test_conv2d_backward_shape_format(self, device):
-        shape_format = [  # input, weight, padding, stride
-            [[np.float32, 0, (1, 4, 5, 5)], [np.float32, 0, (4, 4, 3, 3)], 0, (1, 1)],
-            [[np.float32, 0, (1, 8, 3, 3)], [np.float32, 0, (8, 8, 1, 1)], 0, (2, 1)],
-            [[np.float32, 0, (1024, 2048, 6, 6)], [np.float32, 0, (2048, 2048, 3, 3)], 0, (1, 2)],
-            [[np.float32, 0, (512, 256, 4, 4)], [np.float32, 0, (256, 256, 2, 2)], 0, (2, 2)],
-            [[np.float32, 0, (128, 4, 3, 3)], [np.float32, 0, (4, 4, 2, 2)], 0, (3, 1)],
-            [[np.float32, 0, (2, 64, 3, 3)], [np.float32, 0, (64, 64, 3, 3)], 0, (1, 3)],
-            [[np.float32, 0, (64, 2, 8, 8)], [np.float32, 0, (2, 2, 1, 1)], 0, (3, 3)],
-            [[np.float32, 0, (32, 16, 4, 4)], [np.float32, 0, (16, 16, 3, 3)], 0, (2, 1)],
-            [[np.float32, 0, (1024, 8, 3, 3)], [np.float32, 0, (8, 8, 1, 1)], 0, (1, 2)],
-            [[np.float32, 0, (1, 8, 512, 512)], [np.float32, 0, (8, 8, 3, 3)], 0, (2, 2)],
-            [[np.float32, 0, (1, 2, 1, 1)], [np.float32, 0, (1, 1, 2, 2)], 0, (1, 1)],
-        ]
-
-        for item in shape_format:
-            self.weight_grad.clear()
-            self.input_grad.clear() 
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -2, 2)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], -2, 2)
-            cpu_bias = torch.randn(item[1][2][0])
-            npu_bias = copy.deepcopy(cpu_bias)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, item[2], item[3], cpu_bias)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2, item[2], item[3], npu_bias)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(self.input_grad[0], self.input_grad[1])
-            self.assertRtolEqual(self.weight_grad[0], self.weight_grad[1])
-
-
-instantiate_device_type_tests(TestConv2dBackward, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+import torch.nn as nn
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestConv2dBackward(TestCase):
+    weight_grad = []
+    input_grad = []
+
+    def getWeightGrad(self, grad):
+        self.weight_grad.append(grad.to("cpu"))
+
+    def getInputGrad(self, grad):
+        self.input_grad.append(grad.to("cpu"))
+
+    def cpu_op_exec(self, input1, weight, padding = 0, stride = 1, bias1 = None):
+        input1.requires_grad = True
+        input1.register_hook(lambda grad: self.getInputGrad(grad))
+        weight.requires_grad = True
+        weight.register_hook(lambda grad: self.getWeightGrad(grad))
+        bias1.requires_grad = True
+
+        res_forward = nn.functional.conv2d(input1, weight, bias1, stride, padding)
+        grads = torch.ones_like(res_forward).float()
+        res_forward.backward(grads, retain_graph=True)
+        res_forward = res_forward.detach().numpy()
+        return res_forward
+
+    def npu_op_exec(self, input1, weight, padding = 0, stride = 1, bias1 = None):
+        input1.requires_grad = True
+        input1.register_hook(lambda grad: self.getInputGrad(grad))
+        weight.requires_grad = True
+        weight.register_hook(lambda grad: self.getWeightGrad(grad))
+        bias1 = bias1.to("npu")
+        bias1.requires_grad = True
+
+        res_forward = nn.functional.conv2d(input1, weight, bias1, stride, padding)
+        grads = torch.ones_like(res_forward).float()
+        grads = grads.to("npu")
+        res_forward.backward(grads, retain_graph=True)
+        res_forward = res_forward.to("cpu")
+        res_forward = res_forward.detach().numpy()
+        return res_forward
+
+    def test_conv2d_backward_shape_format(self, device):
+        shape_format = [  # input, weight, padding, stride
+            [[np.float32, 0, (1, 4, 5, 5)], [np.float32, 0, (4, 4, 3, 3)], 0, (1, 1)],
+            [[np.float32, 0, (1, 8, 3, 3)], [np.float32, 0, (8, 8, 1, 1)], 0, (2, 1)],
+            [[np.float32, 0, (1024, 2048, 6, 6)], [np.float32, 0, (2048, 2048, 3, 3)], 0, (1, 2)],
+            [[np.float32, 0, (512, 256, 4, 4)], [np.float32, 0, (256, 256, 2, 2)], 0, (2, 2)],
+            [[np.float32, 0, (128, 4, 3, 3)], [np.float32, 0, (4, 4, 2, 2)], 0, (3, 1)],
+            [[np.float32, 0, (2, 64, 3, 3)], [np.float32, 0, (64, 64, 3, 3)], 0, (1, 3)],
+            [[np.float32, 0, (64, 2, 8, 8)], [np.float32, 0, (2, 2, 1, 1)], 0, (3, 3)],
+            [[np.float32, 0, (32, 16, 4, 4)], [np.float32, 0, (16, 16, 3, 3)], 0, (2, 1)],
+            [[np.float32, 0, (1024, 8, 3, 3)], [np.float32, 0, (8, 8, 1, 1)], 0, (1, 2)],
+            [[np.float32, 0, (1, 8, 512, 512)], [np.float32, 0, (8, 8, 3, 3)], 0, (2, 2)],
+            [[np.float32, 0, (1, 2, 1, 1)], [np.float32, 0, (1, 1, 2, 2)], 0, (1, 1)],
+        ]
+
+        for item in shape_format:
+            self.weight_grad.clear()
+            self.input_grad.clear() 
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -2, 2)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], -2, 2)
+            cpu_bias = torch.randn(item[1][2][0])
+            npu_bias = copy.deepcopy(cpu_bias)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, item[2], item[3], cpu_bias)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2, item[2], item[3], npu_bias)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(self.input_grad[0], self.input_grad[1])
+            self.assertRtolEqual(self.weight_grad[0], self.weight_grad[1])
+
+
+instantiate_device_type_tests(TestConv2dBackward, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_cosine_embedding_loss.py b/test/test_npu/test_cosine_embedding_loss.py
index fc44237c9900307d559d701afeee7f570dde5bf6..443c57ec8c397cf141c0ed86019dd47ff94f9d58 100644
--- a/test/test_npu/test_cosine_embedding_loss.py
+++ b/test/test_npu/test_cosine_embedding_loss.py
@@ -1,99 +1,99 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-import random
-
-
-class TestCosineEmbeddingLoss(TestCase):
-    def generate_target(self, shape, dtype):
-        target = np.random.randint(2, size=shape, dtype=dtype)
-        target = target*2-1
-        target = torch.from_numpy(target)
-        return target
-
-    def cpu_op_exec(self, input1, input2, target, margin, reduction):
-        output = torch.nn.functional.cosine_embedding_loss(
-            input1, input2, target, margin=margin, reduction=reduction)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2, target, margin, reduction):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        target = target.to("npu")
-        output = torch.nn.functional.cosine_embedding_loss(
-            input1, input2, target, margin=margin, reduction=reduction)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_cosine_embedding_loss_common_shape_format(self, device):
-        shape_format = [
-            [[np.float32, -1, (5, 3)], [np.float32, -1, (5, 3)],
-             [np.int32, (5, )], 'sum'],
-            [[np.float32, 0, (16, 4, 3)], [np.float32, 0,
-                                            (16, 4, 3)], [np.int32, (16, 3)], 'mean'],
-            [[np.float32, 3, (64, 10, 10)], [np.float32, 3,
-                                              (64, 10, 10)], [np.int32, (64, 10)], 'none'],
-        ]
-        for item1, item2, target, reduction in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item1, 1, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item2, 1, 100)
-            target = self.generate_target(target[1], target[0])
-            margin = np.random.uniform(0, 1)
-            cpu_output = self.cpu_op_exec(
-                cpu_input1, cpu_input2, target, margin, reduction)
-            npu_output = self.npu_op_exec(
-                npu_input1, npu_input2, target, margin, reduction)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_cosine_embedding_loss_float16_shape_format(self, device):
-        def cpu_op_exec_fp16(input1, input2, target, margin, reduction):
-            input1 = input1.to(torch.float32)
-            input2 = input2.to(torch.float32)
-            output = torch.nn.functional.cosine_embedding_loss(
-                input1, input2, target, margin=margin, reduction=reduction)
-            output = output.numpy().astype(np.float16)
-            return output
-
-        shape_format = [
-            [[np.float16, 3, (4, 1, 3)], [np.float16, 3,
-                                           (4, 1, 3)], [np.int32, (4, 3)], 'sum'],
-            [[np.float16, -1, (16, 8)], [np.float16, -1, (16, 8)],
-             [np.int32, (16, )], 'mean'],
-            [[np.float16, 4, (64, 10, 10)], [np.float16, 3,
-                                              (64, 10, 10)], [np.int32, (64, 10)], 'none']
-        ]
-
-        for item1, item2, target, reduction in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item1, 1, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item2, 1, 100)
-            target = self.generate_target(target[1], target[0])
-            margin = np.random.uniform(0, 1)
-            cpu_output = cpu_op_exec_fp16(
-                cpu_input1, cpu_input2, target, margin, reduction)
-            npu_output = self.npu_op_exec(
-                npu_input1, npu_input2, target, margin, reduction)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(
-    TestCosineEmbeddingLoss, globals(), except_for='cpu')
-if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+import random
+
+
+class TestCosineEmbeddingLoss(TestCase):
+    def generate_target(self, shape, dtype):
+        target = np.random.randint(2, size=shape, dtype=dtype)
+        target = target*2-1
+        target = torch.from_numpy(target)
+        return target
+
+    def cpu_op_exec(self, input1, input2, target, margin, reduction):
+        output = torch.nn.functional.cosine_embedding_loss(
+            input1, input2, target, margin=margin, reduction=reduction)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2, target, margin, reduction):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        target = target.to("npu")
+        output = torch.nn.functional.cosine_embedding_loss(
+            input1, input2, target, margin=margin, reduction=reduction)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_cosine_embedding_loss_common_shape_format(self, device):
+        shape_format = [
+            [[np.float32, -1, (5, 3)], [np.float32, -1, (5, 3)],
+             [np.int32, (5, )], 'sum'],
+            [[np.float32, 0, (16, 4, 3)], [np.float32, 0,
+                                            (16, 4, 3)], [np.int32, (16, 3)], 'mean'],
+            [[np.float32, 3, (64, 10, 10)], [np.float32, 3,
+                                              (64, 10, 10)], [np.int32, (64, 10)], 'none'],
+        ]
+        for item1, item2, target, reduction in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item1, 1, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item2, 1, 100)
+            target = self.generate_target(target[1], target[0])
+            margin = np.random.uniform(0, 1)
+            cpu_output = self.cpu_op_exec(
+                cpu_input1, cpu_input2, target, margin, reduction)
+            npu_output = self.npu_op_exec(
+                npu_input1, npu_input2, target, margin, reduction)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_cosine_embedding_loss_float16_shape_format(self, device):
+        def cpu_op_exec_fp16(input1, input2, target, margin, reduction):
+            input1 = input1.to(torch.float32)
+            input2 = input2.to(torch.float32)
+            output = torch.nn.functional.cosine_embedding_loss(
+                input1, input2, target, margin=margin, reduction=reduction)
+            output = output.numpy().astype(np.float16)
+            return output
+
+        shape_format = [
+            [[np.float16, 3, (4, 1, 3)], [np.float16, 3,
+                                           (4, 1, 3)], [np.int32, (4, 3)], 'sum'],
+            [[np.float16, -1, (16, 8)], [np.float16, -1, (16, 8)],
+             [np.int32, (16, )], 'mean'],
+            [[np.float16, 4, (64, 10, 10)], [np.float16, 3,
+                                              (64, 10, 10)], [np.int32, (64, 10)], 'none']
+        ]
+
+        for item1, item2, target, reduction in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item1, 1, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item2, 1, 100)
+            target = self.generate_target(target[1], target[0])
+            margin = np.random.uniform(0, 1)
+            cpu_output = cpu_op_exec_fp16(
+                cpu_input1, cpu_input2, target, margin, reduction)
+            npu_output = self.npu_op_exec(
+                npu_input1, npu_input2, target, margin, reduction)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(
+    TestCosineEmbeddingLoss, globals(), except_for='cpu')
+if __name__ == "__main__":
+    torch.npu.set_device("npu:5")
+    run_tests()
diff --git a/test/test_npu/test_cudnn_rnn_backward.py b/test/test_npu/test_cudnn_rnn_backward.py
index 68730689e79359904915b793d0d6e131508b80e1..1dd236c74e1727500bedd8371c91370a4ce2a815 100644
--- a/test/test_npu/test_cudnn_rnn_backward.py
+++ b/test/test_npu/test_cudnn_rnn_backward.py
@@ -1,95 +1,95 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import random
-import numpy as np
-import sys
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestCudnnRnnBackward(TestCase):
-    def generate_bool(self):
-        scalar = random.randint(1, 2)
-        return scalar == 1
-
-    def generate_int(self, min_d, max_d):
-        scalar = random.randint(min_d, max_d)
-        return scalar
-
-    def cpu_op_exec(self, input1, vocab_size, num_hiddens, num_steps, batch_size, first, drop, bid):
-        input1.requires_grad_(True)
-        m = torch.nn.RNN(input_size=vocab_size, hidden_size=num_hiddens, batch_first=first, dropout=drop, bidirectional=bid)
-        state = None
-        output, _ = m(input1, state)
-        w = torch.ones_like(output)
-        output = output.backward(w)
-        return input1.grad
-
-    def npu_op_exec(self, input1, vocab_size, num_hiddens, num_steps, batch_size, first, drop, bid):
-        input1.requires_grad_(True)
-        m = torch.nn.RNN(input_size=vocab_size, hidden_size=num_hiddens, batch_first=first, dropout=drop, bidirectional=bid)
-        m = m.npu()
-        state = None
-        output, _ = m(input1, state)
-        w = torch.ones_like(output)
-        output = output.backward(w)
-        out = input1.grad
-        out = out.to("cpu")
-        return out
-
-    def test_cudnn_rnn_backward_common_shape_format(self, device):
-        npu_vocab_size = self.generate_int(1, 10)
-        npu_num_hiddens = self.generate_int(1, 10)
-        npu_num_step = self.generate_int(1, 10)
-        npu_batch_size = self.generate_int(1, 10)
-        first = self.generate_bool()
-        drop = self.generate_int(0, 1)
-        bid = self.generate_bool()
-        shape_format = [
-            [[np.float32, -1, (npu_num_step, npu_batch_size, npu_vocab_size)]],
-            [[np.float32, 0, (npu_num_step, npu_batch_size, npu_vocab_size)]],
-            [[np.float32, 3, (npu_num_step, npu_batch_size, npu_vocab_size)]],
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_result = self.cpu_op_exec(cpu_input1, npu_vocab_size, npu_num_hiddens, npu_num_step, npu_batch_size, first, drop, bid)
-            npu_result = self.npu_op_exec(npu_input1, npu_vocab_size, npu_num_hiddens, npu_num_step, npu_batch_size, first, drop, bid)
-            self.assertRtolEqual(cpu_result.numpy(), npu_result.numpy());
-
-    def test_cudnn_rnn_backward_float16_shape_format(self, device):
-        npu_vocab_size = self.generate_int(1, 10)
-        npu_num_hiddens = self.generate_int(1, 10)
-        npu_num_step = self.generate_int(1, 10)
-        npu_batch_size = self.generate_int(1, 10)
-        first = self.generate_bool()
-        drop = self.generate_int(0, 1)
-        bid = self.generate_bool()
-        shape_format = [
-            [[np.float16, -1, (npu_num_step, npu_batch_size, npu_vocab_size)]],
-            [[np.float16, 0, (npu_num_step, npu_batch_size, npu_vocab_size)]],
-            [[np.float16, 3, (npu_num_step, npu_batch_size, npu_vocab_size)]],
-        ]
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 10, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_result = self.cpu_op_exec(cpu_input1, npu_vocab_size, npu_num_hiddens, npu_num_step, npu_batch_size, first, drop, bid)
-            npu_result = self.npu_op_exec(npu_input1, npu_vocab_size, npu_num_hiddens, npu_num_step, npu_batch_size, first, drop, bid)
-            self.assertRtolEqual(cpu_result.numpy().astype(np.float16), npu_result.numpy().astype(np.float16));
-
-instantiate_device_type_tests(TestCudnnRnnBackward, globals(), except_for="cpu")
-if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import random
+import numpy as np
+import sys
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestCudnnRnnBackward(TestCase):
+    def generate_bool(self):
+        scalar = random.randint(1, 2)
+        return scalar == 1
+
+    def generate_int(self, min_d, max_d):
+        scalar = random.randint(min_d, max_d)
+        return scalar
+
+    def cpu_op_exec(self, input1, vocab_size, num_hiddens, num_steps, batch_size, first, drop, bid):
+        input1.requires_grad_(True)
+        m = torch.nn.RNN(input_size=vocab_size, hidden_size=num_hiddens, batch_first=first, dropout=drop, bidirectional=bid)
+        state = None
+        output, _ = m(input1, state)
+        w = torch.ones_like(output)
+        output = output.backward(w)
+        return input1.grad
+
+    def npu_op_exec(self, input1, vocab_size, num_hiddens, num_steps, batch_size, first, drop, bid):
+        input1.requires_grad_(True)
+        m = torch.nn.RNN(input_size=vocab_size, hidden_size=num_hiddens, batch_first=first, dropout=drop, bidirectional=bid)
+        m = m.npu()
+        state = None
+        output, _ = m(input1, state)
+        w = torch.ones_like(output)
+        output = output.backward(w)
+        out = input1.grad
+        out = out.to("cpu")
+        return out
+
+    def test_cudnn_rnn_backward_common_shape_format(self, device):
+        npu_vocab_size = self.generate_int(1, 10)
+        npu_num_hiddens = self.generate_int(1, 10)
+        npu_num_step = self.generate_int(1, 10)
+        npu_batch_size = self.generate_int(1, 10)
+        first = self.generate_bool()
+        drop = self.generate_int(0, 1)
+        bid = self.generate_bool()
+        shape_format = [
+            [[np.float32, -1, (npu_num_step, npu_batch_size, npu_vocab_size)]],
+            [[np.float32, 0, (npu_num_step, npu_batch_size, npu_vocab_size)]],
+            [[np.float32, 3, (npu_num_step, npu_batch_size, npu_vocab_size)]],
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_result = self.cpu_op_exec(cpu_input1, npu_vocab_size, npu_num_hiddens, npu_num_step, npu_batch_size, first, drop, bid)
+            npu_result = self.npu_op_exec(npu_input1, npu_vocab_size, npu_num_hiddens, npu_num_step, npu_batch_size, first, drop, bid)
+            self.assertRtolEqual(cpu_result.numpy(), npu_result.numpy());
+
+    def test_cudnn_rnn_backward_float16_shape_format(self, device):
+        npu_vocab_size = self.generate_int(1, 10)
+        npu_num_hiddens = self.generate_int(1, 10)
+        npu_num_step = self.generate_int(1, 10)
+        npu_batch_size = self.generate_int(1, 10)
+        first = self.generate_bool()
+        drop = self.generate_int(0, 1)
+        bid = self.generate_bool()
+        shape_format = [
+            [[np.float16, -1, (npu_num_step, npu_batch_size, npu_vocab_size)]],
+            [[np.float16, 0, (npu_num_step, npu_batch_size, npu_vocab_size)]],
+            [[np.float16, 3, (npu_num_step, npu_batch_size, npu_vocab_size)]],
+        ]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 10, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_result = self.cpu_op_exec(cpu_input1, npu_vocab_size, npu_num_hiddens, npu_num_step, npu_batch_size, first, drop, bid)
+            npu_result = self.npu_op_exec(npu_input1, npu_vocab_size, npu_num_hiddens, npu_num_step, npu_batch_size, first, drop, bid)
+            self.assertRtolEqual(cpu_result.numpy().astype(np.float16), npu_result.numpy().astype(np.float16));
+
+instantiate_device_type_tests(TestCudnnRnnBackward, globals(), except_for="cpu")
+if __name__ == "__main__":
+    torch.npu.set_device("npu:5")
+    run_tests()
diff --git a/test/test_npu/test_det.py b/test/test_npu/test_det.py
index 4ab0228b148b4919cddc4050253dbb9dba8af948..657b82473b44bba2dd1671e935f3733f80dc2f49 100644
--- a/test/test_npu/test_det.py
+++ b/test/test_npu/test_det.py
@@ -1,67 +1,67 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-# pylint: disable=unused-variable, unused-argument
-
-class TestDet(TestCase):
-    def generate_data(self, min_val, max_val, shape, dtype):
-        input1 = np.random.uniform(min_val, max_val, shape).astype(dtype)
-        npu_input1 = torch.from_numpy(input1)
-
-        return npu_input1
-
-    def cpu_op_exec(self, input1):
-        output = torch.det(input1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1):
-        input1 = input1.to("npu")
-        output = torch.det(input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_det_float32(self, device):
-        npu_input1 = self.generate_data(-9.313225746154785e-10, 9.313225746154785e-10, (1, 1, 64, 64), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_det_float16(self, device):
-        npu_input1 = self.generate_data(0, 0, (2, 2, 32, 32), np.float16)
-        cpu_output = self.cpu_op_exec(npu_input1.float()).astype(np.float16)
-        npu_output = self.npu_op_exec(npu_input1.float()).astype(np.float16)
-        print(cpu_output,npu_output,'123')
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_big_scale_float32(self, device):
-        npu_input1 = self.generate_data(0, 10, (32, 32), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestDet, globals(), except_for='cpu')
-if __name__ == "__main__":
-    torch.npu.set_device("npu:7")
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+# pylint: disable=unused-variable, unused-argument
+
+class TestDet(TestCase):
+    def generate_data(self, min_val, max_val, shape, dtype):
+        input1 = np.random.uniform(min_val, max_val, shape).astype(dtype)
+        npu_input1 = torch.from_numpy(input1)
+
+        return npu_input1
+
+    def cpu_op_exec(self, input1):
+        output = torch.det(input1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        input1 = input1.to("npu")
+        output = torch.det(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_det_float32(self, device):
+        npu_input1 = self.generate_data(-9.313225746154785e-10, 9.313225746154785e-10, (1, 1, 64, 64), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_det_float16(self, device):
+        npu_input1 = self.generate_data(0, 0, (2, 2, 32, 32), np.float16)
+        cpu_output = self.cpu_op_exec(npu_input1.float()).astype(np.float16)
+        npu_output = self.npu_op_exec(npu_input1.float()).astype(np.float16)
+        print(cpu_output,npu_output,'123')
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_big_scale_float32(self, device):
+        npu_input1 = self.generate_data(0, 10, (32, 32), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestDet, globals(), except_for='cpu')
+if __name__ == "__main__":
+    torch.npu.set_device("npu:7")
+    run_tests()
diff --git a/test/test_npu/test_dynamic_ops/test_dynamic_embedding.py b/test/test_npu/test_dynamic_ops/test_dynamic_embedding.py
index c29994544378437ed39a02a631dcaa96ccbb0641..fda65867de3eb146b97af8a10077ae998fa6b859 100644
--- a/test/test_npu/test_dynamic_ops/test_dynamic_embedding.py
+++ b/test/test_npu/test_dynamic_ops/test_dynamic_embedding.py
@@ -1,80 +1,80 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-from common_utils import TestCase, run_tests
-import time
-import os
-import copy
-# Need export DYNAMIC_COMPILE_ENABLE=1 and export EXPERIMENTAL_DYNAMIC_PARTITION=1
-
-class EmbeddingFuncNet(torch.nn.Module):
-    def __init__(self):
-        super(EmbeddingFuncNet, self).__init__()
-
-    def forward(self, indices, weight):
-        out = torch.nn.functional.embedding(indices, weight)
-        return out
-
-class EmbeddingNet(torch.nn.Module):
-    def __init__(self):
-        super(EmbeddingNet, self).__init__()
-
-    def forward(self, indices, embed):
-        out =embed(indices)
-        return out
-
-class TestShape(TestCase):
-    def generate_weight(self, x, y):
-        rand_data = np.random.randn(x,y).astype(np.float32)
-        cpu_out = torch.from_numpy(rand_data)
-        npu_out = torch.from_numpy(rand_data).npu()
-        return cpu_out.to(torch.float), npu_out.to(torch.float)
-
-    def generate_indices(self, shape, min, max):
-        rand_data = np.random.randint(min, max, shape)
-        cpu_out = torch.from_numpy(rand_data)
-        npu_out = torch.from_numpy(rand_data).npu()
-        return cpu_out.to(torch.long), npu_out.to(torch.long)
-
-    def test_dynamic_threads_support_op(self, device):
-        shape_list1 = [[40, 32], [40, 1024], [40000, 1024], [33712, 1024]]
-        shape_list2 = [[40], [40,3125], [64, 7, 128]]
-        shape_format = [
-            [i, j] for i in shape_list1 for j in shape_list2
-        ]
-        net_func = EmbeddingFuncNet()
-        net = EmbeddingNet()
-        for item in shape_format:
-            weight_cpu, weight_npu = self.generate_weight(item[0][0], item[0][1])
-            indices_cpu, indices_npu = self.generate_indices(item[1], 1, item[0][0])
-            cpu_out = net_func(indices_cpu, weight_cpu)
-            npu_out = net_func(indices_npu, weight_npu)
-            npu_output = npu_out.to("cpu")
-            self.assertRtolEqual(cpu_out.numpy(), npu_output.numpy())
-
-            embed_cpu = torch.nn.Embedding(item[0][0], item[0][1])
-            embed_npu = copy.deepcopy(embed_cpu).npu()
-            cpu_out = net(indices_cpu, embed_cpu)
-            npu_out = net(indices_npu, embed_npu)
-            npu_output = npu_out.to("cpu")
-            self.assertRtolEqual(cpu_out.detach().numpy(), npu_output.detach().numpy())
-
-
-instantiate_device_type_tests(TestShape, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+from common_utils import TestCase, run_tests
+import time
+import os
+import copy
+# Need export DYNAMIC_COMPILE_ENABLE=1 and export EXPERIMENTAL_DYNAMIC_PARTITION=1
+
+class EmbeddingFuncNet(torch.nn.Module):
+    def __init__(self):
+        super(EmbeddingFuncNet, self).__init__()
+
+    def forward(self, indices, weight):
+        out = torch.nn.functional.embedding(indices, weight)
+        return out
+
+class EmbeddingNet(torch.nn.Module):
+    def __init__(self):
+        super(EmbeddingNet, self).__init__()
+
+    def forward(self, indices, embed):
+        out =embed(indices)
+        return out
+
+class TestShape(TestCase):
+    def generate_weight(self, x, y):
+        rand_data = np.random.randn(x,y).astype(np.float32)
+        cpu_out = torch.from_numpy(rand_data)
+        npu_out = torch.from_numpy(rand_data).npu()
+        return cpu_out.to(torch.float), npu_out.to(torch.float)
+
+    def generate_indices(self, shape, min, max):
+        rand_data = np.random.randint(min, max, shape)
+        cpu_out = torch.from_numpy(rand_data)
+        npu_out = torch.from_numpy(rand_data).npu()
+        return cpu_out.to(torch.long), npu_out.to(torch.long)
+
+    def test_dynamic_threads_support_op(self, device):
+        shape_list1 = [[40, 32], [40, 1024], [40000, 1024], [33712, 1024]]
+        shape_list2 = [[40], [40,3125], [64, 7, 128]]
+        shape_format = [
+            [i, j] for i in shape_list1 for j in shape_list2
+        ]
+        net_func = EmbeddingFuncNet()
+        net = EmbeddingNet()
+        for item in shape_format:
+            weight_cpu, weight_npu = self.generate_weight(item[0][0], item[0][1])
+            indices_cpu, indices_npu = self.generate_indices(item[1], 1, item[0][0])
+            cpu_out = net_func(indices_cpu, weight_cpu)
+            npu_out = net_func(indices_npu, weight_npu)
+            npu_output = npu_out.to("cpu")
+            self.assertRtolEqual(cpu_out.numpy(), npu_output.numpy())
+
+            embed_cpu = torch.nn.Embedding(item[0][0], item[0][1])
+            embed_npu = copy.deepcopy(embed_cpu).npu()
+            cpu_out = net(indices_cpu, embed_cpu)
+            npu_out = net(indices_npu, embed_npu)
+            npu_output = npu_out.to("cpu")
+            self.assertRtolEqual(cpu_out.detach().numpy(), npu_output.detach().numpy())
+
+
+instantiate_device_type_tests(TestShape, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_dynamic_ops/test_network_all.py b/test/test_npu/test_dynamic_ops/test_network_all.py
index 152b297d18cebdb8e90483d8ca3a7ff0c3148f28..3cb4dc0a23dd89bc5af6c19a071e7c923794b394 100644
--- a/test/test_npu/test_dynamic_ops/test_network_all.py
+++ b/test/test_npu/test_dynamic_ops/test_network_all.py
@@ -1,81 +1,81 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-from common_utils import TestCase, run_tests
-import time
-import os
-import copy
-# Need export DYNAMIC_COMPILE_ENABLE=1 and export EXPERIMENTAL_DYNAMIC_PARTITION=1
-
-
-class AllNet(torch.nn.Module):
-    def __init__(self):
-        super(AllNet, self).__init__()
-
-    def forward(self, x, axis):
-        if x.device == torch.device("cpu") and x.dtype == torch.float16:
-            x = x.to(torch.float32)
-        out = torch.all(x, axis)
-        if x.device == torch.device("cpu") and x.dtype == torch.float16:
-            out = out.to(torch.float16)
-        return out
-
-
-class TestShape(TestCase):
-    def create_random_shape_tensor(self, item, min_value, max_value):
-        npu_format = item[0]
-        dtype = item[1]
-        dim = item[2]
-        shape = np.random.randint(1, 10, dim)
-        input_tensor = np.random.uniform(min_value, max_value, shape).astype(dtype)
-        cpu_input = torch.from_numpy(input_tensor)
-        npu_input = torch.from_numpy(input_tensor).npu()
-        """
-        if npu_format not in (-1, 0):
-            npu_input = npu_input.npu_format_cast(npu_format)
-        """
-        return cpu_input, npu_input
-
-    def get_random_axis(self, cpu_tensor):
-        shape = list(cpu_tensor.shape)
-        axis = np.random.randint(0, len(shape))
-        return axis
-
-    def test_dynamic_threads_support_op(self, device):
-        format_list = [0, 3, 29]
-        dtype_list = [np.bool_]
-        dim_list = [1, 2, 3, 4]
-        net = AllNet()
-        net_npu = copy.deepcopy(net).to("npu")
-        items = [
-            [i, j, k] for i in format_list for j in dtype_list for k in dim_list
-        ]
-        for item in items:
-            if item[0] == 29 and item[2] == 1:
-                continue
-            for _ in range(100):
-                cpu_tensor, npu_tensor = self.create_random_shape_tensor(item, -10, 10)
-                axis = self.get_random_axis(cpu_tensor)
-                cpu_output = net(cpu_tensor, axis)
-                npu_output = net_npu(npu_tensor, axis)
-                self.assertRtolEqual(cpu_output.to(npu_output.dtype).numpy(), npu_output.cpu().numpy())
-
-
-instantiate_device_type_tests(TestShape, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+from common_utils import TestCase, run_tests
+import time
+import os
+import copy
+# Need export DYNAMIC_COMPILE_ENABLE=1 and export EXPERIMENTAL_DYNAMIC_PARTITION=1
+
+
+class AllNet(torch.nn.Module):
+    def __init__(self):
+        super(AllNet, self).__init__()
+
+    def forward(self, x, axis):
+        if x.device == torch.device("cpu") and x.dtype == torch.float16:
+            x = x.to(torch.float32)
+        out = torch.all(x, axis)
+        if x.device == torch.device("cpu") and x.dtype == torch.float16:
+            out = out.to(torch.float16)
+        return out
+
+
+class TestShape(TestCase):
+    def create_random_shape_tensor(self, item, min_value, max_value):
+        npu_format = item[0]
+        dtype = item[1]
+        dim = item[2]
+        shape = np.random.randint(1, 10, dim)
+        input_tensor = np.random.uniform(min_value, max_value, shape).astype(dtype)
+        cpu_input = torch.from_numpy(input_tensor)
+        npu_input = torch.from_numpy(input_tensor).npu()
+        """
+        if npu_format not in (-1, 0):
+            npu_input = npu_input.npu_format_cast(npu_format)
+        """
+        return cpu_input, npu_input
+
+    def get_random_axis(self, cpu_tensor):
+        shape = list(cpu_tensor.shape)
+        axis = np.random.randint(0, len(shape))
+        return axis
+
+    def test_dynamic_threads_support_op(self, device):
+        format_list = [0, 3, 29]
+        dtype_list = [np.bool_]
+        dim_list = [1, 2, 3, 4]
+        net = AllNet()
+        net_npu = copy.deepcopy(net).to("npu")
+        items = [
+            [i, j, k] for i in format_list for j in dtype_list for k in dim_list
+        ]
+        for item in items:
+            if item[0] == 29 and item[2] == 1:
+                continue
+            for _ in range(100):
+                cpu_tensor, npu_tensor = self.create_random_shape_tensor(item, -10, 10)
+                axis = self.get_random_axis(cpu_tensor)
+                cpu_output = net(cpu_tensor, axis)
+                npu_output = net_npu(npu_tensor, axis)
+                self.assertRtolEqual(cpu_output.to(npu_output.dtype).numpy(), npu_output.cpu().numpy())
+
+
+instantiate_device_type_tests(TestShape, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_dynamic_ops/test_network_broadcast.py b/test/test_npu/test_dynamic_ops/test_network_broadcast.py
index 6c19132920be6ab4a9f90744d7963bc89ef32148..2b1776b780779ddd5a2b78bbb5e13a7d2de6e100 100644
--- a/test/test_npu/test_dynamic_ops/test_network_broadcast.py
+++ b/test/test_npu/test_dynamic_ops/test_network_broadcast.py
@@ -1,77 +1,77 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-from common_utils import TestCase, run_tests
-import time
-import os
-import copy
-# Need export DYNAMIC_COMPILE_ENABLE=1 and export EXPERIMENTAL_DYNAMIC_PARTITION=1
-
-
-class BroadcastToNet(torch.nn.Module):
-    def __init__(self):
-        super(BroadcastToNet, self).__init__()
-
-    def forward(self, x, size):
-        out = x.npu_broadcast(size)
-        return out
-
-
-class TestShape(TestCase):
-    def create_random_shape_tensor(self, item, min_value, max_value):
-        npu_format = item[0]
-        dtype = item[1]
-        dim = item[2]
-        shape = np.random.randint(1, 10, dim)
-        input_tensor = np.random.uniform(min_value, max_value, shape).astype(dtype)
-        cpu_input = torch.from_numpy(input_tensor)
-        npu_input = torch.from_numpy(input_tensor).npu()
-        if npu_format not in (-1, 0):
-            npu_input = npu_input.npu_format_cast(npu_format)
-        return cpu_input, npu_input
-
-    def get_broad_size(self, cpu_tensor):
-        shape = list(cpu_tensor.shape)
-        addim = np.random.randint(0, 10, 2)
-        sizes = list(addim) + shape
-        return list(sizes)
-
-    def test_dynamic_threads_support_op(self, device):
-        format_list = [0]
-        dtype_list = [np.float32, np.float16]
-        dim_list = [4]
-        net = BroadcastToNet()
-        net_npu = copy.deepcopy(net).to("npu")
-        items = [
-            [i, j, k] for i in format_list for j in dtype_list for k in dim_list
-        ]
-        for item in items:
-            if item[0] == 29 and item[2] == 1:
-                continue
-            for _ in range(100):
-                cpu_tensor, npu_tensor = self.create_random_shape_tensor(item, -10, 10)
-                sizes = self.get_broad_size(cpu_tensor)
-                output = net_npu(npu_tensor, sizes)
-                size1 = np.array(output.size(), dtype=np.int32)
-                size2 = np.array(sizes, dtype=np.int32)
-                self.assertRtolEqual(size1, size2)
-
-
-instantiate_device_type_tests(TestShape, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+from common_utils import TestCase, run_tests
+import time
+import os
+import copy
+# Need export DYNAMIC_COMPILE_ENABLE=1 and export EXPERIMENTAL_DYNAMIC_PARTITION=1
+
+
+class BroadcastToNet(torch.nn.Module):
+    def __init__(self):
+        super(BroadcastToNet, self).__init__()
+
+    def forward(self, x, size):
+        out = x.npu_broadcast(size)
+        return out
+
+
+class TestShape(TestCase):
+    def create_random_shape_tensor(self, item, min_value, max_value):
+        npu_format = item[0]
+        dtype = item[1]
+        dim = item[2]
+        shape = np.random.randint(1, 10, dim)
+        input_tensor = np.random.uniform(min_value, max_value, shape).astype(dtype)
+        cpu_input = torch.from_numpy(input_tensor)
+        npu_input = torch.from_numpy(input_tensor).npu()
+        if npu_format not in (-1, 0):
+            npu_input = npu_input.npu_format_cast(npu_format)
+        return cpu_input, npu_input
+
+    def get_broad_size(self, cpu_tensor):
+        shape = list(cpu_tensor.shape)
+        addim = np.random.randint(0, 10, 2)
+        sizes = list(addim) + shape
+        return list(sizes)
+
+    def test_dynamic_threads_support_op(self, device):
+        format_list = [0]
+        dtype_list = [np.float32, np.float16]
+        dim_list = [4]
+        net = BroadcastToNet()
+        net_npu = copy.deepcopy(net).to("npu")
+        items = [
+            [i, j, k] for i in format_list for j in dtype_list for k in dim_list
+        ]
+        for item in items:
+            if item[0] == 29 and item[2] == 1:
+                continue
+            for _ in range(100):
+                cpu_tensor, npu_tensor = self.create_random_shape_tensor(item, -10, 10)
+                sizes = self.get_broad_size(cpu_tensor)
+                output = net_npu(npu_tensor, sizes)
+                size1 = np.array(output.size(), dtype=np.int32)
+                size2 = np.array(sizes, dtype=np.int32)
+                self.assertRtolEqual(size1, size2)
+
+
+instantiate_device_type_tests(TestShape, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_dynamic_ops/test_network_dynamic_shape.py b/test/test_npu/test_dynamic_ops/test_network_dynamic_shape.py
index eb1be0b528c59ed9e9fcbca18a1e83c3963c80a5..45d915c590875dd997f7cb426d371bba6e486872 100644
--- a/test/test_npu/test_dynamic_ops/test_network_dynamic_shape.py
+++ b/test/test_npu/test_dynamic_ops/test_network_dynamic_shape.py
@@ -1,193 +1,193 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-from common_utils import TestCase, run_tests
-import time
-import os
-
-class NetFirst(torch.nn.Module):
-    def __init__(self):
-        super(NetFirst, self).__init__()
-        self.cont = 0
-
-    def forward(self):
-        self.cont += 1
-        return self.cont
-        
-class NetStatic(torch.nn.Module):
-    def __init__(self):
-        super(NetStatic, self).__init__()
-
-    def forward(self, x):
-        out = torch.mul(x, x)  
-        return out
-        
-class NetDynamic(torch.nn.Module):
-    def __init__(self):
-        super(NetDynamic, self).__init__()
-
-    def forward(self, x):
-        out1 = torch.mul(x, x)
-        out2 = torch.relu(out1)
-        out3 = torch.neg(out2)
-        out4 = torch.floor_divide(x, out2)
-        out5 = torch.div(out4, out3)
-        out6 = torch.mul(out5, 0.8)
-        return out6
-        
-class NetUnsupport(torch.nn.Module):
-    def __init__(self):
-        super(NetUnsupport, self).__init__()
-
-    def forward(self, x):
-        out1 = torch.sin(x)
-        out2 = torch.sinh(out1)
-        out3 = torch.selu(out2)
-        return out3
-        
-class NetMixOp(torch.nn.Module):
-    def __init__(self):
-        super(NetMixOp, self).__init__()
-
-    def forward(self, x):
-        out0 = torch.floor_divide(x, x)
-        out1 = torch.selu(out0)
-        out2 = torch.sub(out1, x)
-        out3 = torch.div(out2, x)
-        out4 = torch.sin(out3)
-        out5 = torch.mul(out4, x)
-        out6 = torch.sinh(out5)
-        out7 = torch.neg(out6)
-        out8 = torch.relu(out7)     
-        return out8
-        
-
-class TestShape(TestCase):
-    def create_random_shape_tensor(self, item, minValue, maxValue):
-        format = item[0]
-        dtype = item[1]
-        dim = item[2]
-        shape = np.random.randint(1, 100, dim)
-        input = np.random.uniform(minValue, maxValue, shape).astype(dtype)
-        cpu_input = torch.from_numpy(input)
-        npu_input = torch.from_numpy(input).to("npu:0")
-        if format not in (-1, 0):
-            npu_input = npu_input.npu_format_cast(format)
-        return cpu_input, npu_input
-        
-    def test_dynamic_first_step(self, device):
-        net = NetFirst()
-        net = net.to("npu")      
-        for step in range(100):
-            cpu_output = net()    
-            npu_output = net()
-            assert cpu_output == npu_output - 1
-        
-    def test_dynamic_static_shape(self, device):
-        net = NetStatic()
-        net = net.to("npu")
-        item = [np.float32, 3, (10, 255, 5, 5)]   
-        for step in range(100):
-            cpu_tensor, npu_tensor = create_common_tensor(item, -100, 100)
-            cpu_output = net(cpu_tensor)
-            cpu_output = cpu_output.numpy()
-            npu_output = net(npu_tensor)
-            npu_output = npu_output.to("cpu")
-            npu_output = npu_output.numpy()
-            self.assertRtolEqual(cpu_output, npu_output)
-        
-    def test_dynamic_op(self, device):
-        net = NetDynamic()
-        net = net.to("npu")
-        shape_format = [
-            [3, np.float32, 1],
-            [3, np.float32, 2],
-            [3, np.float32, 4]  
-        ]       
-        for step in range(100):     
-            for item in shape_format:
-                cpu_tensor, npu_tensor = self.create_random_shape_tensor(item, -100, 100)
-                cpu_output = net(cpu_tensor)
-                cpu_output = cpu_output.numpy()
-                npu_output = net(npu_tensor)
-                npu_output = npu_output.to("cpu")
-                npu_output = npu_output.numpy()
-                self.assertRtolEqual(cpu_output, npu_output)
-          
-    def test_dyamic_unspport_op(self, device):
-        net = NetUnsupport()
-        net = net.to("npu")
-        item = [3, np.float32, 4]     
-        for step in range(100):
-            cpu_tensor, npu_tensor = self.create_random_shape_tensor(item, -100, 100)
-            cpu_output = net(cpu_tensor)
-            cpu_output = cpu_output.numpy()
-            npu_output = net(npu_tensor)
-            npu_output = npu_output.to("cpu")
-            npu_output = npu_output.numpy()
-            self.assertRtolEqual(cpu_output, npu_output)
-        
-    def test_dynamic_mix_op(self, device):
-        net = NetMixOp()
-        net = net.to("npu")
-        item = [3, np.float32, 4]       
-        for step in range(100):
-            cpu_tensor, npu_tensor = self.create_random_shape_tensor(item, -100, 100)
-            cpu_output = net(cpu_tensor)
-            cpu_output = cpu_output.numpy()
-            npu_output = net(npu_tensor)
-            npu_output = npu_output.to("cpu")
-            npu_output = npu_output.numpy()
-            self.assertRtolEqual(cpu_output, npu_output)
-        
-    def test_dynamic_all_random_mix_op(self, device):
-        net = NetMixOp()
-        net = net.to("npu")
-        format_list = [0, 3, 29]
-        dtype_list = [np.float32]
-        dim_list = [1, 2, 3, 4]
-        items = [
-            [i, j, k, 10] for i in format_list for j in dtype_list for k in dim_list
-        ]
-        for step in range(100):
-            for item in items:
-                cpu_tensor, npu_tensor = self.create_random_shape_tensor(item, -100, 100)
-                cpu_output = net(cpu_tensor)
-                cpu_output = cpu_output.numpy()
-                npu_output = net(npu_tensor)
-                npu_output = npu_output.to("cpu")
-                npu_output = npu_output.numpy()
-                self.assertRtolEqual(cpu_output, npu_output)
-   
-    def test_dynamic_exit(self, device):
-        net = NetMixOp()
-        net = net.to("npu")
-        item = [3, np.float32, 4]
-        for step in range(2):
-            cpu_tensor, npu_tensor = self.create_random_shape_tensor(item, -100, 100)
-            cpu_output = net(cpu_tensor)
-            cpu_output = cpu_output.numpy()
-            npu_output = net(npu_tensor)
-            npu_output = npu_output.to("cpu")
-            npu_output = npu_output.numpy()
-            self.assertRtolEqual(cpu_output, npu_output)
-         
-    
-instantiate_device_type_tests(TestShape, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()  
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+from common_utils import TestCase, run_tests
+import time
+import os
+
+class NetFirst(torch.nn.Module):
+    def __init__(self):
+        super(NetFirst, self).__init__()
+        self.cont = 0
+
+    def forward(self):
+        self.cont += 1
+        return self.cont
+        
+class NetStatic(torch.nn.Module):
+    def __init__(self):
+        super(NetStatic, self).__init__()
+
+    def forward(self, x):
+        out = torch.mul(x, x)  
+        return out
+        
+class NetDynamic(torch.nn.Module):
+    def __init__(self):
+        super(NetDynamic, self).__init__()
+
+    def forward(self, x):
+        out1 = torch.mul(x, x)
+        out2 = torch.relu(out1)
+        out3 = torch.neg(out2)
+        out4 = torch.floor_divide(x, out2)
+        out5 = torch.div(out4, out3)
+        out6 = torch.mul(out5, 0.8)
+        return out6
+        
+class NetUnsupport(torch.nn.Module):
+    def __init__(self):
+        super(NetUnsupport, self).__init__()
+
+    def forward(self, x):
+        out1 = torch.sin(x)
+        out2 = torch.sinh(out1)
+        out3 = torch.selu(out2)
+        return out3
+        
+class NetMixOp(torch.nn.Module):
+    def __init__(self):
+        super(NetMixOp, self).__init__()
+
+    def forward(self, x):
+        out0 = torch.floor_divide(x, x)
+        out1 = torch.selu(out0)
+        out2 = torch.sub(out1, x)
+        out3 = torch.div(out2, x)
+        out4 = torch.sin(out3)
+        out5 = torch.mul(out4, x)
+        out6 = torch.sinh(out5)
+        out7 = torch.neg(out6)
+        out8 = torch.relu(out7)     
+        return out8
+        
+
+class TestShape(TestCase):
+    def create_random_shape_tensor(self, item, minValue, maxValue):
+        format = item[0]
+        dtype = item[1]
+        dim = item[2]
+        shape = np.random.randint(1, 100, dim)
+        input = np.random.uniform(minValue, maxValue, shape).astype(dtype)
+        cpu_input = torch.from_numpy(input)
+        npu_input = torch.from_numpy(input).to("npu:0")
+        if format not in (-1, 0):
+            npu_input = npu_input.npu_format_cast(format)
+        return cpu_input, npu_input
+        
+    def test_dynamic_first_step(self, device):
+        net = NetFirst()
+        net = net.to("npu")      
+        for step in range(100):
+            cpu_output = net()    
+            npu_output = net()
+            assert cpu_output == npu_output - 1
+        
+    def test_dynamic_static_shape(self, device):
+        net = NetStatic()
+        net = net.to("npu")
+        item = [np.float32, 3, (10, 255, 5, 5)]   
+        for step in range(100):
+            cpu_tensor, npu_tensor = create_common_tensor(item, -100, 100)
+            cpu_output = net(cpu_tensor)
+            cpu_output = cpu_output.numpy()
+            npu_output = net(npu_tensor)
+            npu_output = npu_output.to("cpu")
+            npu_output = npu_output.numpy()
+            self.assertRtolEqual(cpu_output, npu_output)
+        
+    def test_dynamic_op(self, device):
+        net = NetDynamic()
+        net = net.to("npu")
+        shape_format = [
+            [3, np.float32, 1],
+            [3, np.float32, 2],
+            [3, np.float32, 4]  
+        ]       
+        for step in range(100):     
+            for item in shape_format:
+                cpu_tensor, npu_tensor = self.create_random_shape_tensor(item, -100, 100)
+                cpu_output = net(cpu_tensor)
+                cpu_output = cpu_output.numpy()
+                npu_output = net(npu_tensor)
+                npu_output = npu_output.to("cpu")
+                npu_output = npu_output.numpy()
+                self.assertRtolEqual(cpu_output, npu_output)
+          
+    def test_dyamic_unspport_op(self, device):
+        net = NetUnsupport()
+        net = net.to("npu")
+        item = [3, np.float32, 4]     
+        for step in range(100):
+            cpu_tensor, npu_tensor = self.create_random_shape_tensor(item, -100, 100)
+            cpu_output = net(cpu_tensor)
+            cpu_output = cpu_output.numpy()
+            npu_output = net(npu_tensor)
+            npu_output = npu_output.to("cpu")
+            npu_output = npu_output.numpy()
+            self.assertRtolEqual(cpu_output, npu_output)
+        
+    def test_dynamic_mix_op(self, device):
+        net = NetMixOp()
+        net = net.to("npu")
+        item = [3, np.float32, 4]       
+        for step in range(100):
+            cpu_tensor, npu_tensor = self.create_random_shape_tensor(item, -100, 100)
+            cpu_output = net(cpu_tensor)
+            cpu_output = cpu_output.numpy()
+            npu_output = net(npu_tensor)
+            npu_output = npu_output.to("cpu")
+            npu_output = npu_output.numpy()
+            self.assertRtolEqual(cpu_output, npu_output)
+        
+    def test_dynamic_all_random_mix_op(self, device):
+        net = NetMixOp()
+        net = net.to("npu")
+        format_list = [0, 3, 29]
+        dtype_list = [np.float32]
+        dim_list = [1, 2, 3, 4]
+        items = [
+            [i, j, k, 10] for i in format_list for j in dtype_list for k in dim_list
+        ]
+        for step in range(100):
+            for item in items:
+                cpu_tensor, npu_tensor = self.create_random_shape_tensor(item, -100, 100)
+                cpu_output = net(cpu_tensor)
+                cpu_output = cpu_output.numpy()
+                npu_output = net(npu_tensor)
+                npu_output = npu_output.to("cpu")
+                npu_output = npu_output.numpy()
+                self.assertRtolEqual(cpu_output, npu_output)
+   
+    def test_dynamic_exit(self, device):
+        net = NetMixOp()
+        net = net.to("npu")
+        item = [3, np.float32, 4]
+        for step in range(2):
+            cpu_tensor, npu_tensor = self.create_random_shape_tensor(item, -100, 100)
+            cpu_output = net(cpu_tensor)
+            cpu_output = cpu_output.numpy()
+            npu_output = net(npu_tensor)
+            npu_output = npu_output.to("cpu")
+            npu_output = npu_output.numpy()
+            self.assertRtolEqual(cpu_output, npu_output)
+         
+    
+instantiate_device_type_tests(TestShape, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()  
diff --git a/test/test_npu/test_dynamic_ops/test_network_gatherv2.py b/test/test_npu/test_dynamic_ops/test_network_gatherv2.py
index cbbbf407fcaa58c085109e84949a85049be3fe92..176fa41622a8df44954e6233e78a6765750b2ce5 100644
--- a/test/test_npu/test_dynamic_ops/test_network_gatherv2.py
+++ b/test/test_npu/test_dynamic_ops/test_network_gatherv2.py
@@ -1,84 +1,84 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-from common_utils import TestCase, run_tests
-import time
-import os
-import copy
-# Need export DYNAMIC_COMPILE_ENABLE=1 and export EXPERIMENTAL_DYNAMIC_PARTITION=1
-
-
-class GatherV2Net(torch.nn.Module):
-    def __init__(self):
-        super(GatherV2Net, self).__init__()
-
-    def forward(self, x, dim, idx):
-        if x.device == torch.device("cpu") and x.dtype == torch.float16:
-            x = x.to(torch.float32)
-        out = torch.index_select(x, dim, idx)
-        if x.device == torch.device("cpu") and x.dtype == torch.float16:
-            out = out.to(torch.float16)
-        return out
-
-
-class TestShape(TestCase):
-    def create_random_shape_tensor(self, item, min_value, max_value):
-        npu_format = item[0]
-        dtype = item[1]
-        dim = item[2]
-        shape = np.random.randint(1, 10, dim)
-        input_tensor = np.random.uniform(min_value, max_value, shape).astype(dtype)
-        cpu_input = torch.from_numpy(input_tensor)
-        npu_input = torch.from_numpy(input_tensor).npu()
-        if npu_format not in (-1, 0):
-            npu_input = npu_input.npu_format_cast(npu_format)
-        return cpu_input, npu_input
-
-    def get_random_dim_index(self, cpu_tensor):
-        shape = list(cpu_tensor.shape)
-        dim = np.random.randint(0, len(shape))
-        the_size_of_dim = shape[dim]
-        indices = np.random.randint(0, the_size_of_dim, np.random.randint(the_size_of_dim), dtype=np.int64)
-        indices = torch.from_numpy(indices)
-        return dim, indices
-
-    def test_dynamic_threads_support_op(self, device):
-        format_list = [0]
-        dtype_list = [np.float32, np.float16]
-        dim_list = [3,4]
-        net = GatherV2Net()
-        net_npu = copy.deepcopy(net).to("npu")
-        items = [
-            [i, j, k] for i in format_list for j in dtype_list for k in dim_list
-        ]
-        for item in items:
-            if item[0] == 29 and item[2] == 1:
-                continue
-            if item[0] in [0,3,29] and item[2] == 5:
-                continue
-            for _ in range(100):
-                cpu_tensor, npu_tensor = self.create_random_shape_tensor(item, -10, 10)
-                dim, indices = self.get_random_dim_index(cpu_tensor)
-                cpu_output = net(cpu_tensor, dim, indices)
-                npu_output = net_npu(npu_tensor, dim, indices.to("npu"))
-                self.assertRtolEqual(cpu_output.to(npu_output.dtype).numpy(), npu_output.cpu().numpy())
-
-
-instantiate_device_type_tests(TestShape, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+from common_utils import TestCase, run_tests
+import time
+import os
+import copy
+# Need export DYNAMIC_COMPILE_ENABLE=1 and export EXPERIMENTAL_DYNAMIC_PARTITION=1
+
+
+class GatherV2Net(torch.nn.Module):
+    def __init__(self):
+        super(GatherV2Net, self).__init__()
+
+    def forward(self, x, dim, idx):
+        if x.device == torch.device("cpu") and x.dtype == torch.float16:
+            x = x.to(torch.float32)
+        out = torch.index_select(x, dim, idx)
+        if x.device == torch.device("cpu") and x.dtype == torch.float16:
+            out = out.to(torch.float16)
+        return out
+
+
+class TestShape(TestCase):
+    def create_random_shape_tensor(self, item, min_value, max_value):
+        npu_format = item[0]
+        dtype = item[1]
+        dim = item[2]
+        shape = np.random.randint(1, 10, dim)
+        input_tensor = np.random.uniform(min_value, max_value, shape).astype(dtype)
+        cpu_input = torch.from_numpy(input_tensor)
+        npu_input = torch.from_numpy(input_tensor).npu()
+        if npu_format not in (-1, 0):
+            npu_input = npu_input.npu_format_cast(npu_format)
+        return cpu_input, npu_input
+
+    def get_random_dim_index(self, cpu_tensor):
+        shape = list(cpu_tensor.shape)
+        dim = np.random.randint(0, len(shape))
+        the_size_of_dim = shape[dim]
+        indices = np.random.randint(0, the_size_of_dim, np.random.randint(the_size_of_dim), dtype=np.int64)
+        indices = torch.from_numpy(indices)
+        return dim, indices
+
+    def test_dynamic_threads_support_op(self, device):
+        format_list = [0]
+        dtype_list = [np.float32, np.float16]
+        dim_list = [3,4]
+        net = GatherV2Net()
+        net_npu = copy.deepcopy(net).to("npu")
+        items = [
+            [i, j, k] for i in format_list for j in dtype_list for k in dim_list
+        ]
+        for item in items:
+            if item[0] == 29 and item[2] == 1:
+                continue
+            if item[0] in [0,3,29] and item[2] == 5:
+                continue
+            for _ in range(100):
+                cpu_tensor, npu_tensor = self.create_random_shape_tensor(item, -10, 10)
+                dim, indices = self.get_random_dim_index(cpu_tensor)
+                cpu_output = net(cpu_tensor, dim, indices)
+                npu_output = net_npu(npu_tensor, dim, indices.to("npu"))
+                self.assertRtolEqual(cpu_output.to(npu_output.dtype).numpy(), npu_output.cpu().numpy())
+
+
+instantiate_device_type_tests(TestShape, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_dynamic_ops/test_network_repeat.py b/test/test_npu/test_dynamic_ops/test_network_repeat.py
index b135370e3c1425c121b2642c35ed0e4fcef6c07b..b60f9d8a5058b9a157a3b4329a9a6d2c2db72cad 100644
--- a/test/test_npu/test_dynamic_ops/test_network_repeat.py
+++ b/test/test_npu/test_dynamic_ops/test_network_repeat.py
@@ -1,83 +1,83 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-from common_utils import TestCase, run_tests
-import time
-import os
-import copy
-# Need export DYNAMIC_COMPILE_ENABLE=1 and export EXPERIMENTAL_DYNAMIC_PARTITION=1
-
-
-class RepeatNet(torch.nn.Module):
-    def __init__(self):
-        super(RepeatNet, self).__init__()
-
-    def forward(self, x, size):
-        if x.device == torch.device("cpu") and x.dtype == torch.float16:
-            x = x.to(torch.float32)
-        out = x.repeat(size)
-        if x.device == torch.device("cpu") and x.dtype == torch.float16:
-            out = out.to(torch.float16)
-        return out
-
-
-
-class TestShape(TestCase):
-    def create_random_shape_tensor(self, item, min_value, max_value):
-        npu_format = item[0]
-        dtype = item[1]
-        dim = item[2]
-        shape = np.random.randint(1, 10, dim)
-        input_tensor = np.random.uniform(min_value, max_value, shape).astype(dtype)
-        cpu_input = torch.from_numpy(input_tensor)
-        npu_input = torch.from_numpy(input_tensor).npu()
-        if npu_format not in (-1, 0):
-            npu_input = npu_input.npu_format_cast(npu_format)
-        return cpu_input, npu_input
-
-    def get_random_size(self, cpu_tensor):
-        shape = list(cpu_tensor.shape)
-        if np.random.rand()<0.4:
-            sizes = np.random.randint(2, 5,len(shape)+int(np.random.randint(1, 3)))
-        else:
-            sizes = np.random.randint(2, 5, len(shape))
-        return list(sizes)
-
-    def test_dynamic_threads_support_op(self, device):
-        format_list = [0]
-        dtype_list = [np.float32, np.float16]
-        dim_list = [3, 4]
-        net = RepeatNet()
-        net_npu = copy.deepcopy(net).to("npu")
-        items = [
-            [i, j, k] for i in format_list for j in dtype_list for k in dim_list
-        ]
-        for item in items:
-            if item[0] == 29 and item[2] == 1:
-                continue
-            for _ in range(10):
-                cpu_tensor, npu_tensor = self.create_random_shape_tensor(item, -10, 10)
-                sizes = self.get_random_size(cpu_tensor)
-                cpu_output = net(cpu_tensor, sizes)
-                npu_output = net_npu(npu_tensor, sizes)
-                self.assertRtolEqual(cpu_output.to(npu_output.dtype).numpy(), npu_output.cpu().numpy())
-
-
-instantiate_device_type_tests(TestShape, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+from common_utils import TestCase, run_tests
+import time
+import os
+import copy
+# Need export DYNAMIC_COMPILE_ENABLE=1 and export EXPERIMENTAL_DYNAMIC_PARTITION=1
+
+
+class RepeatNet(torch.nn.Module):
+    def __init__(self):
+        super(RepeatNet, self).__init__()
+
+    def forward(self, x, size):
+        if x.device == torch.device("cpu") and x.dtype == torch.float16:
+            x = x.to(torch.float32)
+        out = x.repeat(size)
+        if x.device == torch.device("cpu") and x.dtype == torch.float16:
+            out = out.to(torch.float16)
+        return out
+
+
+
+class TestShape(TestCase):
+    def create_random_shape_tensor(self, item, min_value, max_value):
+        npu_format = item[0]
+        dtype = item[1]
+        dim = item[2]
+        shape = np.random.randint(1, 10, dim)
+        input_tensor = np.random.uniform(min_value, max_value, shape).astype(dtype)
+        cpu_input = torch.from_numpy(input_tensor)
+        npu_input = torch.from_numpy(input_tensor).npu()
+        if npu_format not in (-1, 0):
+            npu_input = npu_input.npu_format_cast(npu_format)
+        return cpu_input, npu_input
+
+    def get_random_size(self, cpu_tensor):
+        shape = list(cpu_tensor.shape)
+        if np.random.rand()<0.4:
+            sizes = np.random.randint(2, 5,len(shape)+int(np.random.randint(1, 3)))
+        else:
+            sizes = np.random.randint(2, 5, len(shape))
+        return list(sizes)
+
+    def test_dynamic_threads_support_op(self, device):
+        format_list = [0]
+        dtype_list = [np.float32, np.float16]
+        dim_list = [3, 4]
+        net = RepeatNet()
+        net_npu = copy.deepcopy(net).to("npu")
+        items = [
+            [i, j, k] for i in format_list for j in dtype_list for k in dim_list
+        ]
+        for item in items:
+            if item[0] == 29 and item[2] == 1:
+                continue
+            for _ in range(10):
+                cpu_tensor, npu_tensor = self.create_random_shape_tensor(item, -10, 10)
+                sizes = self.get_random_size(cpu_tensor)
+                cpu_output = net(cpu_tensor, sizes)
+                npu_output = net_npu(npu_tensor, sizes)
+                self.assertRtolEqual(cpu_output.to(npu_output.dtype).numpy(), npu_output.cpu().numpy())
+
+
+instantiate_device_type_tests(TestShape, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_dynamic_ops/test_network_topK.py b/test/test_npu/test_dynamic_ops/test_network_topK.py
index 79ad2f3b5f0c7941d48458e7c646afde9cfc5de6..614cd3592cab9988b1c55b8257050204a89a2d80 100644
--- a/test/test_npu/test_dynamic_ops/test_network_topK.py
+++ b/test/test_npu/test_dynamic_ops/test_network_topK.py
@@ -1,76 +1,76 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-from common_utils import TestCase, run_tests
-import time
-import os
-import copy
-# Need export DYNAMIC_COMPILE_ENABLE=1 and export EXPERIMENTAL_DYNAMIC_PARTITION=1
-
-
-class TopkNet(torch.nn.Module):
-    def __init__(self):
-        super(TopkNet, self).__init__()
-
-    def forward(self, x, k):
-        if x.device == torch.device("cpu") and x.dtype == torch.float16:
-            x = x.to(torch.float32)
-        out = torch.topk(x, k)
-        if x.device == torch.device("cpu") and x.dtype == torch.float16:
-            out = out.to(torch.float16)
-        return out
-
-
-class TestShape(TestCase):
-    def create_random_shape_tensor(self, item, min_value, max_value):
-        npu_format = item[0]
-        dtype = item[1]
-        dim = item[2]
-        shape = np.random.randint(5, 10, dim)
-        input_tensor = np.random.uniform(min_value, max_value, shape).astype(dtype)
-        cpu_input = torch.from_numpy(input_tensor)
-        npu_input = torch.from_numpy(input_tensor).npu()
-        if npu_format not in (-1, 0):
-            npu_input = npu_input.npu_format_cast(npu_format)
-        return cpu_input, npu_input
-
-    def test_dynamic_threads_support_op(self, device):
-        format_list = [0]
-        dtype_list = [np.float16]
-        dim_list = [3, 4]
-        net = TopkNet()
-        net_npu = copy.deepcopy(net).to("npu")
-        items = [
-            [i, j, k] for i in format_list for j in dtype_list for k in dim_list
-        ]
-        for item in items:
-            if item[0] == 29 and item[2] == 1:
-                continue
-            for _ in range(100):
-                cpu_tensor, npu_tensor = self.create_random_shape_tensor(item, -10, 10)
-                k = 5
-                cpu_output,cpu_indice = net(cpu_tensor, k)
-                npu_output,npu_indice = net_npu(npu_tensor, k)
-                self.assertRtolEqual(cpu_output.to(npu_output.dtype).numpy(), npu_output.cpu().numpy())
-                self.assertRtolEqual(cpu_indice.to(torch.int32).numpy(), npu_indice.to(torch.int32).cpu().numpy())
-               
-
-
-instantiate_device_type_tests(TestShape, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+from common_utils import TestCase, run_tests
+import time
+import os
+import copy
+# Need export DYNAMIC_COMPILE_ENABLE=1 and export EXPERIMENTAL_DYNAMIC_PARTITION=1
+
+
+class TopkNet(torch.nn.Module):
+    def __init__(self):
+        super(TopkNet, self).__init__()
+
+    def forward(self, x, k):
+        if x.device == torch.device("cpu") and x.dtype == torch.float16:
+            x = x.to(torch.float32)
+        out = torch.topk(x, k)
+        if x.device == torch.device("cpu") and x.dtype == torch.float16:
+            out = out.to(torch.float16)
+        return out
+
+
+class TestShape(TestCase):
+    def create_random_shape_tensor(self, item, min_value, max_value):
+        npu_format = item[0]
+        dtype = item[1]
+        dim = item[2]
+        shape = np.random.randint(5, 10, dim)
+        input_tensor = np.random.uniform(min_value, max_value, shape).astype(dtype)
+        cpu_input = torch.from_numpy(input_tensor)
+        npu_input = torch.from_numpy(input_tensor).npu()
+        if npu_format not in (-1, 0):
+            npu_input = npu_input.npu_format_cast(npu_format)
+        return cpu_input, npu_input
+
+    def test_dynamic_threads_support_op(self, device):
+        format_list = [0]
+        dtype_list = [np.float16]
+        dim_list = [3, 4]
+        net = TopkNet()
+        net_npu = copy.deepcopy(net).to("npu")
+        items = [
+            [i, j, k] for i in format_list for j in dtype_list for k in dim_list
+        ]
+        for item in items:
+            if item[0] == 29 and item[2] == 1:
+                continue
+            for _ in range(100):
+                cpu_tensor, npu_tensor = self.create_random_shape_tensor(item, -10, 10)
+                k = 5
+                cpu_output,cpu_indice = net(cpu_tensor, k)
+                npu_output,npu_indice = net_npu(npu_tensor, k)
+                self.assertRtolEqual(cpu_output.to(npu_output.dtype).numpy(), npu_output.cpu().numpy())
+                self.assertRtolEqual(cpu_indice.to(torch.int32).numpy(), npu_indice.to(torch.int32).cpu().numpy())
+               
+
+
+instantiate_device_type_tests(TestShape, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_fill_.py b/test/test_npu/test_fill_.py
index 4c609713b99fcc8f51262618d0b3619fcc90ed53..9dbb022f092f21c2b41c1f5631eb59eeb82ddfa1 100644
--- a/test/test_npu/test_fill_.py
+++ b/test/test_npu/test_fill_.py
@@ -1,129 +1,129 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestFill(TestCase):
-
-    def generate_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-
-        # modify from numpy.ndarray to torch.tensor
-        npu_input1 = torch.from_numpy(input1)
-        npu_input2 = torch.from_numpy(input2)
-
-        return npu_input1, npu_input2
-
-    def generate_single_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        npu_input1 = torch.from_numpy(input1)
-
-        return npu_input1
-
-    def cpu_op_exec(self, input1, input2):
-        output = torch.fill_(input1, input2).numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        output = torch.fill_(input1, input2)
-        output = output.to("cpu").numpy()
-        return output
-
-    def npu_op_exec_scalar(self, input1, input2):
-        input1 = input1.to("npu")
-        output = torch.fill_(input1, input2)
-        output = output.to("cpu").numpy()
-        return output
-
-
-    def test_fill_scalar_int32(self, device):
-        npu_input1, _ = self.generate_data(0, 100, (2, 3), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, 1)
-        npu_output = self.npu_op_exec_scalar(npu_input1, 1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_fill_scalar_float16(self, device):
-        npu_input1, _ = self.generate_data(0, 100, (2, 3), np.float16)
-        cpu_output = self.cpu_op_exec(npu_input1, 1)
-        npu_output = self.npu_op_exec_scalar(npu_input1, 1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_fill_scalar_float32(self, device):
-        npu_input1, _ = self.generate_data(0, 100, (2, 3), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, 1)
-        npu_output = self.npu_op_exec_scalar(npu_input1, 1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-
-    def test_fill_common_shape_format(self, device):
-        shape_format = [
-            [np.float32, -1, (4, 3)],
-            [np.int32, -1, (2, 3)],
-            [np.int32, -1, (4, 3, 1)],
-            [np.float16,-1,(65535, 1)],
-            [np.float16, -1, (1, 8192)],
-            [np.float16, -1, (1, 16384)],
-            [np.float16, -1, (1, 32768)],
-            [np.float16, -1, ( 1, 131072)],
-            [np.float16, -1, (1, 196608)],
-            [np.float16, -1, (1, 262144)],
-            [np.float16, -1, (1, 393216)],
-            [np.float16, -1, (1, 524288)],
-            [np.float16, -1, (1, 655360)],
-            [np.float16, -1, (1, 786432)],
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, 1)
-            npu_output = self.npu_op_exec_scalar(npu_input1, 1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_fill_float32_data_range(self, device):
-        data_range = [
-            [-1.1754943508e-38, -1.1754943508e-38],
-            [-3402823500.0, 3402823500.0],
-            [-0.000030517578125, 0.000030517578125],
-            [3402823500, 3402800000],
-            [-9.313225746154785e-10, 9.313225746154785e-10],
-            [-3402823500.0, -3402823500.0],
-            [-3402823500.0, 3402823500.0],
-            [-9.313225746154785e-10, 9.313225746154785e-10],
-            [-3402823500.0,-3402823500.0],
-            [-0.000000000000000000000000000000000000011754943508, 0.000000000000000000000000000000000000011754943508],
-            [0.000000000000000000000000000000000000011754943508, 0.000000000000000000000000000000000000011754943508],
-            [-0.000000000000000000000000000000000000011754943508, -0.000000000000000000000000000000000000011754943508],
-            [-0.000000000000000000000000000000000000011754943508, 0.000000000000000000000000000000000000011754943508]
-        ]
-        for item in data_range:
-            cpu_input1, npu_input1 = create_common_tensor([np.float32, - 1, (1, 31, 149, 2)], item[0], item[1])
-            cpu_output = self.cpu_op_exec(cpu_input1, 1)
-            npu_output = self.npu_op_exec_scalar(npu_input1, 1)
-            self.assertRtolEqual(cpu_output, npu_output)
-            print("float32 run")
-
-instantiate_device_type_tests(TestFill, globals(), except_for='cpu')
-if __name__ == '__main__':
-    torch.npu.set_device("npu:7")
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestFill(TestCase):
+
+    def generate_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+
+        # modify from numpy.ndarray to torch.tensor
+        npu_input1 = torch.from_numpy(input1)
+        npu_input2 = torch.from_numpy(input2)
+
+        return npu_input1, npu_input2
+
+    def generate_single_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        npu_input1 = torch.from_numpy(input1)
+
+        return npu_input1
+
+    def cpu_op_exec(self, input1, input2):
+        output = torch.fill_(input1, input2).numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        output = torch.fill_(input1, input2)
+        output = output.to("cpu").numpy()
+        return output
+
+    def npu_op_exec_scalar(self, input1, input2):
+        input1 = input1.to("npu")
+        output = torch.fill_(input1, input2)
+        output = output.to("cpu").numpy()
+        return output
+
+
+    def test_fill_scalar_int32(self, device):
+        npu_input1, _ = self.generate_data(0, 100, (2, 3), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, 1)
+        npu_output = self.npu_op_exec_scalar(npu_input1, 1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_fill_scalar_float16(self, device):
+        npu_input1, _ = self.generate_data(0, 100, (2, 3), np.float16)
+        cpu_output = self.cpu_op_exec(npu_input1, 1)
+        npu_output = self.npu_op_exec_scalar(npu_input1, 1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_fill_scalar_float32(self, device):
+        npu_input1, _ = self.generate_data(0, 100, (2, 3), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, 1)
+        npu_output = self.npu_op_exec_scalar(npu_input1, 1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+
+    def test_fill_common_shape_format(self, device):
+        shape_format = [
+            [np.float32, -1, (4, 3)],
+            [np.int32, -1, (2, 3)],
+            [np.int32, -1, (4, 3, 1)],
+            [np.float16,-1,(65535, 1)],
+            [np.float16, -1, (1, 8192)],
+            [np.float16, -1, (1, 16384)],
+            [np.float16, -1, (1, 32768)],
+            [np.float16, -1, ( 1, 131072)],
+            [np.float16, -1, (1, 196608)],
+            [np.float16, -1, (1, 262144)],
+            [np.float16, -1, (1, 393216)],
+            [np.float16, -1, (1, 524288)],
+            [np.float16, -1, (1, 655360)],
+            [np.float16, -1, (1, 786432)],
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, 1)
+            npu_output = self.npu_op_exec_scalar(npu_input1, 1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_fill_float32_data_range(self, device):
+        data_range = [
+            [-1.1754943508e-38, -1.1754943508e-38],
+            [-3402823500.0, 3402823500.0],
+            [-0.000030517578125, 0.000030517578125],
+            [3402823500, 3402800000],
+            [-9.313225746154785e-10, 9.313225746154785e-10],
+            [-3402823500.0, -3402823500.0],
+            [-3402823500.0, 3402823500.0],
+            [-9.313225746154785e-10, 9.313225746154785e-10],
+            [-3402823500.0,-3402823500.0],
+            [-0.000000000000000000000000000000000000011754943508, 0.000000000000000000000000000000000000011754943508],
+            [0.000000000000000000000000000000000000011754943508, 0.000000000000000000000000000000000000011754943508],
+            [-0.000000000000000000000000000000000000011754943508, -0.000000000000000000000000000000000000011754943508],
+            [-0.000000000000000000000000000000000000011754943508, 0.000000000000000000000000000000000000011754943508]
+        ]
+        for item in data_range:
+            cpu_input1, npu_input1 = create_common_tensor([np.float32, - 1, (1, 31, 149, 2)], item[0], item[1])
+            cpu_output = self.cpu_op_exec(cpu_input1, 1)
+            npu_output = self.npu_op_exec_scalar(npu_input1, 1)
+            self.assertRtolEqual(cpu_output, npu_output)
+            print("float32 run")
+
+instantiate_device_type_tests(TestFill, globals(), except_for='cpu')
+if __name__ == '__main__':
+    torch.npu.set_device("npu:7")
+    run_tests()
diff --git a/test/test_npu/test_fmod.py b/test/test_npu/test_fmod.py
index 5683a74794c8f1b4c17aeb4d38d1ed41dc95f827..9732571e0264e7d9c31a7860ae641ede7c7b656d 100644
--- a/test/test_npu/test_fmod.py
+++ b/test/test_npu/test_fmod.py
@@ -1,120 +1,120 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestFmod(TestCase):
-
-    def generate_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-
-        # modify from numpy.ndarray to torch.tensor
-        npu_input1 = torch.from_numpy(input1)
-        npu_input2 = torch.from_numpy(input2)
-
-        return npu_input1, npu_input2
-
-    def generate_single_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        npu_input1 = torch.from_numpy(input1)
-
-        return npu_input1
-
-    def generate_scalar(self, min_d, max_d):
-        scalar = np.random.uniform(min_d, max_d)
-        return scalar
-
-    def generate_int_scalar(self, min_d, max_d):
-        scalar = np.random.randint(min_d, max_d)
-        return scalar
-
-    def cpu_op_exec(self, input1, input2):
-        output = torch.fmod(input1, input2)
-        # output = torch.fmod(input1, input2)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        output = torch.fmod(input1, input2)
-        # output = torch.fmod(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_tensor_need_to_npu(self, input1, input2):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        output = torch.fmod(input1, input2)
-        # output = torch.fmod(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_scalar(self, input1, input2):
-        input1 = input1.to("npu")
-        # output = input1 + input2
-        output = torch.fmod(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_out(self, input1, input2, input3):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        output = input3.to("npu")
-        torch.fmod(input1, input2, out=output)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-            
-    def test_fmod_scalar_float32(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 3), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, 1)
-        npu_output = self.npu_op_exec_scalar(npu_input1, 1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_fmod_uncontiguous_float32_scalar(self, device):
-        def cpu_uncontiguous_op_exec_scalar(input1, input2):
-            input1 = input1.as_strided([2, 2], [1, 2], 1)
-            output = torch.fmod(input1, input2)
-            output = output.numpy()
-            return output
-
-        def npu_uncontiguous_op_exec_scalar(input1, input2):
-            input1 = input1.to("cpu")
-            input1 = input1.as_strided([2, 2], [1, 2], 1)
-            output = torch.fmod(input1, input2)
-            output = output.to("cpu")
-            output = output.numpy()
-            return output
-
-        npu_input1, npu_input2 = self.generate_data(0, 100, (4, 3), np.float32)
-        cpu_input1 = copy.deepcopy(npu_input1)
-        cpu_output = cpu_uncontiguous_op_exec_scalar(cpu_input1, 2)
-        npu_output = npu_uncontiguous_op_exec_scalar(npu_input1, 2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestFmod, globals(), except_for='cpu')
-if __name__ == "__main__":
-    torch.npu.set_device("npu:7")
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestFmod(TestCase):
+
+    def generate_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+
+        # modify from numpy.ndarray to torch.tensor
+        npu_input1 = torch.from_numpy(input1)
+        npu_input2 = torch.from_numpy(input2)
+
+        return npu_input1, npu_input2
+
+    def generate_single_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        npu_input1 = torch.from_numpy(input1)
+
+        return npu_input1
+
+    def generate_scalar(self, min_d, max_d):
+        scalar = np.random.uniform(min_d, max_d)
+        return scalar
+
+    def generate_int_scalar(self, min_d, max_d):
+        scalar = np.random.randint(min_d, max_d)
+        return scalar
+
+    def cpu_op_exec(self, input1, input2):
+        output = torch.fmod(input1, input2)
+        # output = torch.fmod(input1, input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.fmod(input1, input2)
+        # output = torch.fmod(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_tensor_need_to_npu(self, input1, input2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        output = torch.fmod(input1, input2)
+        # output = torch.fmod(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_scalar(self, input1, input2):
+        input1 = input1.to("npu")
+        # output = input1 + input2
+        output = torch.fmod(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2, input3):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        output = input3.to("npu")
+        torch.fmod(input1, input2, out=output)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+            
+    def test_fmod_scalar_float32(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 3), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, 1)
+        npu_output = self.npu_op_exec_scalar(npu_input1, 1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_fmod_uncontiguous_float32_scalar(self, device):
+        def cpu_uncontiguous_op_exec_scalar(input1, input2):
+            input1 = input1.as_strided([2, 2], [1, 2], 1)
+            output = torch.fmod(input1, input2)
+            output = output.numpy()
+            return output
+
+        def npu_uncontiguous_op_exec_scalar(input1, input2):
+            input1 = input1.to("cpu")
+            input1 = input1.as_strided([2, 2], [1, 2], 1)
+            output = torch.fmod(input1, input2)
+            output = output.to("cpu")
+            output = output.numpy()
+            return output
+
+        npu_input1, npu_input2 = self.generate_data(0, 100, (4, 3), np.float32)
+        cpu_input1 = copy.deepcopy(npu_input1)
+        cpu_output = cpu_uncontiguous_op_exec_scalar(cpu_input1, 2)
+        npu_output = npu_uncontiguous_op_exec_scalar(npu_input1, 2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestFmod, globals(), except_for='cpu')
+if __name__ == "__main__":
+    torch.npu.set_device("npu:7")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_hardtanh_backward.py b/test/test_npu/test_hardtanh_backward.py
index 1e36d1876fa8cddaaea97451f094fe8d28b8784b..fc5e1b7c288299b6bcc421f99a88e0aa31020718 100644
--- a/test/test_npu/test_hardtanh_backward.py
+++ b/test/test_npu/test_hardtanh_backward.py
@@ -1,65 +1,65 @@
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestHardtanhBackward(TestCase):
-
-    def cpu_op_exec(self, input_x, min_val, max_val):
-        input_x.requires_grad_(True)
-        m = torch.nn.Hardtanh(min_val, max_val)
-        output = m(input_x)
-        w = torch.ones_like(output)
-        output.backward(w)
-        out = input_x.grad
-        return out
-
-    def npu_op_exec(self, input_x, min_val, max_val):
-        input_x.requires_grad_(True)
-        m = torch.nn.Hardtanh(min_val, max_val)
-        output = m(input_x)
-        w = torch.ones_like(output)
-        w = w.to("npu")
-        output.backward(w)
-        out = input_x.grad.to('cpu')
-        return out
-
-    def test_hardtanh_backwardfloat32(self, device):
-        shape_format = [
-            [[np.float32, 0, (10, 10)], -1, 1], [[np.float32, 0, (5, 6, 7)], -1, 1], 
-            [[np.float32, -1, (6, 6, 6)], -1, 3], [[np.float32, 3, (8, 6, 4)], -2, 2], 
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], -2, 2)
-            cpu_output = self.cpu_op_exec(cpu_input, item[1], item[2])
-            npu_output = self.npu_op_exec(npu_input, item[1], item[2])
-            self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
-
-    def test_hardtanh_backwardfloat16(self, device):
-        shape_format = [
-            [[np.float16, 0, (10, 10, 10)], -1, 1], [[np.float16, 0, (7, 7, 7)], -1, 1], 
-            [[np.float16, -1, (6, 6, 6, 6)], -3, 1], [[np.float16, 3, (10, 10, 10, 10)], -1, 3],
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], -2, 2)
-            cpu_input = cpu_input.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input, item[1], item[2])
-            npu_output = self.npu_op_exec(npu_input, item[1], item[2])
-            cpu_output = cpu_output.to(torch.float16)
-            self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
-
-instantiate_device_type_tests(TestHardtanhBackward, globals(), except_for="cpu")
-if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestHardtanhBackward(TestCase):
+
+    def cpu_op_exec(self, input_x, min_val, max_val):
+        input_x.requires_grad_(True)
+        m = torch.nn.Hardtanh(min_val, max_val)
+        output = m(input_x)
+        w = torch.ones_like(output)
+        output.backward(w)
+        out = input_x.grad
+        return out
+
+    def npu_op_exec(self, input_x, min_val, max_val):
+        input_x.requires_grad_(True)
+        m = torch.nn.Hardtanh(min_val, max_val)
+        output = m(input_x)
+        w = torch.ones_like(output)
+        w = w.to("npu")
+        output.backward(w)
+        out = input_x.grad.to('cpu')
+        return out
+
+    def test_hardtanh_backwardfloat32(self, device):
+        shape_format = [
+            [[np.float32, 0, (10, 10)], -1, 1], [[np.float32, 0, (5, 6, 7)], -1, 1], 
+            [[np.float32, -1, (6, 6, 6)], -1, 3], [[np.float32, 3, (8, 6, 4)], -2, 2], 
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], -2, 2)
+            cpu_output = self.cpu_op_exec(cpu_input, item[1], item[2])
+            npu_output = self.npu_op_exec(npu_input, item[1], item[2])
+            self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
+
+    def test_hardtanh_backwardfloat16(self, device):
+        shape_format = [
+            [[np.float16, 0, (10, 10, 10)], -1, 1], [[np.float16, 0, (7, 7, 7)], -1, 1], 
+            [[np.float16, -1, (6, 6, 6, 6)], -3, 1], [[np.float16, 3, (10, 10, 10, 10)], -1, 3],
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], -2, 2)
+            cpu_input = cpu_input.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input, item[1], item[2])
+            npu_output = self.npu_op_exec(npu_input, item[1], item[2])
+            cpu_output = cpu_output.to(torch.float16)
+            self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
+
+instantiate_device_type_tests(TestHardtanhBackward, globals(), except_for="cpu")
+if __name__ == "__main__":
+    torch.npu.set_device("npu:5")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_isfinite.py b/test/test_npu/test_isfinite.py
index 8e948c199e11855ff36f2fd3d702f522b9e239b4..18a2debae0060e5a2a7bef6f077e7e204d9ec08f 100644
--- a/test/test_npu/test_isfinite.py
+++ b/test/test_npu/test_isfinite.py
@@ -1,70 +1,70 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION.
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestIsfinite(TestCase):
-
-    def generate_data(self, minValue, maxValue, shape, dtype):
-        input1 = np.random.uniform(minValue, maxValue, shape).astype(dtype)
-        # modify from numpy.ndarray to torch.tensor
-        npu_input1 = torch.from_numpy(input1)
-        return npu_input1
-
-
-    def cpu_op_exec(self, input1):
-        output = torch.isfinite(input1);
-        output = output.numpy()
-        return output
-
-
-    def npu_op_exec(self, input1):
-        input1 = input1.to("npu")
-        output = torch.isfinite(input1);
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-
-    def test_isfinite_common_shape_format(self, device):
-        shape_format = [
-            [[np.bool, -1, (4, 3, 1)]],
-            [[np.int32, -1, (4, 3, 1)]],
-            [[np.int8, -1, (2, 3)]],
-            [[np.int16, -1, (2, 3)]],
-            [[np.int64, -1, (2, 3)]],
-            [[np.float32, -1, (4, 3, 1)]],
-            [[np.float64, -1, (4, 3, 1)]],
-            [[np.uint8, -1, (4, 3, 1)]]
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestIsfinite, globals(), except_for='cpu')
-if __name__ == "__main__":
-    torch.npu.set_device("npu:3")
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION.
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestIsfinite(TestCase):
+
+    def generate_data(self, minValue, maxValue, shape, dtype):
+        input1 = np.random.uniform(minValue, maxValue, shape).astype(dtype)
+        # modify from numpy.ndarray to torch.tensor
+        npu_input1 = torch.from_numpy(input1)
+        return npu_input1
+
+
+    def cpu_op_exec(self, input1):
+        output = torch.isfinite(input1);
+        output = output.numpy()
+        return output
+
+
+    def npu_op_exec(self, input1):
+        input1 = input1.to("npu")
+        output = torch.isfinite(input1);
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+
+    def test_isfinite_common_shape_format(self, device):
+        shape_format = [
+            [[np.bool, -1, (4, 3, 1)]],
+            [[np.int32, -1, (4, 3, 1)]],
+            [[np.int8, -1, (2, 3)]],
+            [[np.int16, -1, (2, 3)]],
+            [[np.int64, -1, (2, 3)]],
+            [[np.float32, -1, (4, 3, 1)]],
+            [[np.float64, -1, (4, 3, 1)]],
+            [[np.uint8, -1, (4, 3, 1)]]
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestIsfinite, globals(), except_for='cpu')
+if __name__ == "__main__":
+    torch.npu.set_device("npu:3")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_le.py b/test/test_npu/test_le.py
index 293d23b63f92e93e1ba78ec531c2aade385e4ca0..2eb4be77fe1ddd484897582acb4f4036fe1ff293 100644
--- a/test/test_npu/test_le.py
+++ b/test/test_npu/test_le.py
@@ -1,106 +1,106 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestLe(TestCase):
-
-    def generate_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-
-        # modify from numpy.ndarray to torch.tensor
-        npu_input1 = torch.from_numpy(input1)
-        npu_input2 = torch.from_numpy(input2)
-
-        return npu_input1, npu_input2
-
-    def generate_single_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        npu_input1 = torch.from_numpy(input1)
-
-        return npu_input1
-
-    def cpu_op_exec(self, input1, input2):
-        output = torch.le(input1, input2)  
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        output = torch.le(input1, input2) 
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_scalar(self, input1, input2):
-        input1 = input1.to("npu")
-        output = torch.le(input1,input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-
-    def test_le_float16(self, device):
-        def cpu_op_exec_fp16(input1, input2):   
-            input1 = input1.to(torch.float32)
-            input2 = input2.to(torch.float32)
-            output = torch.le(input1, input2)
-            output = output.numpy()
-            return output
-        npu_input1, npu_input2 = self.generate_data(0, 100, (5, 3), np.float16)
-        cpu_output = cpu_op_exec_fp16(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-
-    def test_le_float32(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (4, 3), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-
-
-    def test_le_float32_broadcast(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (4, 3, 1), np.float32)
-        npu_input2 = self.generate_single_data(0, 100, (4, 1, 5), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-
-    def test_less_scalar_float32(self, device):
-        npu_input1, _= self.generate_data(0, 100, (2,3), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, 1)
-        npu_output = self.npu_op_exec_scalar(npu_input1, 1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-
-    def test_le_int32(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 3), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestLe, globals(), except_for='cpu')
-if __name__ == '__main__':
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestLe(TestCase):
+
+    def generate_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+
+        # modify from numpy.ndarray to torch.tensor
+        npu_input1 = torch.from_numpy(input1)
+        npu_input2 = torch.from_numpy(input2)
+
+        return npu_input1, npu_input2
+
+    def generate_single_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        npu_input1 = torch.from_numpy(input1)
+
+        return npu_input1
+
+    def cpu_op_exec(self, input1, input2):
+        output = torch.le(input1, input2)  
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        output = torch.le(input1, input2) 
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_scalar(self, input1, input2):
+        input1 = input1.to("npu")
+        output = torch.le(input1,input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+
+    def test_le_float16(self, device):
+        def cpu_op_exec_fp16(input1, input2):   
+            input1 = input1.to(torch.float32)
+            input2 = input2.to(torch.float32)
+            output = torch.le(input1, input2)
+            output = output.numpy()
+            return output
+        npu_input1, npu_input2 = self.generate_data(0, 100, (5, 3), np.float16)
+        cpu_output = cpu_op_exec_fp16(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+
+    def test_le_float32(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (4, 3), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+
+
+    def test_le_float32_broadcast(self, device):
+        npu_input1 = self.generate_single_data(0, 100, (4, 3, 1), np.float32)
+        npu_input2 = self.generate_single_data(0, 100, (4, 1, 5), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+
+    def test_less_scalar_float32(self, device):
+        npu_input1, _= self.generate_data(0, 100, (2,3), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, 1)
+        npu_output = self.npu_op_exec_scalar(npu_input1, 1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+
+    def test_le_int32(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 3), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestLe, globals(), except_for='cpu')
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_npu/test_log2.py b/test/test_npu/test_log2.py
index 6eb03210b6689527445364d7760b003fcf65206c..e6e34271e56cfe0f29536ff10bdfa514276cef8d 100644
--- a/test/test_npu/test_log2.py
+++ b/test/test_npu/test_log2.py
@@ -1,101 +1,101 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestLog2(TestCase):
-    
-    def cpu_op_exec(self,input1):
-        output = torch.log2(input1)
-        output = output.numpy()
-        return output
-        
-    def npu_op_exec(self,input1):
-        output = torch.log2(input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-        
-    def cpu_op_exec_(self,input1):
-        output = torch.log2_(input1)
-        output = input1.numpy()
-        return output
-            
-    def npu_op_exec_(self,input1):
-        output = torch.log2_(input1)
-        output = input1.to("cpu")
-        output = output.numpy()
-        return output
-    
-    def cpu_op_exec_out(self,input1,cpu_out):
-        output = torch.log2(input1, out = cpu_out)
-        output = cpu_out.numpy()
-        return output
-        
-    def npu_op_exec_out(self,input1,npu_out):
-        output = torch.log2(input1, out = npu_out)
-        output = npu_out.to("cpu")
-        output = output.numpy()
-        return output
-        
-    def test_log2_float32_common_shape_format(self, device):
-        shape_format = [
-                [[np.float32, -1, (1)]], 
-                [[np.float32, -1, (4, 23)]],
-                [[np.float32, -1, (2, 3)]],
-                [[np.float32, -1, (12, 23)]]
-        ]
-        for item in shape_format:            
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-            
-    def test_log2_float321_common_shape_format(self, device):
-        shape_format = [
-                [[np.float32, -1, (3)]], 
-                [[np.float32, -1, (4, 3)]],
-                [[np.float32, -1, (12, 32)]],
-                [[np.float32, -1, (22, 38)]]
-        ]
-        for item in shape_format:       
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = self.cpu_op_exec_(cpu_input1)
-            npu_output = self.npu_op_exec_(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-	
-    def test_log2_out_float32_common_shape_format(self, device):
-        shape_format = [
-                [[np.float32, -1, (4)]], 
-                [[np.float32, -1, (4, 1, 5)]],
-                [[np.float32, -1, (2, 3, 8)]],
-                [[np.float32, -1, (2, 13, 56)]]
-        ]
-        for item in shape_format:          
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_out, npu_out = create_common_tensor(item[0], 1, 100)
-            cpu_output = self.cpu_op_exec_out(cpu_input1,cpu_out)
-            npu_output = self.npu_op_exec_out(npu_input1,npu_out)
-            self.assertRtolEqual(cpu_output, npu_output)
-  
-instantiate_device_type_tests(TestLog2, globals(), except_for="cpu")
-
-if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestLog2(TestCase):
+    
+    def cpu_op_exec(self,input1):
+        output = torch.log2(input1)
+        output = output.numpy()
+        return output
+        
+    def npu_op_exec(self,input1):
+        output = torch.log2(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+        
+    def cpu_op_exec_(self,input1):
+        output = torch.log2_(input1)
+        output = input1.numpy()
+        return output
+            
+    def npu_op_exec_(self,input1):
+        output = torch.log2_(input1)
+        output = input1.to("cpu")
+        output = output.numpy()
+        return output
+    
+    def cpu_op_exec_out(self,input1,cpu_out):
+        output = torch.log2(input1, out = cpu_out)
+        output = cpu_out.numpy()
+        return output
+        
+    def npu_op_exec_out(self,input1,npu_out):
+        output = torch.log2(input1, out = npu_out)
+        output = npu_out.to("cpu")
+        output = output.numpy()
+        return output
+        
+    def test_log2_float32_common_shape_format(self, device):
+        shape_format = [
+                [[np.float32, -1, (1)]], 
+                [[np.float32, -1, (4, 23)]],
+                [[np.float32, -1, (2, 3)]],
+                [[np.float32, -1, (12, 23)]]
+        ]
+        for item in shape_format:            
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+            
+    def test_log2_float321_common_shape_format(self, device):
+        shape_format = [
+                [[np.float32, -1, (3)]], 
+                [[np.float32, -1, (4, 3)]],
+                [[np.float32, -1, (12, 32)]],
+                [[np.float32, -1, (22, 38)]]
+        ]
+        for item in shape_format:       
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_output = self.cpu_op_exec_(cpu_input1)
+            npu_output = self.npu_op_exec_(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+	
+    def test_log2_out_float32_common_shape_format(self, device):
+        shape_format = [
+                [[np.float32, -1, (4)]], 
+                [[np.float32, -1, (4, 1, 5)]],
+                [[np.float32, -1, (2, 3, 8)]],
+                [[np.float32, -1, (2, 13, 56)]]
+        ]
+        for item in shape_format:          
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_out, npu_out = create_common_tensor(item[0], 1, 100)
+            cpu_output = self.cpu_op_exec_out(cpu_input1,cpu_out)
+            npu_output = self.npu_op_exec_out(npu_input1,npu_out)
+            self.assertRtolEqual(cpu_output, npu_output)
+  
+instantiate_device_type_tests(TestLog2, globals(), except_for="cpu")
+
+if __name__ == "__main__":
+    torch.npu.set_device("npu:6")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_logdet.py b/test/test_npu/test_logdet.py
index 39a0757fae6de337d790fc12e506113388387701..a8b3e203cc45b5196be1f5ee93b3d3abc54327ee 100644
--- a/test/test_npu/test_logdet.py
+++ b/test/test_npu/test_logdet.py
@@ -1,186 +1,186 @@
-
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestLogDet(TestCase):
-
-    def generate_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-
-        # modify from numpy.ndarray to torch.tensor
-        npu_input1 = torch.from_numpy(input1)
-        npu_input2 = torch.from_numpy(input2)
-
-        return npu_input1, npu_input2
-
-    def generate_single_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        npu_input1 = torch.from_numpy(input1)
-
-        return npu_input1
-
-    def generate_three_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        input3 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-
-        # modify from numpy.ndarray to torch.tensor
-        npu_input1 = torch.from_numpy(input1)
-        npu_input2 = torch.from_numpy(input2)
-        npu_input3 = torch.from_numpy(input3)
-
-        return npu_input1, npu_input2, npu_input3
-
-    def generate_scalar(self, min_d, max_d):
-        scalar = np.random.uniform(min_d, max_d)
-        return scalar
-
-    def generate_int_scalar(self, min_d, max_d):
-        scalar = np.random.randint(min_d, max_d)
-        return scalar
-
-    def cpu_op_exec(self, input1):
-        output = torch.logdet(input1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1):
-        output = torch.logdet(input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_tensor_need_to_npu(self, input1):
-        input1 = input1.to("npu")
-        output = torch.logdet(input1)
-
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def register_tensor(self, item,min_val,max_val):
-        res = []
-        cpu_input, npu_input = create_common_tensor(item, min_val , max_val)
-        det_result = torch.det(cpu_input)
-        for i in range(len(det_result)):
-            if det_result[i] > 0:
-                res.append(cpu_input[i])
-        res = torch.stack(res)
-        return res, len(res)
-
-    def register_tensor_fp16(self, item,min_val,max_val):
-        res = []
-        cpu_input, npu_input = create_common_tensor(item,  min_val , max_val)
-        cpu_input_tmp = cpu_input.to(torch.float32)
-        det_result = torch.det(cpu_input_tmp)
-        for i in range(len(det_result)):
-            if det_result[i] > 0:
-                res.append(cpu_input[i])
-        res = torch.stack(res)
-        return res, len(res)
-
-    def create_det_tensor(self, input_tensor):
-        cpu_input = input_tensor
-        npu_input = input_tensor.to("npu")
-        return cpu_input, npu_input
-
-    def test_logdet_common_shape_format(self, device):
-        shape_format = [
-            [[np.float32, -1, (6, 2, 2)], 100, 200],
-            [[np.float32, -1, (24, 5, 5)], 100, 200],
-            [[np.float32, -1, (14, 5, 5)], 21, 22],
-            [[np.float32, -1, (74,4,4)], 21205, 22225],
-            [[np.float32, -1, (58,4,4)], -30,30],
-            [[np.float32, -1, (30,16,16)], -30,30],
-            [[np.float32, -1, (58, 4, 4)], 0.3219780311757745 , 92],
-            [[np.float32, -1, (32, 16, 16)], 0.4820305734500543 , 28],
-            [[np.float32, -1, (28, 8, 8)], 0.8563874665918477 , 98],
-            [[np.float32, -1, (42, 6, 6)], 0.0694198357720135 , 50],
-            [[np.float32, -1, (12, 10, 10)], 0.3316939248453338 , 17],
-            [[np.float32, -1, (6, 10, 10)], 0.6447298684351989 , 95],
-            [[np.float32, -1, (6, 9, 9)],0.8723538084975545 , 85],
-            [[np.float32, -1, (10, 5, 5)], 0.8283759153463854 , 71],
-            [[np.float32, -1, (10, 1, 1)], 0.24718684227306953 , 1],
-            [[np.float32, -1, (6,1,1)], 0.0694198357720135, 0.24718684227306953],
-            [[np.float32, -1, (8, 10, 10)], 0.7866457165672994 , 5],
-            [[np.float32, -1, (6, 14, 14)], 0.9956475043306917 , 28],
-            [[np.float32, -1, (6, 7, 7)],0.3793216987112159 , 39],
-            [[np.float32, -1, (14, 10, 10)], 0.769565434387681 , 9],
-            [[np.float32, -1, (16, 10, 10)], 0.8039978883789274 , 22],
-            [[np.float32, -1, (30, 3, 3)], 0.03133650248813469 , 37],
-            [[np.float32, -1, (4, 1, 1)], 0.853775978441379 , 34 ],
-            [[np.float32, -1, (18, 6, 6)], 0.503285855595573 , 35],
-            [[np.float32, -1, (6, 3, 3)], 1, 10],
-        ]
-        for item in shape_format:
-            input_shape = item[0][2]
-            res, tmp_shape0 = self.register_tensor(item[0],item[1],item[2])
-            cpu_input1, npu_input1 = self.create_det_tensor(res)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            input_shape = list(input_shape)
-            input_shape[0] = tmp_shape0
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_logdet_float16_shape_format(self, device):
-        def cpu_op_exec_fp16(input1):
-            input1 = input1.to(torch.float32)
-            output = torch.logdet(input1)
-            output = output.numpy()
-            output = output.astype(np.float16)
-            return output
-
-        shape_format = [
-            [[np.float16, -1, (9, 5, 5)],-2,2],
-            [[np.float16, -1, (60,4,4)],-10,12],
-            [[np.float16, -1, (12,5,5)], 5,10],
-            [[np.float16, -1, (14, 5, 5)], 0.9283381566708346 , 10],
-            [[np.float16, -1, (71, 2, 2)], 0.6234465730020081 , 13],
-            [[np.float16, -1, (10, 5, 5)], 0.7440899332166594 , 1],
-            [[np.float16, -1, (13, 5, 5)], 0.9790231845699171 , 9],
-            [[np.float16, -1, (10, 7, 7)], 0.7852605507867441 , 8],
-            [[np.float16, -1, (18, 2, 2)], 0.8758750778305631 , 9],
-            [[np.float16, -1, (10, 6, 6)], 0.7570129172808612 , 5],
-            [[np.float16, -1, (7, 7, 7)], 0 , 2],
-            [[np.float16, -1, (9, 5, 5)], 1 , 2],
-            [[np.float16, -1, (12, 4, 4)], 0.7349293532899402 , 19],
-            [[np.float16, -1, (15, 8, 8)], 0.9583309378850908 , 3],
-            [[np.float16, -1, (11, 2, 2)],0.3560076034004038 , 25],
-        ]
-
-        for item in shape_format:
-            input_shape = item[0][2]
-            res, tmp_shape0 = self.register_tensor_fp16(item[0],item[1],item[2])
-            cpu_input1,npu_input1 = self.create_det_tensor(res)
-            cpu_output = cpu_op_exec_fp16(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            input_shape = list(input_shape)
-            input_shape[0] = tmp_shape0
-            self.assertRtolEqual(cpu_output, npu_output,prec=1e-3)
-
-
-instantiate_device_type_tests(TestLogDet, globals(), except_for='cpu')
-
-if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
-    run_tests()
+
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestLogDet(TestCase):
+
+    def generate_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+
+        # modify from numpy.ndarray to torch.tensor
+        npu_input1 = torch.from_numpy(input1)
+        npu_input2 = torch.from_numpy(input2)
+
+        return npu_input1, npu_input2
+
+    def generate_single_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        npu_input1 = torch.from_numpy(input1)
+
+        return npu_input1
+
+    def generate_three_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        input3 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+
+        # modify from numpy.ndarray to torch.tensor
+        npu_input1 = torch.from_numpy(input1)
+        npu_input2 = torch.from_numpy(input2)
+        npu_input3 = torch.from_numpy(input3)
+
+        return npu_input1, npu_input2, npu_input3
+
+    def generate_scalar(self, min_d, max_d):
+        scalar = np.random.uniform(min_d, max_d)
+        return scalar
+
+    def generate_int_scalar(self, min_d, max_d):
+        scalar = np.random.randint(min_d, max_d)
+        return scalar
+
+    def cpu_op_exec(self, input1):
+        output = torch.logdet(input1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        output = torch.logdet(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_tensor_need_to_npu(self, input1):
+        input1 = input1.to("npu")
+        output = torch.logdet(input1)
+
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def register_tensor(self, item,min_val,max_val):
+        res = []
+        cpu_input, npu_input = create_common_tensor(item, min_val , max_val)
+        det_result = torch.det(cpu_input)
+        for i in range(len(det_result)):
+            if det_result[i] > 0:
+                res.append(cpu_input[i])
+        res = torch.stack(res)
+        return res, len(res)
+
+    def register_tensor_fp16(self, item,min_val,max_val):
+        res = []
+        cpu_input, npu_input = create_common_tensor(item,  min_val , max_val)
+        cpu_input_tmp = cpu_input.to(torch.float32)
+        det_result = torch.det(cpu_input_tmp)
+        for i in range(len(det_result)):
+            if det_result[i] > 0:
+                res.append(cpu_input[i])
+        res = torch.stack(res)
+        return res, len(res)
+
+    def create_det_tensor(self, input_tensor):
+        cpu_input = input_tensor
+        npu_input = input_tensor.to("npu")
+        return cpu_input, npu_input
+
+    def test_logdet_common_shape_format(self, device):
+        shape_format = [
+            [[np.float32, -1, (6, 2, 2)], 100, 200],
+            [[np.float32, -1, (24, 5, 5)], 100, 200],
+            [[np.float32, -1, (14, 5, 5)], 21, 22],
+            [[np.float32, -1, (74,4,4)], 21205, 22225],
+            [[np.float32, -1, (58,4,4)], -30,30],
+            [[np.float32, -1, (30,16,16)], -30,30],
+            [[np.float32, -1, (58, 4, 4)], 0.3219780311757745 , 92],
+            [[np.float32, -1, (32, 16, 16)], 0.4820305734500543 , 28],
+            [[np.float32, -1, (28, 8, 8)], 0.8563874665918477 , 98],
+            [[np.float32, -1, (42, 6, 6)], 0.0694198357720135 , 50],
+            [[np.float32, -1, (12, 10, 10)], 0.3316939248453338 , 17],
+            [[np.float32, -1, (6, 10, 10)], 0.6447298684351989 , 95],
+            [[np.float32, -1, (6, 9, 9)],0.8723538084975545 , 85],
+            [[np.float32, -1, (10, 5, 5)], 0.8283759153463854 , 71],
+            [[np.float32, -1, (10, 1, 1)], 0.24718684227306953 , 1],
+            [[np.float32, -1, (6,1,1)], 0.0694198357720135, 0.24718684227306953],
+            [[np.float32, -1, (8, 10, 10)], 0.7866457165672994 , 5],
+            [[np.float32, -1, (6, 14, 14)], 0.9956475043306917 , 28],
+            [[np.float32, -1, (6, 7, 7)],0.3793216987112159 , 39],
+            [[np.float32, -1, (14, 10, 10)], 0.769565434387681 , 9],
+            [[np.float32, -1, (16, 10, 10)], 0.8039978883789274 , 22],
+            [[np.float32, -1, (30, 3, 3)], 0.03133650248813469 , 37],
+            [[np.float32, -1, (4, 1, 1)], 0.853775978441379 , 34 ],
+            [[np.float32, -1, (18, 6, 6)], 0.503285855595573 , 35],
+            [[np.float32, -1, (6, 3, 3)], 1, 10],
+        ]
+        for item in shape_format:
+            input_shape = item[0][2]
+            res, tmp_shape0 = self.register_tensor(item[0],item[1],item[2])
+            cpu_input1, npu_input1 = self.create_det_tensor(res)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            input_shape = list(input_shape)
+            input_shape[0] = tmp_shape0
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_logdet_float16_shape_format(self, device):
+        def cpu_op_exec_fp16(input1):
+            input1 = input1.to(torch.float32)
+            output = torch.logdet(input1)
+            output = output.numpy()
+            output = output.astype(np.float16)
+            return output
+
+        shape_format = [
+            [[np.float16, -1, (9, 5, 5)],-2,2],
+            [[np.float16, -1, (60,4,4)],-10,12],
+            [[np.float16, -1, (12,5,5)], 5,10],
+            [[np.float16, -1, (14, 5, 5)], 0.9283381566708346 , 10],
+            [[np.float16, -1, (71, 2, 2)], 0.6234465730020081 , 13],
+            [[np.float16, -1, (10, 5, 5)], 0.7440899332166594 , 1],
+            [[np.float16, -1, (13, 5, 5)], 0.9790231845699171 , 9],
+            [[np.float16, -1, (10, 7, 7)], 0.7852605507867441 , 8],
+            [[np.float16, -1, (18, 2, 2)], 0.8758750778305631 , 9],
+            [[np.float16, -1, (10, 6, 6)], 0.7570129172808612 , 5],
+            [[np.float16, -1, (7, 7, 7)], 0 , 2],
+            [[np.float16, -1, (9, 5, 5)], 1 , 2],
+            [[np.float16, -1, (12, 4, 4)], 0.7349293532899402 , 19],
+            [[np.float16, -1, (15, 8, 8)], 0.9583309378850908 , 3],
+            [[np.float16, -1, (11, 2, 2)],0.3560076034004038 , 25],
+        ]
+
+        for item in shape_format:
+            input_shape = item[0][2]
+            res, tmp_shape0 = self.register_tensor_fp16(item[0],item[1],item[2])
+            cpu_input1,npu_input1 = self.create_det_tensor(res)
+            cpu_output = cpu_op_exec_fp16(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            input_shape = list(input_shape)
+            input_shape[0] = tmp_shape0
+            self.assertRtolEqual(cpu_output, npu_output,prec=1e-3)
+
+
+instantiate_device_type_tests(TestLogDet, globals(), except_for='cpu')
+
+if __name__ == "__main__":
+    torch.npu.set_device("npu:6")
+    run_tests()
diff --git a/test/test_npu/test_logical_and.py b/test/test_npu/test_logical_and.py
index 743b2e2594b7d1b6e887857006831f8f0347067f..2964665c31bb22250f4f307c5b27cfdd310a9466 100644
--- a/test/test_npu/test_logical_and.py
+++ b/test/test_npu/test_logical_and.py
@@ -1,176 +1,176 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestLogicalAnd(TestCase):
-
-    def generate_single_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        npu_input1 = torch.from_numpy(input1)
-
-        return npu_input1
-
-    def generate_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-
-        #modify from numpy.ndarray to torch.tensor
-        npu_input1 = torch.from_numpy(input1)
-        npu_input2 = torch.from_numpy(input2)
-
-        return npu_input1, npu_input2
-    
-    def generate_three_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        input3 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-
-        #modify from numpy.ndarray to torch.tensor
-        npu_input1 = torch.from_numpy(input1)
-        npu_input2 = torch.from_numpy(input2)
-        npu_input3 = torch.from_numpy(input3)
-        
-        return npu_input1, npu_input2, npu_input3
-
-    def cpu_op_exec(self, input1, input2):
-        output = torch.logical_and(input1, input2)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        output = torch.logical_and(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_out(self, input1, input2, input3):
-        torch.logical_and(input1, input2, out=input3)
-        output = input3.numpy()
-        return output
-
-    def npu_op_exec_out(self, input1, input2, input3):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        output = input3.to("npu")
-        torch.logical_and(input1, input2, out=output)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output 
-
-    def cpu_op_exec_(self, input1, input2):
-        output = torch.Tensor.logical_and_(input1, input2)
-        output = output.numpy()
-        return output
- 
-    def npu_op_exec_(self, input1, input2):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        output = torch.Tensor.logical_and_(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_logical_and_int8(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 5), np.int8)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2).astype(np.float32)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2).astype(np.float32)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_logical_and_uint8(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 5), np.uint8)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2).astype(np.float32)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2).astype(np.float32)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_logical_and_int32(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 5), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2).astype(np.float32)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2).astype(np.float32)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_logical_and_bool(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 2, (2, 5), np.bool)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2).astype(np.float32)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2).astype(np.float32)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_logical_and_float16(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 5), np.float16)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2).astype(np.float32)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2).astype(np.float32)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_logical_and_float32(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 5, (2, 5), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2).astype(np.float32)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2).astype(np.float32)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_logical_or_float32_broadcast(self, device):
-        npu_input1 = self.generate_single_data(0, 2, (4, 3, 1), np.float32)
-        npu_input2 = self.generate_single_data(0, 2, (4, 1, 5), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_logical_and_inplace_uint8(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 5), np.uint8)
-        cpu_output = self.cpu_op_exec_(npu_input1, npu_input2).astype(np.float32)
-        npu_output = self.npu_op_exec_(npu_input1, npu_input2).astype(np.float32)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_logical_and_inplace_int8(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 5), np.int8)
-        cpu_output = self.cpu_op_exec_(npu_input1, npu_input2).astype(np.float32)
-        npu_output = self.npu_op_exec_(npu_input1, npu_input2).astype(np.float32)
-        self.assertRtolEqual(cpu_output, npu_output)
-      
-    def test_logical_and_inplace_int32(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 5), np.int32)
-        cpu_output = self.cpu_op_exec_(npu_input1, npu_input2).astype(np.float32)
-        npu_output = self.npu_op_exec_(npu_input1, npu_input2).astype(np.float32)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_logical_and_inplace_bool(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 2, (2, 5), np.bool)
-        cpu_output = self.cpu_op_exec_(npu_input1, npu_input2).astype(np.float32)
-        npu_output = self.npu_op_exec_(npu_input1, npu_input2).astype(np.float32)
-        self.assertRtolEqual(cpu_output, npu_output)
-        
-    def test_logical_and_inplace_float16(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 5), np.float16)
-        cpu_output = self.cpu_op_exec_(npu_input1, npu_input2).astype(np.float32)
-        npu_output = self.npu_op_exec_(npu_input1, npu_input2).astype(np.float32)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_logical_and_inplace_float32(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 5, (2, 5), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2).astype(np.float32)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2).astype(np.float32)
-        self.assertRtolEqual(cpu_output, npu_output)    
-
-instantiate_device_type_tests(TestLogicalAnd, globals(), except_for='cpu')
-if __name__ == "__main__":
-    torch.npu.set_device("npu:0")
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestLogicalAnd(TestCase):
+
+    def generate_single_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        npu_input1 = torch.from_numpy(input1)
+
+        return npu_input1
+
+    def generate_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+
+        #modify from numpy.ndarray to torch.tensor
+        npu_input1 = torch.from_numpy(input1)
+        npu_input2 = torch.from_numpy(input2)
+
+        return npu_input1, npu_input2
+    
+    def generate_three_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        input3 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+
+        #modify from numpy.ndarray to torch.tensor
+        npu_input1 = torch.from_numpy(input1)
+        npu_input2 = torch.from_numpy(input2)
+        npu_input3 = torch.from_numpy(input3)
+        
+        return npu_input1, npu_input2, npu_input3
+
+    def cpu_op_exec(self, input1, input2):
+        output = torch.logical_and(input1, input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        output = torch.logical_and(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_out(self, input1, input2, input3):
+        torch.logical_and(input1, input2, out=input3)
+        output = input3.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2, input3):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        output = input3.to("npu")
+        torch.logical_and(input1, input2, out=output)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output 
+
+    def cpu_op_exec_(self, input1, input2):
+        output = torch.Tensor.logical_and_(input1, input2)
+        output = output.numpy()
+        return output
+ 
+    def npu_op_exec_(self, input1, input2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        output = torch.Tensor.logical_and_(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_logical_and_int8(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 5), np.int8)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2).astype(np.float32)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2).astype(np.float32)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_logical_and_uint8(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 5), np.uint8)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2).astype(np.float32)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2).astype(np.float32)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_logical_and_int32(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 5), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2).astype(np.float32)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2).astype(np.float32)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_logical_and_bool(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 2, (2, 5), np.bool)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2).astype(np.float32)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2).astype(np.float32)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_logical_and_float16(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 5), np.float16)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2).astype(np.float32)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2).astype(np.float32)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_logical_and_float32(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 5, (2, 5), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2).astype(np.float32)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2).astype(np.float32)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_logical_or_float32_broadcast(self, device):
+        npu_input1 = self.generate_single_data(0, 2, (4, 3, 1), np.float32)
+        npu_input2 = self.generate_single_data(0, 2, (4, 1, 5), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_logical_and_inplace_uint8(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 5), np.uint8)
+        cpu_output = self.cpu_op_exec_(npu_input1, npu_input2).astype(np.float32)
+        npu_output = self.npu_op_exec_(npu_input1, npu_input2).astype(np.float32)
+        self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test_logical_and_inplace_int8(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 5), np.int8)
+        cpu_output = self.cpu_op_exec_(npu_input1, npu_input2).astype(np.float32)
+        npu_output = self.npu_op_exec_(npu_input1, npu_input2).astype(np.float32)
+        self.assertRtolEqual(cpu_output, npu_output)
+      
+    def test_logical_and_inplace_int32(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 5), np.int32)
+        cpu_output = self.cpu_op_exec_(npu_input1, npu_input2).astype(np.float32)
+        npu_output = self.npu_op_exec_(npu_input1, npu_input2).astype(np.float32)
+        self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test_logical_and_inplace_bool(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 2, (2, 5), np.bool)
+        cpu_output = self.cpu_op_exec_(npu_input1, npu_input2).astype(np.float32)
+        npu_output = self.npu_op_exec_(npu_input1, npu_input2).astype(np.float32)
+        self.assertRtolEqual(cpu_output, npu_output)
+        
+    def test_logical_and_inplace_float16(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 5), np.float16)
+        cpu_output = self.cpu_op_exec_(npu_input1, npu_input2).astype(np.float32)
+        npu_output = self.npu_op_exec_(npu_input1, npu_input2).astype(np.float32)
+        self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test_logical_and_inplace_float32(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 5, (2, 5), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2).astype(np.float32)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2).astype(np.float32)
+        self.assertRtolEqual(cpu_output, npu_output)    
+
+instantiate_device_type_tests(TestLogicalAnd, globals(), except_for='cpu')
+if __name__ == "__main__":
+    torch.npu.set_device("npu:0")
+    run_tests()
diff --git a/test/test_npu/test_masked_scale.py b/test/test_npu/test_masked_scale.py
index 85c26cc1307fb4d2cbea647faa40d2ad1f79dece..386f9136fa625ce03d8fa9f4f4cd69637762fd78 100644
--- a/test/test_npu/test_masked_scale.py
+++ b/test/test_npu/test_masked_scale.py
@@ -1,79 +1,79 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestMaskedScale(TestCase):
-    def generate_data(self, dtype, shape, min_d, max_d):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        cpu_input = input1
-        npu_input = torch.from_numpy(input1).to("npu")
-        return cpu_input, npu_input
-
-    def generate_mask(self, shape):
-        mask = torch.empty(shape,dtype=torch.int8).random_(2)
-        cpu_mask = mask.numpy()
-        return cpu_mask, mask
-
-    def dynamic_generate_data(self, data_type):
-        format_list = []
-        shape_range = [2,5]
-        min_value_range = [-100, 0, 1]
-        max_value_range = [100, 1, 1000]
-        for shape in shape_range:
-            for min_v, max_v in zip(min_value_range, max_value_range):
-                shape_v = [np.random.randint(1, 50) for _ in range(shape)]
-                format_list.append(
-                    [data_type, shape_v, min_v, max_v]
-                )
-        return format_list
-
-    def numpy_op_exec_masked_scale(self, input1, mask, value):
-        res = input1 * mask * value
-        return res
-
-    def npu_op_exec_masked_scale(self, input1, mask, value):
-        input1 = input1.npu()
-        mask = mask.npu()
-        value = torch.tensor(value)
-        res = torch._masked_scale(input1, mask, value)
-        res = res.to("cpu")
-        res = res.detach().numpy()
-        return res
-
-    def test_masked_scale_format_fp16(self,device):
-        self._test_masked_scale_format(device, np.float16)
-
-    def test_masked_scale_format_fp32(self,device):
-        self._test_masked_scale_format(device, np.float32)
-
-    def _test_masked_scale_format(self, device, dtype):
-        format_list = self.dynamic_generate_data(dtype)
-        for item in format_list:
-            cpu_input, npu_input = self.generate_data(*item)
-            cpu_mask, npu_mask = self.generate_mask(item[1])
-            scale = np.random.uniform(0,1)
-            cpu_output = self.numpy_op_exec_masked_scale(cpu_input,cpu_mask,scale)
-            npu_output = self.npu_op_exec_masked_scale(npu_input,npu_mask,scale)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestMaskedScale, globals(), except_for='cpu')     
-if __name__ == '__main__': 
-    torch.npu.set_device("npu:0")
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestMaskedScale(TestCase):
+    def generate_data(self, dtype, shape, min_d, max_d):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        cpu_input = input1
+        npu_input = torch.from_numpy(input1).to("npu")
+        return cpu_input, npu_input
+
+    def generate_mask(self, shape):
+        mask = torch.empty(shape,dtype=torch.int8).random_(2)
+        cpu_mask = mask.numpy()
+        return cpu_mask, mask
+
+    def dynamic_generate_data(self, data_type):
+        format_list = []
+        shape_range = [2,5]
+        min_value_range = [-100, 0, 1]
+        max_value_range = [100, 1, 1000]
+        for shape in shape_range:
+            for min_v, max_v in zip(min_value_range, max_value_range):
+                shape_v = [np.random.randint(1, 50) for _ in range(shape)]
+                format_list.append(
+                    [data_type, shape_v, min_v, max_v]
+                )
+        return format_list
+
+    def numpy_op_exec_masked_scale(self, input1, mask, value):
+        res = input1 * mask * value
+        return res
+
+    def npu_op_exec_masked_scale(self, input1, mask, value):
+        input1 = input1.npu()
+        mask = mask.npu()
+        value = torch.tensor(value)
+        res = torch._masked_scale(input1, mask, value)
+        res = res.to("cpu")
+        res = res.detach().numpy()
+        return res
+
+    def test_masked_scale_format_fp16(self,device):
+        self._test_masked_scale_format(device, np.float16)
+
+    def test_masked_scale_format_fp32(self,device):
+        self._test_masked_scale_format(device, np.float32)
+
+    def _test_masked_scale_format(self, device, dtype):
+        format_list = self.dynamic_generate_data(dtype)
+        for item in format_list:
+            cpu_input, npu_input = self.generate_data(*item)
+            cpu_mask, npu_mask = self.generate_mask(item[1])
+            scale = np.random.uniform(0,1)
+            cpu_output = self.numpy_op_exec_masked_scale(cpu_input,cpu_mask,scale)
+            npu_output = self.npu_op_exec_masked_scale(npu_input,npu_mask,scale)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestMaskedScale, globals(), except_for='cpu')     
+if __name__ == '__main__': 
+    torch.npu.set_device("npu:0")
+    run_tests()
diff --git a/test/test_npu/test_miopen_depthwise_convolution.py b/test/test_npu/test_miopen_depthwise_convolution.py
index 1436b7f90767e227a0c32f0f310a409d519ef71a..fdf6ac202f6b1b02f725b4ab14681c288bdf3e18 100644
--- a/test/test_npu/test_miopen_depthwise_convolution.py
+++ b/test/test_npu/test_miopen_depthwise_convolution.py
@@ -1,197 +1,197 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-import torch.nn as nn
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestMiopenDepthwiseConvolution(TestCase):
-
-
-    def op_exec_cpu(self, input, weight, in_channels, out_channels, kernel_size, padding=0, stride=1, dilation=1, bias=True):
-        input1 = input
-        weight1 = weight
-
-        bias1 = False
-        if bias != None:
-            bias1 = True
-
-        m1 = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias1, groups=in_channels)
-        m1.weight.data = weight1
-
-        cpuOutput = m1(input1)
-        tmp = torch.ones_like(cpuOutput)
-
-        return cpuOutput
-
-    def op_exec_npu(self, input, weight, in_channels, out_channels, kernel_size, padding=0, stride=1, dilation=1, bias=True):
-        input1 = input
-        weight1 = weight
-
-
-        bias1 = False
-        if bias != None:
-            bias1 = True
-
-        m1 = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias1, groups=in_channels)
-        m1.weight.data = weight1
-        m1 = m1.to("npu")
-        npuOutput = m1(input1)
-        npuOutput = npuOutput.to("cpu")
-        tmp = torch.ones_like(npuOutput)
-        return npuOutput
-            
-    def test_miopen_depthwise_convolution_input_range1(self, device):
-        shape_format = [  # input, weight, padding, stride, dilation, bias
-           [[np.float16, 3, [4, 3, 5, 5]], [np.float16, 0, [3, 1, 2, 2]], 0, 1, 1, None],
-        ]
-
-        for item in shape_format:
-
-            input_cpu, input_npu = create_common_tensor(item[0],-65504.0,65504.0)
-            input_cpu1, input_npu1 = create_common_tensor(item[0],-0.000030517578125,0.000030517578125)
-            input_cpu2, input_npu2 = create_common_tensor(item[0],-3402823500.0,3402823500.0)
-            input_cpu3, input_npu3 = create_common_tensor(item[0],-0.001953125,0.001953125)
-            
-            if input_cpu.dtype == torch.float16:
-                input_cpu = input_cpu.to(torch.float32)
-            weight_cpu, weight_npu = create_common_tensor(item[1], 0,1 )
-            if weight_cpu.dtype == torch.float16:
-                weight_cpu = weight_cpu.to(torch.float32)
-
-            if input_cpu1.dtype == torch.float16:
-                input_cpu1 = input_cpu1.to(torch.float32)
-            weight_cpu1, weight_npu1 = create_common_tensor(item[1], 0,1 )
-            if weight_cpu1.dtype == torch.float16:
-                weight_cpu1 = weight_cpu1.to(torch.float32)
-            
-            if input_cpu2.dtype == torch.float16:
-                input_cpu2 = input_cpu2.to(torch.float32)
-            weight_cpu2, weight_npu2 = create_common_tensor(item[1], 0,1 )
-            if weight_cpu2.dtype == torch.float16:
-                weight_cpu2 = weight_cpu2.to(torch.float32)
-
-            if input_cpu3.dtype == torch.float16:
-                input_cpu3 = input_cpu3.to(torch.float32)
-            weight_cpu3, weight_npu3 = create_common_tensor(item[1], 0,1 )
-            if weight_cpu3.dtype == torch.float16:
-                weight_cpu3= weight_cpu3.to(torch.float32)
-
-
-            kernel_size = (item[1][2][2], item[1][2][3])
-            cpu_output = self.op_exec_cpu(input_cpu, weight_cpu, item[0][2][1], item[1][2][0], kernel_size=kernel_size, 
-                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
-            weight_npu = weight_npu.to("cpu")
-            npu_output = self.op_exec_npu(input_npu, weight_npu, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
-                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
-            cpu_output = cpu_output.to(npu_output.dtype)
-
-
-            cpu_output1 = self.op_exec_cpu(input_cpu1, weight_cpu1, item[0][2][1], item[1][2][0], kernel_size=kernel_size, 
-                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
-            weight_npu1 = weight_npu1.to("cpu")
-            npu_output1 = self.op_exec_npu(input_npu1, weight_npu1, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
-                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
-            cpu_output1 = cpu_output1.to(npu_output1.dtype)
-
-
-            cpu_output2 = self.op_exec_cpu(input_cpu2, weight_cpu2, item[0][2][1], item[1][2][0], kernel_size=kernel_size, 
-                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
-            weight_npu2 = weight_npu2.to("cpu")
-            npu_output2 = self.op_exec_npu(input_npu2, weight_npu2, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
-                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
-            cpu_output2 = cpu_output2.to(npu_output2.dtype)
-
-
-            cpu_output3 = self.op_exec_cpu(input_cpu3, weight_cpu3, item[0][2][1], item[1][2][0], kernel_size=kernel_size, 
-                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
-            weight_npu3 = weight_npu3.to("cpu")
-            npu_output3 = self.op_exec_npu(input_npu3, weight_npu3, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
-                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
-            cpu_output3 = cpu_output3.to(npu_output3.dtype)
-
-
-            print("===========cpu_output============")
-            print(cpu_output)
-            print("===========cpu_output1============")
-            print(cpu_output1)
-            print("===========cpu_output2============")
-            print(cpu_output2)
-            print("===========cpu_output3============")
-            print(cpu_output3)
-
-            print("===========npu_output============")
-            print(npu_output)
-            print("===========npu_output1============")
-            print(npu_output1)
-            print("===========npu_output2============")
-            print(npu_output2)
-            print("===========npu_output3============")
-            print(npu_output3)
-
-
-            print("===========cpu_input&&npu_input==================")
-            print(input_cpu)
-            
-            self.assertRtolEqual(cpu_output.detach().numpy(), npu_output.detach().numpy())
-
-    def test_miopen_depthwise_convolution_shape_format(self, device):
-        shape_format = [  # input, weight, padding, stride, dilation, bias
-            [[np.float16, 3, [256, 32, 112, 112]], [np.float16, 0, [32, 1, 3, 3]], 0, 1, 1, None],
-            [[np.float16, 0, [1024, 116, 28, 28]], [np.float16, 0, [116, 1, 3, 3]], 1, [2, 2], 1, None],
-            [[np.float16, 0, [1024, 232, 14, 14]], [np.float16, 0, [232, 1, 3, 3]], 1, [2, 2], 1, None],
-            [[np.float16, 3, [1024, 232, 7, 7]], [np.float16, 0, [232, 1, 3, 3]], 1, 1, 1, None],
-            [[np.float16, 3, [1024, 24, 56, 56]], [np.float16, 0, [24, 1, 3, 3]], 1, [2, 2], 1, None],
-            [[np.float16, 3, [1024, 116, 28, 28]], [np.float16, 0, [116, 1, 3, 3]], 1, [2, 2], 1, None],
-            [[np.float16, 3, [1024, 232, 14, 14]], [np.float16, 0, [232, 1, 3, 3]], 1, [2, 2], 1, None],
-        ]
-
-        for item in shape_format:
-
-            input_cpu, input_npu = create_common_tensor(item[0],-65504.0,65504.0)
-            if input_cpu.dtype == torch.float16:
-                input_cpu = input_cpu.to(torch.float32)
-            weight_cpu, weight_npu = create_common_tensor(item[1], 0,1 )
-            if weight_cpu.dtype == torch.float16:
-                weight_cpu = weight_cpu.to(torch.float32)
-            kernel_size = (item[1][2][2], item[1][2][3])
-            cpu_output = self.op_exec_cpu(input_cpu, weight_cpu, item[0][2][1], item[1][2][0], kernel_size=kernel_size, 
-                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
-            weight_npu = weight_npu.to("cpu")
-            npu_output = self.op_exec_npu(input_npu, weight_npu, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
-                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
-            cpu_output = cpu_output.to(npu_output.dtype)
-
-            print("===========cpu_output============")
-            print(cpu_output)
-
-            print("===========npu_output============")
-            print(npu_output)
-
-            print("===========cpu_input&&npu_input==================")
-            print(input_cpu)
-            
-            self.assertRtolEqual(cpu_output.detach().numpy(), npu_output.detach().numpy())
-
-
-
-instantiate_device_type_tests(TestMiopenDepthwiseConvolution, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+import torch.nn as nn
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestMiopenDepthwiseConvolution(TestCase):
+
+
+    def op_exec_cpu(self, input, weight, in_channels, out_channels, kernel_size, padding=0, stride=1, dilation=1, bias=True):
+        input1 = input
+        weight1 = weight
+
+        bias1 = False
+        if bias != None:
+            bias1 = True
+
+        m1 = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias1, groups=in_channels)
+        m1.weight.data = weight1
+
+        cpuOutput = m1(input1)
+        tmp = torch.ones_like(cpuOutput)
+
+        return cpuOutput
+
+    def op_exec_npu(self, input, weight, in_channels, out_channels, kernel_size, padding=0, stride=1, dilation=1, bias=True):
+        input1 = input
+        weight1 = weight
+
+
+        bias1 = False
+        if bias != None:
+            bias1 = True
+
+        m1 = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias1, groups=in_channels)
+        m1.weight.data = weight1
+        m1 = m1.to("npu")
+        npuOutput = m1(input1)
+        npuOutput = npuOutput.to("cpu")
+        tmp = torch.ones_like(npuOutput)
+        return npuOutput
+            
+    def test_miopen_depthwise_convolution_input_range1(self, device):
+        shape_format = [  # input, weight, padding, stride, dilation, bias
+           [[np.float16, 3, [4, 3, 5, 5]], [np.float16, 0, [3, 1, 2, 2]], 0, 1, 1, None],
+        ]
+
+        for item in shape_format:
+
+            input_cpu, input_npu = create_common_tensor(item[0],-65504.0,65504.0)
+            input_cpu1, input_npu1 = create_common_tensor(item[0],-0.000030517578125,0.000030517578125)
+            input_cpu2, input_npu2 = create_common_tensor(item[0],-3402823500.0,3402823500.0)
+            input_cpu3, input_npu3 = create_common_tensor(item[0],-0.001953125,0.001953125)
+            
+            if input_cpu.dtype == torch.float16:
+                input_cpu = input_cpu.to(torch.float32)
+            weight_cpu, weight_npu = create_common_tensor(item[1], 0,1 )
+            if weight_cpu.dtype == torch.float16:
+                weight_cpu = weight_cpu.to(torch.float32)
+
+            if input_cpu1.dtype == torch.float16:
+                input_cpu1 = input_cpu1.to(torch.float32)
+            weight_cpu1, weight_npu1 = create_common_tensor(item[1], 0,1 )
+            if weight_cpu1.dtype == torch.float16:
+                weight_cpu1 = weight_cpu1.to(torch.float32)
+            
+            if input_cpu2.dtype == torch.float16:
+                input_cpu2 = input_cpu2.to(torch.float32)
+            weight_cpu2, weight_npu2 = create_common_tensor(item[1], 0,1 )
+            if weight_cpu2.dtype == torch.float16:
+                weight_cpu2 = weight_cpu2.to(torch.float32)
+
+            if input_cpu3.dtype == torch.float16:
+                input_cpu3 = input_cpu3.to(torch.float32)
+            weight_cpu3, weight_npu3 = create_common_tensor(item[1], 0,1 )
+            if weight_cpu3.dtype == torch.float16:
+                weight_cpu3= weight_cpu3.to(torch.float32)
+
+
+            kernel_size = (item[1][2][2], item[1][2][3])
+            cpu_output = self.op_exec_cpu(input_cpu, weight_cpu, item[0][2][1], item[1][2][0], kernel_size=kernel_size, 
+                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
+            weight_npu = weight_npu.to("cpu")
+            npu_output = self.op_exec_npu(input_npu, weight_npu, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
+                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
+            cpu_output = cpu_output.to(npu_output.dtype)
+
+
+            cpu_output1 = self.op_exec_cpu(input_cpu1, weight_cpu1, item[0][2][1], item[1][2][0], kernel_size=kernel_size, 
+                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
+            weight_npu1 = weight_npu1.to("cpu")
+            npu_output1 = self.op_exec_npu(input_npu1, weight_npu1, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
+                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
+            cpu_output1 = cpu_output1.to(npu_output1.dtype)
+
+
+            cpu_output2 = self.op_exec_cpu(input_cpu2, weight_cpu2, item[0][2][1], item[1][2][0], kernel_size=kernel_size, 
+                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
+            weight_npu2 = weight_npu2.to("cpu")
+            npu_output2 = self.op_exec_npu(input_npu2, weight_npu2, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
+                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
+            cpu_output2 = cpu_output2.to(npu_output2.dtype)
+
+
+            cpu_output3 = self.op_exec_cpu(input_cpu3, weight_cpu3, item[0][2][1], item[1][2][0], kernel_size=kernel_size, 
+                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
+            weight_npu3 = weight_npu3.to("cpu")
+            npu_output3 = self.op_exec_npu(input_npu3, weight_npu3, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
+                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
+            cpu_output3 = cpu_output3.to(npu_output3.dtype)
+
+
+            print("===========cpu_output============")
+            print(cpu_output)
+            print("===========cpu_output1============")
+            print(cpu_output1)
+            print("===========cpu_output2============")
+            print(cpu_output2)
+            print("===========cpu_output3============")
+            print(cpu_output3)
+
+            print("===========npu_output============")
+            print(npu_output)
+            print("===========npu_output1============")
+            print(npu_output1)
+            print("===========npu_output2============")
+            print(npu_output2)
+            print("===========npu_output3============")
+            print(npu_output3)
+
+
+            print("===========cpu_input&&npu_input==================")
+            print(input_cpu)
+            
+            self.assertRtolEqual(cpu_output.detach().numpy(), npu_output.detach().numpy())
+
+    def test_miopen_depthwise_convolution_shape_format(self, device):
+        shape_format = [  # input, weight, padding, stride, dilation, bias
+            [[np.float16, 3, [256, 32, 112, 112]], [np.float16, 0, [32, 1, 3, 3]], 0, 1, 1, None],
+            [[np.float16, 0, [1024, 116, 28, 28]], [np.float16, 0, [116, 1, 3, 3]], 1, [2, 2], 1, None],
+            [[np.float16, 0, [1024, 232, 14, 14]], [np.float16, 0, [232, 1, 3, 3]], 1, [2, 2], 1, None],
+            [[np.float16, 3, [1024, 232, 7, 7]], [np.float16, 0, [232, 1, 3, 3]], 1, 1, 1, None],
+            [[np.float16, 3, [1024, 24, 56, 56]], [np.float16, 0, [24, 1, 3, 3]], 1, [2, 2], 1, None],
+            [[np.float16, 3, [1024, 116, 28, 28]], [np.float16, 0, [116, 1, 3, 3]], 1, [2, 2], 1, None],
+            [[np.float16, 3, [1024, 232, 14, 14]], [np.float16, 0, [232, 1, 3, 3]], 1, [2, 2], 1, None],
+        ]
+
+        for item in shape_format:
+
+            input_cpu, input_npu = create_common_tensor(item[0],-65504.0,65504.0)
+            if input_cpu.dtype == torch.float16:
+                input_cpu = input_cpu.to(torch.float32)
+            weight_cpu, weight_npu = create_common_tensor(item[1], 0,1 )
+            if weight_cpu.dtype == torch.float16:
+                weight_cpu = weight_cpu.to(torch.float32)
+            kernel_size = (item[1][2][2], item[1][2][3])
+            cpu_output = self.op_exec_cpu(input_cpu, weight_cpu, item[0][2][1], item[1][2][0], kernel_size=kernel_size, 
+                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
+            weight_npu = weight_npu.to("cpu")
+            npu_output = self.op_exec_npu(input_npu, weight_npu, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
+                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
+            cpu_output = cpu_output.to(npu_output.dtype)
+
+            print("===========cpu_output============")
+            print(cpu_output)
+
+            print("===========npu_output============")
+            print(npu_output)
+
+            print("===========cpu_input&&npu_input==================")
+            print(input_cpu)
+            
+            self.assertRtolEqual(cpu_output.detach().numpy(), npu_output.detach().numpy())
+
+
+
+instantiate_device_type_tests(TestMiopenDepthwiseConvolution, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_miopen_depthwise_convolution_backward.py b/test/test_npu/test_miopen_depthwise_convolution_backward.py
index b0857b23055a4a4e3ba52c6dbe92e161c43a2557..9d557c2e3427d1b8cd613ae175e90b2528c70a4b 100644
--- a/test/test_npu/test_miopen_depthwise_convolution_backward.py
+++ b/test/test_npu/test_miopen_depthwise_convolution_backward.py
@@ -1,113 +1,113 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-import torch.nn as nn
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestMiopenDepthwiseConvolutionBackward(TestCase):
-    weight_grad = []
-    input_grad = []
-
-    def getWeightGrad(self, grad):
-        self.weight_grad.append(grad.to("cpu"))
-
-    def getInputGrad(self, grad):
-        self.input_grad.append(grad.to("cpu"))
-
-    def op_exec_cpu(self, input, weight, in_channels, out_channels, kernel_size, padding=0, stride=1, dilation=1, bias=True):
-        input1 = input
-        weight1 = weight
-        input1.requires_grad = True
-        input1.register_hook(lambda grad: self.getInputGrad(grad))
-
-        bias1 = False
-        if bias != None:
-            bias1 = True
-
-        m1 = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias1, groups=in_channels)
-        m1.weight.data = weight1
-        m1.weight.register_hook(lambda grad: self.getWeightGrad(grad))
-        cpuOutput = m1(input1)
-        tmp = torch.ones_like(cpuOutput)
-        cpuOutput.backward(tmp)
-
-        return cpuOutput
-
-    def op_exec_npu(self, input, weight, in_channels, out_channels, kernel_size, padding=0, stride=1, dilation=1, bias=True):
-        input1 = input
-        weight1 = weight
-        input1.requires_grad = True
-        input1.register_hook(lambda grad: self.getInputGrad(grad))
-
-        bias1 = False
-        if bias != None:
-            bias1 = True
-
-        m1 = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias1, groups=in_channels)
-        m1.weight.data = weight1
-        m1.weight.register_hook(lambda grad: self.getWeightGrad(grad))
-        m1 = m1.to("npu")
-        npuOutput = m1(input1)
-        npuOutput = npuOutput.to("cpu")
-        tmp = torch.ones_like(npuOutput)
-        npuOutput.backward(tmp)
-
-        return npuOutput
-            
-    def test_miopen_depthwise_convolution_backward_shape_format(self, device):
-        shape_format = [  # input, weight, padding, stride, dilation, bias
-                 [[np.float16, 3, [20, 3, 112, 112]], [np.float16, 0, [3, 1, 3, 3]], 0, 1, 1, None],
-        ]
-
-        for item in shape_format:
-            self.weight_grad.clear()
-            self.input_grad.clear()
-            input_cpu, input_npu = create_common_tensor(item[0], 0, 10)
-            if input_cpu.dtype == torch.float16:
-                input_cpu = input_cpu.to(torch.float32)
-            weight_cpu, weight_npu = create_common_tensor(item[1], 0, 10)
-            if weight_cpu.dtype == torch.float16:
-                weight_cpu = weight_cpu.to(torch.float32)
-            kernel_size = (item[1][2][2], item[1][2][3])
-            cpu_output = self.op_exec_cpu(input_cpu, weight_cpu, item[0][2][1], item[1][2][0], kernel_size=kernel_size, 
-                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
-            weight_npu = weight_npu.to("cpu")
-            npu_output = self.op_exec_npu(input_npu, weight_npu, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
-                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
-            cpu_output = cpu_output.to(npu_output.dtype)
-
-            print("===========cpu_output============")
-            print(cpu_output)
-
-            print("===========npu_output============")
-            print(npu_output)
-            
-            self.input_grad[0] = self.input_grad[0].to(self.input_grad[1].dtype)
-            self.weight_grad[0] = self.weight_grad[0].to(self.weight_grad[1].dtype)
-            
-            self.assertRtolEqual(cpu_output.detach().numpy(), npu_output.detach().numpy())
-            self.assertRtolEqual(self.input_grad[0].numpy(), self.input_grad[1].numpy())
-            self.assertRtolEqual(self.weight_grad[0].numpy(), self.weight_grad[1].numpy())
-
-
-instantiate_device_type_tests(TestMiopenDepthwiseConvolutionBackward, globals(), except_for='cpu')
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+import torch.nn as nn
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestMiopenDepthwiseConvolutionBackward(TestCase):
+    weight_grad = []
+    input_grad = []
+
+    def getWeightGrad(self, grad):
+        self.weight_grad.append(grad.to("cpu"))
+
+    def getInputGrad(self, grad):
+        self.input_grad.append(grad.to("cpu"))
+
+    def op_exec_cpu(self, input, weight, in_channels, out_channels, kernel_size, padding=0, stride=1, dilation=1, bias=True):
+        input1 = input
+        weight1 = weight
+        input1.requires_grad = True
+        input1.register_hook(lambda grad: self.getInputGrad(grad))
+
+        bias1 = False
+        if bias != None:
+            bias1 = True
+
+        m1 = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias1, groups=in_channels)
+        m1.weight.data = weight1
+        m1.weight.register_hook(lambda grad: self.getWeightGrad(grad))
+        cpuOutput = m1(input1)
+        tmp = torch.ones_like(cpuOutput)
+        cpuOutput.backward(tmp)
+
+        return cpuOutput
+
+    def op_exec_npu(self, input, weight, in_channels, out_channels, kernel_size, padding=0, stride=1, dilation=1, bias=True):
+        input1 = input
+        weight1 = weight
+        input1.requires_grad = True
+        input1.register_hook(lambda grad: self.getInputGrad(grad))
+
+        bias1 = False
+        if bias != None:
+            bias1 = True
+
+        m1 = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias1, groups=in_channels)
+        m1.weight.data = weight1
+        m1.weight.register_hook(lambda grad: self.getWeightGrad(grad))
+        m1 = m1.to("npu")
+        npuOutput = m1(input1)
+        npuOutput = npuOutput.to("cpu")
+        tmp = torch.ones_like(npuOutput)
+        npuOutput.backward(tmp)
+
+        return npuOutput
+            
+    def test_miopen_depthwise_convolution_backward_shape_format(self, device):
+        shape_format = [  # input, weight, padding, stride, dilation, bias
+                 [[np.float16, 3, [20, 3, 112, 112]], [np.float16, 0, [3, 1, 3, 3]], 0, 1, 1, None],
+        ]
+
+        for item in shape_format:
+            self.weight_grad.clear()
+            self.input_grad.clear()
+            input_cpu, input_npu = create_common_tensor(item[0], 0, 10)
+            if input_cpu.dtype == torch.float16:
+                input_cpu = input_cpu.to(torch.float32)
+            weight_cpu, weight_npu = create_common_tensor(item[1], 0, 10)
+            if weight_cpu.dtype == torch.float16:
+                weight_cpu = weight_cpu.to(torch.float32)
+            kernel_size = (item[1][2][2], item[1][2][3])
+            cpu_output = self.op_exec_cpu(input_cpu, weight_cpu, item[0][2][1], item[1][2][0], kernel_size=kernel_size, 
+                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
+            weight_npu = weight_npu.to("cpu")
+            npu_output = self.op_exec_npu(input_npu, weight_npu, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
+                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
+            cpu_output = cpu_output.to(npu_output.dtype)
+
+            print("===========cpu_output============")
+            print(cpu_output)
+
+            print("===========npu_output============")
+            print(npu_output)
+            
+            self.input_grad[0] = self.input_grad[0].to(self.input_grad[1].dtype)
+            self.weight_grad[0] = self.weight_grad[0].to(self.weight_grad[1].dtype)
+            
+            self.assertRtolEqual(cpu_output.detach().numpy(), npu_output.detach().numpy())
+            self.assertRtolEqual(self.input_grad[0].numpy(), self.input_grad[1].numpy())
+            self.assertRtolEqual(self.weight_grad[0].numpy(), self.weight_grad[1].numpy())
+
+
+instantiate_device_type_tests(TestMiopenDepthwiseConvolutionBackward, globals(), except_for='cpu')
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_ne.py b/test/test_npu/test_ne.py
index 2e1cd67fa9c46a1e222aea71efa3b999151f6255..548c12ebef33cfc358eabd351fc05b36d27f489c 100644
--- a/test/test_npu/test_ne.py
+++ b/test/test_npu/test_ne.py
@@ -1,160 +1,160 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# coding: utf-8
-
-import torch
-import numpy as np
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestNe(TestCase):
-
-    def cpu_op_exec_scalar(self, input1, other):
-        output = torch.ne(input1, other)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_scalar(self,input1, other):
-        output = torch.ne(input1, other)
-        output1 = output.to("cpu")
-        output2 = output1.numpy()
-        return output2
-
-    def cpu_op_exec(self, input1, other):
-        output = torch.ne(input1, other)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self,input1, other):
-        output = torch.ne(input1, other)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_(self,input1, other):
-        torch.ne_(input1,other)
-        output = input1.numpy()
-        return output
-
-    def npu_op_exec_(self,input1, other):
-        torch.ne_(input1, other)
-        output = input1.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_scalar_(self,input1, other):
-        torch.ne_(input1,other)
-        output = input1.numpy()
-        return output
-
-    def npu_op_exec_scalar_(self,input1, other):
-        torch.ne_(input1, other)
-        output = input1.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_scalar_out(self,input1,other, out):
-        torch.ne(input1,other, out=out)
-        output = out.numpy()
-        return output
-
-    def npu_op_exec_scalar_out(self,input1, other, out):
-        torch.ne(input1, other, out=out)
-        output = out.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_out(self,input1,other, out):
-        torch.ne(input1,other, out=out)
-        output = out.numpy()
-        return output
-
-    def npu_op_exec_out(self,input1, other, out):
-        torch.ne(input1, other, out=out)
-        output = out.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_ne_scalar_common_shape_format(self, device):
-        shape_format = [
-                [[np.float32,0 , (2,4, 3)], 3],
-                [[np.float32, 3, (2, 3)], 2],
-                [[np.float32, 0, (3, 2)], 8],
-                [[np.int8, 0 , (4, 3)],3],
-                [[np.uint8, -1, (2,4, 3)],3],
-                [[np.int32, 0, (2, 6)],6]
-                ]
-        for item in shape_format:            
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 10)
-            cpu_output = self.cpu_op_exec_scalar(cpu_input1, item[1])
-            npu_output = self.npu_op_exec_scalar(npu_input1, item[1])
-            self.assertRtolEqual(cpu_output, npu_output)       
-
-    def test_ne_common_shape_format(self, device):
-        shape_format = [
-                [[np.float32,0 , (2, 4, 3)], [np.float32,0 , (2, 4, 3)]],
-                [[np.float32, 3, (2, 3)], [np.float32, 3, (2, 3)]],
-                [[np.float32, 0, (3, 2)], [np.float32, 0, (3, 2)]],
-                [[np.int8, 0 , (4, 3)], [np.int8, 0 , (4, 3)]],
-                [[np.uint8, -1, (2,4, 3)], [np.uint8, -1, (2,4, 3)]],
-                [[np.int32, 0, (2, 6)], [np.int32, 0, (2, 6)]],
-                ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 10)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 1, 10)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_ne_scalar_out_common_shape_format(self, device):
-        shape_format = [
-                [[np.float32,0 , (2, 4, 3)], 2, [np.bool, 0 , (2, 4, 3)]],
-                [[np.float32, 3, (2, 3)],    3, [np.bool, -1, (2, 3)]],
-                [[np.float32, 0, (3, 2)],    4, [np.bool, 0, (3, 2)]],
-
-                [[np.int8, 0 , (4, 3)],      5, [np.bool, 0 , (4, 3)]],
-                [[np.uint8, -1, (2,4, 3)],   6, [np.bool, -1, (2,4, 3)]],
-                [[np.int32, 0, (2, 6)],      7, [np.bool, 0, (2, 6)]]
-                ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 10)
-            cpu_out, npu_out = create_common_tensor(item[2], 1, 10)
-            cpu_output = self.cpu_op_exec_scalar_out(cpu_input1, item[1], cpu_out)
-            npu_output = self.npu_op_exec_scalar_out(npu_input1, item[1], npu_out)
-            self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_ne_out_common_shape_format(self, device):
-        shape_format = [
-                [[np.float32,0 , (2, 4, 3)], [np.float32,0 , (2, 4, 3)], [np.bool, 0 , (2, 4, 3)]],
-                [[np.float32, 3, (2, 3)],    [np.float32, 3, (2, 3)],    [np.bool, -1, (2, 3)]],
-                [[np.float32, 0, (3, 2)],    [np.float32, 0, (3, 2)],    [np.bool, 0, (3, 2)]],
-
-                [[np.int8, 0 , (4, 3)],      [np.int8, 0 , (4, 3)],      [np.bool, 0 , (4, 3)]],
-                [[np.uint8, -1, (2,4, 3)],   [np.uint8, -1, (2,4, 3)],   [np.bool, -1, (2,4, 3)]],
-                [[np.int32, 0, (2, 6)],      [np.int32, 0, (2, 6)],      [np.bool, 0, (2, 6)]]
-                ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 10)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 1, 10)
-            cpu_out, npu_out = create_common_tensor(item[2], 1, 10)
-            cpu_output = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_out)
-            npu_output = self.npu_op_exec_out(npu_input1, npu_input2, npu_out)
-            self.assertRtolEqual(cpu_output, npu_output)
-    
-instantiate_device_type_tests(TestNe, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding: utf-8
+
+import torch
+import numpy as np
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestNe(TestCase):
+
+    def cpu_op_exec_scalar(self, input1, other):
+        output = torch.ne(input1, other)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_scalar(self,input1, other):
+        output = torch.ne(input1, other)
+        output1 = output.to("cpu")
+        output2 = output1.numpy()
+        return output2
+
+    def cpu_op_exec(self, input1, other):
+        output = torch.ne(input1, other)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self,input1, other):
+        output = torch.ne(input1, other)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_(self,input1, other):
+        torch.ne_(input1,other)
+        output = input1.numpy()
+        return output
+
+    def npu_op_exec_(self,input1, other):
+        torch.ne_(input1, other)
+        output = input1.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_scalar_(self,input1, other):
+        torch.ne_(input1,other)
+        output = input1.numpy()
+        return output
+
+    def npu_op_exec_scalar_(self,input1, other):
+        torch.ne_(input1, other)
+        output = input1.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_scalar_out(self,input1,other, out):
+        torch.ne(input1,other, out=out)
+        output = out.numpy()
+        return output
+
+    def npu_op_exec_scalar_out(self,input1, other, out):
+        torch.ne(input1, other, out=out)
+        output = out.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_out(self,input1,other, out):
+        torch.ne(input1,other, out=out)
+        output = out.numpy()
+        return output
+
+    def npu_op_exec_out(self,input1, other, out):
+        torch.ne(input1, other, out=out)
+        output = out.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_ne_scalar_common_shape_format(self, device):
+        shape_format = [
+                [[np.float32,0 , (2,4, 3)], 3],
+                [[np.float32, 3, (2, 3)], 2],
+                [[np.float32, 0, (3, 2)], 8],
+                [[np.int8, 0 , (4, 3)],3],
+                [[np.uint8, -1, (2,4, 3)],3],
+                [[np.int32, 0, (2, 6)],6]
+                ]
+        for item in shape_format:            
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 10)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input1, item[1])
+            npu_output = self.npu_op_exec_scalar(npu_input1, item[1])
+            self.assertRtolEqual(cpu_output, npu_output)       
+
+    def test_ne_common_shape_format(self, device):
+        shape_format = [
+                [[np.float32,0 , (2, 4, 3)], [np.float32,0 , (2, 4, 3)]],
+                [[np.float32, 3, (2, 3)], [np.float32, 3, (2, 3)]],
+                [[np.float32, 0, (3, 2)], [np.float32, 0, (3, 2)]],
+                [[np.int8, 0 , (4, 3)], [np.int8, 0 , (4, 3)]],
+                [[np.uint8, -1, (2,4, 3)], [np.uint8, -1, (2,4, 3)]],
+                [[np.int32, 0, (2, 6)], [np.int32, 0, (2, 6)]],
+                ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 10)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 1, 10)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test_ne_scalar_out_common_shape_format(self, device):
+        shape_format = [
+                [[np.float32,0 , (2, 4, 3)], 2, [np.bool, 0 , (2, 4, 3)]],
+                [[np.float32, 3, (2, 3)],    3, [np.bool, -1, (2, 3)]],
+                [[np.float32, 0, (3, 2)],    4, [np.bool, 0, (3, 2)]],
+
+                [[np.int8, 0 , (4, 3)],      5, [np.bool, 0 , (4, 3)]],
+                [[np.uint8, -1, (2,4, 3)],   6, [np.bool, -1, (2,4, 3)]],
+                [[np.int32, 0, (2, 6)],      7, [np.bool, 0, (2, 6)]]
+                ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 10)
+            cpu_out, npu_out = create_common_tensor(item[2], 1, 10)
+            cpu_output = self.cpu_op_exec_scalar_out(cpu_input1, item[1], cpu_out)
+            npu_output = self.npu_op_exec_scalar_out(npu_input1, item[1], npu_out)
+            self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test_ne_out_common_shape_format(self, device):
+        shape_format = [
+                [[np.float32,0 , (2, 4, 3)], [np.float32,0 , (2, 4, 3)], [np.bool, 0 , (2, 4, 3)]],
+                [[np.float32, 3, (2, 3)],    [np.float32, 3, (2, 3)],    [np.bool, -1, (2, 3)]],
+                [[np.float32, 0, (3, 2)],    [np.float32, 0, (3, 2)],    [np.bool, 0, (3, 2)]],
+
+                [[np.int8, 0 , (4, 3)],      [np.int8, 0 , (4, 3)],      [np.bool, 0 , (4, 3)]],
+                [[np.uint8, -1, (2,4, 3)],   [np.uint8, -1, (2,4, 3)],   [np.bool, -1, (2,4, 3)]],
+                [[np.int32, 0, (2, 6)],      [np.int32, 0, (2, 6)],      [np.bool, 0, (2, 6)]]
+                ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 10)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 1, 10)
+            cpu_out, npu_out = create_common_tensor(item[2], 1, 10)
+            cpu_output = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_out)
+            npu_output = self.npu_op_exec_out(npu_input1, npu_input2, npu_out)
+            self.assertRtolEqual(cpu_output, npu_output)
+    
+instantiate_device_type_tests(TestNe, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_neg.py b/test/test_npu/test_neg.py
index 7a192768f0a57a50b9b264b11040c91cc790ea9a..6c98aa1ed59faf3c4073e12d938617e6e0f109a3 100644
--- a/test/test_npu/test_neg.py
+++ b/test/test_npu/test_neg.py
@@ -1,213 +1,213 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestNeg(TestCase):
-    def generate_single_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        npu_input1 = torch.from_numpy(input1)
-
-        return npu_input1
-
-    def cpu_op_exec(self, input1):
-        output = torch.neg(input1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1):
-        input1 = input1.to("npu")
-        output = torch.neg(input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_neg_float16_1(self, device):
-        def cpu_op_exec_fp16(input1):
-            input1 = input1.to(torch.float32)
-            output = torch.neg(input1)
-            output = output.numpy()
-            output = output.astype(np.float16)
-            return output
-
-        npu_input1 = self.generate_single_data(-2, 2, ((65535, 1, 1, 1)), np.float16)
-        cpu_output = cpu_op_exec_fp16(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_neg_float16_2(self, device):
-        def cpu_op_exec_fp16(input1):
-            input1 = input1.to(torch.float32)
-            output = torch.neg(input1)
-            output = output.numpy()
-            output = output.astype(np.float16)
-            return output
-
-        npu_input1 = self.generate_single_data(-2, 2, ((1, 1, 1, 8192)), np.float16)
-        cpu_output = cpu_op_exec_fp16(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_neg_float16_3(self, device):
-        def cpu_op_exec_fp16(input1):
-            input1 = input1.to(torch.float32)
-            output = torch.neg(input1)
-            output = output.numpy()
-            output = output.astype(np.float16)
-            return output
-
-        npu_input1 = self.generate_single_data(-2, 2, ((1, 1, 1, 65535)), np.float16)
-        cpu_output = cpu_op_exec_fp16(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_neg_float16_4(self, device):
-        def cpu_op_exec_fp16(input1):
-            input1 = input1.to(torch.float32)
-            output = torch.neg(input1)
-            output = output.numpy()
-            output = output.astype(np.float16)
-            return output
-
-        npu_input1 = self.generate_single_data(-2, 2, ((1, 1, 1, 524288)), np.float16)
-        cpu_output = cpu_op_exec_fp16(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_neg_float16_5(self, device):
-        def cpu_op_exec_fp16(input1):
-            input1 = input1.to(torch.float32)
-            output = torch.neg(input1)
-            output = output.numpy()
-            output = output.astype(np.float16)
-            return output
-
-        npu_input1 = self.generate_single_data(-2, 2, ((1, 1, 1, 786432)), np.float16)
-        cpu_output = cpu_op_exec_fp16(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_neg_float32_1(self, device):
-        npu_input1 = self.generate_single_data(-1.1754943508e-38, -1.1754943508e-38, ((1, 31, 149, 2)), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_neg_float32_2(self, device):
-        npu_input1 = self.generate_single_data(-3402823500.0, 3402823500.0, ((1, 32, 31, 1)), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertEqual(cpu_output, npu_output)
-
-    def test_neg_float32_3(self, device):
-        npu_input1 = self.generate_single_data(-0.000030517578125, 0.000030517578125, ((2, 32, 149, 31)), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_neg_float32_4(self, device):
-        npu_input1 = self.generate_single_data(3402823500, 3402800000, ((128)), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_neg_float32_5(self, device):
-        npu_input1 = self.generate_single_data(-9.313225746154785e-10, 9.313225746154785e-10, ((184965, 1)), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_neg_float32_6(self, device):
-        npu_input1 = self.generate_single_data(-3402823500.0, -3402823500.0, ((1, 31, 149, 2)), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_neg_float32_7(self, device):
-        npu_input1 = self.generate_single_data(-3402823500.0, 3402823500.0, ((1, 31, 149, 2)), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_neg_float32_8(self, device):
-        npu_input1 = self.generate_single_data(-9.313225746154785e-10, 9.313225746154785e-10, ((1, 31, 149, 2)), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_neg_float32_9(self, device):
-        npu_input1 = self.generate_single_data(-3402823500.0, -3402823500.0, ((1, 31, 149, 2)), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_neg_float32_10(self, device):
-        npu_input1 = self.generate_single_data(-0.000000000000000000000000000000000000011754943508,
-                                          0.000000000000000000000000000000000000011754943508, ((2, 31, 149, 2)),
-                                          np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_neg_float32_11(self, device):
-        npu_input1 = self.generate_single_data(0.000000000000000000000000000000000000011754943508,
-                                          0.000000000000000000000000000000000000011754943508, ((4, 31, 149, 2)),
-                                          np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_neg_float32_12(self, device):
-        npu_input1 = self.generate_single_data(-0.000000000000000000000000000000000000011754943508,
-                                          -0.000000000000000000000000000000000000011754943508, ((2048, 31, 1, 2)),
-                                          np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_neg_float32_13(self, device):
-        npu_input1 = self.generate_single_data(-0.000000000000000000000000000000000000011754943508,
-                                          0.000000000000000000000000000000000000011754943508, ((8, 7, 149)), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_neg_int32_1(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (2, 3), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_neg_int32_2(self, device):
-        npu_input1 = self.generate_single_data(2147483647, 2147483648, (2, 3), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_neg_int32_3(self, device):
-        npu_input1 = self.generate_single_data(-2147483648, -2147483647, (2, 3), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestNeg, globals(), except_for='cpu')
-if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestNeg(TestCase):
+    def generate_single_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        npu_input1 = torch.from_numpy(input1)
+
+        return npu_input1
+
+    def cpu_op_exec(self, input1):
+        output = torch.neg(input1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        input1 = input1.to("npu")
+        output = torch.neg(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_neg_float16_1(self, device):
+        def cpu_op_exec_fp16(input1):
+            input1 = input1.to(torch.float32)
+            output = torch.neg(input1)
+            output = output.numpy()
+            output = output.astype(np.float16)
+            return output
+
+        npu_input1 = self.generate_single_data(-2, 2, ((65535, 1, 1, 1)), np.float16)
+        cpu_output = cpu_op_exec_fp16(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_neg_float16_2(self, device):
+        def cpu_op_exec_fp16(input1):
+            input1 = input1.to(torch.float32)
+            output = torch.neg(input1)
+            output = output.numpy()
+            output = output.astype(np.float16)
+            return output
+
+        npu_input1 = self.generate_single_data(-2, 2, ((1, 1, 1, 8192)), np.float16)
+        cpu_output = cpu_op_exec_fp16(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_neg_float16_3(self, device):
+        def cpu_op_exec_fp16(input1):
+            input1 = input1.to(torch.float32)
+            output = torch.neg(input1)
+            output = output.numpy()
+            output = output.astype(np.float16)
+            return output
+
+        npu_input1 = self.generate_single_data(-2, 2, ((1, 1, 1, 65535)), np.float16)
+        cpu_output = cpu_op_exec_fp16(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_neg_float16_4(self, device):
+        def cpu_op_exec_fp16(input1):
+            input1 = input1.to(torch.float32)
+            output = torch.neg(input1)
+            output = output.numpy()
+            output = output.astype(np.float16)
+            return output
+
+        npu_input1 = self.generate_single_data(-2, 2, ((1, 1, 1, 524288)), np.float16)
+        cpu_output = cpu_op_exec_fp16(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_neg_float16_5(self, device):
+        def cpu_op_exec_fp16(input1):
+            input1 = input1.to(torch.float32)
+            output = torch.neg(input1)
+            output = output.numpy()
+            output = output.astype(np.float16)
+            return output
+
+        npu_input1 = self.generate_single_data(-2, 2, ((1, 1, 1, 786432)), np.float16)
+        cpu_output = cpu_op_exec_fp16(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_neg_float32_1(self, device):
+        npu_input1 = self.generate_single_data(-1.1754943508e-38, -1.1754943508e-38, ((1, 31, 149, 2)), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_neg_float32_2(self, device):
+        npu_input1 = self.generate_single_data(-3402823500.0, 3402823500.0, ((1, 32, 31, 1)), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertEqual(cpu_output, npu_output)
+
+    def test_neg_float32_3(self, device):
+        npu_input1 = self.generate_single_data(-0.000030517578125, 0.000030517578125, ((2, 32, 149, 31)), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_neg_float32_4(self, device):
+        npu_input1 = self.generate_single_data(3402823500, 3402800000, ((128)), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_neg_float32_5(self, device):
+        npu_input1 = self.generate_single_data(-9.313225746154785e-10, 9.313225746154785e-10, ((184965, 1)), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_neg_float32_6(self, device):
+        npu_input1 = self.generate_single_data(-3402823500.0, -3402823500.0, ((1, 31, 149, 2)), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_neg_float32_7(self, device):
+        npu_input1 = self.generate_single_data(-3402823500.0, 3402823500.0, ((1, 31, 149, 2)), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_neg_float32_8(self, device):
+        npu_input1 = self.generate_single_data(-9.313225746154785e-10, 9.313225746154785e-10, ((1, 31, 149, 2)), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_neg_float32_9(self, device):
+        npu_input1 = self.generate_single_data(-3402823500.0, -3402823500.0, ((1, 31, 149, 2)), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_neg_float32_10(self, device):
+        npu_input1 = self.generate_single_data(-0.000000000000000000000000000000000000011754943508,
+                                          0.000000000000000000000000000000000000011754943508, ((2, 31, 149, 2)),
+                                          np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_neg_float32_11(self, device):
+        npu_input1 = self.generate_single_data(0.000000000000000000000000000000000000011754943508,
+                                          0.000000000000000000000000000000000000011754943508, ((4, 31, 149, 2)),
+                                          np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_neg_float32_12(self, device):
+        npu_input1 = self.generate_single_data(-0.000000000000000000000000000000000000011754943508,
+                                          -0.000000000000000000000000000000000000011754943508, ((2048, 31, 1, 2)),
+                                          np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_neg_float32_13(self, device):
+        npu_input1 = self.generate_single_data(-0.000000000000000000000000000000000000011754943508,
+                                          0.000000000000000000000000000000000000011754943508, ((8, 7, 149)), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_neg_int32_1(self, device):
+        npu_input1 = self.generate_single_data(0, 100, (2, 3), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_neg_int32_2(self, device):
+        npu_input1 = self.generate_single_data(2147483647, 2147483648, (2, 3), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test_neg_int32_3(self, device):
+        npu_input1 = self.generate_single_data(-2147483648, -2147483647, (2, 3), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestNeg, globals(), except_for='cpu')
+if __name__ == "__main__":
+    torch.npu.set_device("npu:5")
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test___ior__.py b/test/test_npu/test_network_ops/test___ior__.py
index c28a0bd8e0eebc166ed817d421c681843417f3d4..20ce8bca8a7c6139ae6afa8335a3aadde9255f3e 100644
--- a/test/test_npu/test_network_ops/test___ior__.py
+++ b/test/test_npu/test_network_ops/test___ior__.py
@@ -1,226 +1,226 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestIor(TestCase):
-    #pylint: disable=unused-argument
-    def generate_bool_data(self, shape):
-        input1 = np.random.uniform(0, 1, shape).astype(np.float32)
-        input1 = input1 < 0.5
-        npu_input1 = torch.from_numpy(input1)
-
-        return npu_input1
-    
-    def generate_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-
-        # modify from numpy.ndarray to torch.tensor
-        npu_input1 = torch.from_numpy(input1)
-        npu_input2 = torch.from_numpy(input2)
-
-        return npu_input1, npu_input2
-
-    def generate_single_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        npu_input1 = torch.from_numpy(input1)
-        return npu_input1
-
-    def generate_int_scalar(self, min_d, max_d):
-        scalar = np.random.randint(min_d, max_d)
-        return scalar
-
-    def cpu_op_exec(self, input1, input2):
-        output = input1.__ior__(input2)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        output = input1.__ior__(input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_scalar(self, input1, input2):
-        input1 = input1.to("npu")
-        output = input1.__ior__(input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test___ior___bool(self, device):
-        npu_input1 = self.generate_bool_data((1, 31, 149, 2))
-        npu_input2 = self.generate_bool_data((1, 31, 149, 2))
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test___ior___bool_scalar(self, device):
-        npu_input1 = self.generate_bool_data((1, 31, 149, 2))
-        npu_input2 = False
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec_scalar(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test___ior___uint8(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 255, (1, 31, 149, 2), np.uint8)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test___ior___int8(self, device):
-        npu_input1, npu_input2 = self.generate_data(-128, 127, (1, 31, 149, 2), np.int8)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test___ior___int32_001(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2147483648, -2147483648, (1, 31, 149, 2), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test___ior___int32_002(self, device):
-        npu_input1, npu_input2 = self.generate_data(2147483647, 2147483647, (128), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test___ior___int32_003(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (184965, 1), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test___ior___int32_004(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1, 31, 149, 2), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test___ior___int32_005(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (2, 31, 149, 2), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test___ior___int32_006(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (4, 31, 149, 2), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test___ior___int32_007(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (2048, 31, 1, 2), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test___ior___int32_008(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (8, 7, 149), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test___ior___int32_009(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (65535,1,1,1), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test___ior___int32_010(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,8192), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test___ior___int32_011(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,16384), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test___ior___int32_012(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,32768), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test___ior___int32_013(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,65535), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test___ior___int32_014(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,131072), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-        
-    def test___ior___int32_015(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,196608), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test___ior___int32_016(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,262144), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test___ior___int32_017(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,393216), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test___ior___int32_018(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,524288), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-      
-    def test___ior___int32_019(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,655360), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test___ior___int32_020(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,786432), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test___ior___int_scalar(self, device):
-        npu_input1 = self.generate_single_data(-2147483648, 2147483647, (1,31,149,2), np.int32)
-        npu_input2 = self.generate_int_scalar(-2147483648, 2147483647)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec_scalar(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestIor, globals(), except_for='cpu')
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestIor(TestCase):
+    #pylint: disable=unused-argument
+    def generate_bool_data(self, shape):
+        input1 = np.random.uniform(0, 1, shape).astype(np.float32)
+        input1 = input1 < 0.5
+        npu_input1 = torch.from_numpy(input1)
+
+        return npu_input1
+    
+    def generate_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+
+        # modify from numpy.ndarray to torch.tensor
+        npu_input1 = torch.from_numpy(input1)
+        npu_input2 = torch.from_numpy(input2)
+
+        return npu_input1, npu_input2
+
+    def generate_single_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        npu_input1 = torch.from_numpy(input1)
+        return npu_input1
+
+    def generate_int_scalar(self, min_d, max_d):
+        scalar = np.random.randint(min_d, max_d)
+        return scalar
+
+    def cpu_op_exec(self, input1, input2):
+        output = input1.__ior__(input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        output = input1.__ior__(input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_scalar(self, input1, input2):
+        input1 = input1.to("npu")
+        output = input1.__ior__(input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test___ior___bool(self, device):
+        npu_input1 = self.generate_bool_data((1, 31, 149, 2))
+        npu_input2 = self.generate_bool_data((1, 31, 149, 2))
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test___ior___bool_scalar(self, device):
+        npu_input1 = self.generate_bool_data((1, 31, 149, 2))
+        npu_input2 = False
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec_scalar(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test___ior___uint8(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 255, (1, 31, 149, 2), np.uint8)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test___ior___int8(self, device):
+        npu_input1, npu_input2 = self.generate_data(-128, 127, (1, 31, 149, 2), np.int8)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test___ior___int32_001(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2147483648, -2147483648, (1, 31, 149, 2), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test___ior___int32_002(self, device):
+        npu_input1, npu_input2 = self.generate_data(2147483647, 2147483647, (128), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test___ior___int32_003(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (184965, 1), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test___ior___int32_004(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1, 31, 149, 2), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test___ior___int32_005(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (2, 31, 149, 2), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test___ior___int32_006(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (4, 31, 149, 2), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test___ior___int32_007(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (2048, 31, 1, 2), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test___ior___int32_008(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (8, 7, 149), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test___ior___int32_009(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (65535,1,1,1), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test___ior___int32_010(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,8192), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test___ior___int32_011(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,16384), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test___ior___int32_012(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,32768), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test___ior___int32_013(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,65535), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test___ior___int32_014(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,131072), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+        
+    def test___ior___int32_015(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,196608), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test___ior___int32_016(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,262144), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test___ior___int32_017(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,393216), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test___ior___int32_018(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,524288), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+      
+    def test___ior___int32_019(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,655360), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test___ior___int32_020(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2147483648, 2147483647, (1,1,1,786432), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test___ior___int_scalar(self, device):
+        npu_input1 = self.generate_single_data(-2147483648, 2147483647, (1,31,149,2), np.int32)
+        npu_input2 = self.generate_int_scalar(-2147483648, 2147483647)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec_scalar(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestIor, globals(), except_for='cpu')
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_abs.py b/test/test_npu/test_network_ops/test_abs.py
old mode 100644
new mode 100755
index 163c3fe304d4e2710bfa19cc614912c2ed84e126..607a7ea4e58454da14d8bb6667dfff0ffb56dd95
--- a/test/test_npu/test_network_ops/test_abs.py
+++ b/test/test_npu/test_network_ops/test_abs.py
@@ -1,64 +1,64 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-from common_utils import TestCase, run_tests
-
-
-class TestAbs(TestCase):
-    def cpu_op_exec(self, input):
-        output = torch.abs(input)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input):
-        output = torch.abs(input)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_abs_shape_format_fp16(self, device):
-        format_list = [0, 3]
-        shape_list = [[5], [5, 10], [1, 3, 2], [52, 15, 15, 20]]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            # print(item)
-            cpu_input, npu_input = create_common_tensor(item, -10, 10)
-            cpu_input = cpu_input.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec(npu_input)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_abs_shape_format_fp32(self, device):
-        format_list = [0, 3]
-        shape_list = [[5], [5, 10], [1, 3, 2], [52, 15, 15, 20]]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            # print(item)
-            cpu_input, npu_input = create_common_tensor(item, -10, 10)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec(npu_input)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestAbs, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+from common_utils import TestCase, run_tests
+
+
+class TestAbs(TestCase):
+    def cpu_op_exec(self, input):
+        output = torch.abs(input)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input):
+        output = torch.abs(input)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_abs_shape_format_fp16(self, device):
+        format_list = [0, 3]
+        shape_list = [[5], [5, 10], [1, 3, 2], [52, 15, 15, 20]]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            # print(item)
+            cpu_input, npu_input = create_common_tensor(item, -10, 10)
+            cpu_input = cpu_input.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_abs_shape_format_fp32(self, device):
+        format_list = [0, 3]
+        shape_list = [[5], [5, 10], [1, 3, 2], [52, 15, 15, 20]]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            # print(item)
+            cpu_input, npu_input = create_common_tensor(item, -10, 10)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestAbs, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_adaptive_avg_pool1d.py b/test/test_npu/test_network_ops/test_adaptive_avg_pool1d.py
index 662cae2af3231941d335f5aa27f24af512b06a66..16e66b88372aff6c6824c1760d88bb5a77da3016 100644
--- a/test/test_npu/test_network_ops/test_adaptive_avg_pool1d.py
+++ b/test/test_npu/test_network_ops/test_adaptive_avg_pool1d.py
@@ -1,63 +1,63 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestAdaptiveAvgPool1d(TestCase):
-    def cpu_op_exec(self, input, output_size):
-        m = nn.AdaptiveAvgPool1d(output_size)
-        output= m(input)
-        return output.numpy()
-
-    def npu_op_exec(self, input, output_size):
-        m = nn.AdaptiveAvgPool1d(output_size).npu()
-        output = m(input)
-        return output.cpu().numpy()
-    
-    def test_AdaptiveAvgPool1d_shape_format_fp16(self, device):
-        shape_format = [
-                [np.float16, 0, (64, 10, 16)],
-                [np.float16, -1, (256, 2048, 8)],
-                [np.float16, 3, (32, 16, 16)]
-        ]
-        output_list = [(4), (3)]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 1, 10)
-            for output_size in output_list:
-                cpu_output = self.cpu_op_exec(cpu_input, output_size)
-                npu_output = self.npu_op_exec(npu_input, output_size)
-                self.assertRtolEqual(cpu_output, npu_output, prec16=0.002)
-
-    def test_AdaptiveAvgPool1d_shape_format_fp32(self, device):
-        shape_format = [
-                [np.float32, 0, (64, 10, 16)],
-                [np.float32, -1, (256, 2048, 8)],
-                [np.float32, 3, (32, 16, 16)]
-        ]
-        output_list = [(4), (3), (1)]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 1, 10)
-            for output_size in output_list:
-                cpu_output = self.cpu_op_exec(cpu_input, output_size)
-                npu_output = self.npu_op_exec(npu_input, output_size)
-                self.assertRtolEqual(cpu_output, npu_output, 0.001)
-
-instantiate_device_type_tests(TestAdaptiveAvgPool1d, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestAdaptiveAvgPool1d(TestCase):
+    def cpu_op_exec(self, input, output_size):
+        m = nn.AdaptiveAvgPool1d(output_size)
+        output= m(input)
+        return output.numpy()
+
+    def npu_op_exec(self, input, output_size):
+        m = nn.AdaptiveAvgPool1d(output_size).npu()
+        output = m(input)
+        return output.cpu().numpy()
+    
+    def test_AdaptiveAvgPool1d_shape_format_fp16(self, device):
+        shape_format = [
+                [np.float16, 0, (64, 10, 16)],
+                [np.float16, -1, (256, 2048, 8)],
+                [np.float16, 3, (32, 16, 16)]
+        ]
+        output_list = [(4), (3)]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 1, 10)
+            for output_size in output_list:
+                cpu_output = self.cpu_op_exec(cpu_input, output_size)
+                npu_output = self.npu_op_exec(npu_input, output_size)
+                self.assertRtolEqual(cpu_output, npu_output, prec16=0.002)
+
+    def test_AdaptiveAvgPool1d_shape_format_fp32(self, device):
+        shape_format = [
+                [np.float32, 0, (64, 10, 16)],
+                [np.float32, -1, (256, 2048, 8)],
+                [np.float32, 3, (32, 16, 16)]
+        ]
+        output_list = [(4), (3), (1)]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 1, 10)
+            for output_size in output_list:
+                cpu_output = self.cpu_op_exec(cpu_input, output_size)
+                npu_output = self.npu_op_exec(npu_input, output_size)
+                self.assertRtolEqual(cpu_output, npu_output, 0.001)
+
+instantiate_device_type_tests(TestAdaptiveAvgPool1d, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_adaptive_avg_pool3d.py b/test/test_npu/test_network_ops/test_adaptive_avg_pool3d.py
index 859cccf3cbb1470177b87472a518fe4d0c06f870..27250968d7bc731b48b9d24d72ac4aedf05e9e83 100644
--- a/test/test_npu/test_network_ops/test_adaptive_avg_pool3d.py
+++ b/test/test_npu/test_network_ops/test_adaptive_avg_pool3d.py
@@ -1,69 +1,69 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestAdaptiveAvgPool3d(TestCase):
-    def cpu_op_exec(self, input, output_size):
-        m = nn.AdaptiveAvgPool3d(output_size)
-        output= m(input)
-        return output.numpy()
-
-    def npu_op_exec(self, input, output_size):
-        m = nn.AdaptiveAvgPool3d(output_size)
-        output= m(input).cpu()
-        return output.numpy()
-    
-    def test_AdaptiveAvgPool3d_shape_format_fp16(self, device):
-        shape_format = [
-                [np.float16, -1, (64, 10, 16, 32)],
-                [np.float16, -1, (4, 16, 8, 4, 2)],
-                [np.float16, -1, (2, 16, 4, 32)],
-                [np.float16, -1, (4, 16, 8, 4, 16)]
-        ]
-        # output_list = [(4, 2, 4), (2, 2, 2), (2, 4, 4), (4, 4, 2)]
-        output_list = [(1, 1, 1)]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 1, 10)
-            cpu_input = cpu_input.to(torch.float32)
-            for output_size in output_list:
-                cpu_output = self.cpu_op_exec(cpu_input, output_size)
-                npu_output = self.npu_op_exec(npu_input, output_size)
-                cpu_output = cpu_output.astype(npu_output.dtype)
-                self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_AdaptiveAvgPool3d_shape_format_fp32(self, device):
-        shape_format = [
-                [np.float32, -1, (64, 10, 16, 32)],
-                [np.float32, -1, (4, 2, 2, 4, 316)],
-                [np.float32, -1, (2, 16, 4, 32)],
-                [np.float32, -1, (4, 16, 8, 4, 16)]
-        ]
-        # output_list = [(4, 2, 4), (2, 2, 2), (2, 4, 4), (4, 4, 2)]
-        output_list = [(1, 1, 1)]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 1, 10)
-            for output_size in output_list:
-                cpu_output = self.cpu_op_exec(cpu_input, output_size)
-                npu_output = self.npu_op_exec(npu_input, output_size)
-                self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestAdaptiveAvgPool3d, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestAdaptiveAvgPool3d(TestCase):
+    def cpu_op_exec(self, input, output_size):
+        m = nn.AdaptiveAvgPool3d(output_size)
+        output= m(input)
+        return output.numpy()
+
+    def npu_op_exec(self, input, output_size):
+        m = nn.AdaptiveAvgPool3d(output_size)
+        output= m(input).cpu()
+        return output.numpy()
+    
+    def test_AdaptiveAvgPool3d_shape_format_fp16(self, device):
+        shape_format = [
+                [np.float16, -1, (64, 10, 16, 32)],
+                [np.float16, -1, (4, 16, 8, 4, 2)],
+                [np.float16, -1, (2, 16, 4, 32)],
+                [np.float16, -1, (4, 16, 8, 4, 16)]
+        ]
+        # output_list = [(4, 2, 4), (2, 2, 2), (2, 4, 4), (4, 4, 2)]
+        output_list = [(1, 1, 1)]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 1, 10)
+            cpu_input = cpu_input.to(torch.float32)
+            for output_size in output_list:
+                cpu_output = self.cpu_op_exec(cpu_input, output_size)
+                npu_output = self.npu_op_exec(npu_input, output_size)
+                cpu_output = cpu_output.astype(npu_output.dtype)
+                self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_AdaptiveAvgPool3d_shape_format_fp32(self, device):
+        shape_format = [
+                [np.float32, -1, (64, 10, 16, 32)],
+                [np.float32, -1, (4, 2, 2, 4, 316)],
+                [np.float32, -1, (2, 16, 4, 32)],
+                [np.float32, -1, (4, 16, 8, 4, 16)]
+        ]
+        # output_list = [(4, 2, 4), (2, 2, 2), (2, 4, 4), (4, 4, 2)]
+        output_list = [(1, 1, 1)]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 1, 10)
+            for output_size in output_list:
+                cpu_output = self.cpu_op_exec(cpu_input, output_size)
+                npu_output = self.npu_op_exec(npu_input, output_size)
+                self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestAdaptiveAvgPool3d, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_adaptive_avg_pool3d_backward.py b/test/test_npu/test_network_ops/test_adaptive_avg_pool3d_backward.py
index c3dc9a48430dbc337faa1ac4895b7563883584e2..01d1ce2a115c1f8c42fe67ad360b3897d93ef434 100644
--- a/test/test_npu/test_network_ops/test_adaptive_avg_pool3d_backward.py
+++ b/test/test_npu/test_network_ops/test_adaptive_avg_pool3d_backward.py
@@ -1,66 +1,66 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-from torch.nn import functional as F
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestAdaptiveAvgPool3dBackward(TestCase):
-
-    def cpu_op_exec(self, input_x, output_size):
-        input_x.requires_grad_(True)
-        m = torch.nn.AdaptiveAvgPool3d(output_size)
-        output = m(input_x)
-        ones = torch.ones_like(output)
-        output.backward(ones)
-        out = input_x.grad
-        return out.numpy()
-
-    def npu_op_exec(self, input_x,  output_size):
-        input_x.requires_grad_(True)
-        m = torch.nn.AdaptiveAvgPool3d( output_size)
-        output = m(input_x)
-        ones = torch.ones_like(output)
-        output.backward(ones)
-        out = input_x.grad.cpu()
-        return out.numpy()
-
-    def test_adaptiveAvgPool3d_backward(self, device):
-        dtype_list = [np.float16, np.float32]
-        format_list = [-1]
-        shape_list = [
-            [2, 3, 7, 7],
-            [1, 2, 3, 6, 6],
-            [6, 5, 8, 10],
-            [2, 5, 6, 8, 9]
-        ]
-        shape_format = [
-            [i, j, k] for i in dtype_list for j in format_list for k in shape_list
-        ]
-        output_sizes = [[1, 1, 1]]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 1, 10)
-            for output_size in output_sizes:
-                cpu_output = self.cpu_op_exec(cpu_input, output_size)
-                npu_output = self.npu_op_exec(npu_input, output_size)
-
-                self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestAdaptiveAvgPool3dBackward, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+from torch.nn import functional as F
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestAdaptiveAvgPool3dBackward(TestCase):
+
+    def cpu_op_exec(self, input_x, output_size):
+        input_x.requires_grad_(True)
+        m = torch.nn.AdaptiveAvgPool3d(output_size)
+        output = m(input_x)
+        ones = torch.ones_like(output)
+        output.backward(ones)
+        out = input_x.grad
+        return out.numpy()
+
+    def npu_op_exec(self, input_x,  output_size):
+        input_x.requires_grad_(True)
+        m = torch.nn.AdaptiveAvgPool3d( output_size)
+        output = m(input_x)
+        ones = torch.ones_like(output)
+        output.backward(ones)
+        out = input_x.grad.cpu()
+        return out.numpy()
+
+    def test_adaptiveAvgPool3d_backward(self, device):
+        dtype_list = [np.float16, np.float32]
+        format_list = [-1]
+        shape_list = [
+            [2, 3, 7, 7],
+            [1, 2, 3, 6, 6],
+            [6, 5, 8, 10],
+            [2, 5, 6, 8, 9]
+        ]
+        shape_format = [
+            [i, j, k] for i in dtype_list for j in format_list for k in shape_list
+        ]
+        output_sizes = [[1, 1, 1]]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 1, 10)
+            for output_size in output_sizes:
+                cpu_output = self.cpu_op_exec(cpu_input, output_size)
+                npu_output = self.npu_op_exec(npu_input, output_size)
+
+                self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestAdaptiveAvgPool3dBackward, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_adaptive_max_pool2d.py b/test/test_npu/test_network_ops/test_adaptive_max_pool2d.py
index 877f50c11c26fb787a491cae9fcfc7b2957db0a9..fbd6bf65e7a961b55b2dd7487b14a0f5509e869f 100644
--- a/test/test_npu/test_network_ops/test_adaptive_max_pool2d.py
+++ b/test/test_npu/test_network_ops/test_adaptive_max_pool2d.py
@@ -1,53 +1,53 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestAdaptiveMaxPool2d(TestCase):
-    def cpu_op_exec(self, input, output_size):
-        m = nn.AdaptiveMaxPool2d(output_size)
-        output = m(input)
-        return output.numpy()
-
-    def npu_op_exec(self, input, output_size):
-        m = nn.AdaptiveMaxPool2d(output_size).npu()
-        output = m(input)
-        return output.cpu().numpy()
-
-    def test_adaptiveMaxPool2d_shape_format_fp32_6(self, device):
-        format_list = [-1]
-        # (1, 8, 9) IndexError
-        shape_list = [(1, 5, 9, 9)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        output_list = [(3, 3)]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            for output_size in output_list:
-                cpu_output = self.cpu_op_exec(cpu_input, output_size)
-                npu_output = self.npu_op_exec(npu_input, output_size)
-
-                self.assertRtolEqual(cpu_output, npu_output, 0.0004)
-
-
-instantiate_device_type_tests(TestAdaptiveMaxPool2d, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestAdaptiveMaxPool2d(TestCase):
+    def cpu_op_exec(self, input, output_size):
+        m = nn.AdaptiveMaxPool2d(output_size)
+        output = m(input)
+        return output.numpy()
+
+    def npu_op_exec(self, input, output_size):
+        m = nn.AdaptiveMaxPool2d(output_size).npu()
+        output = m(input)
+        return output.cpu().numpy()
+
+    def test_adaptiveMaxPool2d_shape_format_fp32_6(self, device):
+        format_list = [-1]
+        # (1, 8, 9) IndexError
+        shape_list = [(1, 5, 9, 9)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        output_list = [(3, 3)]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            for output_size in output_list:
+                cpu_output = self.cpu_op_exec(cpu_input, output_size)
+                npu_output = self.npu_op_exec(npu_input, output_size)
+
+                self.assertRtolEqual(cpu_output, npu_output, 0.0004)
+
+
+instantiate_device_type_tests(TestAdaptiveMaxPool2d, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_add.py b/test/test_npu/test_network_ops/test_add.py
old mode 100644
new mode 100755
index af1963946098a7f5a66cc43b722ab421e6db7ee5..34235b05e7cb5f8b60b880c917dc0fbef6b40a19
--- a/test/test_npu/test_network_ops/test_add.py
+++ b/test/test_npu/test_network_ops/test_add.py
@@ -1,398 +1,398 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-sys.path.append('..')
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestAdd(TestCase):
-    def cpu_op_out_exec(self, input1, input2, output):
-        torch.add(input1, input2, alpha = 1, out = output)
-        output = output.numpy()
-        return output
-
-    def npu_op_out_exec_new(self, input1, input2, output):
-        torch.add(input1, input2, alpha = 1, out = output)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec(self, input1, input2):
-        output = torch.add(input1, input2, alpha = 1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_new(self, input1, input2):
-        output = torch.add(input1, input2, alpha = 1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_alpha(self, input1, input2):
-        output = torch.add(input1, input2, alpha = 3)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_new_alpha(self, input1, input2):
-        output = torch.add(input1, input2, alpha = 3)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_scalar_exec(self, input1, scalar):
-        output = torch.add(input1, scalar, alpha = 1)
-        output = output.numpy()
-        return output
-
-    def npu_op_scalar_exec_new(self, input1, scalar):
-        output = torch.add(input1, scalar, alpha = 1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_scalar_exec_alpha(self, input1, scalar):
-        output = torch.add(input1, scalar, alpha = 3)
-        output = output.numpy()
-        return output
-
-    def npu_op_scalar_exec_new_alpha(self, input1, scalar):
-        output = torch.add(input1, scalar, alpha = 3)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def add_scalar_result(self, shape_format):
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            if cpu_input.dtype == torch.float16:
-                cpu_input = cpu_input.to(torch.float32)
-            cpu_output = self.cpu_op_scalar_exec(cpu_input, item[1])
-            npu_output = self.npu_op_exec_new(npu_input, item[1])
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def add_scalar_alpha_result(self, shape_format):
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            if cpu_input.dtype == torch.float16:
-                cpu_input = cpu_input.to(torch.float32)
-            cpu_output = self.cpu_op_scalar_exec_alpha(cpu_input, item[1])
-            npu_output = self.npu_op_scalar_exec_new_alpha(npu_input, item[1])
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def add_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_input2 = cpu_input2.to(torch.float32)
-                
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec_new(npu_input1, npu_input2)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def add_out_result(self, shape_format):
-        for item in shape_format:
-            cpuout = torch.randn(3)
-            npuout = torch.randn(3).to("npu")
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_input2 = cpu_input2.to(torch.float32)
-                
-            cpu_output = self.cpu_op_out_exec(cpu_input1, cpu_input2,cpuout)
-            npu_output = self.npu_op_out_exec_new(npu_input1, npu_input2, npuout)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def add_alpha_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_input2 = cpu_input2.to(torch.float32)
-                
-            cpu_output = self.cpu_op_exec_alpha(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec_new_alpha(npu_input1, npu_input2)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_add_scalar_shape_format_fp16_1d(self, device):
-        format_list = [0, 3]
-        scalar_list = [0,1]
-        shape_format = [
-            [[np.float16, i, [18]], k]  for i in format_list for k in scalar_list            
-        ]        
-        self.add_scalar_result(shape_format)
-    
-    def test_add_scalar_shape_format_fp32_1d(self, device):
-        format_list = [0, 3]
-        scalar_list = [0,1]
-        shape_format = [
-            [[np.float32, i, [18]], k]  for i in format_list for k in scalar_list            
-        ]        
-        self.add_scalar_result(shape_format)
-        
-    def test_add_scalar_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 29]
-        scalar_list = [0,1]
-        shape_format = [
-            [[np.float16, i, [5, 256]], k]  for i in format_list for k in scalar_list            
-        ]        
-        self.add_scalar_result(shape_format)
-    
-    def test_add_scalar_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 29]
-        scalar_list = [0,1]
-        shape_format = [
-            [[np.float32, i, [5, 256]], k]  for i in format_list for k in scalar_list            
-        ]        
-        self.add_scalar_result(shape_format)
-        
-    def test_add_scalar_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 29]
-        scalar_list = [0,1]
-        shape_format = [
-            [[np.float16, i, [32, 3, 3]], k]  for i in format_list for k in scalar_list            
-        ]        
-        self.add_scalar_result(shape_format)
-    
-    def test_add_scalar_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 29]
-        scalar_list = [0,1]
-        shape_format = [
-            [[np.float32, i, [32, 3, 3]], k]  for i in format_list for k in scalar_list            
-        ]        
-        self.add_scalar_result(shape_format)
-        
-    def test_add_scalar_shape_format_fp16_4d(self, device):
-        format_list = [0, 3, 29]
-        scalar_list = [0,1]
-        shape_format = [
-            [[np.float16, i, [64, 112, 7, 7]], k]  for i in format_list for k in scalar_list            
-        ]        
-        self.add_scalar_result(shape_format)
-    
-    def test_add_scalar_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 29]
-        scalar_list = [0,1]
-        shape_format = [
-            [[np.float32, i, [64, 112, 7, 7]], k]  for i in format_list for k in scalar_list            
-        ]        
-        self.add_scalar_result(shape_format)
-          
-    def test_add_scalar_shape_format_fp16_1d(self, device):
-        format_list = [0, 3]
-        scalar_list = [0,1]
-        shape_format = [
-            [[np.float16, i, [18]], k]  for i in format_list for k in scalar_list            
-        ]        
-        self.add_scalar_alpha_result(shape_format)
-    
-    def test_add_scalar_shape_format_fp32_1d(self, device):
-        format_list = [0, 3]
-        scalar_list = [0,1]
-        shape_format = [
-            [[np.float32, i, [18]], k]  for i in format_list for k in scalar_list            
-        ]        
-        self.add_scalar_alpha_result(shape_format)
-        
-    def test_add_scalar_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 29]
-        scalar_list = [0,1]
-        shape_format = [
-            [[np.float16, i, [5, 256]], k]  for i in format_list for k in scalar_list            
-        ]        
-        self.add_scalar_alpha_result(shape_format)
-    
-    def test_add_scalar_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 29]
-        scalar_list = [0,1]
-        shape_format = [
-            [[np.float32, i, [5, 256]], k]  for i in format_list for k in scalar_list            
-        ]        
-        self.add_scalar_alpha_result(shape_format)
-        
-    def test_add_scalar_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 29]
-        scalar_list = [0,1]
-        shape_format = [
-            [[np.float16, i, [32, 3, 3]], k]  for i in format_list for k in scalar_list            
-        ]        
-        self.add_scalar_alpha_result(shape_format)
-    
-    def test_add_scalar_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 29]
-        scalar_list = [0,1]
-        shape_format = [
-            [[np.float32, i, [32, 3, 3]], k]  for i in format_list for k in scalar_list            
-        ]        
-        self.add_scalar_alpha_result(shape_format)
-        
-    def test_add_scalar_shape_format_fp16_4d(self, device):
-        format_list = [0, 3, 29]
-        scalar_list = [0,1]
-        shape_format = [
-            [[np.float16, i, [64, 112, 7, 7]], k]  for i in format_list for k in scalar_list            
-        ]        
-        self.add_scalar_alpha_result(shape_format)
-    
-    def test_add_scalar_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 29]
-        scalar_list = [0,1]
-        shape_format = [
-            [[np.float32, i, [64, 112, 7, 7]], k]  for i in format_list for k in scalar_list            
-        ]        
-        self.add_scalar_alpha_result(shape_format)
-
-    def test_add_shape_format_fp16_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float16, i, [64]]  for i in format_list
-        ]        
-        self.add_result(shape_format)
-    
-    def test_add_shape_format_fp32_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float32, i, [64]]  for i in format_list 
-        ]        
-        self.add_result(shape_format)
-        
-    def test_add_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [
-            [np.float16, i, [5, 256]]  for i in format_list
-        ]        
-        self.add_result(shape_format)
-    
-    def test_add_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [
-            [np.float32, i, [5, 256]]  for i in format_list 
-        ]        
-        self.add_result(shape_format)
-        
-    def test_add_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [
-            [np.float16, i, [32, 3, 3]]  for i in format_list
-        ]        
-        self.add_result(shape_format)
-    
-    def test_add_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [
-            [np.float32, i, [32, 3, 3]]  for i in format_list 
-        ]        
-        self.add_result(shape_format)
-        
-    def test_add_shape_format_fp16_4d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [
-            [np.float16, i, [64, 112, 7, 7]]  for i in format_list
-        ]        
-        self.add_result(shape_format)
-    
-    def test_add_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [
-            [np.float32, i, [64, 112, 7, 7]]  for i in format_list 
-        ]        
-        self.add_result(shape_format)
-
-    def test_add_shape_format_fp16_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float16, i, [64]]  for i in format_list
-        ]        
-        self.add_alpha_result(shape_format)
-    
-    def test_add_shape_format_fp32_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float32, i, [64]]  for i in format_list 
-        ]        
-        self.add_alpha_result(shape_format)
-        
-    def test_add_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [
-            [np.float16, i, [5, 256]]  for i in format_list
-        ]        
-        self.add_alpha_result(shape_format)
-    
-    def test_add_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [
-            [np.float32, i, [5, 256]]  for i in format_list 
-        ]        
-        self.add_alpha_result(shape_format)
-        
-    def test_add_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [
-            [np.float16, i, [32, 3, 3]]  for i in format_list
-        ]        
-        self.add_alpha_result(shape_format)
-    
-    def test_add_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [
-            [np.float32, i, [32, 3, 3]]  for i in format_list 
-        ]        
-        self.add_alpha_result(shape_format)
-        
-    def test_add_shape_format_fp16_4d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [
-            [np.float16, i, [64, 112, 7, 7]]  for i in format_list
-        ]        
-        self.add_alpha_result(shape_format)
-    
-    def test_add_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [
-            [np.float32, i, [64, 112, 7, 7]]  for i in format_list 
-        ]        
-        self.add_alpha_result(shape_format)
-
-    def test_add_mix_dtype(self, device):
-        cpu_input1, npu_input1 = create_common_tensor([np.int32, 0, (2, 3)], 1, 100)
-        cpu_input2, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
-        cpu_output = torch.add(cpu_input1, cpu_input2)
-        npu_output = torch.add(npu_input1, npu_input2)
-        npu_output = npu_output.to("cpu")
-        self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestAdd, globals(), except_for="cpu")
-
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('..')
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestAdd(TestCase):
+    def cpu_op_out_exec(self, input1, input2, output):
+        torch.add(input1, input2, alpha = 1, out = output)
+        output = output.numpy()
+        return output
+
+    def npu_op_out_exec_new(self, input1, input2, output):
+        torch.add(input1, input2, alpha = 1, out = output)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec(self, input1, input2):
+        output = torch.add(input1, input2, alpha = 1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_new(self, input1, input2):
+        output = torch.add(input1, input2, alpha = 1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_alpha(self, input1, input2):
+        output = torch.add(input1, input2, alpha = 3)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_new_alpha(self, input1, input2):
+        output = torch.add(input1, input2, alpha = 3)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_scalar_exec(self, input1, scalar):
+        output = torch.add(input1, scalar, alpha = 1)
+        output = output.numpy()
+        return output
+
+    def npu_op_scalar_exec_new(self, input1, scalar):
+        output = torch.add(input1, scalar, alpha = 1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_scalar_exec_alpha(self, input1, scalar):
+        output = torch.add(input1, scalar, alpha = 3)
+        output = output.numpy()
+        return output
+
+    def npu_op_scalar_exec_new_alpha(self, input1, scalar):
+        output = torch.add(input1, scalar, alpha = 3)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def add_scalar_result(self, shape_format):
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            if cpu_input.dtype == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+            cpu_output = self.cpu_op_scalar_exec(cpu_input, item[1])
+            npu_output = self.npu_op_exec_new(npu_input, item[1])
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def add_scalar_alpha_result(self, shape_format):
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            if cpu_input.dtype == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+            cpu_output = self.cpu_op_scalar_exec_alpha(cpu_input, item[1])
+            npu_output = self.npu_op_scalar_exec_new_alpha(npu_input, item[1])
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def add_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+                
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec_new(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def add_out_result(self, shape_format):
+        for item in shape_format:
+            cpuout = torch.randn(3)
+            npuout = torch.randn(3).to("npu")
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+                
+            cpu_output = self.cpu_op_out_exec(cpu_input1, cpu_input2,cpuout)
+            npu_output = self.npu_op_out_exec_new(npu_input1, npu_input2, npuout)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def add_alpha_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+                
+            cpu_output = self.cpu_op_exec_alpha(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec_new_alpha(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_add_scalar_shape_format_fp16_1d(self, device):
+        format_list = [0, 3]
+        scalar_list = [0,1]
+        shape_format = [
+            [[np.float16, i, [18]], k]  for i in format_list for k in scalar_list            
+        ]        
+        self.add_scalar_result(shape_format)
+    
+    def test_add_scalar_shape_format_fp32_1d(self, device):
+        format_list = [0, 3]
+        scalar_list = [0,1]
+        shape_format = [
+            [[np.float32, i, [18]], k]  for i in format_list for k in scalar_list            
+        ]        
+        self.add_scalar_result(shape_format)
+        
+    def test_add_scalar_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 29]
+        scalar_list = [0,1]
+        shape_format = [
+            [[np.float16, i, [5, 256]], k]  for i in format_list for k in scalar_list            
+        ]        
+        self.add_scalar_result(shape_format)
+    
+    def test_add_scalar_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 29]
+        scalar_list = [0,1]
+        shape_format = [
+            [[np.float32, i, [5, 256]], k]  for i in format_list for k in scalar_list            
+        ]        
+        self.add_scalar_result(shape_format)
+        
+    def test_add_scalar_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        scalar_list = [0,1]
+        shape_format = [
+            [[np.float16, i, [32, 3, 3]], k]  for i in format_list for k in scalar_list            
+        ]        
+        self.add_scalar_result(shape_format)
+    
+    def test_add_scalar_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        scalar_list = [0,1]
+        shape_format = [
+            [[np.float32, i, [32, 3, 3]], k]  for i in format_list for k in scalar_list            
+        ]        
+        self.add_scalar_result(shape_format)
+        
+    def test_add_scalar_shape_format_fp16_4d(self, device):
+        format_list = [0, 3, 29]
+        scalar_list = [0,1]
+        shape_format = [
+            [[np.float16, i, [64, 112, 7, 7]], k]  for i in format_list for k in scalar_list            
+        ]        
+        self.add_scalar_result(shape_format)
+    
+    def test_add_scalar_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 29]
+        scalar_list = [0,1]
+        shape_format = [
+            [[np.float32, i, [64, 112, 7, 7]], k]  for i in format_list for k in scalar_list            
+        ]        
+        self.add_scalar_result(shape_format)
+          
+    def test_add_scalar_shape_format_fp16_1d(self, device):
+        format_list = [0, 3]
+        scalar_list = [0,1]
+        shape_format = [
+            [[np.float16, i, [18]], k]  for i in format_list for k in scalar_list            
+        ]        
+        self.add_scalar_alpha_result(shape_format)
+    
+    def test_add_scalar_shape_format_fp32_1d(self, device):
+        format_list = [0, 3]
+        scalar_list = [0,1]
+        shape_format = [
+            [[np.float32, i, [18]], k]  for i in format_list for k in scalar_list            
+        ]        
+        self.add_scalar_alpha_result(shape_format)
+        
+    def test_add_scalar_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 29]
+        scalar_list = [0,1]
+        shape_format = [
+            [[np.float16, i, [5, 256]], k]  for i in format_list for k in scalar_list            
+        ]        
+        self.add_scalar_alpha_result(shape_format)
+    
+    def test_add_scalar_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 29]
+        scalar_list = [0,1]
+        shape_format = [
+            [[np.float32, i, [5, 256]], k]  for i in format_list for k in scalar_list            
+        ]        
+        self.add_scalar_alpha_result(shape_format)
+        
+    def test_add_scalar_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        scalar_list = [0,1]
+        shape_format = [
+            [[np.float16, i, [32, 3, 3]], k]  for i in format_list for k in scalar_list            
+        ]        
+        self.add_scalar_alpha_result(shape_format)
+    
+    def test_add_scalar_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        scalar_list = [0,1]
+        shape_format = [
+            [[np.float32, i, [32, 3, 3]], k]  for i in format_list for k in scalar_list            
+        ]        
+        self.add_scalar_alpha_result(shape_format)
+        
+    def test_add_scalar_shape_format_fp16_4d(self, device):
+        format_list = [0, 3, 29]
+        scalar_list = [0,1]
+        shape_format = [
+            [[np.float16, i, [64, 112, 7, 7]], k]  for i in format_list for k in scalar_list            
+        ]        
+        self.add_scalar_alpha_result(shape_format)
+    
+    def test_add_scalar_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 29]
+        scalar_list = [0,1]
+        shape_format = [
+            [[np.float32, i, [64, 112, 7, 7]], k]  for i in format_list for k in scalar_list            
+        ]        
+        self.add_scalar_alpha_result(shape_format)
+
+    def test_add_shape_format_fp16_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [
+            [np.float16, i, [64]]  for i in format_list
+        ]        
+        self.add_result(shape_format)
+    
+    def test_add_shape_format_fp32_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [
+            [np.float32, i, [64]]  for i in format_list 
+        ]        
+        self.add_result(shape_format)
+        
+    def test_add_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [
+            [np.float16, i, [5, 256]]  for i in format_list
+        ]        
+        self.add_result(shape_format)
+    
+    def test_add_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [
+            [np.float32, i, [5, 256]]  for i in format_list 
+        ]        
+        self.add_result(shape_format)
+        
+    def test_add_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [
+            [np.float16, i, [32, 3, 3]]  for i in format_list
+        ]        
+        self.add_result(shape_format)
+    
+    def test_add_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [
+            [np.float32, i, [32, 3, 3]]  for i in format_list 
+        ]        
+        self.add_result(shape_format)
+        
+    def test_add_shape_format_fp16_4d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [
+            [np.float16, i, [64, 112, 7, 7]]  for i in format_list
+        ]        
+        self.add_result(shape_format)
+    
+    def test_add_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [
+            [np.float32, i, [64, 112, 7, 7]]  for i in format_list 
+        ]        
+        self.add_result(shape_format)
+
+    def test_add_shape_format_fp16_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [
+            [np.float16, i, [64]]  for i in format_list
+        ]        
+        self.add_alpha_result(shape_format)
+    
+    def test_add_shape_format_fp32_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [
+            [np.float32, i, [64]]  for i in format_list 
+        ]        
+        self.add_alpha_result(shape_format)
+        
+    def test_add_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [
+            [np.float16, i, [5, 256]]  for i in format_list
+        ]        
+        self.add_alpha_result(shape_format)
+    
+    def test_add_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [
+            [np.float32, i, [5, 256]]  for i in format_list 
+        ]        
+        self.add_alpha_result(shape_format)
+        
+    def test_add_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [
+            [np.float16, i, [32, 3, 3]]  for i in format_list
+        ]        
+        self.add_alpha_result(shape_format)
+    
+    def test_add_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [
+            [np.float32, i, [32, 3, 3]]  for i in format_list 
+        ]        
+        self.add_alpha_result(shape_format)
+        
+    def test_add_shape_format_fp16_4d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [
+            [np.float16, i, [64, 112, 7, 7]]  for i in format_list
+        ]        
+        self.add_alpha_result(shape_format)
+    
+    def test_add_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [
+            [np.float32, i, [64, 112, 7, 7]]  for i in format_list 
+        ]        
+        self.add_alpha_result(shape_format)
+
+    def test_add_mix_dtype(self, device):
+        cpu_input1, npu_input1 = create_common_tensor([np.int32, 0, (2, 3)], 1, 100)
+        cpu_input2, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
+        cpu_output = torch.add(cpu_input1, cpu_input2)
+        npu_output = torch.add(npu_input1, npu_input2)
+        npu_output = npu_output.to("cpu")
+        self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestAdd, globals(), except_for="cpu")
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_addmm.py b/test/test_npu/test_network_ops/test_addmm.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_all.py b/test/test_npu/test_network_ops/test_all.py
old mode 100644
new mode 100755
index 12cf8284caf24bf24f2ae090be190166117c009b..71373287f9cc9ce231f14f5eabe9288f10288711
--- a/test/test_npu/test_network_ops/test_all.py
+++ b/test/test_npu/test_network_ops/test_all.py
@@ -1,88 +1,88 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestAll(TestCase):
-    def create_bool_tensor(self, shape, minValue, maxValue):
-        input1 = np.random.uniform(minValue, maxValue, shape)
-        input1 = input1 > 0.5
-        cpu_input = torch.from_numpy(input1)
-        npu_input = torch.from_numpy(input1).to("npu")
-        return cpu_input, npu_input
-
-    def cpu_op_exec(self, input):
-        output = input.all()
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input):
-        output = input.all()
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_all_shape_format(self, device):
-        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024], [2, 0, 2]]
-        for item in shape_list:
-            cpu_input, npu_input = self.create_bool_tensor(item, 0, 1)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec(npu_input)
-            self.assertRtolEqual(
-                cpu_output.astype(
-                    np.int32), npu_output.astype(
-                    np.int32))
-
-    def cpu_op_exec1(self, input, dim):
-        output = input.all(dim=dim)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec1(self, input, dim):
-        output = input.all(dim=dim)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-    
-    def npu_op_out_exec1(self, input, dim):
-        shape = list(input.shape)
-        output0 = torch.randn(shape) > 0
-        output1 = torch.randn(shape.pop()) > 0
-        output0 = output0.npu()
-        output1 = output1.npu()
-        torch.all(input, dim=dim, keepdim = False, out = output0)
-        torch.all(input, dim=dim, keepdim = False, out = output1)
-        output0 = output0.to("cpu").numpy()
-        output1 = output1.to("cpu").numpy()
-        return output0, output1
-
-    def test_alld_shape_format(self, device):
-        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]]
-        for item in shape_list:
-            cpu_input, npu_input = self.create_bool_tensor(item, 0, 1)
-            cpu_output = self.cpu_op_exec1(cpu_input, 0)
-            npu_output = self.npu_op_exec1(npu_input, 0)
-            npu_out0, npu_out1 = self.npu_op_out_exec1(npu_input, 0)
-            self.assertRtolEqual(cpu_output.astype(np.int32), npu_output.astype(np.int32))
-            self.assertRtolEqual(cpu_output.astype(np.int32), npu_out0.astype(np.int32))
-            self.assertRtolEqual(cpu_output.astype(np.int32), npu_out1.astype(np.int32))
-
-
-instantiate_device_type_tests(TestAll, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestAll(TestCase):
+    def create_bool_tensor(self, shape, minValue, maxValue):
+        input1 = np.random.uniform(minValue, maxValue, shape)
+        input1 = input1 > 0.5
+        cpu_input = torch.from_numpy(input1)
+        npu_input = torch.from_numpy(input1).to("npu")
+        return cpu_input, npu_input
+
+    def cpu_op_exec(self, input):
+        output = input.all()
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input):
+        output = input.all()
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_all_shape_format(self, device):
+        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024], [2, 0, 2]]
+        for item in shape_list:
+            cpu_input, npu_input = self.create_bool_tensor(item, 0, 1)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            self.assertRtolEqual(
+                cpu_output.astype(
+                    np.int32), npu_output.astype(
+                    np.int32))
+
+    def cpu_op_exec1(self, input, dim):
+        output = input.all(dim=dim)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec1(self, input, dim):
+        output = input.all(dim=dim)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+    
+    def npu_op_out_exec1(self, input, dim):
+        shape = list(input.shape)
+        output0 = torch.randn(shape) > 0
+        output1 = torch.randn(shape.pop()) > 0
+        output0 = output0.npu()
+        output1 = output1.npu()
+        torch.all(input, dim=dim, keepdim = False, out = output0)
+        torch.all(input, dim=dim, keepdim = False, out = output1)
+        output0 = output0.to("cpu").numpy()
+        output1 = output1.to("cpu").numpy()
+        return output0, output1
+
+    def test_alld_shape_format(self, device):
+        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]]
+        for item in shape_list:
+            cpu_input, npu_input = self.create_bool_tensor(item, 0, 1)
+            cpu_output = self.cpu_op_exec1(cpu_input, 0)
+            npu_output = self.npu_op_exec1(npu_input, 0)
+            npu_out0, npu_out1 = self.npu_op_out_exec1(npu_input, 0)
+            self.assertRtolEqual(cpu_output.astype(np.int32), npu_output.astype(np.int32))
+            self.assertRtolEqual(cpu_output.astype(np.int32), npu_out0.astype(np.int32))
+            self.assertRtolEqual(cpu_output.astype(np.int32), npu_out1.astype(np.int32))
+
+
+instantiate_device_type_tests(TestAll, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_amp_non_finite_check_and_unscale_.py b/test/test_npu/test_network_ops/test_amp_non_finite_check_and_unscale_.py
index 7660cb245f36afec495ce9ce25dda06a886ab3ce..b34bbe53b65a71fb068129c0ad3941b901bf5968 100644
--- a/test/test_npu/test_network_ops/test_amp_non_finite_check_and_unscale_.py
+++ b/test/test_npu/test_network_ops/test_amp_non_finite_check_and_unscale_.py
@@ -1,80 +1,80 @@
-#  Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#  Licensed under the BSD 3-Clause License  (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#  https://opensource.org/licenses/BSD-3-Clause
-#
-#  Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class Test_AmpNonFiniteCheckAndUnscale_(TestCase):
-    def generate_data(self, min_d, max_d, shape, dtype, input3):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        input1 = torch.from_numpy(input1)
-        input2 = np.array([0.0]).astype(dtype)
-        input2 = torch.from_numpy(input2)
-        input3 = np.array([input3]).astype(dtype)
-        input3 = torch.from_numpy(input3)
-        return input1, input2, input3
-
-    def cpu_op_exec(self, input1, input2, input3):
-        input1 = input1.numpy()
-        input2 = input2.numpy()
-        input3 = input3.numpy()
-        input1 = np.multiply(input1, input3)
-        return input1
-
-    def npu_op_exec(self, input1, input2, input3):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        input3 = input3.to("npu")
-        torch._amp_non_finite_check_and_unscale_(input1,input2,input3)
-        input1 = input1.to("cpu")
-        input1 = input1.numpy()
-        return input1
-
-    def test_AmpNonFiniteCheckAndUnscale_float32_case1(self, device):
-        input1, input2, input3 = self.generate_data(0, 100, (4, 3), np.float32, 1.5)
-        cpu_output = self.cpu_op_exec(input1, input2, input3)
-        npu_output = self.npu_op_exec(input1, input2, input3)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_AmpNonFiniteCheckAndUnscale_float32_case2(self, device):
-        input1, input2, input3 = self.generate_data(0, 100, (2, 5, 6), np.float32, 3.7)
-        cpu_output = self.cpu_op_exec(input1, input2, input3)
-        npu_output = self.npu_op_exec(input1, input2, input3)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_AmpNonFiniteCheckAndUnscale_float16_case1(self, device):
-        input1, input2, input3 = self.generate_data(0, 100, (5, 7), np.float16, 1.9)
-        input1 = input1.to(torch.float32)
-        input2 = input2.to(torch.float32)
-        input3 = input3.to(torch.float32)
-        cpu_output = self.cpu_op_exec(input1, input2, input3)
-        npu_output = self.npu_op_exec(input1, input2, input3)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_AmpNonFiniteCheckAndUnscale_float16_case2(self, device):
-        input1, input2, input3 = self.generate_data(0, 100, (2, 8, 1), np.float16, 3.2)
-        input1 = input1.to(torch.float32)
-        input2 = input2.to(torch.float32)
-        input3 = input3.to(torch.float32)
-        cpu_output = self.cpu_op_exec(input1, input2, input3)
-        npu_output = self.npu_op_exec(input1, input2, input3)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(Test_AmpNonFiniteCheckAndUnscale_, globals(), except_for='cpu')
-if __name__ == '__main__':
+#  Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#  Licensed under the BSD 3-Clause License  (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#  https://opensource.org/licenses/BSD-3-Clause
+#
+#  Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class Test_AmpNonFiniteCheckAndUnscale_(TestCase):
+    def generate_data(self, min_d, max_d, shape, dtype, input3):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        input1 = torch.from_numpy(input1)
+        input2 = np.array([0.0]).astype(dtype)
+        input2 = torch.from_numpy(input2)
+        input3 = np.array([input3]).astype(dtype)
+        input3 = torch.from_numpy(input3)
+        return input1, input2, input3
+
+    def cpu_op_exec(self, input1, input2, input3):
+        input1 = input1.numpy()
+        input2 = input2.numpy()
+        input3 = input3.numpy()
+        input1 = np.multiply(input1, input3)
+        return input1
+
+    def npu_op_exec(self, input1, input2, input3):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        input3 = input3.to("npu")
+        torch._amp_non_finite_check_and_unscale_(input1,input2,input3)
+        input1 = input1.to("cpu")
+        input1 = input1.numpy()
+        return input1
+
+    def test_AmpNonFiniteCheckAndUnscale_float32_case1(self, device):
+        input1, input2, input3 = self.generate_data(0, 100, (4, 3), np.float32, 1.5)
+        cpu_output = self.cpu_op_exec(input1, input2, input3)
+        npu_output = self.npu_op_exec(input1, input2, input3)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_AmpNonFiniteCheckAndUnscale_float32_case2(self, device):
+        input1, input2, input3 = self.generate_data(0, 100, (2, 5, 6), np.float32, 3.7)
+        cpu_output = self.cpu_op_exec(input1, input2, input3)
+        npu_output = self.npu_op_exec(input1, input2, input3)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_AmpNonFiniteCheckAndUnscale_float16_case1(self, device):
+        input1, input2, input3 = self.generate_data(0, 100, (5, 7), np.float16, 1.9)
+        input1 = input1.to(torch.float32)
+        input2 = input2.to(torch.float32)
+        input3 = input3.to(torch.float32)
+        cpu_output = self.cpu_op_exec(input1, input2, input3)
+        npu_output = self.npu_op_exec(input1, input2, input3)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_AmpNonFiniteCheckAndUnscale_float16_case2(self, device):
+        input1, input2, input3 = self.generate_data(0, 100, (2, 8, 1), np.float16, 3.2)
+        input1 = input1.to(torch.float32)
+        input2 = input2.to(torch.float32)
+        input3 = input3.to(torch.float32)
+        cpu_output = self.cpu_op_exec(input1, input2, input3)
+        npu_output = self.npu_op_exec(input1, input2, input3)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(Test_AmpNonFiniteCheckAndUnscale_, globals(), except_for='cpu')
+if __name__ == '__main__':
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_anchor_response_flags.py b/test/test_npu/test_network_ops/test_anchor_response_flags.py
index 06c2e33a105939c9d851c03a43bc32049dd2f763..ea0bc819e8af685a2583287195a3aada2a1c94fa 100644
--- a/test/test_npu/test_network_ops/test_anchor_response_flags.py
+++ b/test/test_npu/test_network_ops/test_anchor_response_flags.py
@@ -1,60 +1,60 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import copy
-import sys
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestAnchorResponseFlags(TestCase):
-    def cpu_op_exec(self, gt_bboxes, featmap_size, strides, num_base_anchors):
-        feat_h, feat_w = featmap_size
-        gt_bboxes_cx = ((gt_bboxes[:, 0] + gt_bboxes[:, 2]) * 0.5)
-        gt_bboxes_cy = ((gt_bboxes[:, 1] + gt_bboxes[:, 3]) * 0.5)
-        gt_bboxes_grid_x = torch.floor(gt_bboxes_cx / strides[0]).int()
-        gt_bboxes_grid_y = torch.floor(gt_bboxes_cy / strides[1]).int()
-        gt_bboxes_grid_idx = gt_bboxes_grid_y * feat_w + gt_bboxes_grid_x
-        responsible_grid = torch.zeros(feat_h * feat_w, dtype=torch.uint8)
-        gt_bboxes_grid_idx = gt_bboxes_grid_idx.long()
-        responsible_grid[gt_bboxes_grid_idx] = 1
-        responsible_grid = responsible_grid[:, None].expand(
-            responsible_grid.size(0), num_base_anchors).contiguous().view(-1)
-        return responsible_grid.numpy()
-
-    def npu_op_exec(self, input_npu, featmap_size, strides, num_base_anchors):
-        out = torch.npu_anchor_response_flags(input_npu, featmap_size, strides, num_base_anchors)
-        out = out.to("cpu")
-        return out.detach().numpy()
-        
-
-    def test_anchor_response_flags(self, device):
-        shape_format = [
-            [[np.float32, -1, [100, 4]], [60, 60], [2, 2], 9],
-            [[np.float16, -1, [200, 4]], [10, 10], [32, 32], 3],
-            [[np.float16, -1, [500, 4]], [32, 32], [16, 16], 5]
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            if cpu_input.dtype == torch.float16:
-                cpu_input = cpu_input.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input, *item[1:])
-            npu_output = self.npu_op_exec(npu_input, *item[1:])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestAnchorResponseFlags, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import copy
+import sys
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestAnchorResponseFlags(TestCase):
+    def cpu_op_exec(self, gt_bboxes, featmap_size, strides, num_base_anchors):
+        feat_h, feat_w = featmap_size
+        gt_bboxes_cx = ((gt_bboxes[:, 0] + gt_bboxes[:, 2]) * 0.5)
+        gt_bboxes_cy = ((gt_bboxes[:, 1] + gt_bboxes[:, 3]) * 0.5)
+        gt_bboxes_grid_x = torch.floor(gt_bboxes_cx / strides[0]).int()
+        gt_bboxes_grid_y = torch.floor(gt_bboxes_cy / strides[1]).int()
+        gt_bboxes_grid_idx = gt_bboxes_grid_y * feat_w + gt_bboxes_grid_x
+        responsible_grid = torch.zeros(feat_h * feat_w, dtype=torch.uint8)
+        gt_bboxes_grid_idx = gt_bboxes_grid_idx.long()
+        responsible_grid[gt_bboxes_grid_idx] = 1
+        responsible_grid = responsible_grid[:, None].expand(
+            responsible_grid.size(0), num_base_anchors).contiguous().view(-1)
+        return responsible_grid.numpy()
+
+    def npu_op_exec(self, input_npu, featmap_size, strides, num_base_anchors):
+        out = torch.npu_anchor_response_flags(input_npu, featmap_size, strides, num_base_anchors)
+        out = out.to("cpu")
+        return out.detach().numpy()
+        
+
+    def test_anchor_response_flags(self, device):
+        shape_format = [
+            [[np.float32, -1, [100, 4]], [60, 60], [2, 2], 9],
+            [[np.float16, -1, [200, 4]], [10, 10], [32, 32], 3],
+            [[np.float16, -1, [500, 4]], [32, 32], [16, 16], 5]
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            if cpu_input.dtype == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input, *item[1:])
+            npu_output = self.npu_op_exec(npu_input, *item[1:])
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestAnchorResponseFlags, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_any.py b/test/test_npu/test_network_ops/test_any.py
old mode 100644
new mode 100755
index ded18ccb5cfb579b74e17bf14560732f12a0b649..2bde503e0641f48fc1221c8a179359dd14ebd061
--- a/test/test_npu/test_network_ops/test_any.py
+++ b/test/test_npu/test_network_ops/test_any.py
@@ -1,94 +1,94 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestAny(TestCase):
-    def create_bool_tensor(self, shape, minValue, maxValue):
-        input1 = np.random.uniform(minValue, maxValue, shape)
-        cpu_input = torch.from_numpy(input1) > 0.5
-        npu_input = (torch.from_numpy(input1) > 0.5).to("npu")
-        return cpu_input, npu_input
-
-    def cpu_op_exec(self, input):
-        output = input.any()
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input):
-        output = input.any()
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_any_shape_format(self, device):
-        shape_list = [[],
-                      [1024], 
-                      [32, 1024], 
-                      [32, 8, 1024], 
-                      [128, 32, 8, 1024]]
-
-        for item in shape_list:
-            cpu_input, npu_input = self.create_bool_tensor(item, 0, 1)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec(npu_input)
-            self.assertRtolEqual(
-                cpu_output.astype(np.int32), 
-                npu_output.astype(np.int32))
-
-    def cpu_op_exec1(self, input, dim, keepdim):
-        output = input.any(dim=dim, keepdim=keepdim)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec1(self, input, dim, keepdim):
-        output = input.any(dim=dim, keepdim=keepdim)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-    
-    def npu_op_out_exec1(self, input, dim, keepdim):
-        shape = list(input.shape)
-        output0 = torch.randn(shape)>0
-        output1 = torch.randn(shape.pop())>0
-        output0 = output0.npu()
-        output1 = output1.npu()
-        torch.any(input, dim=dim, keepdim=keepdim, out=output0)
-        torch.any(input, dim=dim, keepdim=keepdim, out=output1)
-        output0 = output0.to("cpu").numpy()
-        output1 = output1.to("cpu").numpy()
-        return output0, output1
-
-    def test_anyd_shape_format(self, device):
-        shape_list = [[ [1024],             0, False],
-                      [ [32, 1024],         1, False],
-                      [ [32, 8, 1024],      2, True ],
-                      [ [128, 32, 8, 1024], 3, True ]]
-
-        for item in shape_list:
-            cpu_input, npu_input = self.create_bool_tensor(item[0], 0, 1)
-            cpu_output = self.cpu_op_exec1(cpu_input, item[1], item[2])
-            npu_output = self.npu_op_exec1(npu_input, item[1], item[2])
-            npu_out0, npu_out1 = self.npu_op_out_exec1(npu_input, item[1], item[2])
-            self.assertRtolEqual(cpu_output.astype(np.int32),npu_output.astype(np.int32))
-            self.assertRtolEqual(cpu_output.astype(np.int32),npu_out0.astype(np.int32))
-            self.assertRtolEqual(cpu_output.astype(np.int32),npu_out1.astype(np.int32))
-
-instantiate_device_type_tests(TestAny, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestAny(TestCase):
+    def create_bool_tensor(self, shape, minValue, maxValue):
+        input1 = np.random.uniform(minValue, maxValue, shape)
+        cpu_input = torch.from_numpy(input1) > 0.5
+        npu_input = (torch.from_numpy(input1) > 0.5).to("npu")
+        return cpu_input, npu_input
+
+    def cpu_op_exec(self, input):
+        output = input.any()
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input):
+        output = input.any()
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_any_shape_format(self, device):
+        shape_list = [[],
+                      [1024], 
+                      [32, 1024], 
+                      [32, 8, 1024], 
+                      [128, 32, 8, 1024]]
+
+        for item in shape_list:
+            cpu_input, npu_input = self.create_bool_tensor(item, 0, 1)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            self.assertRtolEqual(
+                cpu_output.astype(np.int32), 
+                npu_output.astype(np.int32))
+
+    def cpu_op_exec1(self, input, dim, keepdim):
+        output = input.any(dim=dim, keepdim=keepdim)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec1(self, input, dim, keepdim):
+        output = input.any(dim=dim, keepdim=keepdim)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+    
+    def npu_op_out_exec1(self, input, dim, keepdim):
+        shape = list(input.shape)
+        output0 = torch.randn(shape)>0
+        output1 = torch.randn(shape.pop())>0
+        output0 = output0.npu()
+        output1 = output1.npu()
+        torch.any(input, dim=dim, keepdim=keepdim, out=output0)
+        torch.any(input, dim=dim, keepdim=keepdim, out=output1)
+        output0 = output0.to("cpu").numpy()
+        output1 = output1.to("cpu").numpy()
+        return output0, output1
+
+    def test_anyd_shape_format(self, device):
+        shape_list = [[ [1024],             0, False],
+                      [ [32, 1024],         1, False],
+                      [ [32, 8, 1024],      2, True ],
+                      [ [128, 32, 8, 1024], 3, True ]]
+
+        for item in shape_list:
+            cpu_input, npu_input = self.create_bool_tensor(item[0], 0, 1)
+            cpu_output = self.cpu_op_exec1(cpu_input, item[1], item[2])
+            npu_output = self.npu_op_exec1(npu_input, item[1], item[2])
+            npu_out0, npu_out1 = self.npu_op_out_exec1(npu_input, item[1], item[2])
+            self.assertRtolEqual(cpu_output.astype(np.int32),npu_output.astype(np.int32))
+            self.assertRtolEqual(cpu_output.astype(np.int32),npu_out0.astype(np.int32))
+            self.assertRtolEqual(cpu_output.astype(np.int32),npu_out1.astype(np.int32))
+
+instantiate_device_type_tests(TestAny, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_arange.py b/test/test_npu/test_network_ops/test_arange.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_argmax.py b/test/test_npu/test_network_ops/test_argmax.py
old mode 100644
new mode 100755
index 0b00aba0f60c1c00c6c084ffe327409680254398..70b08c7a0f95922ad3c720836400c83f135c6cb9
--- a/test/test_npu/test_network_ops/test_argmax.py
+++ b/test/test_npu/test_network_ops/test_argmax.py
@@ -1,101 +1,101 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestArgmax(TestCase):
-    def cpu_op_exec(self, input):
-        output = torch.argmax(input)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input):
-        output = torch.argmax(input)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_argmax_shape_format_fp16(self, device):
-        format_list = [0]
-        shape_list = [[5], [2, 4], [2, 2, 4], [2, 3, 3, 4]]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, -10, 10)
-            cpu_input = cpu_input.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec(npu_input)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_argmax_shape_format_fp32(self, device):
-        format_list = [0]
-        shape_list = [[5], [2, 4], [2, 2, 4], [2, 3, 3, 4]]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, -10, 10)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec(npu_input)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def cpu_op_exec1(self, input, dim):
-        output = torch.argmax(input, dim)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec1(self, input, dim):
-        output = torch.argmax(input, dim)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_argmaxd_shape_format_fp16(self, device):
-        format_list = [0]
-        shape_list = [[5], [2, 4], [2, 2, 4], [2, 3, 3, 4]]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, -10, 10)
-            cpu_input = cpu_input.to(torch.float32)
-            cpu_output = self.cpu_op_exec1(cpu_input, -1)
-            npu_output = self.npu_op_exec1(npu_input, -1)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_argmaxd_shape_format_fp32(self, device):
-        format_list = [0]
-        shape_list = [[5], [2, 4], [2, 2, 4], [2, 3, 3, 4]]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, -10, 10)
-            cpu_output = self.cpu_op_exec1(cpu_input, -1)
-            npu_output = self.npu_op_exec1(npu_input, -1)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestArgmax, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestArgmax(TestCase):
+    def cpu_op_exec(self, input):
+        output = torch.argmax(input)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input):
+        output = torch.argmax(input)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_argmax_shape_format_fp16(self, device):
+        format_list = [0]
+        shape_list = [[5], [2, 4], [2, 2, 4], [2, 3, 3, 4]]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, -10, 10)
+            cpu_input = cpu_input.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_argmax_shape_format_fp32(self, device):
+        format_list = [0]
+        shape_list = [[5], [2, 4], [2, 2, 4], [2, 3, 3, 4]]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, -10, 10)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def cpu_op_exec1(self, input, dim):
+        output = torch.argmax(input, dim)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec1(self, input, dim):
+        output = torch.argmax(input, dim)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_argmaxd_shape_format_fp16(self, device):
+        format_list = [0]
+        shape_list = [[5], [2, 4], [2, 2, 4], [2, 3, 3, 4]]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, -10, 10)
+            cpu_input = cpu_input.to(torch.float32)
+            cpu_output = self.cpu_op_exec1(cpu_input, -1)
+            npu_output = self.npu_op_exec1(npu_input, -1)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_argmaxd_shape_format_fp32(self, device):
+        format_list = [0]
+        shape_list = [[5], [2, 4], [2, 2, 4], [2, 3, 3, 4]]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, -10, 10)
+            cpu_output = self.cpu_op_exec1(cpu_input, -1)
+            npu_output = self.npu_op_exec1(npu_input, -1)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestArgmax, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_argsort.py b/test/test_npu/test_network_ops/test_argsort.py
index be90fd3c8e9d458e30257fb62e43853489e51091..73fc5ecc4e364ac24e88a1e6e77e6484feef9fb2 100644
--- a/test/test_npu/test_network_ops/test_argsort.py
+++ b/test/test_npu/test_network_ops/test_argsort.py
@@ -1,80 +1,80 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestArgSort(TestCase):
-    def cpu_op_exec(self, input1, dim, descending):
-        output = torch.argsort(input1, dim=dim, descending=descending)
-        return output.numpy()
-
-    def npu_op_exec(self, input1, dim, descending):
-        output = torch.argsort(input1, dim=dim, descending=descending)
-
-        return output.cpu().numpy()
-    
-    def cpu_default_op_exec(self, input1):
-        output = torch.argsort(input1)
-        return output.numpy()
-
-    def npu_default_op_exec(self, input1):
-        output = torch.argsort(input1)
-        return output.cpu().numpy()
-
-    def test_sort_shape_format_fp32(self, device):
-        shape_format = [
-                [[np.float32, 0, (8, 4, 3, 9)], 2, False],
-                [[np.float32, 0, (2, 3)]],
-                [[np.float32, 0, (1, 7)], 0, True],
-                [[np.float32, 0, (1, 5, 6)], 1, False],
-        ]
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            if len(item) > 1:
-                cpu_output = self.cpu_op_exec(cpu_input1, item[1], item[2])
-                npu_output = self.npu_op_exec(npu_input1, item[1], item[2])
-            else:
-                cpu_output = self.cpu_default_op_exec(cpu_input1)
-                npu_output = self.npu_default_op_exec(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_sort_shape_format_fp16(self, device):
-        shape_format = [
-                [[np.float16, 0, (8, 4, 3, 9)], 2, False],
-                [[np.float16, 0, (2, 3)]],
-                [[np.float16, 0, (1, 7)], 0, True],
-                [[np.float16, 0, (1, 5, 6)], 1, False],
-        ]
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            if len(item) > 1:
-                cpu_output = self.cpu_op_exec(cpu_input1.to(torch.float32), item[1], item[2])
-                npu_output = self.npu_op_exec(npu_input1, item[1], item[2])
-            else:
-                cpu_output = self.cpu_default_op_exec(cpu_input1.to(torch.float32))
-                npu_output = self.npu_default_op_exec(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestArgSort, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestArgSort(TestCase):
+    def cpu_op_exec(self, input1, dim, descending):
+        output = torch.argsort(input1, dim=dim, descending=descending)
+        return output.numpy()
+
+    def npu_op_exec(self, input1, dim, descending):
+        output = torch.argsort(input1, dim=dim, descending=descending)
+
+        return output.cpu().numpy()
+    
+    def cpu_default_op_exec(self, input1):
+        output = torch.argsort(input1)
+        return output.numpy()
+
+    def npu_default_op_exec(self, input1):
+        output = torch.argsort(input1)
+        return output.cpu().numpy()
+
+    def test_sort_shape_format_fp32(self, device):
+        shape_format = [
+                [[np.float32, 0, (8, 4, 3, 9)], 2, False],
+                [[np.float32, 0, (2, 3)]],
+                [[np.float32, 0, (1, 7)], 0, True],
+                [[np.float32, 0, (1, 5, 6)], 1, False],
+        ]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            if len(item) > 1:
+                cpu_output = self.cpu_op_exec(cpu_input1, item[1], item[2])
+                npu_output = self.npu_op_exec(npu_input1, item[1], item[2])
+            else:
+                cpu_output = self.cpu_default_op_exec(cpu_input1)
+                npu_output = self.npu_default_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_sort_shape_format_fp16(self, device):
+        shape_format = [
+                [[np.float16, 0, (8, 4, 3, 9)], 2, False],
+                [[np.float16, 0, (2, 3)]],
+                [[np.float16, 0, (1, 7)], 0, True],
+                [[np.float16, 0, (1, 5, 6)], 1, False],
+        ]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            if len(item) > 1:
+                cpu_output = self.cpu_op_exec(cpu_input1.to(torch.float32), item[1], item[2])
+                npu_output = self.npu_op_exec(npu_input1, item[1], item[2])
+            else:
+                cpu_output = self.cpu_default_op_exec(cpu_input1.to(torch.float32))
+                npu_output = self.npu_default_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestArgSort, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_as_strided.py b/test/test_npu/test_network_ops/test_as_strided.py
index f8da429613197b599b907640daaea11f4d474ecb..5ffeaace5115d0a606aa81a9f0ff9468b764902d 100644
--- a/test/test_npu/test_network_ops/test_as_strided.py
+++ b/test/test_npu/test_network_ops/test_as_strided.py
@@ -1,53 +1,53 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestAsStrided(TestCase):
-    def cpu_op_exec(self, input1, size, stride, storage_offset):
-        output = torch.as_strided(input1, size, stride, storage_offset)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self,input1, size, stride, storage_offset):
-        output = torch.as_strided(input1, size, stride, storage_offset)
-        output = output.cpu().numpy()
-        return output
-
-    def test_as_strided(self, device):
-        shape_format = [
-                [[np.float32, 0, [3, 3]], (2, 2), (1, 2), 0],
-                [[np.float16, 0, [13, 23]], (10, 15), (1, 2), 1],
-                [[np.int32, 0, [5, 5]], (3, 3), (1, 2), 1],
-        ]
-        
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, item[1], item[2], item[3])
-            npu_output = self.npu_op_exec(npu_input1, item[1], item[2], item[3])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestAsStrided, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestAsStrided(TestCase):
+    def cpu_op_exec(self, input1, size, stride, storage_offset):
+        output = torch.as_strided(input1, size, stride, storage_offset)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self,input1, size, stride, storage_offset):
+        output = torch.as_strided(input1, size, stride, storage_offset)
+        output = output.cpu().numpy()
+        return output
+
+    def test_as_strided(self, device):
+        shape_format = [
+                [[np.float32, 0, [3, 3]], (2, 2), (1, 2), 0],
+                [[np.float16, 0, [13, 23]], (10, 15), (1, 2), 1],
+                [[np.int32, 0, [5, 5]], (3, 3), (1, 2), 1],
+        ]
+        
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, item[1], item[2], item[3])
+            npu_output = self.npu_op_exec(npu_input1, item[1], item[2], item[3])
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestAsStrided, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_atan.py b/test/test_npu/test_network_ops/test_atan.py
index 32292bea7bbb09df0d5a97d077451a5ef8a1a698..65451c008735c2c543086f4ed21459fb43114f32 100644
--- a/test/test_npu/test_network_ops/test_atan.py
+++ b/test/test_npu/test_network_ops/test_atan.py
@@ -1,49 +1,49 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
- 
-class TestAtan(TestCase):
-    def cpu_op_exec(self, input):
-        output = torch.atan(input) 
-        return output
- 
-    def npu_op_exec(self, input):
-        output = torch.atan(input) 
-        output = output.to("cpu") 
-        return output  
-        
-    def test_atan_shape_format(self, device):
-        shape_format = [
-                [[np.float32, 0, 1]],
-                [[np.float32, 0, (64, 10)]],
-                [[np.float32, 3, (256, 2048, 7, 7)]],
-                [[np.float32, 4, (32, 1, 3, 3)]],
-                [[np.float32, 29, (10, 128)]]
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], -1, 1)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec(npu_input)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestAtan, globals(), except_for="cpu") 
-if __name__ == "__main__":
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+ 
+class TestAtan(TestCase):
+    def cpu_op_exec(self, input):
+        output = torch.atan(input) 
+        return output
+ 
+    def npu_op_exec(self, input):
+        output = torch.atan(input) 
+        output = output.to("cpu") 
+        return output  
+        
+    def test_atan_shape_format(self, device):
+        shape_format = [
+                [[np.float32, 0, 1]],
+                [[np.float32, 0, (64, 10)]],
+                [[np.float32, 3, (256, 2048, 7, 7)]],
+                [[np.float32, 4, (32, 1, 3, 3)]],
+                [[np.float32, 29, (10, 128)]]
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], -1, 1)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestAtan, globals(), except_for="cpu") 
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_avg_pool2d.py b/test/test_npu/test_network_ops/test_avg_pool2d.py
index 6042069f8de3840e11a894f355da879a2319098a..ff218cd5dc7d0ba00e81081f76031195da8df5ca 100644
--- a/test/test_npu/test_network_ops/test_avg_pool2d.py
+++ b/test/test_npu/test_network_ops/test_avg_pool2d.py
@@ -1,63 +1,63 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestAvgPool2d(TestCase):
-    def cpu_op_exec(self, input, ceil_mode):
-        m = nn.AvgPool2d(3, stride=(6, 5), padding=0, ceil_mode=ceil_mode)
-        output = m(input)
-        output = output.detach().numpy()
-        return output
-
-    def npu_op_exec(self, input, ceil_mode):
-        m = nn.AvgPool2d(3, stride=(6, 5), padding=0, ceil_mode=ceil_mode).npu()
-        output = m(input)
-        output = output.to("cpu")
-        output = output.detach().numpy()
-        return output
-
-    def test_avg_pool2d_backward_shape_format_fp16(self, device):
-        shape_format = [
-            [[np.float16, 0, (1, 3, 147, 147)], True],
-            [[np.float16, 0, (1, 3, 147, 147)], True]
-        ]
-        
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            cpu_input = cpu_input.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input.float(), item[1]).astype(np.float16)
-            npu_output = self.npu_op_exec(npu_input, item[1])
-            self.assertRtolEqual(cpu_output, npu_output, prec16=0.002)
-
-    def test_avg_pool2d_backward_shape_format_fp32(self, device):
-        shape_format = [
-            [[np.float32, 0, (1, 3, 147, 147)], True],
-            [[np.float32, 0, (1, 3, 147, 147)], True]
-        ]
-
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            cpu_output = self.cpu_op_exec(cpu_input, item[1])
-            npu_output = self.npu_op_exec(npu_input, item[1])
-            self.assertRtolEqual(cpu_output, npu_output, 0.0009)
-
-instantiate_device_type_tests(TestAvgPool2d, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestAvgPool2d(TestCase):
+    def cpu_op_exec(self, input, ceil_mode):
+        m = nn.AvgPool2d(3, stride=(6, 5), padding=0, ceil_mode=ceil_mode)
+        output = m(input)
+        output = output.detach().numpy()
+        return output
+
+    def npu_op_exec(self, input, ceil_mode):
+        m = nn.AvgPool2d(3, stride=(6, 5), padding=0, ceil_mode=ceil_mode).npu()
+        output = m(input)
+        output = output.to("cpu")
+        output = output.detach().numpy()
+        return output
+
+    def test_avg_pool2d_backward_shape_format_fp16(self, device):
+        shape_format = [
+            [[np.float16, 0, (1, 3, 147, 147)], True],
+            [[np.float16, 0, (1, 3, 147, 147)], True]
+        ]
+        
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            cpu_input = cpu_input.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input.float(), item[1]).astype(np.float16)
+            npu_output = self.npu_op_exec(npu_input, item[1])
+            self.assertRtolEqual(cpu_output, npu_output, prec16=0.002)
+
+    def test_avg_pool2d_backward_shape_format_fp32(self, device):
+        shape_format = [
+            [[np.float32, 0, (1, 3, 147, 147)], True],
+            [[np.float32, 0, (1, 3, 147, 147)], True]
+        ]
+
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input, item[1])
+            npu_output = self.npu_op_exec(npu_input, item[1])
+            self.assertRtolEqual(cpu_output, npu_output, 0.0009)
+
+instantiate_device_type_tests(TestAvgPool2d, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_avg_pool2d_backward.py b/test/test_npu/test_network_ops/test_avg_pool2d_backward.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_avg_pool3d.py b/test/test_npu/test_network_ops/test_avg_pool3d.py
index cd33a713467660c06b8ad36e458f038296be54b9..fba81c961671794b9ee63c8851f57e1c29a7ac7c 100644
--- a/test/test_npu/test_network_ops/test_avg_pool3d.py
+++ b/test/test_npu/test_network_ops/test_avg_pool3d.py
@@ -1,76 +1,76 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION.
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestAvgPool3D(TestCase):
-
-    def cpu_op_exec(self, kernel_size, stride, input1):
-        m = torch.nn.AvgPool3d(kernel_size, stride)
-        output_data = m(input1)
-        return output_data
-
-    def cpu_op_exec_fp16(self, kernel_size, stride, input1):
-        m = torch.nn.AvgPool3d(kernel_size, stride)
-        output_data = m(input1.float())
-        return output_data.half()
-
-    def npu_op_exec(self, kernel_size, stride, input1):
-        m = torch.nn.AvgPool3d(kernel_size, stride).npu()
-        output_data = m(input1)
-        return output_data
-
-    def test_avg_pool_3d_fp32(self, device):
-        # shape_format:[[dtype, (input_shape)], kernel_size, stride]
-        shape_format = [
-                        [[np.float32, -1, (20, 16, 50, 44, 31)], (3, 2, 2), (2, 1, 2)],
-                        [[np.float32, -1, (2, 1, 4, 4, 4)], 3, 2],
-                        [[np.float32, -1, (2, 1, 4, 4, 4)], 2, 2],
-                        [[np.float32, -1, (2, 4 , 4, 4)], 2, 2]
-        ]
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor( item[0], 1, 100 )
-            npu_output = self.npu_op_exec(item[1], item[2], npu_input1)
-            cpu_output = self.cpu_op_exec(item[1], item[2], cpu_input1)
-            self.assertRtolEqual(cpu_output, npu_output.cpu(), 1.e-3)
-
-    def test_avg_pool_3d_fp16(self, device):
-        # shape_format:[[dtype, (input_shape)], kernel_size, stride]
-        shape_format = [
-                        [[np.float16, -1, (20, 16, 50, 44, 31)], (3, 2, 2), (2, 1, 2)],
-                        [[np.float16, -1, (2, 1, 4, 4, 4)], 3, 2],
-                        [[np.float16, -1, (2, 1, 4, 4, 4)], 2, 2],
-                        [[np.float16, -1, (2, 4 , 4, 4)], 2, 2]
-        ]
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor( item[0], 1, 100 )
-            npu_output = self.npu_op_exec(item[1], item[2], npu_input1)
-            cpu_output = self.cpu_op_exec_fp16(item[1], item[2], cpu_input1)
-            self.assertRtolEqual(cpu_output, npu_output.cpu())
-
-instantiate_device_type_tests(TestAvgPool3D, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
-
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION.
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestAvgPool3D(TestCase):
+
+    def cpu_op_exec(self, kernel_size, stride, input1):
+        m = torch.nn.AvgPool3d(kernel_size, stride)
+        output_data = m(input1)
+        return output_data
+
+    def cpu_op_exec_fp16(self, kernel_size, stride, input1):
+        m = torch.nn.AvgPool3d(kernel_size, stride)
+        output_data = m(input1.float())
+        return output_data.half()
+
+    def npu_op_exec(self, kernel_size, stride, input1):
+        m = torch.nn.AvgPool3d(kernel_size, stride).npu()
+        output_data = m(input1)
+        return output_data
+
+    def test_avg_pool_3d_fp32(self, device):
+        # shape_format:[[dtype, (input_shape)], kernel_size, stride]
+        shape_format = [
+                        [[np.float32, -1, (20, 16, 50, 44, 31)], (3, 2, 2), (2, 1, 2)],
+                        [[np.float32, -1, (2, 1, 4, 4, 4)], 3, 2],
+                        [[np.float32, -1, (2, 1, 4, 4, 4)], 2, 2],
+                        [[np.float32, -1, (2, 4 , 4, 4)], 2, 2]
+        ]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor( item[0], 1, 100 )
+            npu_output = self.npu_op_exec(item[1], item[2], npu_input1)
+            cpu_output = self.cpu_op_exec(item[1], item[2], cpu_input1)
+            self.assertRtolEqual(cpu_output, npu_output.cpu(), 1.e-3)
+
+    def test_avg_pool_3d_fp16(self, device):
+        # shape_format:[[dtype, (input_shape)], kernel_size, stride]
+        shape_format = [
+                        [[np.float16, -1, (20, 16, 50, 44, 31)], (3, 2, 2), (2, 1, 2)],
+                        [[np.float16, -1, (2, 1, 4, 4, 4)], 3, 2],
+                        [[np.float16, -1, (2, 1, 4, 4, 4)], 2, 2],
+                        [[np.float16, -1, (2, 4 , 4, 4)], 2, 2]
+        ]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor( item[0], 1, 100 )
+            npu_output = self.npu_op_exec(item[1], item[2], npu_input1)
+            cpu_output = self.cpu_op_exec_fp16(item[1], item[2], cpu_input1)
+            self.assertRtolEqual(cpu_output, npu_output.cpu())
+
+instantiate_device_type_tests(TestAvgPool3D, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
+
diff --git a/test/test_npu/test_network_ops/test_batchnorm_backward_eval.py b/test/test_npu/test_network_ops/test_batchnorm_backward_eval.py
index 69b82acc880b9e2368567438f6e05df5bab3b217..81120f2ef927c0ffa28337bcb038d5ec035e653f 100644
--- a/test/test_npu/test_network_ops/test_batchnorm_backward_eval.py
+++ b/test/test_npu/test_network_ops/test_batchnorm_backward_eval.py
@@ -1,83 +1,83 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-import torch.nn as nn
-
-class Model(nn.Module):
-    def __init__(self, in_channels):
-        super(Model, self).__init__()
-        self.op1 = nn.Conv2d(in_channels, in_channels, 1)
-        self.op2 = nn.BatchNorm2d(in_channels)
-        self.op2.running_mean = torch.tensor([i/1000 for i in range(in_channels)])
-        self.op2.running_var = torch.tensor([i/1000 for i in range(in_channels)])
-        self.op3 = nn.Conv2d(in_channels, in_channels, 1)
-
-    def forward(self, x):
-        self.op2.eval()
-        x = self.op1(x)
-        x = self.op2(x)
-        x = self.op3(x)
-        return x
-
-class TestBn2dEval(TestCase):
-    def test_batchnorm_backward_eval(self, device):
-        model = Model(in_channels=256)
-        cpu_tensor = torch.randn(32,256,14,14)
-        npu_tensor = cpu_tensor.npu()
-        cpu_tensor.requires_grad = True
-        npu_tensor.requires_grad = True
-
-        for i in range(1):
-            out = model(cpu_tensor)
-            loss = out.sum()
-            loss.backward()
-            cpuout = out
-            cpu_grad_list = []
-            for name, module in model.named_parameters():
-                cpu_grad_list.append(module.grad)
-                module.grad = None
-
-            model = model.npu()
-            out = model(npu_tensor)
-            loss = out.sum()
-            loss.backward()
-            npuout = out
-            npu_grad_list = []
-            for name, module in model.named_parameters():
-                npu_grad_list.append(module.grad.cpu())
-
-            #print(cpu_tensor.grad, npu_tensor.grad)
-            cpu_grad = cpu_tensor.grad
-            npu_grad = npu_tensor.grad            
-            # TODO(ascend): Insufficient precision
-            #精度未满足 self.assertRtolEqual(cpu_grad.numpy(), npu_grad.cpu().numpy())
-            self.assertRtolEqual(cpu_grad.numpy(), npu_grad.cpu().numpy(), 0.01)
-
-            for cpu_grad, npu_grad in zip(cpu_grad_list, npu_grad_list):
-                #print(cpu_grad, npu_grad)
-                # TODO(ascend): Insufficient precision
-                #精度未满足 self.assertRtolEqual(cpu_grad.numpy(), npu_grad.numpy())
-                self.assertRtolEqual(cpu_grad.numpy(), npu_grad.numpy(), 0.1)
-
-instantiate_device_type_tests(TestBn2dEval, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+import torch.nn as nn
+
+class Model(nn.Module):
+    def __init__(self, in_channels):
+        super(Model, self).__init__()
+        self.op1 = nn.Conv2d(in_channels, in_channels, 1)
+        self.op2 = nn.BatchNorm2d(in_channels)
+        self.op2.running_mean = torch.tensor([i/1000 for i in range(in_channels)])
+        self.op2.running_var = torch.tensor([i/1000 for i in range(in_channels)])
+        self.op3 = nn.Conv2d(in_channels, in_channels, 1)
+
+    def forward(self, x):
+        self.op2.eval()
+        x = self.op1(x)
+        x = self.op2(x)
+        x = self.op3(x)
+        return x
+
+class TestBn2dEval(TestCase):
+    def test_batchnorm_backward_eval(self, device):
+        model = Model(in_channels=256)
+        cpu_tensor = torch.randn(32,256,14,14)
+        npu_tensor = cpu_tensor.npu()
+        cpu_tensor.requires_grad = True
+        npu_tensor.requires_grad = True
+
+        for i in range(1):
+            out = model(cpu_tensor)
+            loss = out.sum()
+            loss.backward()
+            cpuout = out
+            cpu_grad_list = []
+            for name, module in model.named_parameters():
+                cpu_grad_list.append(module.grad)
+                module.grad = None
+
+            model = model.npu()
+            out = model(npu_tensor)
+            loss = out.sum()
+            loss.backward()
+            npuout = out
+            npu_grad_list = []
+            for name, module in model.named_parameters():
+                npu_grad_list.append(module.grad.cpu())
+
+            #print(cpu_tensor.grad, npu_tensor.grad)
+            cpu_grad = cpu_tensor.grad
+            npu_grad = npu_tensor.grad            
+            # TODO(ascend): Insufficient precision
+            #精度未满足 self.assertRtolEqual(cpu_grad.numpy(), npu_grad.cpu().numpy())
+            self.assertRtolEqual(cpu_grad.numpy(), npu_grad.cpu().numpy(), 0.01)
+
+            for cpu_grad, npu_grad in zip(cpu_grad_list, npu_grad_list):
+                #print(cpu_grad, npu_grad)
+                # TODO(ascend): Insufficient precision
+                #精度未满足 self.assertRtolEqual(cpu_grad.numpy(), npu_grad.numpy())
+                self.assertRtolEqual(cpu_grad.numpy(), npu_grad.numpy(), 0.1)
+
+instantiate_device_type_tests(TestBn2dEval, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_bernoulli.py b/test/test_npu/test_network_ops/test_bernoulli.py
index b161b2696a93e299d41cb792038db6e3a2326961..05c70b03ba3a331369320d57b9aba6839610334c 100644
--- a/test/test_npu/test_network_ops/test_bernoulli.py
+++ b/test/test_npu/test_network_ops/test_bernoulli.py
@@ -1,116 +1,116 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import copy
-import sys
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestBernoulli(TestCase):
-    def cpu_op_exec(self, input):
-        output = torch.bernoulli(input)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input):
-        output = torch.bernoulli(input)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_inplace_tensor_exec(self, input, p):
-        output = input.bernoulli_(p)
-        output = output.numpy()
-        return output
-
-    def npu_op_inplace_tensor_exec(self, input, p):
-        output = input.bernoulli_(p)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_inplace_float_exec(self, input):
-        output = input.bernoulli_(0.5)
-        output = output.numpy()
-        return output
-
-    def npu_op_inplace_float_exec(self, input):
-        output = input.bernoulli_(0.5)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_bernoulli_float32(self, device):
-        format_list = [0, 3]
-        shape_list = [(2, 3, 4)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 1)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec(npu_input)
-            print(cpu_output, npu_output)
-            #self.assertEqual(cpu_output, npu_output)
-            #生成随机值，无法对比cpu值
-
-    def test_bernoulli_float16(self, device):
-        format_list = [0, 3]
-        shape_list = [(2, 3, 4)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 1)
-            cpu_input = cpu_input.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec(npu_input)
-            cpu_output = cpu_output.astype(np.float16)
-            print(cpu_output, npu_output)
-            #self.assertEqual(cpu_output, npu_output)
-
-    def test_bernoulli_tensor_p(self, device):
-        format_list = [0, 3]
-        shape_list = [(2, 3, 4)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 1)
-            cpu_input_p, npu_input_p = create_common_tensor(item, 0, 1)
-            cpu_output = self.cpu_op_inplace_tensor_exec(cpu_input, cpu_input_p)
-            npu_output = self.npu_op_inplace_tensor_exec(npu_input, npu_input_p)
-            print(cpu_output, npu_output)
-            #self.assertEqual(cpu_output, npu_output)
-
-    def test_bernoulli_float_p(self, device):
-        format_list = [0, 3]
-        shape_list = [(2, 3, 4)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 1)
-            cpu_output = self.cpu_op_inplace_float_exec(cpu_input)
-            npu_output = self.npu_op_inplace_float_exec(npu_input)
-            print(cpu_output, npu_output)
-            #self.assertEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestBernoulli, globals(), except_for="cpu")
-if __name__ == '__main__':
-    run_tests()
-
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import copy
+import sys
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestBernoulli(TestCase):
+    def cpu_op_exec(self, input):
+        output = torch.bernoulli(input)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input):
+        output = torch.bernoulli(input)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_tensor_exec(self, input, p):
+        output = input.bernoulli_(p)
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_tensor_exec(self, input, p):
+        output = input.bernoulli_(p)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_float_exec(self, input):
+        output = input.bernoulli_(0.5)
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_float_exec(self, input):
+        output = input.bernoulli_(0.5)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_bernoulli_float32(self, device):
+        format_list = [0, 3]
+        shape_list = [(2, 3, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 1)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            print(cpu_output, npu_output)
+            #self.assertEqual(cpu_output, npu_output)
+            #生成随机值，无法对比cpu值
+
+    def test_bernoulli_float16(self, device):
+        format_list = [0, 3]
+        shape_list = [(2, 3, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 1)
+            cpu_input = cpu_input.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            cpu_output = cpu_output.astype(np.float16)
+            print(cpu_output, npu_output)
+            #self.assertEqual(cpu_output, npu_output)
+
+    def test_bernoulli_tensor_p(self, device):
+        format_list = [0, 3]
+        shape_list = [(2, 3, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 1)
+            cpu_input_p, npu_input_p = create_common_tensor(item, 0, 1)
+            cpu_output = self.cpu_op_inplace_tensor_exec(cpu_input, cpu_input_p)
+            npu_output = self.npu_op_inplace_tensor_exec(npu_input, npu_input_p)
+            print(cpu_output, npu_output)
+            #self.assertEqual(cpu_output, npu_output)
+
+    def test_bernoulli_float_p(self, device):
+        format_list = [0, 3]
+        shape_list = [(2, 3, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 1)
+            cpu_output = self.cpu_op_inplace_float_exec(cpu_input)
+            npu_output = self.npu_op_inplace_float_exec(npu_input)
+            print(cpu_output, npu_output)
+            #self.assertEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestBernoulli, globals(), except_for="cpu")
+if __name__ == '__main__':
+    run_tests()
+
diff --git a/test/test_npu/test_bilinear.py b/test/test_npu/test_network_ops/test_bilinear.py
similarity index 74%
rename from test/test_npu/test_bilinear.py
rename to test/test_npu/test_network_ops/test_bilinear.py
index dbb919e5a7466f0848326adbb34125a04fc0b34e..4bfdb837f9827f178042539e0e161167a0a81859 100644
--- a/test/test_npu/test_bilinear.py
+++ b/test/test_npu/test_network_ops/test_bilinear.py
@@ -33,7 +33,7 @@ class test_bilinear(TestCase):
         outputs = outputs.cpu().detach().numpy()
         return outputs
 
-    def test_add_common_shape_format1(self, device):
+    def test_bilinear_common_shape_format1(self, device):
         shape_format = [  
                   [[np.float32, -1, (10,30)], [np.float32, -1, (10, 40)], [np.float32, -1, (5, 30, 40)],
                     [np.float32, -1, (5,)]],
@@ -43,12 +43,12 @@ class test_bilinear(TestCase):
                   [[np.float32, -1, (10, 30, 40, 30)], [np.float32, -1, (10, 30, 40, 30)], 
                     [np.float32, -1, (30, 30, 30)],
                       [np.float32, -1, (30,)]],
-                  [[np.float32, -1, (100,3)], [np.float32, -1, (1000, 4)], [np.float32, -1, (5, 3, 4)],
+                  [[np.float32, -1, (100,3)], [np.float32, -1, (100, 4)], [np.float32, -1, (5, 3, 4)],
                     [np.float32, -1, (5,)]],
                   [[np.float16, -1, (2, 1, 1, 1)], [np.float16, -1, (2, 1, 1, 1)], [np.float16, -1, (5, 1, 1)],
                     [np.float16, -1, (5,)]],
                   [[np.float16, -1, (2, 50)], [np.float16, -1, (2, 50)], [np.float16, -1, (5, 50, 50)],
-                    [np.float16, -1, (2, 4)]],
+                    [np.float16, -1, (5)]],
                   [[np.float16, -1, (2, 3)], [np.float16, -1, (2, 4)], [np.float16, -1, (2, 3, 4)],],
                   [[np.float16, -1, (2, 3)], [np.float16, -1, (2, 4)], [np.float16, -1, (4, 3, 4)],
                   [np.float16, -1, (4,)]],
@@ -61,11 +61,19 @@ class test_bilinear(TestCase):
             if len(item)>3:
               cpu_input4, npu_input4 = create_common_tensor(item[3], 0, 1)
               bias = [cpu_input4, npu_input4]
-            cpu_outputs = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3, bias[0])
+            if cpu_input1.dtype == torch.float16:
+              if bias[0] != None:
+                cpu_outputs = self.cpu_op_exec(
+                  cpu_input1.float(), cpu_input2.float(), cpu_input3.float(), bias[0].float()).astype(np.float16)
+              else:
+                cpu_outputs = self.cpu_op_exec(
+                  cpu_input1.float(), cpu_input2.float(), cpu_input3.float(), bias[0]).astype(np.float16)
+            else:
+              cpu_outputs = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3, bias[0])
             npu_outputs = self.npu_op_exec(npu_input1, npu_input2, npu_input3, bias[1])
             self.assertRtolEqual(cpu_outputs, npu_outputs)
     
-    def test_add_common_shape_format2(self, device):
+    def test_bilinear_common_shape_format2(self, device):
         shape_format = [  
                   [[np.int32, -1, (10,30)], [np.int32, -1, (10, 40)], [np.int32, -1, (5, 30, 40)],
                     [np.int32, -1, (5,)]],
@@ -87,7 +95,7 @@ class test_bilinear(TestCase):
             npu_outputs = self.npu_op_exec(npu_input1, npu_input2, npu_input3, bias[1])
             self.assertRtolEqual(cpu_outputs, npu_outputs)
        
-    def test_add_common_shape_format3(self, device):
+    def test_bilinear_common_shape_format3(self, device):
         shape_format = [  
                 [[np.float32, 0, (10,30)], [np.float32, 0, (10, 40)], [np.float32, 0, (5, 30, 40)],
                   [np.float32, 0, (5,)]],
@@ -97,12 +105,12 @@ class test_bilinear(TestCase):
                 [[np.float32, 0, (10, 30, 40, 30)], [np.float32, 0, (10, 30, 40, 30)], 
                   [np.float32, 0, (30, 30, 30)],
                     [np.float32, 0, (30,)]],
-                [[np.float32, 0, (100,3)], [np.float32, 0, (1000, 4)], [np.float32, 0, (5, 3, 4)],
+                [[np.float32, 0, (100,3)], [np.float32, 0, (100, 4)], [np.float32, 0, (5, 3, 4)],
                   [np.float32, 0, (5,)]],
                 [[np.float16, 0, (2, 1, 1, 1)], [np.float16, 0, (2, 1, 1, 1)], [np.float16, 0, (5, 1, 1)],
                   [np.float16, 0, (5,)]],
                 [[np.float16, 0, (2, 50)], [np.float16, 0, (2, 50)], [np.float16, 0, (5, 50, 50)],
-                  [np.float16, 0, (2, 4)]],
+                  [np.float16, 0, (5)]],
                 [[np.float16, 0, (2, 3)], [np.float16, 0, (2, 4)], [np.float16, 0, (2, 3, 4)],],
                 [[np.float16, 0, (2, 3)], [np.float16, 0, (2, 4)], [np.float16, 0, (4, 3, 4)],
                 [np.float16, 0, (4,)]],
@@ -115,11 +123,19 @@ class test_bilinear(TestCase):
           if len(item)>3:
             cpu_input4, npu_input4 = create_common_tensor(item[3], 0, 1)
             bias = [cpu_input4, npu_input4]
-          cpu_outputs = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3, bias[0])
+          if cpu_input1.dtype == torch.float16:
+            if bias[0] != None:
+              cpu_outputs = self.cpu_op_exec(
+                cpu_input1.float(), cpu_input2.float(), cpu_input3.float(), bias[0].float()).astype(np.float16)
+            else:
+              cpu_outputs = self.cpu_op_exec(
+                cpu_input1.float(), cpu_input2.float(), cpu_input3.float(), bias[0]).astype(np.float16)
+          else:
+            cpu_outputs = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3, bias[0])
           npu_outputs = self.npu_op_exec(npu_input1, npu_input2, npu_input3, bias[1])
           self.assertRtolEqual(cpu_outputs, npu_outputs)
 
-    def test_add_common_shape_format4(self, device):
+    def test_bilinear_common_shape_format4(self, device):
         shape_format = [  
                 [[np.float32, 3, (10,30)], [np.float32, 3, (10, 40)], [np.float32, 3, (5, 30, 40)],
                   [np.float32, 3, (5,)]],
@@ -129,15 +145,15 @@ class test_bilinear(TestCase):
                 [[np.float32, 3, (10, 30, 40, 30)], [np.float32, 3, (10, 30, 40, 30)], 
                   [np.float32, 3, (30, 30, 30)],
                     [np.float32, 3, (30,)]],
-                [[np.float32, 29, (100,3)], [np.float32, 29, (1000, 4)], [np.float32, 29, (5, 3, 4)],
-                  [np.float32, 29, (5,)]],
-                [[np.float16, 29, (2, 1, 1, 1)], [np.float16, 29, (2, 1, 1, 1)], [np.float16, 29, (5, 1, 1)],
-                  [np.float16, 29, (5,)]],
-                [[np.float16, 29, (2, 50)], [np.float16, 29, (2, 50)], [np.float16, 29, (5, 50, 50)],
-                  [np.float16, 29, (2, 4)]],
-                [[np.float16, 29, (2, 3)], [np.float16, 29, (2, 4)], [np.float16, 29, (2, 3, 4)],],
-                [[np.float16, 29, (2, 3)], [np.float16, 29, (2, 4)], [np.float16, 29, (4, 3, 4)],
-                [np.float16, 29, (4,)]],
+                [[np.float32, 2, (100,3)], [np.float32, 2, (100, 4)], [np.float32, 2, (5, 3, 4)],
+                  [np.float32, 2, (5,)]],
+                [[np.float16, 2, (2, 1, 1, 1)], [np.float16, 2, (2, 1, 1, 1)], [np.float16, 2, (5, 1, 1)],
+                  [np.float16, 2, (5,)]],
+                [[np.float16, 2, (2, 50)], [np.float16, 2, (2, 50)], [np.float16, 2, (5, 50, 50)],
+                  [np.float16, 2, (5)]],
+                [[np.float16, 2, (2, 3)], [np.float16, 2, (2, 4)], [np.float16, 2, (2, 3, 4)],],
+                [[np.float16, 2, (2, 3)], [np.float16, 2, (2, 4)], [np.float16, 2, (4, 3, 4)],
+                [np.float16, 2, (4,)]],
               ]
         for item in shape_format:
           bias = [None, None]
@@ -147,11 +163,18 @@ class test_bilinear(TestCase):
           if len(item)>3:
             cpu_input4, npu_input4 = create_common_tensor(item[3], 0, 1)
             bias = [cpu_input4, npu_input4]
-          cpu_outputs = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3, bias[0])
+          if cpu_input1.dtype == torch.float16:
+            if bias[0] != None:
+              cpu_outputs = self.cpu_op_exec(
+                cpu_input1.float(), cpu_input2.float(), cpu_input3.float(), bias[0].float()).astype(np.float16)
+            else:
+              cpu_outputs = self.cpu_op_exec(
+                cpu_input1.float(), cpu_input2.float(), cpu_input3.float(), bias[0]).astype(np.float16)
+          else:
+            cpu_outputs = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3, bias[0])
           npu_outputs = self.npu_op_exec(npu_input1, npu_input2, npu_input3, bias[1])
           self.assertRtolEqual(cpu_outputs, npu_outputs)
           
 instantiate_device_type_tests(test_bilinear, globals(), except_for='cpu')
 if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
     run_tests()
diff --git a/test/test_npu/test_network_ops/test_binary_cross_entropy_with_logits_backward.py b/test/test_npu/test_network_ops/test_binary_cross_entropy_with_logits_backward.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_bitwise_not.py b/test/test_npu/test_network_ops/test_bitwise_not.py
index a10801063b1b47b54932dac4080ed43f9c79a6a5..0f6baa7c47c16a8b1e44e75613a34df79d7223b1 100644
--- a/test/test_npu/test_network_ops/test_bitwise_not.py
+++ b/test/test_npu/test_network_ops/test_bitwise_not.py
@@ -1,104 +1,104 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class Test_Bitwise_Not(TestCase):
-    def generate_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        npu_input1 = torch.from_numpy(input1)
-
-        return npu_input1
-
-    def generate_bool_data(self, shape):
-        input1 = np.random.randint(0, 2, shape).astype(np.bool_)
-        npu_input1 = torch.from_numpy(input1)
-        return npu_input1
-
-    def cpu_op_exec(self, input1):
-        output = torch.bitwise_not(input1)
-        if output.dtype not in [torch.int32, torch.int8, torch.bool]: 
-            output = output.to(torch.int32)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1):
-        input1 = input1.to("npu")
-        output = torch.bitwise_not(input1)
-        output = output.to("cpu")
-        if output.dtype not in [torch.int32, torch.int8, torch.bool]: 
-            output = output.to(torch.int32)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_out(self, input1, input2):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        torch.bitwise_not(input1, out = input2)
-        output = input2.to("cpu")
-        if output.dtype not in [torch.int32, torch.int8, torch.bool]: 
-            output = output.to(torch.int32)
-        output = output.numpy()
-        return output
-
-    def test_bitwise_not_bool(self, device):
-        npu_input1 = self.generate_bool_data((2, 3))
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_bitwise_not_int16(self, device):
-        npu_input1 = self.generate_data(0, 2342, (2, 3), np.int16)
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_bitwise_not_int32(self, device):
-        npu_input1 = self.generate_data(0, 34222, (2, 3), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-        
-    def test_bitwise_not_int64(self, device):
-        npu_input1 = self.generate_data(0, 355553, (2, 3), np.int64)
-        cpu_output = self.cpu_op_exec(npu_input1)
-        npu_output = self.npu_op_exec(npu_input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_bitwise_not_out(self, device):
-        shape_format = [
-            [[0, 2342, [2, 3], np.int16], [0, 2342, [10, 20], np.int16]],
-            [[0, 34222, [2, 3], np.int32], [0, 34222, [10, 20], np.int32]],
-            [[0, 355553, [2, 3], np.int64], [0, 355553, [1, 1], np.int64]],
-            ]
-        for item in shape_format:
-            npu_input1 = self.generate_data(item[0][0], item[0][1], item[0][2], item[0][3])
-            npu_input2 = self.generate_data(item[1][0], item[1][1], item[1][2], item[1][3])
-            cpu_output = self.cpu_op_exec(npu_input1)
-            npu_output1 = self.npu_op_exec_out(npu_input1, npu_input1)
-            npu_output2 = self.npu_op_exec_out(npu_input1, npu_input2)
-            self.assertRtolEqual(cpu_output, npu_output1)
-            self.assertRtolEqual(cpu_output, npu_output1)
-
-
-instantiate_device_type_tests(Test_Bitwise_Not, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class Test_Bitwise_Not(TestCase):
+    def generate_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        npu_input1 = torch.from_numpy(input1)
+
+        return npu_input1
+
+    def generate_bool_data(self, shape):
+        input1 = np.random.randint(0, 2, shape).astype(np.bool_)
+        npu_input1 = torch.from_numpy(input1)
+        return npu_input1
+
+    def cpu_op_exec(self, input1):
+        output = torch.bitwise_not(input1)
+        if output.dtype not in [torch.int32, torch.int8, torch.bool]: 
+            output = output.to(torch.int32)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        input1 = input1.to("npu")
+        output = torch.bitwise_not(input1)
+        output = output.to("cpu")
+        if output.dtype not in [torch.int32, torch.int8, torch.bool]: 
+            output = output.to(torch.int32)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        torch.bitwise_not(input1, out = input2)
+        output = input2.to("cpu")
+        if output.dtype not in [torch.int32, torch.int8, torch.bool]: 
+            output = output.to(torch.int32)
+        output = output.numpy()
+        return output
+
+    def test_bitwise_not_bool(self, device):
+        npu_input1 = self.generate_bool_data((2, 3))
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_bitwise_not_int16(self, device):
+        npu_input1 = self.generate_data(0, 2342, (2, 3), np.int16)
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_bitwise_not_int32(self, device):
+        npu_input1 = self.generate_data(0, 34222, (2, 3), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+        
+    def test_bitwise_not_int64(self, device):
+        npu_input1 = self.generate_data(0, 355553, (2, 3), np.int64)
+        cpu_output = self.cpu_op_exec(npu_input1)
+        npu_output = self.npu_op_exec(npu_input1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_bitwise_not_out(self, device):
+        shape_format = [
+            [[0, 2342, [2, 3], np.int16], [0, 2342, [10, 20], np.int16]],
+            [[0, 34222, [2, 3], np.int32], [0, 34222, [10, 20], np.int32]],
+            [[0, 355553, [2, 3], np.int64], [0, 355553, [1, 1], np.int64]],
+            ]
+        for item in shape_format:
+            npu_input1 = self.generate_data(item[0][0], item[0][1], item[0][2], item[0][3])
+            npu_input2 = self.generate_data(item[1][0], item[1][1], item[1][2], item[1][3])
+            cpu_output = self.cpu_op_exec(npu_input1)
+            npu_output1 = self.npu_op_exec_out(npu_input1, npu_input1)
+            npu_output2 = self.npu_op_exec_out(npu_input1, npu_input2)
+            self.assertRtolEqual(cpu_output, npu_output1)
+            self.assertRtolEqual(cpu_output, npu_output1)
+
+
+instantiate_device_type_tests(Test_Bitwise_Not, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_bitwise_xor.py b/test/test_npu/test_network_ops/test_bitwise_xor.py
index a23603b7b843cb11a99579993e200710f7f70f18..6d90d7d44b6d7c0379b40d98b2f749f811947456 100644
--- a/test/test_npu/test_network_ops/test_bitwise_xor.py
+++ b/test/test_npu/test_network_ops/test_bitwise_xor.py
@@ -1,202 +1,202 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-import random
-
-class TestBitwiseXor(TestCase):
-    def generate_data(self, min, max, shape_x, shape_y, dtype):
-        input1 = np.random.randint(min, max, shape_x).astype(dtype)
-        input2 = np.random.randint(min, max, shape_y).astype(dtype)
-
-        #can't convert np.uint16 to pytoch tensor, so convert np.uint16 to np.int32 first
-        if input1.dtype == np.uint16:
-            input1 = input1.astype(np.int32)
-            input2 = input2.astype(np.int32)
-
-        # modify from numpy.ndarray to torch.tensor
-        npu_input1 = torch.from_numpy(input1)
-        npu_input2 = torch.from_numpy(input2)
-        return npu_input1, npu_input2
-
-    def cpu_op_exec(self, input1, input2):
-        output = torch.bitwise_xor(input1, input2)
-        if output.dtype not in [torch.int32, torch.bool]:
-            output = output.to(torch.int32)
-        output = output.numpy()
-        return output
-
-
-    def npu_op_exec(self, input1, input2):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        output = torch.bitwise_xor(input1, input2)
-        output = output.to("cpu")
-        if output.dtype not in [torch.int32, torch.bool]:
-            output = output.to(torch.int32)
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_scalar(self, input1, scalar): 
-        output = torch.bitwise_xor(input1, scalar) 
-        if output.dtype not in [torch.int32, torch.bool]:
-            output = output.to(torch.int32)
-        output = output.numpy() 
-        return output
-
-    def npu_op_exec_scalar(self, input1, input2):
-        input1 = input1.to("npu")
-        output = torch.bitwise_xor(input1, input2)
-        output = output.to("cpu")
-        if output.dtype not in [torch.int32, torch.bool]:
-            output = output.to(torch.int32)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_scalar_out(self, input1, scalar, output): 
-        input1 = input1.to("npu") 
-        output = output.to("npu") 
-        output = torch.bitwise_xor(input1, scalar, out = output) 
-        output = output.to("cpu")
-        if output.dtype not in [torch.int32, torch.bool]:
-            output = output.to(torch.int32)
-        output = output.numpy() 
-        return output 
-
-    def npu_op_exec_out(self, input1, input2, input3): 
-        input1 = input1.to("npu") 
-        input2 = input2.to("npu") 
-        output = input3.to("npu") 
-        torch.bitwise_xor(input1, input2, out=output) 
-        output = output.to("cpu")
-        if output.dtype not in [torch.int32, torch.bool]:
-            output = output.to(torch.int32)
-        output = output.numpy() 
-        return output 
-
-    def bitwise_xor_tensor_out_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], 0, 100)
-            cpu_input3, npu_input3 = create_common_tensor(item[1], 0, 100)
-            cpu_output_out = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
-            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
-
-            self.assertRtolEqual(cpu_output_out, npu_output_out)
-
-    def test_bitwise_xor_tensor_out(self, device):
-        shape_format = [
-            [[np.int16, 0, [128, 3, 224, 224]], [np.int16, 0, [3, 3, 3]]],
-            [[np.int16, 0, [128, 116, 14, 14]], [np.int16, 0, [128, 116, 14, 14]]],
-            [[np.int32, 0, [256, 128, 7, 7]],   [np.int32, 0, [128, 256, 3, 3]]],
-            [[np.int32, 0, [2, 3, 3, 3]],       [np.int32, 0, [3, 1, 3]]],
-            [[np.int32, 0, [128, 232, 7, 7]],   [np.int32, 0, [128, 232, 7, 7]]],
-        ]
-        self.bitwise_xor_tensor_out_result(shape_format)
-
-    def bitwise_xor_scalar_out_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
-            scalar = np.random.randint(1, 5)
-            cpu_output_out = self.cpu_op_exec_scalar(cpu_input1, scalar)
-            npu_output_out = self.npu_op_exec_scalar_out(npu_input1, scalar, npu_input2)
-            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
-            self.assertRtolEqual(cpu_output_out, npu_output_out)
-
-    def test_bitwise_xor_scalar_out(self, device):
-        shape_format = [
-            [[np.int16, 0, [16, 3, 1111, 1212]], [np.int16, 0, [3, 3, 3]]],
-            [[np.int16, 0, [128, 116, 14, 14]], [np.int16, 0, [128, 116, 14, 14]]],
-            [[np.int32, 0, [1313, 3, 3, 3]], [np.int32, 0, [3, 1, 3]]],
-            [[np.int32, 0, [128, 232, 7, 7]], [np.int32, 0, [128, 232, 7, 7]]],
-        ]
-        self.bitwise_xor_scalar_out_result(shape_format)
-
-    def test_bitwise_xor_int16_3d(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (3, 3, 3), (3, 3, 3), np.int16)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        cpu_output = cpu_output.astype(np.float32)
-        npu_output = npu_output.astype(np.float32)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_bitwise_xor_int16_1_1(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (3, 3, 3), (1, 1), np.int16)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        cpu_output = cpu_output.astype(np.float32)
-        npu_output = npu_output.astype(np.float32)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_bitwise_xor_int16_1(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (3, 3, 3), 1, np.int16)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        cpu_output = cpu_output.astype(np.float32)
-        npu_output = npu_output.astype(np.float32)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_bitwise_xor_int16(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (3, 3, 3), (), np.int16)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        cpu_output = cpu_output.astype(np.float32)
-        npu_output = npu_output.astype(np.float32)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_bitwise_xor_int32(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 2, (1, 3), (1, 3), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, True)
-        npu_output = self.npu_op_exec_scalar(npu_input1, True)
-        cpu_output = cpu_output.astype(np.float32)
-        npu_output = npu_output.astype(np.float32)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_bitwise_xor_bool(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 2, (1, 3), (1, 3), np.bool)
-        cpu_output = self.cpu_op_exec(npu_input1, True)
-        npu_output = self.npu_op_exec_scalar(npu_input1, True)
-        cpu_output = cpu_output.astype(np.float32)
-        npu_output = npu_output.astype(np.float32)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_bitwise_xor_uint16(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (3, 3, 3), (3, 3, 3), np.uint16)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        cpu_output = cpu_output.astype(np.float32)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        npu_output = npu_output.astype(np.float32)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_bitwise_xor_mix_dtype(self, device):
-        npu_input1, npu_input3 = self.generate_data(0, 100, (3, 3, 3), (), np.uint16)
-        npu_input2, npu_input4 = self.generate_data(0, 100, (3, 3, 3), (), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2) 
-        npu_output = self.npu_op_exec(npu_input1, npu_input2) 
-        self.assertRtolEqual(cpu_output, npu_output)
-
-
-
-instantiate_device_type_tests(TestBitwiseXor, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
-
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+import random
+
+class TestBitwiseXor(TestCase):
+    def generate_data(self, min, max, shape_x, shape_y, dtype):
+        input1 = np.random.randint(min, max, shape_x).astype(dtype)
+        input2 = np.random.randint(min, max, shape_y).astype(dtype)
+
+        #can't convert np.uint16 to pytoch tensor, so convert np.uint16 to np.int32 first
+        if input1.dtype == np.uint16:
+            input1 = input1.astype(np.int32)
+            input2 = input2.astype(np.int32)
+
+        # modify from numpy.ndarray to torch.tensor
+        npu_input1 = torch.from_numpy(input1)
+        npu_input2 = torch.from_numpy(input2)
+        return npu_input1, npu_input2
+
+    def cpu_op_exec(self, input1, input2):
+        output = torch.bitwise_xor(input1, input2)
+        if output.dtype not in [torch.int32, torch.bool]:
+            output = output.to(torch.int32)
+        output = output.numpy()
+        return output
+
+
+    def npu_op_exec(self, input1, input2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        output = torch.bitwise_xor(input1, input2)
+        output = output.to("cpu")
+        if output.dtype not in [torch.int32, torch.bool]:
+            output = output.to(torch.int32)
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_scalar(self, input1, scalar): 
+        output = torch.bitwise_xor(input1, scalar) 
+        if output.dtype not in [torch.int32, torch.bool]:
+            output = output.to(torch.int32)
+        output = output.numpy() 
+        return output
+
+    def npu_op_exec_scalar(self, input1, input2):
+        input1 = input1.to("npu")
+        output = torch.bitwise_xor(input1, input2)
+        output = output.to("cpu")
+        if output.dtype not in [torch.int32, torch.bool]:
+            output = output.to(torch.int32)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_scalar_out(self, input1, scalar, output): 
+        input1 = input1.to("npu") 
+        output = output.to("npu") 
+        output = torch.bitwise_xor(input1, scalar, out = output) 
+        output = output.to("cpu")
+        if output.dtype not in [torch.int32, torch.bool]:
+            output = output.to(torch.int32)
+        output = output.numpy() 
+        return output 
+
+    def npu_op_exec_out(self, input1, input2, input3): 
+        input1 = input1.to("npu") 
+        input2 = input2.to("npu") 
+        output = input3.to("npu") 
+        torch.bitwise_xor(input1, input2, out=output) 
+        output = output.to("cpu")
+        if output.dtype not in [torch.int32, torch.bool]:
+            output = output.to(torch.int32)
+        output = output.numpy() 
+        return output 
+
+    def bitwise_xor_tensor_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], 0, 100)
+            cpu_input3, npu_input3 = create_common_tensor(item[1], 0, 100)
+            cpu_output_out = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
+            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
+
+            self.assertRtolEqual(cpu_output_out, npu_output_out)
+
+    def test_bitwise_xor_tensor_out(self, device):
+        shape_format = [
+            [[np.int16, 0, [128, 3, 224, 224]], [np.int16, 0, [3, 3, 3]]],
+            [[np.int16, 0, [128, 116, 14, 14]], [np.int16, 0, [128, 116, 14, 14]]],
+            [[np.int32, 0, [256, 128, 7, 7]],   [np.int32, 0, [128, 256, 3, 3]]],
+            [[np.int32, 0, [2, 3, 3, 3]],       [np.int32, 0, [3, 1, 3]]],
+            [[np.int32, 0, [128, 232, 7, 7]],   [np.int32, 0, [128, 232, 7, 7]]],
+        ]
+        self.bitwise_xor_tensor_out_result(shape_format)
+
+    def bitwise_xor_scalar_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            scalar = np.random.randint(1, 5)
+            cpu_output_out = self.cpu_op_exec_scalar(cpu_input1, scalar)
+            npu_output_out = self.npu_op_exec_scalar_out(npu_input1, scalar, npu_input2)
+            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
+            self.assertRtolEqual(cpu_output_out, npu_output_out)
+
+    def test_bitwise_xor_scalar_out(self, device):
+        shape_format = [
+            [[np.int16, 0, [16, 3, 1111, 1212]], [np.int16, 0, [3, 3, 3]]],
+            [[np.int16, 0, [128, 116, 14, 14]], [np.int16, 0, [128, 116, 14, 14]]],
+            [[np.int32, 0, [1313, 3, 3, 3]], [np.int32, 0, [3, 1, 3]]],
+            [[np.int32, 0, [128, 232, 7, 7]], [np.int32, 0, [128, 232, 7, 7]]],
+        ]
+        self.bitwise_xor_scalar_out_result(shape_format)
+
+    def test_bitwise_xor_int16_3d(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (3, 3, 3), (3, 3, 3), np.int16)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        cpu_output = cpu_output.astype(np.float32)
+        npu_output = npu_output.astype(np.float32)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_bitwise_xor_int16_1_1(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (3, 3, 3), (1, 1), np.int16)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        cpu_output = cpu_output.astype(np.float32)
+        npu_output = npu_output.astype(np.float32)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_bitwise_xor_int16_1(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (3, 3, 3), 1, np.int16)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        cpu_output = cpu_output.astype(np.float32)
+        npu_output = npu_output.astype(np.float32)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_bitwise_xor_int16(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (3, 3, 3), (), np.int16)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        cpu_output = cpu_output.astype(np.float32)
+        npu_output = npu_output.astype(np.float32)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_bitwise_xor_int32(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 2, (1, 3), (1, 3), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, True)
+        npu_output = self.npu_op_exec_scalar(npu_input1, True)
+        cpu_output = cpu_output.astype(np.float32)
+        npu_output = npu_output.astype(np.float32)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_bitwise_xor_bool(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 2, (1, 3), (1, 3), np.bool)
+        cpu_output = self.cpu_op_exec(npu_input1, True)
+        npu_output = self.npu_op_exec_scalar(npu_input1, True)
+        cpu_output = cpu_output.astype(np.float32)
+        npu_output = npu_output.astype(np.float32)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_bitwise_xor_uint16(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (3, 3, 3), (3, 3, 3), np.uint16)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        cpu_output = cpu_output.astype(np.float32)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        npu_output = npu_output.astype(np.float32)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_bitwise_xor_mix_dtype(self, device):
+        npu_input1, npu_input3 = self.generate_data(0, 100, (3, 3, 3), (), np.uint16)
+        npu_input2, npu_input4 = self.generate_data(0, 100, (3, 3, 3), (), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2) 
+        npu_output = self.npu_op_exec(npu_input1, npu_input2) 
+        self.assertRtolEqual(cpu_output, npu_output)
+
+
+
+instantiate_device_type_tests(TestBitwiseXor, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
+
diff --git a/test/test_npu/test_network_ops/test_bmm.py b/test/test_npu/test_network_ops/test_bmm.py
old mode 100644
new mode 100755
index 68eccbf83b39104d43b4ded2535476b2d198fd24..62cd0be0ef9ff88fe987e7034a9f19924832649f
--- a/test/test_npu/test_network_ops/test_bmm.py
+++ b/test/test_npu/test_network_ops/test_bmm.py
@@ -1,72 +1,72 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-from common_utils import TestCase, run_tests
-
-
-class TestBatchMatMul(TestCase):
-  def cpu_op_exec(self, input1, input2):
-      output = torch.bmm(input1, input2)
-      output = output.numpy()
-      return output
-
-  def npu_op_exec(self, input1, input2):
-      output = torch.bmm(input1, input2)
-      output = output.to("cpu")
-      output = output.numpy()
-      return output
-
-  def bmm_auto_list_exec(self, shape):
-      for item in shape:
-          cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 10)
-          cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 10)
-          if cpu_input1.dtype == torch.float16:
-              cpu_input1 = cpu_input1.to(torch.float32)
-          if cpu_input2.dtype == torch.float16:
-              cpu_input2 = cpu_input2.to(torch.float32)
-          cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-          npu_output = self.npu_op_exec(npu_input1, npu_input2)
-          cpu_output = cpu_output.astype(npu_output.dtype)
-          self.assertRtolEqual(cpu_output, npu_output)
-
-  def test_batchmatmul_shape_format_fp16_3d(self, device):
-      format_list = [0, 3, 29]
-      shape_list = [(1, 3, 2)]
-      shape_format1 = [[np.float16, i, j]
-                        for i in format_list for j in shape_list]
-      format_list = [0, 3, 29]
-      shape_list = [(1, 2, 3)]
-      shape_format2 = [[np.float16, i, j]
-                        for i in format_list for j in shape_list]
-      shape_format = [[i, j] for i in shape_format1 for j in shape_format2]
-      self.bmm_auto_list_exec(shape_format)
-
-  def test_batchmatmul_shape_format_fp32_3d(self, device):
-      format_list = [0, 3, 29]
-      shape_list = [(1, 3, 2)]
-      shape_format1 = [[np.float32, i, j]
-                        for i in format_list for j in shape_list]
-      format_list = [0, 3, 29]
-      shape_list = [(1, 2, 3)]
-      shape_format2 = [[np.float32, i, j]
-                        for i in format_list for j in shape_list]
-      shape_format = [[i, j] for i in shape_format1 for j in shape_format2]
-      self.bmm_auto_list_exec(shape_format)
-
-instantiate_device_type_tests(TestBatchMatMul, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+from common_utils import TestCase, run_tests
+
+
+class TestBatchMatMul(TestCase):
+  def cpu_op_exec(self, input1, input2):
+      output = torch.bmm(input1, input2)
+      output = output.numpy()
+      return output
+
+  def npu_op_exec(self, input1, input2):
+      output = torch.bmm(input1, input2)
+      output = output.to("cpu")
+      output = output.numpy()
+      return output
+
+  def bmm_auto_list_exec(self, shape):
+      for item in shape:
+          cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 10)
+          cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 10)
+          if cpu_input1.dtype == torch.float16:
+              cpu_input1 = cpu_input1.to(torch.float32)
+          if cpu_input2.dtype == torch.float16:
+              cpu_input2 = cpu_input2.to(torch.float32)
+          cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+          npu_output = self.npu_op_exec(npu_input1, npu_input2)
+          cpu_output = cpu_output.astype(npu_output.dtype)
+          self.assertRtolEqual(cpu_output, npu_output)
+
+  def test_batchmatmul_shape_format_fp16_3d(self, device):
+      format_list = [0, 3, 29]
+      shape_list = [(1, 3, 2)]
+      shape_format1 = [[np.float16, i, j]
+                        for i in format_list for j in shape_list]
+      format_list = [0, 3, 29]
+      shape_list = [(1, 2, 3)]
+      shape_format2 = [[np.float16, i, j]
+                        for i in format_list for j in shape_list]
+      shape_format = [[i, j] for i in shape_format1 for j in shape_format2]
+      self.bmm_auto_list_exec(shape_format)
+
+  def test_batchmatmul_shape_format_fp32_3d(self, device):
+      format_list = [0, 3, 29]
+      shape_list = [(1, 3, 2)]
+      shape_format1 = [[np.float32, i, j]
+                        for i in format_list for j in shape_list]
+      format_list = [0, 3, 29]
+      shape_list = [(1, 2, 3)]
+      shape_format2 = [[np.float32, i, j]
+                        for i in format_list for j in shape_list]
+      shape_format = [[i, j] for i in shape_format1 for j in shape_format2]
+      self.bmm_auto_list_exec(shape_format)
+
+instantiate_device_type_tests(TestBatchMatMul, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_broadcastToD.py b/test/test_npu/test_network_ops/test_broadcastToD.py
old mode 100644
new mode 100755
index 3a7ff68abd1c5954077359894ed5dc4e3820d1f1..7145d4e1a89a0eb0c67811e7a4f8d0028bab00f2
--- a/test/test_npu/test_network_ops/test_broadcastToD.py
+++ b/test/test_npu/test_network_ops/test_broadcastToD.py
@@ -1,40 +1,40 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-
-class TestBroadCastToD(TestCase):
-    @dtypes(torch.float, torch.float16, torch.int32, torch.int8, torch.uint8, torch.bool)
-    def test_broadcast(self, device, dtype):
-        shapes = [
-                    [[1], [5]],
-                    [[ 1, 2], [3, 2]],
-                    [[1, 2, 1], [1, 2, 3]],
-                ]
-        for item in shapes:
-            input1 = torch.randn(item[0]).to(dtype).npu()
-            output = input1.npu_broadcast(item[1])
-            size1 = np.array(output.size(), dtype=np.int32)
-            size2 = np.array(item[1], dtype=np.int32)
-            self.assertRtolEqual(size1, size2)
-
-
-instantiate_device_type_tests(TestBroadCastToD, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+
+class TestBroadCastToD(TestCase):
+    @dtypes(torch.float, torch.float16, torch.int32, torch.int8, torch.uint8, torch.bool)
+    def test_broadcast(self, device, dtype):
+        shapes = [
+                    [[1], [5]],
+                    [[ 1, 2], [3, 2]],
+                    [[1, 2, 1], [1, 2, 3]],
+                ]
+        for item in shapes:
+            input1 = torch.randn(item[0]).to(dtype).npu()
+            output = input1.npu_broadcast(item[1])
+            size1 = np.array(output.size(), dtype=np.int32)
+            size2 = np.array(item[1], dtype=np.int32)
+            self.assertRtolEqual(size1, size2)
+
+
+instantiate_device_type_tests(TestBroadCastToD, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_broadcast_tensors.py b/test/test_npu/test_network_ops/test_broadcast_tensors.py
index dd24481b3fa8e7175ab6dac1f8a17b1da25859d7..b83d1c5417ce81f8a37241d73771bb25ddd3f027 100644
--- a/test/test_npu/test_network_ops/test_broadcast_tensors.py
+++ b/test/test_npu/test_network_ops/test_broadcast_tensors.py
@@ -1,54 +1,54 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION.
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestBroadCastTensors(TestCase):
-
-    def cpu_op_exec(self, input1, input2):
-        output1, output2 = torch.broadcast_tensors(input1,input2);
-        return output1.numpy(), output2.numpy()
-
-    def npu_op_exec(self, input1, input2):
-        input1 =input1.npu()
-        input2 =input2.npu()
-        output1, output2 = torch.broadcast_tensors(input1,input2);
-        return output1.cpu().numpy(), output2.cpu().numpy()
-
-    def test_broadcast_tensors_common_shape_format(self, device):
-        shape_format = [
-            [[1, 3], (2, 1), torch.float32],
-            [[1, 9], (5, 1), torch.float32],
-            [[3, 1], (1, 3), torch.float32],
-        ]
-        for item in shape_format:
-            cpu_input1 =  torch.randn(item[0], dtype=item[2])
-            cpu_input2 =  torch.randn(item[1], dtype=item[2])
-            cpu_output1, cpu_output2 = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output1, npu_output2 = self.npu_op_exec(cpu_input1, cpu_input2)
-            self.assertRtolEqual(cpu_output1, npu_output1)
-            self.assertRtolEqual(cpu_output2, npu_output2)
-
-
-instantiate_device_type_tests(TestBroadCastTensors, globals(), except_for='cpu')
-if __name__ == "__main__":
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION.
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestBroadCastTensors(TestCase):
+
+    def cpu_op_exec(self, input1, input2):
+        output1, output2 = torch.broadcast_tensors(input1,input2);
+        return output1.numpy(), output2.numpy()
+
+    def npu_op_exec(self, input1, input2):
+        input1 =input1.npu()
+        input2 =input2.npu()
+        output1, output2 = torch.broadcast_tensors(input1,input2);
+        return output1.cpu().numpy(), output2.cpu().numpy()
+
+    def test_broadcast_tensors_common_shape_format(self, device):
+        shape_format = [
+            [[1, 3], (2, 1), torch.float32],
+            [[1, 9], (5, 1), torch.float32],
+            [[3, 1], (1, 3), torch.float32],
+        ]
+        for item in shape_format:
+            cpu_input1 =  torch.randn(item[0], dtype=item[2])
+            cpu_input2 =  torch.randn(item[1], dtype=item[2])
+            cpu_output1, cpu_output2 = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output1, npu_output2 = self.npu_op_exec(cpu_input1, cpu_input2)
+            self.assertRtolEqual(cpu_output1, npu_output1)
+            self.assertRtolEqual(cpu_output2, npu_output2)
+
+
+instantiate_device_type_tests(TestBroadCastTensors, globals(), except_for='cpu')
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_cat.py b/test/test_npu/test_network_ops/test_cat.py
old mode 100644
new mode 100755
index 200ec9d8aa506ba2a5e96c8387c2e620b8b12c52..5e3acfbe8f9c44c3a33025b14f21ebcf144df660
--- a/test/test_npu/test_network_ops/test_cat.py
+++ b/test/test_npu/test_network_ops/test_cat.py
@@ -1,158 +1,158 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestCat(TestCase):
-    def cpu_op_exec(self, input1, input2, n):
-        output = torch.cat(input1 + input2, n)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2, n):
-        output = torch.cat(input1 + input2, n)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_cat_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_list = [(256, 32, 56)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_exec([cpu_input1], [cpu_input2], 1)
-            npu_output = self.npu_op_exec([npu_input1], [npu_input2], 1)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_cat_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_list = [(256, 32, 56)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_exec([cpu_input1], [cpu_input2], 1)
-            npu_output = self.npu_op_exec([npu_input1], [npu_input2], 1)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_cat_shape_format_fp16_4d(self, device):
-        format_list = [0, 3, 29]
-        shape_list = [(256, 32, 56, 56)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_exec([cpu_input1], [cpu_input2], 1)
-            npu_output = self.npu_op_exec([npu_input1], [npu_input2], 1)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_cat_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 29]
-        shape_list = [(256, 32, 56, 56)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_exec([cpu_input1], [cpu_input2], 1)
-            npu_output = self.npu_op_exec([npu_input1], [npu_input2], 1)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_cat_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_list = [(56, 56)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_exec([cpu_input1], [cpu_input2], 1)
-            npu_output = self.npu_op_exec([npu_input1], [npu_input2], 1)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_cat_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_list = [(56, 56)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_exec([cpu_input1], [cpu_input2], 1)
-            npu_output = self.npu_op_exec([npu_input1], [npu_input2], 1)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_cat_null_tensor(self, device):
-        x1 = torch.randn(15, 2, 1, 1)
-        x2 = torch.randn(0, 2, 1, 1)
-        x3 = torch.randn(0, 2, 3, 1)
-        y1_cpu = torch.cat([x1, x2], dim=0)
-        y2_cpu = torch.cat([x2, x3], dim=2)
-        y3_cpu = torch.cat([x2, x2, x2], dim=1)
-        x1 = x1.npu()
-        x2 = x2.npu()
-        x3 = x3.npu()
-        y1_npu = torch.cat([x1, x2], dim=0)
-        y2_npu = torch.cat([x2, x3], dim=2)
-        y3_npu = torch.cat([x2, x2, x2], dim=1)
-        self.assertRtolEqual(y1_cpu, y1_npu.cpu())
-        self.assertRtolEqual(y2_cpu, y2_npu.cpu())
-        self.assertRtolEqual(y3_cpu, y3_npu.cpu())
-
-
-instantiate_device_type_tests(TestCat, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestCat(TestCase):
+    def cpu_op_exec(self, input1, input2, n):
+        output = torch.cat(input1 + input2, n)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2, n):
+        output = torch.cat(input1 + input2, n)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_cat_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [(256, 32, 56)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec([cpu_input1], [cpu_input2], 1)
+            npu_output = self.npu_op_exec([npu_input1], [npu_input2], 1)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_cat_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [(256, 32, 56)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec([cpu_input1], [cpu_input2], 1)
+            npu_output = self.npu_op_exec([npu_input1], [npu_input2], 1)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_cat_shape_format_fp16_4d(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [(256, 32, 56, 56)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec([cpu_input1], [cpu_input2], 1)
+            npu_output = self.npu_op_exec([npu_input1], [npu_input2], 1)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_cat_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [(256, 32, 56, 56)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec([cpu_input1], [cpu_input2], 1)
+            npu_output = self.npu_op_exec([npu_input1], [npu_input2], 1)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_cat_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [(56, 56)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec([cpu_input1], [cpu_input2], 1)
+            npu_output = self.npu_op_exec([npu_input1], [npu_input2], 1)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_cat_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [(56, 56)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec([cpu_input1], [cpu_input2], 1)
+            npu_output = self.npu_op_exec([npu_input1], [npu_input2], 1)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test_cat_null_tensor(self, device):
+        x1 = torch.randn(15, 2, 1, 1)
+        x2 = torch.randn(0, 2, 1, 1)
+        x3 = torch.randn(0, 2, 3, 1)
+        y1_cpu = torch.cat([x1, x2], dim=0)
+        y2_cpu = torch.cat([x2, x3], dim=2)
+        y3_cpu = torch.cat([x2, x2, x2], dim=1)
+        x1 = x1.npu()
+        x2 = x2.npu()
+        x3 = x3.npu()
+        y1_npu = torch.cat([x1, x2], dim=0)
+        y2_npu = torch.cat([x2, x3], dim=2)
+        y3_npu = torch.cat([x2, x2, x2], dim=1)
+        self.assertRtolEqual(y1_cpu, y1_npu.cpu())
+        self.assertRtolEqual(y2_cpu, y2_npu.cpu())
+        self.assertRtolEqual(y3_cpu, y3_npu.cpu())
+
+
+instantiate_device_type_tests(TestCat, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_clamp.py b/test/test_npu/test_network_ops/test_clamp.py
old mode 100644
new mode 100755
index 8de1bbc7d2754c7f4a965d9b2061a16b5f0c763e..3c61dbd4dffaccbf16f8e8392d9da138376dfedc
--- a/test/test_npu/test_network_ops/test_clamp.py
+++ b/test/test_npu/test_network_ops/test_clamp.py
@@ -1,199 +1,199 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestClamp(TestCase):
-    def generate_data(self, data):
-        input1 = np.random.uniform(data[0], data[1], data[2]).astype(data[3])
-
-        #modify from numpy.ndarray to torch.tensor
-        input1 = torch.from_numpy(input1)
-        
-        return input1
-
-    def npu_op_exec(self, input1, min_val, max_val):
-        input1 = input1.to("npu")
-        output = torch.clamp(input1, min_val, max_val)
-        output = output.to("cpu")
-        output = output.numpy()
-
-        return output
-
-    def cpu_op_exec(self, input1, min_val, max_val):
-        output = torch.clamp(input1, min_val,max_val)
-        output = output.numpy()
-
-        return output
-
-    def cpu_op_exec_float16(self, input1, min_val, max_val):
-        input1 = input1.to(torch.float32)
-        output = torch.clamp(input1, min_val, max_val).to(torch.float16)
-        output = output.numpy()
-
-        return output
-
-    def npu_inp_op_exec(self, input1, min_val, max_val):
-        input1 = input1.to("npu")
-        output = torch.clamp_(input1, min_val, max_val)
-        output = input1.to("cpu")
-        output = output.numpy()
-
-        return output
-
-    def cpu_inp_op_exec(self, input1, min_val, max_val):
-        output = torch.clamp_(input1, min_val, max_val)
-        output = output.numpy()
-
-        return output
-
-    def cpu_inp_op_exec_float16(self, input1, min_val, max_val):
-        input1 = input1.to(torch.float32)
-        output = torch.clamp_(input1, min_val, max_val).to(torch.float16)
-        output = output.numpy()
-
-        return output
-
-    def npu_op_exec_out(self, input1, min_val, max_val, input2):
-        input1 = input1.to("npu")
-        output = input2.to("npu")
-        torch.clamp(input1, min_val, max_val, out=output)
-        output = output.to("cpu")
-        output = output.numpy()
-
-        return output
-
-    def npu_inp_uncon_op_exec(self, input1, min_val, max_val):
-        input1 = input1.to("npu")
-        input1 = input1.as_strided([2, 2], [1, 2], 2)
-        output = torch.clamp_(input1, min_val, max_val)
-        output = input1.to("cpu")
-        output = output.numpy()
-
-        return output
-
-    def cpu_inp_uncon_op_exec(self, input1, min_val, max_val):
-        input1 = input1.as_strided([2, 2], [1, 2], 2)
-        output = torch.clamp(input1, min_val, max_val)
-        output = output.numpy()
-
-        return output
-
-    def cpu_inp_uncon_op_exec_float16(self, input1, min_val, max_val):
-        input1 = input1.to(torch.float32).as_strided([2, 2], [1, 2], 2)
-        output = torch.clamp(input1, min_val, max_val).to(torch.float16)
-        output = output.numpy()
-
-        return output
-
-    def test_clamp_common(self, device):
-        shape_format = [
-                [1, 100, (4, 3), np.float32],
-                [1, 100, (4, 3), np.int32],
-        ]
-        for item in shape_format:
-            input1 = self.generate_data(item)
-
-            cpu_output = self.cpu_op_exec(input1, 40, 60)
-            npu_output = self.npu_op_exec(input1, 40, 60)
-
-            cpu_inp_output = self.cpu_inp_op_exec(input1, 40, 60)
-            npu_inp_output = self.npu_inp_op_exec(input1, 40, 60)
-
-            input2 = self.generate_data(item)
-            npu_out_output = self.npu_op_exec_out(input1, 40, 60, input2)
-
-            cpu_inp_uncon_output = self.cpu_inp_uncon_op_exec(input1, 40, 60)
-            npu_inp_uncon_output = self.npu_inp_uncon_op_exec(input1, 40, 60)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_inp_output, npu_inp_output)
-            self.assertRtolEqual(cpu_output, npu_out_output)
-            self.assertRtolEqual(cpu_inp_uncon_output, npu_inp_uncon_output)
-
-    def test_clamp_common_out(self, device):
-        shape_format = [
-                [[1, 100, (4, 3), np.float32], [1, 100, (10, 10), np.float32]],
-                [[1, 100, (4, 3), np.float32], [1, 100, (1, 1), np.float32]],
-                [[1, 100, (4, 3), np.int32], [1, 100, (10, 10), np.int32]],
-                [[1, 100, (4, 3), np.int32], [1, 100, (1, 1), np.int32]]
-                ]
-        for item in shape_format:
-            print(item)
-            input1 = self.generate_data(item[0])
-            cpu_output = self.cpu_op_exec(input1, 40, 60)
-
-            input2 = self.generate_data(item[0])
-            npu_out_output = self.npu_op_exec_out(input1, 40, 60, input2)
-            input3 = self.generate_data(item[1])
-            npu_out_output1 = self.npu_op_exec_out(input1, 40, 60, input3)
-
-            self.assertRtolEqual(cpu_output, npu_out_output)
-            self.assertRtolEqual(cpu_output, npu_out_output1)
-
-    def test_clamp_float16(self, device):
-        shape_format = [
-                [1, 100, (4, 3), np.float16],
-        ]
-        for item in shape_format:
-            input1 = self.generate_data(item)
-
-            cpu_output = self.cpu_op_exec_float16(input1, 40, 60)
-            npu_output = self.npu_op_exec(input1, 40, 60)
-
-            cpu_inp_output = self.cpu_inp_op_exec_float16(input1, 40, 60)
-            npu_inp_output = self.npu_inp_op_exec(input1, 40, 60)
-
-            input2 = self.generate_data(item)
-            npu_out_output = self.npu_op_exec_out(input1, 40, 60, input2)
-
-            cpu_inp_uncon_output = self.cpu_inp_uncon_op_exec_float16(input1, 40, 60)
-            npu_inp_uncon_output = self.npu_inp_uncon_op_exec(input1, 40, 60)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_inp_output, npu_inp_output)
-            self.assertRtolEqual(cpu_output, npu_out_output)
-            self.assertRtolEqual(cpu_inp_uncon_output, npu_inp_uncon_output)
-
-    def test_clamp_float16_out(self, device):
-        shape_format = [
-                [[1, 100, (4, 3), np.float16], [1, 100, (10, 10), np.float16]],
-                [[1, 100, (4, 3), np.float16], [1, 100, (1, 1), np.float16]],
-        ]
-        for item in shape_format:
-            print(item)
-            input1 = self.generate_data(item[0])
-            cpu_output = self.cpu_op_exec_float16(input1, 40, 60)
-
-            input2 = self.generate_data(item[0])
-            npu_out_output = self.npu_op_exec_out(input1, 40, 60, input2)
-            input3 = self.generate_data(item[1])
-            npu_out_output1 = self.npu_op_exec_out(input1, 40, 60, input3)
-
-            self.assertRtolEqual(cpu_output, npu_out_output)
-            self.assertRtolEqual(cpu_output, npu_out_output1)
-
-
-instantiate_device_type_tests(TestClamp, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestClamp(TestCase):
+    def generate_data(self, data):
+        input1 = np.random.uniform(data[0], data[1], data[2]).astype(data[3])
+
+        #modify from numpy.ndarray to torch.tensor
+        input1 = torch.from_numpy(input1)
+        
+        return input1
+
+    def npu_op_exec(self, input1, min_val, max_val):
+        input1 = input1.to("npu")
+        output = torch.clamp(input1, min_val, max_val)
+        output = output.to("cpu")
+        output = output.numpy()
+
+        return output
+
+    def cpu_op_exec(self, input1, min_val, max_val):
+        output = torch.clamp(input1, min_val,max_val)
+        output = output.numpy()
+
+        return output
+
+    def cpu_op_exec_float16(self, input1, min_val, max_val):
+        input1 = input1.to(torch.float32)
+        output = torch.clamp(input1, min_val, max_val).to(torch.float16)
+        output = output.numpy()
+
+        return output
+
+    def npu_inp_op_exec(self, input1, min_val, max_val):
+        input1 = input1.to("npu")
+        output = torch.clamp_(input1, min_val, max_val)
+        output = input1.to("cpu")
+        output = output.numpy()
+
+        return output
+
+    def cpu_inp_op_exec(self, input1, min_val, max_val):
+        output = torch.clamp_(input1, min_val, max_val)
+        output = output.numpy()
+
+        return output
+
+    def cpu_inp_op_exec_float16(self, input1, min_val, max_val):
+        input1 = input1.to(torch.float32)
+        output = torch.clamp_(input1, min_val, max_val).to(torch.float16)
+        output = output.numpy()
+
+        return output
+
+    def npu_op_exec_out(self, input1, min_val, max_val, input2):
+        input1 = input1.to("npu")
+        output = input2.to("npu")
+        torch.clamp(input1, min_val, max_val, out=output)
+        output = output.to("cpu")
+        output = output.numpy()
+
+        return output
+
+    def npu_inp_uncon_op_exec(self, input1, min_val, max_val):
+        input1 = input1.to("npu")
+        input1 = input1.as_strided([2, 2], [1, 2], 2)
+        output = torch.clamp_(input1, min_val, max_val)
+        output = input1.to("cpu")
+        output = output.numpy()
+
+        return output
+
+    def cpu_inp_uncon_op_exec(self, input1, min_val, max_val):
+        input1 = input1.as_strided([2, 2], [1, 2], 2)
+        output = torch.clamp(input1, min_val, max_val)
+        output = output.numpy()
+
+        return output
+
+    def cpu_inp_uncon_op_exec_float16(self, input1, min_val, max_val):
+        input1 = input1.to(torch.float32).as_strided([2, 2], [1, 2], 2)
+        output = torch.clamp(input1, min_val, max_val).to(torch.float16)
+        output = output.numpy()
+
+        return output
+
+    def test_clamp_common(self, device):
+        shape_format = [
+                [1, 100, (4, 3), np.float32],
+                [1, 100, (4, 3), np.int32],
+        ]
+        for item in shape_format:
+            input1 = self.generate_data(item)
+
+            cpu_output = self.cpu_op_exec(input1, 40, 60)
+            npu_output = self.npu_op_exec(input1, 40, 60)
+
+            cpu_inp_output = self.cpu_inp_op_exec(input1, 40, 60)
+            npu_inp_output = self.npu_inp_op_exec(input1, 40, 60)
+
+            input2 = self.generate_data(item)
+            npu_out_output = self.npu_op_exec_out(input1, 40, 60, input2)
+
+            cpu_inp_uncon_output = self.cpu_inp_uncon_op_exec(input1, 40, 60)
+            npu_inp_uncon_output = self.npu_inp_uncon_op_exec(input1, 40, 60)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_inp_output, npu_inp_output)
+            self.assertRtolEqual(cpu_output, npu_out_output)
+            self.assertRtolEqual(cpu_inp_uncon_output, npu_inp_uncon_output)
+
+    def test_clamp_common_out(self, device):
+        shape_format = [
+                [[1, 100, (4, 3), np.float32], [1, 100, (10, 10), np.float32]],
+                [[1, 100, (4, 3), np.float32], [1, 100, (1, 1), np.float32]],
+                [[1, 100, (4, 3), np.int32], [1, 100, (10, 10), np.int32]],
+                [[1, 100, (4, 3), np.int32], [1, 100, (1, 1), np.int32]]
+                ]
+        for item in shape_format:
+            print(item)
+            input1 = self.generate_data(item[0])
+            cpu_output = self.cpu_op_exec(input1, 40, 60)
+
+            input2 = self.generate_data(item[0])
+            npu_out_output = self.npu_op_exec_out(input1, 40, 60, input2)
+            input3 = self.generate_data(item[1])
+            npu_out_output1 = self.npu_op_exec_out(input1, 40, 60, input3)
+
+            self.assertRtolEqual(cpu_output, npu_out_output)
+            self.assertRtolEqual(cpu_output, npu_out_output1)
+
+    def test_clamp_float16(self, device):
+        shape_format = [
+                [1, 100, (4, 3), np.float16],
+        ]
+        for item in shape_format:
+            input1 = self.generate_data(item)
+
+            cpu_output = self.cpu_op_exec_float16(input1, 40, 60)
+            npu_output = self.npu_op_exec(input1, 40, 60)
+
+            cpu_inp_output = self.cpu_inp_op_exec_float16(input1, 40, 60)
+            npu_inp_output = self.npu_inp_op_exec(input1, 40, 60)
+
+            input2 = self.generate_data(item)
+            npu_out_output = self.npu_op_exec_out(input1, 40, 60, input2)
+
+            cpu_inp_uncon_output = self.cpu_inp_uncon_op_exec_float16(input1, 40, 60)
+            npu_inp_uncon_output = self.npu_inp_uncon_op_exec(input1, 40, 60)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_inp_output, npu_inp_output)
+            self.assertRtolEqual(cpu_output, npu_out_output)
+            self.assertRtolEqual(cpu_inp_uncon_output, npu_inp_uncon_output)
+
+    def test_clamp_float16_out(self, device):
+        shape_format = [
+                [[1, 100, (4, 3), np.float16], [1, 100, (10, 10), np.float16]],
+                [[1, 100, (4, 3), np.float16], [1, 100, (1, 1), np.float16]],
+        ]
+        for item in shape_format:
+            print(item)
+            input1 = self.generate_data(item[0])
+            cpu_output = self.cpu_op_exec_float16(input1, 40, 60)
+
+            input2 = self.generate_data(item[0])
+            npu_out_output = self.npu_op_exec_out(input1, 40, 60, input2)
+            input3 = self.generate_data(item[1])
+            npu_out_output1 = self.npu_op_exec_out(input1, 40, 60, input3)
+
+            self.assertRtolEqual(cpu_output, npu_out_output)
+            self.assertRtolEqual(cpu_output, npu_out_output1)
+
+
+instantiate_device_type_tests(TestClamp, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_clamp_max.py b/test/test_npu/test_network_ops/test_clamp_max.py
index aeaf3cf9d73bb7ced33f92cf930962d8f2acaff4..66148ae023173fe1a566f0c614f12c15cac49d65 100644
--- a/test/test_npu/test_network_ops/test_clamp_max.py
+++ b/test/test_npu/test_network_ops/test_clamp_max.py
@@ -1,161 +1,161 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestClampMax(TestCase):
-    def generate_data(self, data):
-        input1 = np.random.uniform(data[0], data[1], data[2]).astype(data[3])
-
-        #modify from numpy.ndarray to torch.tensor
-        input1 = torch.from_numpy(input1)
-        
-        return input1
-
-    def npu_op_exec(self, input1, max_val):
-        input1 = input1.to("npu")
-        output = torch.clamp_max(input1, max_val)
-        output = output.to("cpu")
-        output = output.numpy()
-
-        return output
-
-    def cpu_op_exec(self, input1, max_val):
-        output = torch.clamp_max(input1, max_val)
-        output = output.numpy()
-
-        return output
-
-    def cpu_op_exec_float16(self, input1, max_val):
-        input1 = input1.to(torch.float32)
-        output = torch.clamp_max(input1, max_val).to(torch.float16)
-        output = output.numpy()
-
-        return output
-
-    def npu_inp_op_exec(self, input1, max_val):
-        input1 = input1.to("npu")
-        output = torch.clamp_max_(input1, max_val)
-        output = input1.to("cpu")
-        output = output.numpy()
-
-        return output
-
-    def cpu_inp_op_exec(self, input1, max_val):
-        output = torch.clamp_max_(input1, max_val)
-        output = output.numpy()
-
-        return output
-
-    def cpu_inp_op_exec_float16(self, input1, max_val):
-        input1 = input1.to(torch.float32)
-        output = torch.clamp_max_(input1, max_val).to(torch.float16)
-        output = output.numpy()
-
-        return output
-
-    def npu_op_exec_out(self, input1, max_val, input2):
-        input1 = input1.to("npu")
-        output = input2.to("npu")
-        torch.clamp_max(input1, max_val, out=output)
-        output = output.to("cpu")
-        output = output.numpy()
-
-        return output
-
-    def npu_inp_uncon_op_exec(self, input1, max_val):
-        input1 = input1.to("npu")
-        input1 = input1.as_strided([2, 2], [1, 2], 2)
-        output = torch.clamp_max_(input1, max_val)
-        output = input1.to("cpu")
-        output = output.numpy()
-
-        return output
-
-    def cpu_inp_uncon_op_exec(self, input1, max_val):
-        input1 = input1.as_strided([2, 2], [1, 2], 2)
-        output = torch.clamp_max(input1, max_val)
-        output = output.numpy()
-
-        return output
-
-    def cpu_inp_uncon_op_exec_float16(self, input1, max_val):
-        input1 = input1.to(torch.float32).as_strided([2, 2], [1, 2], 2)
-        output = torch.clamp_max(input1, max_val).to(torch.float16)
-        output = output.numpy()
-
-        return output
-
-    def test_clamp_max_common(self, device):
-        shape_format = [
-                [1, 100, (4, 3), np.float32],
-                [1, 100, (4, 3), np.int32],
-        ]
-        for item in shape_format:
-            input1 = self.generate_data(item)
-
-            cpu_output = self.cpu_op_exec(input1, 50)
-            npu_output = self.npu_op_exec(input1, 50)
-
-            cpu_inp_output = self.cpu_inp_op_exec(input1, 50)
-            npu_inp_output = self.npu_inp_op_exec(input1, 50)
-
-            input2 = self.generate_data(item)
-            npu_out_output = self.npu_op_exec_out(input1, 50, input2)
-
-            cpu_inp_uncon_output = self.cpu_inp_uncon_op_exec(input1, 50)
-            npu_inp_uncon_output = self.npu_inp_uncon_op_exec(input1, 50)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_inp_output, npu_inp_output)
-            self.assertRtolEqual(cpu_output, npu_out_output)
-            self.assertRtolEqual(cpu_inp_uncon_output, npu_inp_uncon_output)
-
-    def test_clamp_max_float16(self, device):
-        shape_format = [
-                [1, 100, (4, 3), np.float16],
-        ]
-        for item in shape_format:
-            input1 = self.generate_data(item)
-
-            cpu_output = self.cpu_op_exec_float16(input1, 50)
-            npu_output = self.npu_op_exec(input1, 50)
-
-            cpu_inp_output = self.cpu_inp_op_exec_float16(input1, 50)
-            npu_inp_output = self.npu_inp_op_exec(input1, 50)
-
-            input2 = self.generate_data(item)
-            npu_out_output = self.npu_op_exec_out(input1, 50, input2)
-
-            cpu_inp_uncon_output = self.cpu_inp_uncon_op_exec_float16(input1, 50)
-            npu_inp_uncon_output = self.npu_inp_uncon_op_exec(input1, 50)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_inp_output, npu_inp_output)
-            self.assertRtolEqual(cpu_output, npu_out_output)
-            self.assertRtolEqual(cpu_inp_uncon_output, npu_inp_uncon_output)
-
-
-instantiate_device_type_tests(TestClampMax, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestClampMax(TestCase):
+    def generate_data(self, data):
+        input1 = np.random.uniform(data[0], data[1], data[2]).astype(data[3])
+
+        #modify from numpy.ndarray to torch.tensor
+        input1 = torch.from_numpy(input1)
+        
+        return input1
+
+    def npu_op_exec(self, input1, max_val):
+        input1 = input1.to("npu")
+        output = torch.clamp_max(input1, max_val)
+        output = output.to("cpu")
+        output = output.numpy()
+
+        return output
+
+    def cpu_op_exec(self, input1, max_val):
+        output = torch.clamp_max(input1, max_val)
+        output = output.numpy()
+
+        return output
+
+    def cpu_op_exec_float16(self, input1, max_val):
+        input1 = input1.to(torch.float32)
+        output = torch.clamp_max(input1, max_val).to(torch.float16)
+        output = output.numpy()
+
+        return output
+
+    def npu_inp_op_exec(self, input1, max_val):
+        input1 = input1.to("npu")
+        output = torch.clamp_max_(input1, max_val)
+        output = input1.to("cpu")
+        output = output.numpy()
+
+        return output
+
+    def cpu_inp_op_exec(self, input1, max_val):
+        output = torch.clamp_max_(input1, max_val)
+        output = output.numpy()
+
+        return output
+
+    def cpu_inp_op_exec_float16(self, input1, max_val):
+        input1 = input1.to(torch.float32)
+        output = torch.clamp_max_(input1, max_val).to(torch.float16)
+        output = output.numpy()
+
+        return output
+
+    def npu_op_exec_out(self, input1, max_val, input2):
+        input1 = input1.to("npu")
+        output = input2.to("npu")
+        torch.clamp_max(input1, max_val, out=output)
+        output = output.to("cpu")
+        output = output.numpy()
+
+        return output
+
+    def npu_inp_uncon_op_exec(self, input1, max_val):
+        input1 = input1.to("npu")
+        input1 = input1.as_strided([2, 2], [1, 2], 2)
+        output = torch.clamp_max_(input1, max_val)
+        output = input1.to("cpu")
+        output = output.numpy()
+
+        return output
+
+    def cpu_inp_uncon_op_exec(self, input1, max_val):
+        input1 = input1.as_strided([2, 2], [1, 2], 2)
+        output = torch.clamp_max(input1, max_val)
+        output = output.numpy()
+
+        return output
+
+    def cpu_inp_uncon_op_exec_float16(self, input1, max_val):
+        input1 = input1.to(torch.float32).as_strided([2, 2], [1, 2], 2)
+        output = torch.clamp_max(input1, max_val).to(torch.float16)
+        output = output.numpy()
+
+        return output
+
+    def test_clamp_max_common(self, device):
+        shape_format = [
+                [1, 100, (4, 3), np.float32],
+                [1, 100, (4, 3), np.int32],
+        ]
+        for item in shape_format:
+            input1 = self.generate_data(item)
+
+            cpu_output = self.cpu_op_exec(input1, 50)
+            npu_output = self.npu_op_exec(input1, 50)
+
+            cpu_inp_output = self.cpu_inp_op_exec(input1, 50)
+            npu_inp_output = self.npu_inp_op_exec(input1, 50)
+
+            input2 = self.generate_data(item)
+            npu_out_output = self.npu_op_exec_out(input1, 50, input2)
+
+            cpu_inp_uncon_output = self.cpu_inp_uncon_op_exec(input1, 50)
+            npu_inp_uncon_output = self.npu_inp_uncon_op_exec(input1, 50)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_inp_output, npu_inp_output)
+            self.assertRtolEqual(cpu_output, npu_out_output)
+            self.assertRtolEqual(cpu_inp_uncon_output, npu_inp_uncon_output)
+
+    def test_clamp_max_float16(self, device):
+        shape_format = [
+                [1, 100, (4, 3), np.float16],
+        ]
+        for item in shape_format:
+            input1 = self.generate_data(item)
+
+            cpu_output = self.cpu_op_exec_float16(input1, 50)
+            npu_output = self.npu_op_exec(input1, 50)
+
+            cpu_inp_output = self.cpu_inp_op_exec_float16(input1, 50)
+            npu_inp_output = self.npu_inp_op_exec(input1, 50)
+
+            input2 = self.generate_data(item)
+            npu_out_output = self.npu_op_exec_out(input1, 50, input2)
+
+            cpu_inp_uncon_output = self.cpu_inp_uncon_op_exec_float16(input1, 50)
+            npu_inp_uncon_output = self.npu_inp_uncon_op_exec(input1, 50)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_inp_output, npu_inp_output)
+            self.assertRtolEqual(cpu_output, npu_out_output)
+            self.assertRtolEqual(cpu_inp_uncon_output, npu_inp_uncon_output)
+
+
+instantiate_device_type_tests(TestClampMax, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_clamp_min.py b/test/test_npu/test_network_ops/test_clamp_min.py
index 04f2370f0837f700f86611f2812a3895fc7bfaa0..d8f176897bfbd5c2f257a5d04fecd70d423a78c0 100644
--- a/test/test_npu/test_network_ops/test_clamp_min.py
+++ b/test/test_npu/test_network_ops/test_clamp_min.py
@@ -1,196 +1,196 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestClampMin(TestCase):
-    def generate_data(self, data):
-        input1 = np.random.uniform(data[0], data[1], data[2]).astype(data[3])
-
-        #modify from numpy.ndarray to torch.tensor
-        input1 = torch.from_numpy(input1)
-        
-        return input1
-
-    def npu_op_exec(self, input1, min_val):
-        input1 = input1.to("npu")
-        output = torch.clamp_min(input1, min_val)
-        output = output.to("cpu")
-        output = output.numpy()
-
-        return output
-
-    def cpu_op_exec(self, input1, min_val):
-        output = torch.clamp_min(input1, min_val)
-        output = output.numpy()
-
-        return output
-
-    def cpu_op_exec_float16(self, input1, min_val):
-        input1 = input1.to(torch.float32)
-        output = torch.clamp_min(input1, min_val).to(torch.float16)
-        output = output.numpy()
-
-        return output
-
-    def npu_inp_op_exec(self, input1, min_val):
-        input1 = input1.to("npu")
-        output = torch.clamp_min_(input1, min_val)
-        output = input1.to("cpu")
-        output = output.numpy()
-
-        return output
-
-    def cpu_inp_op_exec(self, input1, min_val):
-        output = torch.clamp_min_(input1, min_val)
-        output = output.numpy()
-
-        return output
-
-    def cpu_inp_op_exec_float16(self, input1, min_val):
-        input1 = input1.to(torch.float32)
-        output = torch.clamp_min_(input1, min_val).to(torch.float16)
-        output = output.numpy()
-
-        return output
-
-    def npu_op_exec_out(self, input1, min_val, input2):
-        input1 = input1.to("npu")
-        output = input2.to("npu")
-        torch.clamp_min(input1, min_val, out=output)
-        output = output.to("cpu")
-        output = output.numpy()
-
-        return output
-
-    def npu_inp_uncon_op_exec(self, input1, min_val):
-        input1 = input1.to("npu")
-        input1 = input1.as_strided([2, 2], [1, 2], 2)
-        output = torch.clamp_min_(input1, min_val)
-        output = input1.to("cpu")
-        output = output.numpy()
-
-        return output
-
-    def cpu_inp_uncon_op_exec(self, input1, min_val):
-        input1 = input1.as_strided([2, 2], [1, 2], 2)
-        output = torch.clamp_min(input1, min_val)
-        output = output.numpy()
-
-        return output
-
-    def cpu_inp_uncon_op_exec_float16(self, input1, min_val):
-        input1 = input1.to(torch.float32).as_strided([2, 2], [1, 2], 2)
-        output = torch.clamp_min(input1, min_val).to(torch.float16)
-        output = output.numpy()
-
-        return output
-
-    def test_clamp_min_common(self, device):
-        shape_format = [
-                [1, 100, (4, 3), np.float32],
-                [1, 100, (4, 3), np.int32],
-        ]
-        for item in shape_format:
-            input1 = self.generate_data(item)
-
-            cpu_output = self.cpu_op_exec(input1, 50)
-            npu_output = self.npu_op_exec(input1, 50)
-
-            cpu_inp_output = self.cpu_inp_op_exec(input1, 50)
-            npu_inp_output = self.npu_inp_op_exec(input1, 50)
-
-            input2 = self.generate_data(item)
-            npu_out_output = self.npu_op_exec_out(input1, 50, input2)
-
-            cpu_inp_uncon_output = self.cpu_inp_uncon_op_exec(input1, 50)
-            npu_inp_uncon_output = self.npu_inp_uncon_op_exec(input1, 50)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_inp_output, npu_inp_output)
-            self.assertRtolEqual(cpu_output, npu_out_output)
-            self.assertRtolEqual(cpu_inp_uncon_output, npu_inp_uncon_output)
-
-    def test_clamp_min_common_out(self, device):
-        shape_format = [
-                [[1, 100, (4, 3), np.float32], [1, 100, (10, 10), np.float32]],
-                [[1, 100, (4, 3), np.float32], [1, 100, (1, 1), np.float32]],
-                [[1, 100, (4, 3), np.int32], [1, 100, (10, 10), np.int32]],
-                [[1, 100, (4, 3), np.int32], [1, 100, (1, 1), np.int32]]
-        ]
-        for item in shape_format:
-            print(item)
-            input1 = self.generate_data(item[0])
-            cpu_output = self.cpu_op_exec(input1, 50)
-            input2 = self.generate_data(item[0])
-            npu_out_output = self.npu_op_exec_out(input1, 50, input2)
-            input3 = self.generate_data(item[1])
-            npu_out_output1 = self.npu_op_exec_out(input1, 50, input3)
-
-            self.assertRtolEqual(cpu_output, npu_out_output)
-            self.assertRtolEqual(cpu_output, npu_out_output1)
-
-    def test_clamp_min_float16(self, device):
-        shape_format = [
-                [1, 100, (4, 3), np.float16],
-        ]
-        for item in shape_format:
-            input1 = self.generate_data(item)
-
-            cpu_output = self.cpu_op_exec_float16(input1, 50)
-            npu_output = self.npu_op_exec(input1, 50)
-
-            cpu_inp_output = self.cpu_inp_op_exec_float16(input1, 50)
-            npu_inp_output = self.npu_inp_op_exec(input1, 50)
-
-            input2 = self.generate_data(item)
-            npu_out_output = self.npu_op_exec_out(input1, 50, input2)
-
-            cpu_inp_uncon_output = self.cpu_inp_uncon_op_exec_float16(input1, 50)
-            npu_inp_uncon_output = self.npu_inp_uncon_op_exec(input1, 50)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_inp_output, npu_inp_output)
-            self.assertRtolEqual(cpu_output, npu_out_output)
-            self.assertRtolEqual(cpu_inp_uncon_output, npu_inp_uncon_output)
-
-    def test_clamp_min_float16_out(self, device):
-        shape_format = [
-                [[1, 100, (4, 3), np.float16], [1, 100, (10, 10), np.float16]],
-                [[1, 100, (4, 3), np.float16], [1, 100, (1, 1), np.float16]],
-        ]
-        for item in shape_format:
-            print(item)
-            input1 = self.generate_data(item[0])
-            cpu_output = self.cpu_op_exec_float16(input1, 50)
-            input2 = self.generate_data(item[0])
-            npu_out_output = self.npu_op_exec_out(input1, 50, input2)
-            input3 = self.generate_data(item[1])
-            npu_out_output1 = self.npu_op_exec_out(input1, 50, input3)
-
-            self.assertRtolEqual(cpu_output, npu_out_output)
-            self.assertRtolEqual(cpu_output, npu_out_output1)
-
-instantiate_device_type_tests(TestClampMin, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestClampMin(TestCase):
+    def generate_data(self, data):
+        input1 = np.random.uniform(data[0], data[1], data[2]).astype(data[3])
+
+        #modify from numpy.ndarray to torch.tensor
+        input1 = torch.from_numpy(input1)
+        
+        return input1
+
+    def npu_op_exec(self, input1, min_val):
+        input1 = input1.to("npu")
+        output = torch.clamp_min(input1, min_val)
+        output = output.to("cpu")
+        output = output.numpy()
+
+        return output
+
+    def cpu_op_exec(self, input1, min_val):
+        output = torch.clamp_min(input1, min_val)
+        output = output.numpy()
+
+        return output
+
+    def cpu_op_exec_float16(self, input1, min_val):
+        input1 = input1.to(torch.float32)
+        output = torch.clamp_min(input1, min_val).to(torch.float16)
+        output = output.numpy()
+
+        return output
+
+    def npu_inp_op_exec(self, input1, min_val):
+        input1 = input1.to("npu")
+        output = torch.clamp_min_(input1, min_val)
+        output = input1.to("cpu")
+        output = output.numpy()
+
+        return output
+
+    def cpu_inp_op_exec(self, input1, min_val):
+        output = torch.clamp_min_(input1, min_val)
+        output = output.numpy()
+
+        return output
+
+    def cpu_inp_op_exec_float16(self, input1, min_val):
+        input1 = input1.to(torch.float32)
+        output = torch.clamp_min_(input1, min_val).to(torch.float16)
+        output = output.numpy()
+
+        return output
+
+    def npu_op_exec_out(self, input1, min_val, input2):
+        input1 = input1.to("npu")
+        output = input2.to("npu")
+        torch.clamp_min(input1, min_val, out=output)
+        output = output.to("cpu")
+        output = output.numpy()
+
+        return output
+
+    def npu_inp_uncon_op_exec(self, input1, min_val):
+        input1 = input1.to("npu")
+        input1 = input1.as_strided([2, 2], [1, 2], 2)
+        output = torch.clamp_min_(input1, min_val)
+        output = input1.to("cpu")
+        output = output.numpy()
+
+        return output
+
+    def cpu_inp_uncon_op_exec(self, input1, min_val):
+        input1 = input1.as_strided([2, 2], [1, 2], 2)
+        output = torch.clamp_min(input1, min_val)
+        output = output.numpy()
+
+        return output
+
+    def cpu_inp_uncon_op_exec_float16(self, input1, min_val):
+        input1 = input1.to(torch.float32).as_strided([2, 2], [1, 2], 2)
+        output = torch.clamp_min(input1, min_val).to(torch.float16)
+        output = output.numpy()
+
+        return output
+
+    def test_clamp_min_common(self, device):
+        shape_format = [
+                [1, 100, (4, 3), np.float32],
+                [1, 100, (4, 3), np.int32],
+        ]
+        for item in shape_format:
+            input1 = self.generate_data(item)
+
+            cpu_output = self.cpu_op_exec(input1, 50)
+            npu_output = self.npu_op_exec(input1, 50)
+
+            cpu_inp_output = self.cpu_inp_op_exec(input1, 50)
+            npu_inp_output = self.npu_inp_op_exec(input1, 50)
+
+            input2 = self.generate_data(item)
+            npu_out_output = self.npu_op_exec_out(input1, 50, input2)
+
+            cpu_inp_uncon_output = self.cpu_inp_uncon_op_exec(input1, 50)
+            npu_inp_uncon_output = self.npu_inp_uncon_op_exec(input1, 50)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_inp_output, npu_inp_output)
+            self.assertRtolEqual(cpu_output, npu_out_output)
+            self.assertRtolEqual(cpu_inp_uncon_output, npu_inp_uncon_output)
+
+    def test_clamp_min_common_out(self, device):
+        shape_format = [
+                [[1, 100, (4, 3), np.float32], [1, 100, (10, 10), np.float32]],
+                [[1, 100, (4, 3), np.float32], [1, 100, (1, 1), np.float32]],
+                [[1, 100, (4, 3), np.int32], [1, 100, (10, 10), np.int32]],
+                [[1, 100, (4, 3), np.int32], [1, 100, (1, 1), np.int32]]
+        ]
+        for item in shape_format:
+            print(item)
+            input1 = self.generate_data(item[0])
+            cpu_output = self.cpu_op_exec(input1, 50)
+            input2 = self.generate_data(item[0])
+            npu_out_output = self.npu_op_exec_out(input1, 50, input2)
+            input3 = self.generate_data(item[1])
+            npu_out_output1 = self.npu_op_exec_out(input1, 50, input3)
+
+            self.assertRtolEqual(cpu_output, npu_out_output)
+            self.assertRtolEqual(cpu_output, npu_out_output1)
+
+    def test_clamp_min_float16(self, device):
+        shape_format = [
+                [1, 100, (4, 3), np.float16],
+        ]
+        for item in shape_format:
+            input1 = self.generate_data(item)
+
+            cpu_output = self.cpu_op_exec_float16(input1, 50)
+            npu_output = self.npu_op_exec(input1, 50)
+
+            cpu_inp_output = self.cpu_inp_op_exec_float16(input1, 50)
+            npu_inp_output = self.npu_inp_op_exec(input1, 50)
+
+            input2 = self.generate_data(item)
+            npu_out_output = self.npu_op_exec_out(input1, 50, input2)
+
+            cpu_inp_uncon_output = self.cpu_inp_uncon_op_exec_float16(input1, 50)
+            npu_inp_uncon_output = self.npu_inp_uncon_op_exec(input1, 50)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_inp_output, npu_inp_output)
+            self.assertRtolEqual(cpu_output, npu_out_output)
+            self.assertRtolEqual(cpu_inp_uncon_output, npu_inp_uncon_output)
+
+    def test_clamp_min_float16_out(self, device):
+        shape_format = [
+                [[1, 100, (4, 3), np.float16], [1, 100, (10, 10), np.float16]],
+                [[1, 100, (4, 3), np.float16], [1, 100, (1, 1), np.float16]],
+        ]
+        for item in shape_format:
+            print(item)
+            input1 = self.generate_data(item[0])
+            cpu_output = self.cpu_op_exec_float16(input1, 50)
+            input2 = self.generate_data(item[0])
+            npu_out_output = self.npu_op_exec_out(input1, 50, input2)
+            input3 = self.generate_data(item[1])
+            npu_out_output1 = self.npu_op_exec_out(input1, 50, input3)
+
+            self.assertRtolEqual(cpu_output, npu_out_output)
+            self.assertRtolEqual(cpu_output, npu_out_output1)
+
+instantiate_device_type_tests(TestClampMin, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_confusion_transpose.py b/test/test_npu/test_network_ops/test_confusion_transpose.py
index 7702496c95fe5441001121c1cbe272cd0ecce53e..50f91e881ddf37b0ff7e835483508830eaf2d5af 100644
--- a/test/test_npu/test_network_ops/test_confusion_transpose.py
+++ b/test/test_npu/test_network_ops/test_confusion_transpose.py
@@ -1,56 +1,56 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestConfusionTransposeD(TestCase):
-    def npu_op_exec(self, input1, shape, perm, transpose_first):
-        output = torch.npu_confusion_transpose(input1, perm, shape, transpose_first)
-        output = output.cpu().numpy()
-        return output
-
-    def cpu_op_exec(self, input1, shape, perm, transpose_first):
-        if transpose_first:
-            output = input1.permute(*perm).contiguous().view(shape)
-        else:
-            output = input1.view(shape).permute(*perm)
-        output = output.numpy()
-        return output
-
-    def test_confusion_transpose(self, device):
-        shape_format = [
-            [[np.float32, 0, [1, 576, 2560]],[1, 576, 32, 80], (0, 2, 1, 3), False],
-            [[np.float32, 0, [1, 32, 576, 80]],[1, 576, 2560], (0, 2, 1, 3), True],
-            [[np.float16, 0, [1, 576, 2560]], [1, 576, 32, 80], (0, 2, 1, 3), False],
-            [[np.float16, 0, [1, 32, 576, 80]], [1, 576, 2560], (0, 2, 1, 3), True],
-            [[np.int, 0, [1, 576, 2560]], [1, 576, 32, 80], (0, 2, 1, 3), False],
-            [[np.int, 0, [1, 32, 576, 80]], [1, 576, 2560], (0, 2, 1, 3), True],
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            cpu_output = self.cpu_op_exec(cpu_input, item[1], item[2], item[3])
-            npu_output = self.npu_op_exec(npu_input, item[1], item[2], item[3])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestConfusionTransposeD, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestConfusionTransposeD(TestCase):
+    def npu_op_exec(self, input1, shape, perm, transpose_first):
+        output = torch.npu_confusion_transpose(input1, perm, shape, transpose_first)
+        output = output.cpu().numpy()
+        return output
+
+    def cpu_op_exec(self, input1, shape, perm, transpose_first):
+        if transpose_first:
+            output = input1.permute(*perm).contiguous().view(shape)
+        else:
+            output = input1.view(shape).permute(*perm)
+        output = output.numpy()
+        return output
+
+    def test_confusion_transpose(self, device):
+        shape_format = [
+            [[np.float32, 0, [1, 576, 2560]],[1, 576, 32, 80], (0, 2, 1, 3), False],
+            [[np.float32, 0, [1, 32, 576, 80]],[1, 576, 2560], (0, 2, 1, 3), True],
+            [[np.float16, 0, [1, 576, 2560]], [1, 576, 32, 80], (0, 2, 1, 3), False],
+            [[np.float16, 0, [1, 32, 576, 80]], [1, 576, 2560], (0, 2, 1, 3), True],
+            [[np.int, 0, [1, 576, 2560]], [1, 576, 32, 80], (0, 2, 1, 3), False],
+            [[np.int, 0, [1, 32, 576, 80]], [1, 576, 2560], (0, 2, 1, 3), True],
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input, item[1], item[2], item[3])
+            npu_output = self.npu_op_exec(npu_input, item[1], item[2], item[3])
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestConfusionTransposeD, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_confusion_transpose_backward.py b/test/test_npu/test_network_ops/test_confusion_transpose_backward.py
index debfc54ae8cddc984ade037ca457a6cafe80a638..37c65cbe631ff7101d3a3818fb11daac07bd931b 100644
--- a/test/test_npu/test_network_ops/test_confusion_transpose_backward.py
+++ b/test/test_npu/test_network_ops/test_confusion_transpose_backward.py
@@ -1,59 +1,59 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION.
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestConfusionTransposeDBackward(TestCase):
-    def npu_op_exec(self, input1, shape, perm, transpose_first):
-        input1.requires_grad_()
-        output = torch.npu_confusion_transpose(input1, perm, shape, transpose_first)
-        output.backward(torch.ones_like(output))
-        output1 = output.detach().cpu().numpy()
-        output2 = input1.grad.cpu().numpy()
-        return output1, output2
-
-    def cpu_op_exec(self, input1, shape, perm, transpose_first):
-        input1.requires_grad_()
-        if transpose_first:
-            output = input1.permute(*perm).contiguous().view(shape)
-        else:
-            output = input1.view(shape).permute(*perm)
-        output.backward(torch.ones_like(output))
-        output1 = output.detach().numpy()
-        output2 = input1.grad.numpy()
-        return output1, output2
-
-    def test_confusion_transpose_backward(self, device):
-        shape_format = [
-            [[np.float32, 0, [1, 576, 2560]],[1, 576, 32, 80], (0, 2, 1, 3), False],
-            [[np.float32, 0, [1, 32, 576, 80]],[1, 576, 2560], (0, 2, 1, 3), True],
-            [[np.float16, 0, [1, 576, 2560]], [1, 576, 32, 80], (0, 2, 1, 3), False],
-            [[np.float16, 0, [1, 32, 576, 80]], [1, 576, 2560], (0, 2, 1, 3), True],
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            cpu_output1, cpu_output2 = self.cpu_op_exec(cpu_input, item[1], item[2], item[3])
-            npu_output1, npu_output2 = self.npu_op_exec(npu_input, item[1], item[2], item[3])
-            self.assertRtolEqual(cpu_output1, npu_output1)
-            self.assertRtolEqual(cpu_output2, npu_output2)
-
-instantiate_device_type_tests(TestConfusionTransposeDBackward, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION.
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestConfusionTransposeDBackward(TestCase):
+    def npu_op_exec(self, input1, shape, perm, transpose_first):
+        input1.requires_grad_()
+        output = torch.npu_confusion_transpose(input1, perm, shape, transpose_first)
+        output.backward(torch.ones_like(output))
+        output1 = output.detach().cpu().numpy()
+        output2 = input1.grad.cpu().numpy()
+        return output1, output2
+
+    def cpu_op_exec(self, input1, shape, perm, transpose_first):
+        input1.requires_grad_()
+        if transpose_first:
+            output = input1.permute(*perm).contiguous().view(shape)
+        else:
+            output = input1.view(shape).permute(*perm)
+        output.backward(torch.ones_like(output))
+        output1 = output.detach().numpy()
+        output2 = input1.grad.numpy()
+        return output1, output2
+
+    def test_confusion_transpose_backward(self, device):
+        shape_format = [
+            [[np.float32, 0, [1, 576, 2560]],[1, 576, 32, 80], (0, 2, 1, 3), False],
+            [[np.float32, 0, [1, 32, 576, 80]],[1, 576, 2560], (0, 2, 1, 3), True],
+            [[np.float16, 0, [1, 576, 2560]], [1, 576, 32, 80], (0, 2, 1, 3), False],
+            [[np.float16, 0, [1, 32, 576, 80]], [1, 576, 2560], (0, 2, 1, 3), True],
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            cpu_output1, cpu_output2 = self.cpu_op_exec(cpu_input, item[1], item[2], item[3])
+            npu_output1, npu_output2 = self.npu_op_exec(npu_input, item[1], item[2], item[3])
+            self.assertRtolEqual(cpu_output1, npu_output1)
+            self.assertRtolEqual(cpu_output2, npu_output2)
+
+instantiate_device_type_tests(TestConfusionTransposeDBackward, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_conv1d.py b/test/test_npu/test_network_ops/test_conv1d.py
index 6b3fe0a04cd5e11337f4633832f009e27721581d..4d7d8d90797deacfa9c6754aa9af921e0952215f 100644
--- a/test/test_npu/test_network_ops/test_conv1d.py
+++ b/test/test_npu/test_network_ops/test_conv1d.py
@@ -1,86 +1,86 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-import torch.nn as nn
-import torch.nn.functional as F
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestConv1d(TestCase):
-    def cpu_op_exec(self, input1, weight, stride, pad):
-        input1.requires_grad = True
-        weight.requires_grad = True
-        out = F.conv1d(input1, weight, stride=stride, padding=pad)
-        out.backward(torch.ones_like(out))
-        input_grad = input1.grad
-        weight_grad = weight.grad
-        out = out.detach()
-        return out, input_grad, weight_grad
-    
-    def npu_op_exec(self, input1, weight, stride, pad):
-        input1.requires_grad = True
-        weight.requires_grad = True
-        out = F.conv1d(input1, weight, stride=stride, padding=pad)
-        out.backward(torch.ones_like(out))
-        input_grad = input1.grad.cpu()
-        weight_grad = weight.grad.cpu()
-        out = out.cpu().detach()
-        return out, input_grad, weight_grad
-
-    def test_conv1d_shape_format_fp16(self, device):
-        shape_format = [
-            [[np.float16, 0, [4, 1, 166400]], [np.float16, 0, [514, 1, 400]], 400, 0]
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 1)
-            cpu_weight, npu_weight = create_common_tensor(item[1], 0, 1)
-            stride = item[2]
-            padding = item[3]
-            
-            cpu_output, cpu_input_grad, cpu_weight_grad = self.cpu_op_exec(cpu_input.float(), cpu_weight.float(), stride, padding)
-            cpu_output = cpu_output.half()
-            cpu_input_grad = cpu_input_grad.half()
-            cpu_weight_grad = cpu_weight_grad.half()
-            npu_output, npu_input_grad, npu_weight_grad = self.npu_op_exec(npu_input, npu_weight, stride, padding)
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_input_grad, npu_input_grad)
-            self.assertRtolEqual(cpu_weight_grad, npu_weight_grad)
-
-    def test_conv1d_shape_format_fp32(self, device):
-        shape_format = [
-            [[np.float32, 0, [4, 1, 166400]], [np.float32, 0, [514, 1, 400]], 400, 0]
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 1)
-            cpu_weight, npu_weight = create_common_tensor(item[1], 0, 1)
-            stride = item[2]
-            padding = item[3]
-            
-            cpu_output, cpu_input_grad, cpu_weight_grad = self.cpu_op_exec(cpu_input, cpu_weight, stride, padding)
-            npu_output, npu_input_grad, npu_weight_grad = self.npu_op_exec(npu_input, npu_weight, stride, padding)
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_input_grad, npu_input_grad)
-            self.assertRtolEqual(cpu_weight_grad, npu_weight_grad)
-
-
-instantiate_device_type_tests(TestConv1d, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+import torch.nn as nn
+import torch.nn.functional as F
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestConv1d(TestCase):
+    def cpu_op_exec(self, input1, weight, stride, pad):
+        input1.requires_grad = True
+        weight.requires_grad = True
+        out = F.conv1d(input1, weight, stride=stride, padding=pad)
+        out.backward(torch.ones_like(out))
+        input_grad = input1.grad
+        weight_grad = weight.grad
+        out = out.detach()
+        return out, input_grad, weight_grad
+    
+    def npu_op_exec(self, input1, weight, stride, pad):
+        input1.requires_grad = True
+        weight.requires_grad = True
+        out = F.conv1d(input1, weight, stride=stride, padding=pad)
+        out.backward(torch.ones_like(out))
+        input_grad = input1.grad.cpu()
+        weight_grad = weight.grad.cpu()
+        out = out.cpu().detach()
+        return out, input_grad, weight_grad
+
+    def test_conv1d_shape_format_fp16(self, device):
+        shape_format = [
+            [[np.float16, 0, [4, 1, 166400]], [np.float16, 0, [514, 1, 400]], 400, 0]
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 1)
+            cpu_weight, npu_weight = create_common_tensor(item[1], 0, 1)
+            stride = item[2]
+            padding = item[3]
+            
+            cpu_output, cpu_input_grad, cpu_weight_grad = self.cpu_op_exec(cpu_input.float(), cpu_weight.float(), stride, padding)
+            cpu_output = cpu_output.half()
+            cpu_input_grad = cpu_input_grad.half()
+            cpu_weight_grad = cpu_weight_grad.half()
+            npu_output, npu_input_grad, npu_weight_grad = self.npu_op_exec(npu_input, npu_weight, stride, padding)
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_input_grad, npu_input_grad)
+            self.assertRtolEqual(cpu_weight_grad, npu_weight_grad)
+
+    def test_conv1d_shape_format_fp32(self, device):
+        shape_format = [
+            [[np.float32, 0, [4, 1, 166400]], [np.float32, 0, [514, 1, 400]], 400, 0]
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 1)
+            cpu_weight, npu_weight = create_common_tensor(item[1], 0, 1)
+            stride = item[2]
+            padding = item[3]
+            
+            cpu_output, cpu_input_grad, cpu_weight_grad = self.cpu_op_exec(cpu_input, cpu_weight, stride, padding)
+            npu_output, npu_input_grad, npu_weight_grad = self.npu_op_exec(npu_input, npu_weight, stride, padding)
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_input_grad, npu_input_grad)
+            self.assertRtolEqual(cpu_weight_grad, npu_weight_grad)
+
+
+instantiate_device_type_tests(TestConv1d, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_conv2d.py b/test/test_npu/test_network_ops/test_conv2d.py
old mode 100644
new mode 100755
index 916e197f5320e310845bcd550c9055a3c3df63df..5a5f88f14d849ee787c2560c8ba413462cd6058d
--- a/test/test_npu/test_network_ops/test_conv2d.py
+++ b/test/test_npu/test_network_ops/test_conv2d.py
@@ -1,210 +1,210 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-sys.path.append('..')
-import torch
-import numpy as np
-import torch.nn as nn
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestConv2d(TestCase):
-    weight_grad = []
-    input_grad = []
-
-    def getWeightGrad(self, grad):
-        self.weight_grad.append(grad.to("cpu"))
-
-    def getInputGrad(self, grad):
-        self.input_grad.append(grad.to("cpu"))
-
-    def op_exec_cpu(self, input, weight, in_channels, out_channels, kernel_size, padding=0, stride=1, dilation=1, bias=True, groups=1):
-        input1 = input
-        weight1 = weight
-        input1.requires_grad = True
-        input1.register_hook(lambda grad: self.getInputGrad(grad))
-
-        m1 = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias, groups=groups)
-        m1.weight.data = weight1
-        m1.weight.register_hook(lambda grad: self.getWeightGrad(grad))
-        cpuOutput = m1(input1)
-        tmp = torch.ones_like(cpuOutput)
-        cpuOutput.backward(tmp)
-
-        return cpuOutput
-
-    def op_exec_npu(self, input, weight, in_channels, out_channels, kernel_size, padding=0, stride=1, dilation=1, bias=True, groups=1):
-        input1 = input
-        weight1 = weight
-        input1.requires_grad = True
-        input1.register_hook(lambda grad: self.getInputGrad(grad))
-
-        m1 = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias, groups=groups)
-        m1.weight.data = weight1
-        m1.weight.register_hook(lambda grad: self.getWeightGrad(grad))
-        m1 = m1.to("npu")
-        npuOutput = m1(input1)
-        tmp = torch.ones_like(npuOutput)
-        npuOutput.backward(tmp)
-
-        return npuOutput.to("cpu")
-
-    def conv2d_backward_result(self, shape_format):
-        for item in shape_format:
-            self.weight_grad.clear()
-            self.input_grad.clear()
-            input_cpu, input_npu = create_common_tensor(item[0], -1, 1)
-            if input_cpu.dtype == torch.float16:
-                input_cpu = input_cpu.to(torch.float32)
-            weight_cpu, weight_npu = create_common_tensor(item[1], -1, 1)
-            if weight_cpu.dtype == torch.float16:
-                weight_cpu = weight_cpu.to(torch.float32)
-            kernel_size = (item[1][2][2], item[1][2][3])
-            assert item[0][2][1]/item[6] == item[1][2][1], "ilegal parameters: con2d in_channels//groups must equal to weight.size[1]."
-            cpu_output = self.op_exec_cpu(input_cpu, weight_cpu, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
-                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5], groups=item[6])
-            weight_npu = weight_npu.to("cpu")
-            npu_output = self.op_exec_npu(input_npu, weight_npu, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
-                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5], groups=item[6])
-
-            npu_output = npu_output.to(torch.float16)
-            cpu_output = cpu_output.to(torch.float16)
-            self.input_grad[0] = self.input_grad[0].to(torch.float16)
-            self.input_grad[1] = self.input_grad[1].to(torch.float16)
-
-            self.weight_grad[0] = self.weight_grad[0].to(self.weight_grad[1].dtype)
-
-            self.assertRtolEqual(cpu_output.detach().numpy(), npu_output.detach().numpy())
-            self.assertRtolEqual(self.input_grad[0].numpy(), self.input_grad[1].numpy())
-            self.assertRtolEqual(self.weight_grad[0].numpy(), self.weight_grad[1].numpy())
-
-    def test_conv2d_backward_shape_format_fp16(self, device):
-        shape_format = [  # input, weight, padding, stride, dilation, bias, groups            
-            # shuflenet
-            [[np.float16, 3, [1024, 232, 7, 7]], [np.float16, 4, [232, 232, 1, 1]], 0, 1, 1, None, 1],
-            [[np.float16, 0, [1024, 116, 14, 14]], [np.float16, 4, [116, 116, 1, 1]], 0, 1, 1, None, 1],
-            [[np.float16, 0, [4, 8, 300, 40]], [np.float16, 0, [16, 8, 3, 3]], [2,1], 1, 1, None, 1], 
-            [[np.float16, 0, [4, 64, 150, 10]], [np.float16, 0, [32, 64, 1, 1]], 0, 1, 1, None, 1], 
-            [[np.float16, 0, [4, 128, 75, 10]], [np.float16, 0, [64, 128, 1, 1]], 0, 1, 1, None, 1], 
-            [[np.float16, 0, [4, 256, 75, 5]], [np.float16, 0, [128, 256, 3, 3]], [2,1], 1, 1, None, 1], 
-            [[np.float16, 0, [4, 384, 75, 1]], [np.float16, 0, [192, 384, 3, 1]], 0, 1, 1, None, 1], 
-            [[np.float16, 0, [4, 384, 1, 75]], [np.float16, 0, [192, 384, 1, 3]], 0, 1, 1, None, 1], 
-            [[np.float16, 3, [4, 256, 75, 5]], [np.float16, 4, [128, 256, 3, 3]], [2,1], 1, 1, None, 1], 
-            [[np.float16, 3, [4, 384, 75, 1]], [np.float16, 4, [192, 384, 3, 1]], 0, 1, 1, None, 1], 
-            [[np.float16, 3, [4, 384, 1, 75]], [np.float16, 4, [192, 384, 1, 3]], 0, 1, 1, None, 1], 
-            [[np.float16, 0, [4, 256, 75, 5]], [np.float16, 4, [128, 256, 3, 3]], [2,1], 1, 1, None, 1], 
-            [[np.float16, 0, [4, 384, 75, 1]], [np.float16, 4, [192, 384, 3, 1]], 0, 1, 1, None, 1], 
-            [[np.float16, 0, [4, 384, 1, 75]], [np.float16, 4, [192, 384, 1, 3]], 0, 1, 1, None, 1], 
-            # 当前不支持kernel_size_h >= padding_h*2 + input_h和kernel_size_w >= padding_w*2 + input_w, 预计330支持
-            # [[np.float16, 0, [4, 384, 75, 1]], [np.float16, 0, [192, 384, 3, 3]], 0, 1, 1, None, 1],
-            # [[np.float16, 0, [4, 384, 75, 1]], [np.float16, 0, [192, 384, 3, 3]], [1,1], 1, 1, None, 1],
-            # [[np.float16, 0, [4, 384, 1, 75]], [np.float16, 0, [192, 384, 3, 3]], 0, 1, 1, None, 1],
-            # [[np.float16, 0, [4, 384, 1, 75]], [np.float16, 0, [192, 384, 3, 3]], [1,1], 1, 1, None, 1],
-        ]
-        self.conv2d_backward_result(shape_format)
-
-    def test_conv2d_backward_shape_format_fp32(self, device):
-        shape_format = [  # input, weight, padding, stride, dilation, bias, groups            
-            # mobilenet
-            [[np.float32, 3, [256, 960, 7, 7]], [np.float32, 0, [320, 960, 1, 1]], 0, 1, 1, None, 1],
-            [[np.float32, 0, [256, 3, 224, 224]], [np.float32, 0, [32, 3, 3, 3]], 1, 2, 1, None, 1],
-            [[np.float32, 0, [16, 3, 640, 640]], [np.float32, 4, [64, 3, 7, 7]], 3, 2, 1, None, 1],
-            [[np.float32, 0, [4, 8, 300, 40]], [np.float32, 0, [16, 8, 3, 3]], [2,1], 1, 1, None, 1], 
-            [[np.float32, 0, [4, 64, 150, 10]], [np.float32, 0, [32, 64, 1, 1]], 0, 1, 1, None, 1], 
-            [[np.float32, 0, [4, 128, 75, 10]], [np.float32, 0, [64, 128, 1, 1]], 0, 1, 1, None, 1], 
-            [[np.float32, 0, [4, 256, 75, 5]], [np.float32, 0, [128, 256, 3, 3]], [2,1], 1, 1, None, 1], 
-            [[np.float32, 0, [4, 384, 75, 1]], [np.float32, 0, [192, 384, 3, 1]], 0, 1, 1, None, 1], 
-            [[np.float32, 0, [4, 384, 1, 75]], [np.float32, 0, [192, 384, 1, 3]], 0, 1, 1, None, 1], 
-            [[np.float32, 3, [4, 256, 75, 5]], [np.float32, 0, [128, 256, 3, 3]], [2,1], 1, 1, None, 1], 
-            [[np.float32, 3, [4, 384, 75, 1]], [np.float32, 0, [192, 384, 3, 1]], 0, 1, 1, None, 1], 
-            [[np.float32, 3, [4, 384, 1, 75]], [np.float32, 0, [192, 384, 1, 3]], 0, 1, 1, None, 1], 
-            [[np.float32, 0, [4, 256, 75, 5]], [np.float32, 4, [128, 256, 3, 3]], [2,1], 1, 1, None, 1], 
-            [[np.float32, 0, [4, 384, 75, 1]], [np.float32, 4, [192, 384, 3, 1]], 0, 1, 1, None, 1], 
-            [[np.float32, 0, [4, 384, 1, 75]], [np.float32, 4, [192, 384, 1, 3]], 0, 1, 1, None, 1], 
-            # 当前不支持kernel_size_h >= padding_h*2 + input_h和kernel_size_w >= padding_w*2 + input_w, 预计330支持
-            # [[np.float32, 0, [4, 384, 75, 1]], [np.float32, 0, [192, 384, 3, 3]], 0, 1, 1, None, 1],
-            # [[np.float32, 0, [4, 384, 75, 1]], [np.float32, 0, [192, 384, 3, 3]], [1,1], 1, 1, None, 1],
-            # [[np.float32, 0, [4, 384, 1, 75]], [np.float32, 0, [192, 384, 3, 3]], 0, 1, 1, None, 1],
-            # [[np.float32, 0, [4, 384, 1, 75]], [np.float32, 0, [192, 384, 3, 3]], [1,1], 1, 1, None, 1],
-            ]
-        #conv类算子不支持fp32数据的精度要求
-        #self.conv2d_backward_result(shape_format)
-
-    def test_group_conv2d_backward_shape_format_fp16(self, device):
-        shape_format= [  # input, weight, padding, stride, dilation, bias, groups
-            # KDXF
-            [[np.float16, 0, [4, 64, 75, 10]], [np.float16, 0, [128, 16, 3, 3]], [2,1], 1, 1, None, 4],
-            [[np.float16, 0, [4, 128, 75, 10]], [np.float16, 0, [64, 32, 1, 1]], 0, 1, 1, None, 4],
-            [[np.float16, 0, [4, 128, 75, 5]], [np.float16, 0, [256, 32, 3, 3]], [2,1], 1, 1, None, 4],
-            [[np.float16, 0, [4, 256, 75, 1]], [np.float16, 0, [384, 64, 3, 1]], [1,0], 1, 1, None, 4],
-            [[np.float16, 0, [4, 192, 75, 1]], [np.float16, 0, [384, 48, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float16, 0, [4, 128, 75, 1]], [np.float16, 0, [128, 32, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float16, 0, [4, 128, 75, 5]], [np.float16, 0, [128, 32, 3, 3]], [2,1], 1, 1, None, 4],
-            [[np.float16, 3, [4, 192, 75, 1]], [np.float16, 0, [384, 48, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float16, 3, [4, 128, 75, 1]], [np.float16, 0, [128, 32, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float16, 3, [4, 128, 75, 5]], [np.float16, 0, [128, 32, 3, 3]], [2,1], 1, 1, None, 4],
-            [[np.float16, 3, [4, 192, 75, 1]], [np.float16, 4, [384, 48, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float16, 3, [4, 128, 75, 1]], [np.float16, 4, [128, 32, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float16, 3, [4, 128, 75, 5]], [np.float16, 4, [128, 32, 3, 3]], [2,1], 1, 1, None, 4],
-            [[np.float16, 0, [4, 64, 75, 5]], [np.float16, 0, [64, 1, 3, 3]], [2,1], 1, 1, None, 64], 
-            [[np.float16, 0, [4, 64, 75, 1]], [np.float16, 0, [64, 1, 3, 1]], 0, 1, 1, None, 64], 
-            [[np.float16, 0, [4, 64, 1, 75]], [np.float16, 0, [64, 1, 1, 3]], 0, 1, 1, None, 64], 
-            # 当前不支持kernel_size_h >= padding_h*2 + input_h和kernel_size_w >= padding_w*2 + input_w, 预计330支持
-            # [[np.float16, 0, [4, 64, 75, 1]], [np.float16, 0, [128, 16, 3, 3]], 0, 1, 1, None, 4],
-            # [[np.float16, 0, [4, 64, 75, 1]], [np.float16, 0, [128, 16, 3, 3]], [1,1], 1, 1, None, 4],
-            # [[np.float16, 0, [4, 64, 1, 75]], [np.float16, 0, [128, 16, 3, 3]], 0, 1, 1, None, 4],
-            # [[np.float16, 0, [4, 64, 1, 75]], [np.float16, 0, [128, 16, 3, 3]], [1,1], 1, 1, None, 4],
-            # 当前不支持in_channel == groups != out_channel
-            # [[np.float32, 0, [4, 64, 75, 5]], [np.float32, 0, [128, 1, 3, 3]], [2,1], 1, 1, None, 64], 
-            # [[np.float32, 0, [4, 64, 75, 1]], [np.float32, 0, [128, 1, 3, 1]], 0, 1, 1, None, 64], 
-            # [[np.float32, 0, [4, 64, 1, 75]], [np.float32, 0, [128, 1, 1, 3]], 0, 1, 1, None, 64], 
-        ]
-
-    def test_group_conv2d_backward_shape_format_fp32(self, device):
-        shape_format= [  # input, weight, padding, stride, dilation, bias, groups
-            # KDXF
-            [[np.float32, 0, [4, 64, 75, 10]], [np.float32, 0, [128, 16, 3, 3]], [2,1], 1, 1, None, 4],
-            [[np.float32, 0, [4, 128, 75, 10]], [np.float32, 0, [64, 32, 1, 1]], 0, 1, 1, None, 4],
-            [[np.float32, 0, [4, 128, 75, 5]], [np.float32, 0, [256, 32, 3, 3]], [2,1], 1, 1, None, 4],
-            [[np.float32, 0, [4, 256, 75, 1]], [np.float32, 0, [384, 64, 3, 1]], [1,0], 1, 1, None, 4],
-            [[np.float32, 0, [4, 192, 75, 1]], [np.float32, 0, [384, 48, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float32, 0, [4, 128, 75, 1]], [np.float32, 0, [128, 32, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float32, 0, [4, 128, 75, 5]], [np.float32, 0, [128, 32, 3, 3]], [2,1], 1, 1, None, 4],
-            [[np.float32, 3, [4, 192, 75, 1]], [np.float32, 0, [384, 48, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float32, 3, [4, 128, 75, 1]], [np.float32, 0, [128, 32, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float32, 3, [4, 128, 75, 5]], [np.float32, 0, [128, 32, 3, 3]], [2,1], 1, 1, None, 4],
-            [[np.float32, 3, [4, 192, 75, 1]], [np.float32, 4, [384, 48, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float32, 3, [4, 128, 75, 1]], [np.float32, 4, [128, 32, 3, 1]], [2,0], 1, 1, None, 4],
-            [[np.float32, 3, [4, 128, 75, 5]], [np.float32, 4, [128, 32, 3, 3]], [2,1], 1, 1, None, 4],
-            [[np.float32, 0, [4, 64, 75, 5]], [np.float32, 0, [64, 1, 3, 3]], [2,1], 1, 1, None, 64], 
-            [[np.float32, 0, [4, 64, 75, 1]], [np.float32, 0, [64, 1, 3, 1]], 0, 1, 1, None, 64], 
-            [[np.float32, 0, [4, 64, 1, 75]], [np.float32, 0, [64, 1, 1, 3]], 0, 1, 1, None, 64], 
-            # 当前不支持kernel_size_h >= padding_h*2 + input_h和kernel_size_w >= padding_w*2 + input_w
-            # [[np.float32, 0, [4, 64, 75, 1]], [np.float32, 0, [128, 16, 3, 3]], 0, 1, 1, None, 4],
-            # [[np.float32, 0, [4, 64, 75, 1]], [np.float32, 0, [128, 16, 3, 3]], [1,1], 1, 1, None, 4],
-            # [[np.float32, 0, [4, 64, 1, 75]], [np.float32, 0, [128, 16, 3, 3]], 0, 1, 1, None, 4],
-            # [[np.float32, 0, [4, 64, 1, 75]], [np.float32, 0, [128, 16, 3, 3]], [1,1], 1, 1, None, 4],
-            # 当前不支持in_channel == groups != out_channel
-            # [[np.float32, 0, [4, 64, 75, 5]], [np.float32, 0, [128, 1, 3, 3]], [2,1], 1, 1, None, 64], 
-            # [[np.float32, 0, [4, 64, 75, 1]], [np.float32, 0, [128, 1, 3, 1]], 0, 1, 1, None, 64], 
-            # [[np.float32, 0, [4, 64, 1, 75]], [np.float32, 0, [128, 1, 1, 3]], 0, 1, 1, None, 64], 
-        ]
-
-
-instantiate_device_type_tests(TestConv2d, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('..')
+import torch
+import numpy as np
+import torch.nn as nn
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestConv2d(TestCase):
+    weight_grad = []
+    input_grad = []
+
+    def getWeightGrad(self, grad):
+        self.weight_grad.append(grad.to("cpu"))
+
+    def getInputGrad(self, grad):
+        self.input_grad.append(grad.to("cpu"))
+
+    def op_exec_cpu(self, input, weight, in_channels, out_channels, kernel_size, padding=0, stride=1, dilation=1, bias=True, groups=1):
+        input1 = input
+        weight1 = weight
+        input1.requires_grad = True
+        input1.register_hook(lambda grad: self.getInputGrad(grad))
+
+        m1 = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias, groups=groups)
+        m1.weight.data = weight1
+        m1.weight.register_hook(lambda grad: self.getWeightGrad(grad))
+        cpuOutput = m1(input1)
+        tmp = torch.ones_like(cpuOutput)
+        cpuOutput.backward(tmp)
+
+        return cpuOutput
+
+    def op_exec_npu(self, input, weight, in_channels, out_channels, kernel_size, padding=0, stride=1, dilation=1, bias=True, groups=1):
+        input1 = input
+        weight1 = weight
+        input1.requires_grad = True
+        input1.register_hook(lambda grad: self.getInputGrad(grad))
+
+        m1 = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias, groups=groups)
+        m1.weight.data = weight1
+        m1.weight.register_hook(lambda grad: self.getWeightGrad(grad))
+        m1 = m1.to("npu")
+        npuOutput = m1(input1)
+        tmp = torch.ones_like(npuOutput)
+        npuOutput.backward(tmp)
+
+        return npuOutput.to("cpu")
+
+    def conv2d_backward_result(self, shape_format):
+        for item in shape_format:
+            self.weight_grad.clear()
+            self.input_grad.clear()
+            input_cpu, input_npu = create_common_tensor(item[0], -1, 1)
+            if input_cpu.dtype == torch.float16:
+                input_cpu = input_cpu.to(torch.float32)
+            weight_cpu, weight_npu = create_common_tensor(item[1], -1, 1)
+            if weight_cpu.dtype == torch.float16:
+                weight_cpu = weight_cpu.to(torch.float32)
+            kernel_size = (item[1][2][2], item[1][2][3])
+            assert item[0][2][1]/item[6] == item[1][2][1], "ilegal parameters: con2d in_channels//groups must equal to weight.size[1]."
+            cpu_output = self.op_exec_cpu(input_cpu, weight_cpu, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
+                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5], groups=item[6])
+            weight_npu = weight_npu.to("cpu")
+            npu_output = self.op_exec_npu(input_npu, weight_npu, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
+                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5], groups=item[6])
+
+            npu_output = npu_output.to(torch.float16)
+            cpu_output = cpu_output.to(torch.float16)
+            self.input_grad[0] = self.input_grad[0].to(torch.float16)
+            self.input_grad[1] = self.input_grad[1].to(torch.float16)
+
+            self.weight_grad[0] = self.weight_grad[0].to(self.weight_grad[1].dtype)
+
+            self.assertRtolEqual(cpu_output.detach().numpy(), npu_output.detach().numpy())
+            self.assertRtolEqual(self.input_grad[0].numpy(), self.input_grad[1].numpy())
+            self.assertRtolEqual(self.weight_grad[0].numpy(), self.weight_grad[1].numpy())
+
+    def test_conv2d_backward_shape_format_fp16(self, device):
+        shape_format = [  # input, weight, padding, stride, dilation, bias, groups            
+            # shuflenet
+            [[np.float16, 3, [1024, 232, 7, 7]], [np.float16, 4, [232, 232, 1, 1]], 0, 1, 1, None, 1],
+            [[np.float16, 0, [1024, 116, 14, 14]], [np.float16, 4, [116, 116, 1, 1]], 0, 1, 1, None, 1],
+            [[np.float16, 0, [4, 8, 300, 40]], [np.float16, 0, [16, 8, 3, 3]], [2,1], 1, 1, None, 1], 
+            [[np.float16, 0, [4, 64, 150, 10]], [np.float16, 0, [32, 64, 1, 1]], 0, 1, 1, None, 1], 
+            [[np.float16, 0, [4, 128, 75, 10]], [np.float16, 0, [64, 128, 1, 1]], 0, 1, 1, None, 1], 
+            [[np.float16, 0, [4, 256, 75, 5]], [np.float16, 0, [128, 256, 3, 3]], [2,1], 1, 1, None, 1], 
+            [[np.float16, 0, [4, 384, 75, 1]], [np.float16, 0, [192, 384, 3, 1]], 0, 1, 1, None, 1], 
+            [[np.float16, 0, [4, 384, 1, 75]], [np.float16, 0, [192, 384, 1, 3]], 0, 1, 1, None, 1], 
+            [[np.float16, 3, [4, 256, 75, 5]], [np.float16, 4, [128, 256, 3, 3]], [2,1], 1, 1, None, 1], 
+            [[np.float16, 3, [4, 384, 75, 1]], [np.float16, 4, [192, 384, 3, 1]], 0, 1, 1, None, 1], 
+            [[np.float16, 3, [4, 384, 1, 75]], [np.float16, 4, [192, 384, 1, 3]], 0, 1, 1, None, 1], 
+            [[np.float16, 0, [4, 256, 75, 5]], [np.float16, 4, [128, 256, 3, 3]], [2,1], 1, 1, None, 1], 
+            [[np.float16, 0, [4, 384, 75, 1]], [np.float16, 4, [192, 384, 3, 1]], 0, 1, 1, None, 1], 
+            [[np.float16, 0, [4, 384, 1, 75]], [np.float16, 4, [192, 384, 1, 3]], 0, 1, 1, None, 1], 
+            # 当前不支持kernel_size_h >= padding_h*2 + input_h和kernel_size_w >= padding_w*2 + input_w, 预计330支持
+            # [[np.float16, 0, [4, 384, 75, 1]], [np.float16, 0, [192, 384, 3, 3]], 0, 1, 1, None, 1],
+            # [[np.float16, 0, [4, 384, 75, 1]], [np.float16, 0, [192, 384, 3, 3]], [1,1], 1, 1, None, 1],
+            # [[np.float16, 0, [4, 384, 1, 75]], [np.float16, 0, [192, 384, 3, 3]], 0, 1, 1, None, 1],
+            # [[np.float16, 0, [4, 384, 1, 75]], [np.float16, 0, [192, 384, 3, 3]], [1,1], 1, 1, None, 1],
+        ]
+        self.conv2d_backward_result(shape_format)
+
+    def test_conv2d_backward_shape_format_fp32(self, device):
+        shape_format = [  # input, weight, padding, stride, dilation, bias, groups            
+            # mobilenet
+            [[np.float32, 3, [256, 960, 7, 7]], [np.float32, 0, [320, 960, 1, 1]], 0, 1, 1, None, 1],
+            [[np.float32, 0, [256, 3, 224, 224]], [np.float32, 0, [32, 3, 3, 3]], 1, 2, 1, None, 1],
+            [[np.float32, 0, [16, 3, 640, 640]], [np.float32, 4, [64, 3, 7, 7]], 3, 2, 1, None, 1],
+            [[np.float32, 0, [4, 8, 300, 40]], [np.float32, 0, [16, 8, 3, 3]], [2,1], 1, 1, None, 1], 
+            [[np.float32, 0, [4, 64, 150, 10]], [np.float32, 0, [32, 64, 1, 1]], 0, 1, 1, None, 1], 
+            [[np.float32, 0, [4, 128, 75, 10]], [np.float32, 0, [64, 128, 1, 1]], 0, 1, 1, None, 1], 
+            [[np.float32, 0, [4, 256, 75, 5]], [np.float32, 0, [128, 256, 3, 3]], [2,1], 1, 1, None, 1], 
+            [[np.float32, 0, [4, 384, 75, 1]], [np.float32, 0, [192, 384, 3, 1]], 0, 1, 1, None, 1], 
+            [[np.float32, 0, [4, 384, 1, 75]], [np.float32, 0, [192, 384, 1, 3]], 0, 1, 1, None, 1], 
+            [[np.float32, 3, [4, 256, 75, 5]], [np.float32, 0, [128, 256, 3, 3]], [2,1], 1, 1, None, 1], 
+            [[np.float32, 3, [4, 384, 75, 1]], [np.float32, 0, [192, 384, 3, 1]], 0, 1, 1, None, 1], 
+            [[np.float32, 3, [4, 384, 1, 75]], [np.float32, 0, [192, 384, 1, 3]], 0, 1, 1, None, 1], 
+            [[np.float32, 0, [4, 256, 75, 5]], [np.float32, 4, [128, 256, 3, 3]], [2,1], 1, 1, None, 1], 
+            [[np.float32, 0, [4, 384, 75, 1]], [np.float32, 4, [192, 384, 3, 1]], 0, 1, 1, None, 1], 
+            [[np.float32, 0, [4, 384, 1, 75]], [np.float32, 4, [192, 384, 1, 3]], 0, 1, 1, None, 1], 
+            # 当前不支持kernel_size_h >= padding_h*2 + input_h和kernel_size_w >= padding_w*2 + input_w, 预计330支持
+            # [[np.float32, 0, [4, 384, 75, 1]], [np.float32, 0, [192, 384, 3, 3]], 0, 1, 1, None, 1],
+            # [[np.float32, 0, [4, 384, 75, 1]], [np.float32, 0, [192, 384, 3, 3]], [1,1], 1, 1, None, 1],
+            # [[np.float32, 0, [4, 384, 1, 75]], [np.float32, 0, [192, 384, 3, 3]], 0, 1, 1, None, 1],
+            # [[np.float32, 0, [4, 384, 1, 75]], [np.float32, 0, [192, 384, 3, 3]], [1,1], 1, 1, None, 1],
+            ]
+        #conv类算子不支持fp32数据的精度要求
+        #self.conv2d_backward_result(shape_format)
+
+    def test_group_conv2d_backward_shape_format_fp16(self, device):
+        shape_format= [  # input, weight, padding, stride, dilation, bias, groups
+            # KDXF
+            [[np.float16, 0, [4, 64, 75, 10]], [np.float16, 0, [128, 16, 3, 3]], [2,1], 1, 1, None, 4],
+            [[np.float16, 0, [4, 128, 75, 10]], [np.float16, 0, [64, 32, 1, 1]], 0, 1, 1, None, 4],
+            [[np.float16, 0, [4, 128, 75, 5]], [np.float16, 0, [256, 32, 3, 3]], [2,1], 1, 1, None, 4],
+            [[np.float16, 0, [4, 256, 75, 1]], [np.float16, 0, [384, 64, 3, 1]], [1,0], 1, 1, None, 4],
+            [[np.float16, 0, [4, 192, 75, 1]], [np.float16, 0, [384, 48, 3, 1]], [2,0], 1, 1, None, 4],
+            [[np.float16, 0, [4, 128, 75, 1]], [np.float16, 0, [128, 32, 3, 1]], [2,0], 1, 1, None, 4],
+            [[np.float16, 0, [4, 128, 75, 5]], [np.float16, 0, [128, 32, 3, 3]], [2,1], 1, 1, None, 4],
+            [[np.float16, 3, [4, 192, 75, 1]], [np.float16, 0, [384, 48, 3, 1]], [2,0], 1, 1, None, 4],
+            [[np.float16, 3, [4, 128, 75, 1]], [np.float16, 0, [128, 32, 3, 1]], [2,0], 1, 1, None, 4],
+            [[np.float16, 3, [4, 128, 75, 5]], [np.float16, 0, [128, 32, 3, 3]], [2,1], 1, 1, None, 4],
+            [[np.float16, 3, [4, 192, 75, 1]], [np.float16, 4, [384, 48, 3, 1]], [2,0], 1, 1, None, 4],
+            [[np.float16, 3, [4, 128, 75, 1]], [np.float16, 4, [128, 32, 3, 1]], [2,0], 1, 1, None, 4],
+            [[np.float16, 3, [4, 128, 75, 5]], [np.float16, 4, [128, 32, 3, 3]], [2,1], 1, 1, None, 4],
+            [[np.float16, 0, [4, 64, 75, 5]], [np.float16, 0, [64, 1, 3, 3]], [2,1], 1, 1, None, 64], 
+            [[np.float16, 0, [4, 64, 75, 1]], [np.float16, 0, [64, 1, 3, 1]], 0, 1, 1, None, 64], 
+            [[np.float16, 0, [4, 64, 1, 75]], [np.float16, 0, [64, 1, 1, 3]], 0, 1, 1, None, 64], 
+            # 当前不支持kernel_size_h >= padding_h*2 + input_h和kernel_size_w >= padding_w*2 + input_w, 预计330支持
+            # [[np.float16, 0, [4, 64, 75, 1]], [np.float16, 0, [128, 16, 3, 3]], 0, 1, 1, None, 4],
+            # [[np.float16, 0, [4, 64, 75, 1]], [np.float16, 0, [128, 16, 3, 3]], [1,1], 1, 1, None, 4],
+            # [[np.float16, 0, [4, 64, 1, 75]], [np.float16, 0, [128, 16, 3, 3]], 0, 1, 1, None, 4],
+            # [[np.float16, 0, [4, 64, 1, 75]], [np.float16, 0, [128, 16, 3, 3]], [1,1], 1, 1, None, 4],
+            # 当前不支持in_channel == groups != out_channel
+            # [[np.float32, 0, [4, 64, 75, 5]], [np.float32, 0, [128, 1, 3, 3]], [2,1], 1, 1, None, 64], 
+            # [[np.float32, 0, [4, 64, 75, 1]], [np.float32, 0, [128, 1, 3, 1]], 0, 1, 1, None, 64], 
+            # [[np.float32, 0, [4, 64, 1, 75]], [np.float32, 0, [128, 1, 1, 3]], 0, 1, 1, None, 64], 
+        ]
+
+    def test_group_conv2d_backward_shape_format_fp32(self, device):
+        shape_format= [  # input, weight, padding, stride, dilation, bias, groups
+            # KDXF
+            [[np.float32, 0, [4, 64, 75, 10]], [np.float32, 0, [128, 16, 3, 3]], [2,1], 1, 1, None, 4],
+            [[np.float32, 0, [4, 128, 75, 10]], [np.float32, 0, [64, 32, 1, 1]], 0, 1, 1, None, 4],
+            [[np.float32, 0, [4, 128, 75, 5]], [np.float32, 0, [256, 32, 3, 3]], [2,1], 1, 1, None, 4],
+            [[np.float32, 0, [4, 256, 75, 1]], [np.float32, 0, [384, 64, 3, 1]], [1,0], 1, 1, None, 4],
+            [[np.float32, 0, [4, 192, 75, 1]], [np.float32, 0, [384, 48, 3, 1]], [2,0], 1, 1, None, 4],
+            [[np.float32, 0, [4, 128, 75, 1]], [np.float32, 0, [128, 32, 3, 1]], [2,0], 1, 1, None, 4],
+            [[np.float32, 0, [4, 128, 75, 5]], [np.float32, 0, [128, 32, 3, 3]], [2,1], 1, 1, None, 4],
+            [[np.float32, 3, [4, 192, 75, 1]], [np.float32, 0, [384, 48, 3, 1]], [2,0], 1, 1, None, 4],
+            [[np.float32, 3, [4, 128, 75, 1]], [np.float32, 0, [128, 32, 3, 1]], [2,0], 1, 1, None, 4],
+            [[np.float32, 3, [4, 128, 75, 5]], [np.float32, 0, [128, 32, 3, 3]], [2,1], 1, 1, None, 4],
+            [[np.float32, 3, [4, 192, 75, 1]], [np.float32, 4, [384, 48, 3, 1]], [2,0], 1, 1, None, 4],
+            [[np.float32, 3, [4, 128, 75, 1]], [np.float32, 4, [128, 32, 3, 1]], [2,0], 1, 1, None, 4],
+            [[np.float32, 3, [4, 128, 75, 5]], [np.float32, 4, [128, 32, 3, 3]], [2,1], 1, 1, None, 4],
+            [[np.float32, 0, [4, 64, 75, 5]], [np.float32, 0, [64, 1, 3, 3]], [2,1], 1, 1, None, 64], 
+            [[np.float32, 0, [4, 64, 75, 1]], [np.float32, 0, [64, 1, 3, 1]], 0, 1, 1, None, 64], 
+            [[np.float32, 0, [4, 64, 1, 75]], [np.float32, 0, [64, 1, 1, 3]], 0, 1, 1, None, 64], 
+            # 当前不支持kernel_size_h >= padding_h*2 + input_h和kernel_size_w >= padding_w*2 + input_w
+            # [[np.float32, 0, [4, 64, 75, 1]], [np.float32, 0, [128, 16, 3, 3]], 0, 1, 1, None, 4],
+            # [[np.float32, 0, [4, 64, 75, 1]], [np.float32, 0, [128, 16, 3, 3]], [1,1], 1, 1, None, 4],
+            # [[np.float32, 0, [4, 64, 1, 75]], [np.float32, 0, [128, 16, 3, 3]], 0, 1, 1, None, 4],
+            # [[np.float32, 0, [4, 64, 1, 75]], [np.float32, 0, [128, 16, 3, 3]], [1,1], 1, 1, None, 4],
+            # 当前不支持in_channel == groups != out_channel
+            # [[np.float32, 0, [4, 64, 75, 5]], [np.float32, 0, [128, 1, 3, 3]], [2,1], 1, 1, None, 64], 
+            # [[np.float32, 0, [4, 64, 75, 1]], [np.float32, 0, [128, 1, 3, 1]], 0, 1, 1, None, 64], 
+            # [[np.float32, 0, [4, 64, 1, 75]], [np.float32, 0, [128, 1, 1, 3]], 0, 1, 1, None, 64], 
+        ]
+
+
+instantiate_device_type_tests(TestConv2d, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_conv3d.py b/test/test_npu/test_network_ops/test_conv3d.py
index d06b3426e5c26dd9c0e1dc13313f27d53801bd1e..bb5b24bb96b89915bd84cfec2a71dcfddbf1b0bf 100644
--- a/test/test_npu/test_network_ops/test_conv3d.py
+++ b/test/test_npu/test_network_ops/test_conv3d.py
@@ -1,115 +1,115 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import torch
-import numpy as np
-import torch.nn as nn
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestConv3d(TestCase):
-    weight_grad = []
-    input_grad = []
-
-    def getWeightGrad(self, grad):
-        self.weight_grad.append(grad.to("cpu"))
-
-    def getInputGrad(self, grad):
-        self.input_grad.append(grad.to("cpu"))
-
-    def op_exec_cpu(self, input, weight, in_channels, out_channels, kernel_size, padding=0, stride=1, dilation=1, bias=True, groups=1):
-        
-        input1 = input
-        weight1 = weight
-        input1.requires_grad = True
-        input1.register_hook(lambda grad: self.getInputGrad(grad))
-
-        m1 = nn.Conv3d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias, groups=groups)
-        m1.weight.data = weight1
-        m1.weight.register_hook(lambda grad: self.getWeightGrad(grad))
-        cpuOutput = m1(input1)
-        tmp = torch.ones_like(cpuOutput)
-        cpuOutput.backward(tmp)
-        return cpuOutput
-
-    def op_exec_npu(self, input, weight, in_channels, out_channels, kernel_size, padding=0, stride=1, dilation=1, bias=False, groups=1):
-        
-        input1 = input
-        weight1 = weight
-        input1.requires_grad = True
-        input1.register_hook(lambda grad: self.getInputGrad(grad))
-
-        m1 = nn.Conv3d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias, groups=groups)
-        m1.weight.data = weight1
-        m1.weight.register_hook(lambda grad: self.getWeightGrad(grad))
-        m1 = m1.to("npu")
-        npuOutput = m1(input1)
-        tmp = torch.ones_like(npuOutput)
-        npuOutput.backward(tmp)
-
-        return npuOutput.to("cpu")
-
-    def conv3d_backward_result(self, shape_format):
-        for item in shape_format:
-            self.weight_grad.clear()
-            self.input_grad.clear()
-            input_cpu, input_npu = create_common_tensor(item[0], 0, 1)
-            if input_cpu.dtype == torch.float16:
-                input_cpu = input_cpu.to(torch.float32)
-            weight_cpu, weight_npu = create_common_tensor(item[1], 0, 1)
-            if weight_cpu.dtype == torch.float16:
-                weight_cpu = weight_cpu.to(torch.float32)
-            kernel_size = (item[1][2][2], item[1][2][3],item[1][2][4])
-            #assert item[0][2][1]/item[6] == item[1][2][1], "ilegal parameters: con2d in_channels//groups must equal to weight.size[1]."
-            cpu_output = self.op_exec_cpu(input_cpu, weight_cpu, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
-                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5], groups=item[6])
-            weight_npu = weight_npu.to("cpu")
-
-            npu_output = self.op_exec_npu(input_npu, weight_npu, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
-                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5], groups=item[6])
-
-            npu_output = npu_output.to(torch.float16)
-            cpu_output = cpu_output.to(torch.float16)
-            self.input_grad[0] = self.input_grad[0].to(torch.float16)
-            self.input_grad[1] = self.input_grad[1].to(torch.float16)
-            self.weight_grad[0] = self.weight_grad[0].to(self.weight_grad[1].dtype)
-            self.assertRtolEqual(cpu_output.detach().numpy(), npu_output.cpu().detach().numpy())
-            self.assertRtolEqual(self.input_grad[0].numpy(), self.input_grad[1].cpu().numpy())
-            self.assertRtolEqual(self.weight_grad[0].numpy(), self.weight_grad[1].cpu().numpy())
-
-    def test_conv3d_backward_shape_format_fp16(self, device):
-        shape_format = [  # input, weight, padding, stride, dilation, bias, groups                      
-            [[np.float16, 30, [128, 128, 4, 14, 14]], [np.float16, 30, [128, 128, 3, 3, 3]], [1,1,1], [1,1,1], 1, None, 1],
-            [[np.float16, 30, [128, 64, 4, 14, 14]], [np.float16, 30, [128, 64, 3, 3, 3]], [1,1,1], [2,2,2], 1, None, 1],
-            [[np.float16, 30, [128, 256, 2, 7, 7]], [np.float16, 30, [256, 256, 3, 3, 3]], [1,1,1], [1,1,1], 1, None, 1],
-            [[np.float16, 30, [128, 512, 1, 4, 4]], [np.float16, 30, [512, 512, 3, 3, 3]], [1,1,1], [1,1,1], 1, None, 1],
-            [[np.float16, 30, [128, 256, 2, 7, 7]], [np.float16, 30, [512, 256, 1, 1, 1]], 0, [2,2,2], 1, None, 1]
-        ]
-        self.conv3d_backward_result(shape_format)
-        
-    def test_conv3d_backward_shape_format_fp32(self, device):
-        shape_format = [  # input, weight, padding, stride, dilation, bias, groups                      
-            [[np.float32, 30, [128, 128, 4, 14, 14]], [np.float32, 30, [128, 128, 3, 3, 3]], [1,1,1], [1,1,1], 1, None, 1],
-            [[np.float32, 30, [128, 64, 4, 14, 14]], [np.float32, 30, [128, 64, 3, 3, 3]], [1,1,1], [2,2,2], 1, None, 1],
-            [[np.float32, 30, [128, 256, 2, 7, 7]], [np.float32, 30, [256, 256, 3, 3, 3]], [1,1,1], [1,1,1], 1, None, 1],
-            [[np.float32, 30, [128, 512, 1, 4, 4]], [np.float32, 30, [512, 512, 3, 3, 3]], [1,1,1], [1,1,1], 1, None, 1],
-            [[np.float32, 30, [128, 256, 2, 7, 7]], [np.float32, 30, [512, 256, 1, 1, 1]], 0, [2,2,2], 1, None, 1]
-        ]
-        self.conv3d_backward_result(shape_format)
-
-instantiate_device_type_tests(TestConv3d, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import torch
+import numpy as np
+import torch.nn as nn
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestConv3d(TestCase):
+    weight_grad = []
+    input_grad = []
+
+    def getWeightGrad(self, grad):
+        self.weight_grad.append(grad.to("cpu"))
+
+    def getInputGrad(self, grad):
+        self.input_grad.append(grad.to("cpu"))
+
+    def op_exec_cpu(self, input, weight, in_channels, out_channels, kernel_size, padding=0, stride=1, dilation=1, bias=True, groups=1):
+        
+        input1 = input
+        weight1 = weight
+        input1.requires_grad = True
+        input1.register_hook(lambda grad: self.getInputGrad(grad))
+
+        m1 = nn.Conv3d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias, groups=groups)
+        m1.weight.data = weight1
+        m1.weight.register_hook(lambda grad: self.getWeightGrad(grad))
+        cpuOutput = m1(input1)
+        tmp = torch.ones_like(cpuOutput)
+        cpuOutput.backward(tmp)
+        return cpuOutput
+
+    def op_exec_npu(self, input, weight, in_channels, out_channels, kernel_size, padding=0, stride=1, dilation=1, bias=False, groups=1):
+        
+        input1 = input
+        weight1 = weight
+        input1.requires_grad = True
+        input1.register_hook(lambda grad: self.getInputGrad(grad))
+
+        m1 = nn.Conv3d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias, groups=groups)
+        m1.weight.data = weight1
+        m1.weight.register_hook(lambda grad: self.getWeightGrad(grad))
+        m1 = m1.to("npu")
+        npuOutput = m1(input1)
+        tmp = torch.ones_like(npuOutput)
+        npuOutput.backward(tmp)
+
+        return npuOutput.to("cpu")
+
+    def conv3d_backward_result(self, shape_format):
+        for item in shape_format:
+            self.weight_grad.clear()
+            self.input_grad.clear()
+            input_cpu, input_npu = create_common_tensor(item[0], 0, 1)
+            if input_cpu.dtype == torch.float16:
+                input_cpu = input_cpu.to(torch.float32)
+            weight_cpu, weight_npu = create_common_tensor(item[1], 0, 1)
+            if weight_cpu.dtype == torch.float16:
+                weight_cpu = weight_cpu.to(torch.float32)
+            kernel_size = (item[1][2][2], item[1][2][3],item[1][2][4])
+            #assert item[0][2][1]/item[6] == item[1][2][1], "ilegal parameters: con2d in_channels//groups must equal to weight.size[1]."
+            cpu_output = self.op_exec_cpu(input_cpu, weight_cpu, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
+                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5], groups=item[6])
+            weight_npu = weight_npu.to("cpu")
+
+            npu_output = self.op_exec_npu(input_npu, weight_npu, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
+                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5], groups=item[6])
+
+            npu_output = npu_output.to(torch.float16)
+            cpu_output = cpu_output.to(torch.float16)
+            self.input_grad[0] = self.input_grad[0].to(torch.float16)
+            self.input_grad[1] = self.input_grad[1].to(torch.float16)
+            self.weight_grad[0] = self.weight_grad[0].to(self.weight_grad[1].dtype)
+            self.assertRtolEqual(cpu_output.detach().numpy(), npu_output.cpu().detach().numpy())
+            self.assertRtolEqual(self.input_grad[0].numpy(), self.input_grad[1].cpu().numpy())
+            self.assertRtolEqual(self.weight_grad[0].numpy(), self.weight_grad[1].cpu().numpy())
+
+    def test_conv3d_backward_shape_format_fp16(self, device):
+        shape_format = [  # input, weight, padding, stride, dilation, bias, groups                      
+            [[np.float16, 30, [128, 128, 4, 14, 14]], [np.float16, 30, [128, 128, 3, 3, 3]], [1,1,1], [1,1,1], 1, None, 1],
+            [[np.float16, 30, [128, 64, 4, 14, 14]], [np.float16, 30, [128, 64, 3, 3, 3]], [1,1,1], [2,2,2], 1, None, 1],
+            [[np.float16, 30, [128, 256, 2, 7, 7]], [np.float16, 30, [256, 256, 3, 3, 3]], [1,1,1], [1,1,1], 1, None, 1],
+            [[np.float16, 30, [128, 512, 1, 4, 4]], [np.float16, 30, [512, 512, 3, 3, 3]], [1,1,1], [1,1,1], 1, None, 1],
+            [[np.float16, 30, [128, 256, 2, 7, 7]], [np.float16, 30, [512, 256, 1, 1, 1]], 0, [2,2,2], 1, None, 1]
+        ]
+        self.conv3d_backward_result(shape_format)
+        
+    def test_conv3d_backward_shape_format_fp32(self, device):
+        shape_format = [  # input, weight, padding, stride, dilation, bias, groups                      
+            [[np.float32, 30, [128, 128, 4, 14, 14]], [np.float32, 30, [128, 128, 3, 3, 3]], [1,1,1], [1,1,1], 1, None, 1],
+            [[np.float32, 30, [128, 64, 4, 14, 14]], [np.float32, 30, [128, 64, 3, 3, 3]], [1,1,1], [2,2,2], 1, None, 1],
+            [[np.float32, 30, [128, 256, 2, 7, 7]], [np.float32, 30, [256, 256, 3, 3, 3]], [1,1,1], [1,1,1], 1, None, 1],
+            [[np.float32, 30, [128, 512, 1, 4, 4]], [np.float32, 30, [512, 512, 3, 3, 3]], [1,1,1], [1,1,1], 1, None, 1],
+            [[np.float32, 30, [128, 256, 2, 7, 7]], [np.float32, 30, [512, 256, 1, 1, 1]], 0, [2,2,2], 1, None, 1]
+        ]
+        self.conv3d_backward_result(shape_format)
+
+instantiate_device_type_tests(TestConv3d, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_conv_depthwise2d_backward.py b/test/test_npu/test_network_ops/test_conv_depthwise2d_backward.py
old mode 100644
new mode 100755
index 535a8d0c0071be732055d19f842eb9daf5444c73..2ae37272167992347d4d4887530d7f93e5baaba6
--- a/test/test_npu/test_network_ops/test_conv_depthwise2d_backward.py
+++ b/test/test_npu/test_network_ops/test_conv_depthwise2d_backward.py
@@ -1,118 +1,118 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-sys.path.append('..')
-import torch
-import numpy as np
-import torch.nn as nn
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestConvDepthwise2d(TestCase):
-    weight_grad = []
-    input_grad = []
-
-    def getWeightGrad(self, grad):
-        self.weight_grad.append(grad.to("cpu"))
-
-    def getInputGrad(self, grad):
-        self.input_grad.append(grad.to("cpu"))
-
-    def op_exec_cpu(self, input, weight, in_channels, out_channels, kernel_size, padding=0, stride=1, dilation=1,
-                    bias=True):
-        input1 = input
-        weight1 = weight
-        input1.requires_grad = True
-        input1.register_hook(lambda grad: self.getInputGrad(grad))
-
-        m1 = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias,
-                       groups=in_channels)
-        m1.weight.data = weight1
-        m1.weight.register_hook(lambda grad: self.getWeightGrad(grad))
-        cpuOutput = m1(input1)
-        tmp = torch.ones_like(cpuOutput)
-        cpuOutput.backward(tmp)
-
-        return cpuOutput
-
-    def op_exec_npu(self, input, weight, in_channels, out_channels, kernel_size, padding=0, stride=1, dilation=1,
-                    bias=True):
-        input1 = input
-        weight1 = weight
-        input1.requires_grad = True
-        input1.register_hook(lambda grad: self.getInputGrad(grad))
-
-        m1 = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias,
-                       groups=in_channels)
-        m1.weight.data = weight1
-        m1.weight.register_hook(lambda grad: self.getWeightGrad(grad))
-        m1 = m1.to("npu")
-        npuOutput = m1(input1)
-        npuOutput = npuOutput.to("cpu")
-        tmp = torch.ones_like(npuOutput)
-        npuOutput.backward(tmp)
-
-        return npuOutput
-
-    def conv_depthwise2d_backward_result(self, shape_format):
-        for item in shape_format:
-            self.weight_grad.clear()
-            self.input_grad.clear()
-            input_cpu, input_npu = create_common_tensor(item[0], -1, 1)
-            if input_cpu.dtype == torch.float16:
-                input_cpu = input_cpu.to(torch.float32)
-            weight_cpu, weight_npu = create_common_tensor(item[1], -1, 1)
-            if weight_cpu.dtype == torch.float16:
-                weight_cpu = weight_cpu.to(torch.float32)
-            kernel_size = (item[1][2][2], item[1][2][3])
-            cpu_output = self.op_exec_cpu(input_cpu, weight_cpu, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
-                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
-            weight_npu = weight_npu.to("cpu")
-            npu_output = self.op_exec_npu(input_npu, weight_npu, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
-                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
-            npu_output = npu_output.to(torch.float16)
-            cpu_output = cpu_output.to(torch.float16)
-            self.input_grad[0] = self.input_grad[0].to(torch.float16)
-            self.input_grad[1] = self.input_grad[1].to(torch.float16)
-
-            self.weight_grad[0] = self.weight_grad[0].to(self.weight_grad[1].dtype)
-
-            self.assertRtolEqual(cpu_output.detach().numpy(), npu_output.detach().numpy())
-            self.assertRtolEqual(self.input_grad[0].numpy(), self.input_grad[1].numpy())
-            self.assertRtolEqual(self.weight_grad[0].numpy(), self.weight_grad[1].numpy())
-
-    def test_conv_depthwise2d_backward_shape_format_fp16(self, device):
-        shape_format = [  #input , weight, padding, stide, dilation, bias
-            # shuflenet
-            [[np.float16, 0, [1024, 116, 28, 28]], [np.float16, 0, [116, 1, 3, 3]], 1, 2, 1, 0],
-            [[np.float16, 3, [1024, 116, 14, 14]], [np.float16, 0, [116, 1, 3, 3]], 1, 1, 1, 0],
-        ]
-        self.conv_depthwise2d_backward_result(shape_format)
-
-    def test_conv_depthwise2d_backward_shape_format_fp32(self, device):
-        shape_format = [  #input , weight, padding, stide, dilation, bias
-            # mobilenet
-            [[np.float32, 3, [256, 32, 112, 112]], [np.float32, 0, [32, 1, 3, 3]], 1, 1, 1, None],
-            [[np.float32, 3, [256, 96, 112, 112]], [np.float32, 0, [96, 1, 3, 3]], 1, 2, 1, None],
-        ]
-        #conv类算子不支持fp32数据的精度要求
-        #self.conv_depthwise2d_backward_result(shape_format)
-
-
-instantiate_device_type_tests(TestConvDepthwise2d, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('..')
+import torch
+import numpy as np
+import torch.nn as nn
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestConvDepthwise2d(TestCase):
+    weight_grad = []
+    input_grad = []
+
+    def getWeightGrad(self, grad):
+        self.weight_grad.append(grad.to("cpu"))
+
+    def getInputGrad(self, grad):
+        self.input_grad.append(grad.to("cpu"))
+
+    def op_exec_cpu(self, input, weight, in_channels, out_channels, kernel_size, padding=0, stride=1, dilation=1,
+                    bias=True):
+        input1 = input
+        weight1 = weight
+        input1.requires_grad = True
+        input1.register_hook(lambda grad: self.getInputGrad(grad))
+
+        m1 = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias,
+                       groups=in_channels)
+        m1.weight.data = weight1
+        m1.weight.register_hook(lambda grad: self.getWeightGrad(grad))
+        cpuOutput = m1(input1)
+        tmp = torch.ones_like(cpuOutput)
+        cpuOutput.backward(tmp)
+
+        return cpuOutput
+
+    def op_exec_npu(self, input, weight, in_channels, out_channels, kernel_size, padding=0, stride=1, dilation=1,
+                    bias=True):
+        input1 = input
+        weight1 = weight
+        input1.requires_grad = True
+        input1.register_hook(lambda grad: self.getInputGrad(grad))
+
+        m1 = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias,
+                       groups=in_channels)
+        m1.weight.data = weight1
+        m1.weight.register_hook(lambda grad: self.getWeightGrad(grad))
+        m1 = m1.to("npu")
+        npuOutput = m1(input1)
+        npuOutput = npuOutput.to("cpu")
+        tmp = torch.ones_like(npuOutput)
+        npuOutput.backward(tmp)
+
+        return npuOutput
+
+    def conv_depthwise2d_backward_result(self, shape_format):
+        for item in shape_format:
+            self.weight_grad.clear()
+            self.input_grad.clear()
+            input_cpu, input_npu = create_common_tensor(item[0], -1, 1)
+            if input_cpu.dtype == torch.float16:
+                input_cpu = input_cpu.to(torch.float32)
+            weight_cpu, weight_npu = create_common_tensor(item[1], -1, 1)
+            if weight_cpu.dtype == torch.float16:
+                weight_cpu = weight_cpu.to(torch.float32)
+            kernel_size = (item[1][2][2], item[1][2][3])
+            cpu_output = self.op_exec_cpu(input_cpu, weight_cpu, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
+                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
+            weight_npu = weight_npu.to("cpu")
+            npu_output = self.op_exec_npu(input_npu, weight_npu, item[0][2][1], item[1][2][0], kernel_size=kernel_size,
+                                          padding=item[2], stride=item[3], dilation=item[4], bias=item[5])
+            npu_output = npu_output.to(torch.float16)
+            cpu_output = cpu_output.to(torch.float16)
+            self.input_grad[0] = self.input_grad[0].to(torch.float16)
+            self.input_grad[1] = self.input_grad[1].to(torch.float16)
+
+            self.weight_grad[0] = self.weight_grad[0].to(self.weight_grad[1].dtype)
+
+            self.assertRtolEqual(cpu_output.detach().numpy(), npu_output.detach().numpy())
+            self.assertRtolEqual(self.input_grad[0].numpy(), self.input_grad[1].numpy())
+            self.assertRtolEqual(self.weight_grad[0].numpy(), self.weight_grad[1].numpy())
+
+    def test_conv_depthwise2d_backward_shape_format_fp16(self, device):
+        shape_format = [  #input , weight, padding, stide, dilation, bias
+            # shuflenet
+            [[np.float16, 0, [1024, 116, 28, 28]], [np.float16, 0, [116, 1, 3, 3]], 1, 2, 1, 0],
+            [[np.float16, 3, [1024, 116, 14, 14]], [np.float16, 0, [116, 1, 3, 3]], 1, 1, 1, 0],
+        ]
+        self.conv_depthwise2d_backward_result(shape_format)
+
+    def test_conv_depthwise2d_backward_shape_format_fp32(self, device):
+        shape_format = [  #input , weight, padding, stide, dilation, bias
+            # mobilenet
+            [[np.float32, 3, [256, 32, 112, 112]], [np.float32, 0, [32, 1, 3, 3]], 1, 1, 1, None],
+            [[np.float32, 3, [256, 96, 112, 112]], [np.float32, 0, [96, 1, 3, 3]], 1, 2, 1, None],
+        ]
+        #conv类算子不支持fp32数据的精度要求
+        #self.conv_depthwise2d_backward_result(shape_format)
+
+
+instantiate_device_type_tests(TestConvDepthwise2d, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_convolution_backward_weight.py b/test/test_npu/test_network_ops/test_convolution_backward_weight.py
index beaf25c285e64c0f8c1c81f48ab61f10e3a7f369..e8924c141a0dc2bf27148830c2d8a06375f9b8cb 100644
--- a/test/test_npu/test_network_ops/test_convolution_backward_weight.py
+++ b/test/test_npu/test_network_ops/test_convolution_backward_weight.py
@@ -1,109 +1,109 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestCudnnConvolutionBackwardWeight(TestCase):
-    weight_grad = []
-
-    def getWeightGrad(self, grad):
-        self.weight_grad.append(grad.to("cpu"))
-
-    def cpu_op_exec(self, input1, weight, stride, padding, dilation, groups):
-        weight.requires_grad = True
-        res_forward = torch._convolution(input1,
-                                         weight,
-                                         bias=None,
-                                         stride=stride,
-                                         padding=padding,
-                                         dilation=dilation,
-                                         transposed=False,
-                                         output_padding=(0, 0),
-                                         groups=groups,
-                                         benchmark=True,
-                                         deterministic=True,
-                                         cudnn_enabled=True)
-        grads = torch.ones_like(res_forward).float()
-        res_forward.backward(grads, retain_graph=True)
-        res_forward = res_forward.detach().numpy()
-        gradweight = weight.grad
-        return res_forward, gradweight
-
-    def npu_op_exec(self, input1, weight, stride, padding, dilation, groups):
-        weight.requires_grad = True
-        input1 = input1.to("npu")
-        res_forward = torch._convolution(input1,
-                                         weight,
-                                         bias=None,
-                                         stride=stride,
-                                         padding=padding,
-                                         dilation=dilation,
-                                         transposed=False,
-                                         output_padding=(0, 0),
-                                         groups=groups,
-                                         benchmark=True,
-                                         deterministic=True,
-                                         cudnn_enabled=True)
-        grads = torch.ones_like(res_forward).float()
-        grads = grads.to("npu")
-        res_forward.backward(grads, retain_graph=True)
-        res_forward = res_forward.to("cpu")
-        res_forward = res_forward.detach().numpy()
-        gradweight = weight.grad.to("cpu")
-        return res_forward, gradweight
-
-    def test_cudnn_convolution_backward_weight_shape_format(self, device):
-        shape_format = [  # input, weight, stride, padding, dilation, groups
-            [[np.float16, 0, (1, 4, 5, 5)], [np.float16, 0, (4, 4, 3, 3)],
-             (1, 1), (1, 1), (1, 1), 1],
-            [[np.float32, 0, [256, 3, 224, 224]],
-             [np.float32, 0, [32, 3, 3, 3]], [2, 2], [0, 0], [1, 1], 1],
-            [[np.float16, 3, (256, 8, 1, 1)], [np.float16, 3, (8, 8, 1, 1)],
-             (1, 1), (0, 0), (1, 1), 1],
-            [[np.float16, 3, [1024, 232, 7, 7]],
-             [np.float16, 4, [232, 232, 1, 1]], (1, 1), (0, 0), (1, 1), 1],
-            [[np.float32, 0, (1, 4, 5, 5)], [np.float32, 0, (4, 4, 3, 3)],
-             (1, 1), (1, 1), (1, 1), 1]
-        ]
-
-        for item in shape_format:
-            self.weight_grad.clear()
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -2, 2)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], -2, 2)
-            if cpu_input2.dtype == torch.float16:
-                cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output, cpu_dweight = self.cpu_op_exec(cpu_input1, cpu_input2, item[2],
-                                          item[3], item[4], item[5])
-            npu_output, npu_dweight = self.npu_op_exec(npu_input1, npu_input2, item[2],
-                                          item[3], item[4], item[5])
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            cpu_dweight = cpu_dweight.to(npu_dweight.dtype)
-            self.assertRtolEqual(cpu_output, npu_output, 0.007)
-            self.assertRtolEqual(cpu_dweight, npu_dweight, 0.003)
-
-
-instantiate_device_type_tests(TestCudnnConvolutionBackwardWeight,
-                              globals(),
-                              except_for='cpu')
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestCudnnConvolutionBackwardWeight(TestCase):
+    weight_grad = []
+
+    def getWeightGrad(self, grad):
+        self.weight_grad.append(grad.to("cpu"))
+
+    def cpu_op_exec(self, input1, weight, stride, padding, dilation, groups):
+        weight.requires_grad = True
+        res_forward = torch._convolution(input1,
+                                         weight,
+                                         bias=None,
+                                         stride=stride,
+                                         padding=padding,
+                                         dilation=dilation,
+                                         transposed=False,
+                                         output_padding=(0, 0),
+                                         groups=groups,
+                                         benchmark=True,
+                                         deterministic=True,
+                                         cudnn_enabled=True)
+        grads = torch.ones_like(res_forward).float()
+        res_forward.backward(grads, retain_graph=True)
+        res_forward = res_forward.detach().numpy()
+        gradweight = weight.grad
+        return res_forward, gradweight
+
+    def npu_op_exec(self, input1, weight, stride, padding, dilation, groups):
+        weight.requires_grad = True
+        input1 = input1.to("npu")
+        res_forward = torch._convolution(input1,
+                                         weight,
+                                         bias=None,
+                                         stride=stride,
+                                         padding=padding,
+                                         dilation=dilation,
+                                         transposed=False,
+                                         output_padding=(0, 0),
+                                         groups=groups,
+                                         benchmark=True,
+                                         deterministic=True,
+                                         cudnn_enabled=True)
+        grads = torch.ones_like(res_forward).float()
+        grads = grads.to("npu")
+        res_forward.backward(grads, retain_graph=True)
+        res_forward = res_forward.to("cpu")
+        res_forward = res_forward.detach().numpy()
+        gradweight = weight.grad.to("cpu")
+        return res_forward, gradweight
+
+    def test_cudnn_convolution_backward_weight_shape_format(self, device):
+        shape_format = [  # input, weight, stride, padding, dilation, groups
+            [[np.float16, 0, (1, 4, 5, 5)], [np.float16, 0, (4, 4, 3, 3)],
+             (1, 1), (1, 1), (1, 1), 1],
+            [[np.float32, 0, [256, 3, 224, 224]],
+             [np.float32, 0, [32, 3, 3, 3]], [2, 2], [0, 0], [1, 1], 1],
+            [[np.float16, 3, (256, 8, 1, 1)], [np.float16, 3, (8, 8, 1, 1)],
+             (1, 1), (0, 0), (1, 1), 1],
+            [[np.float16, 3, [1024, 232, 7, 7]],
+             [np.float16, 4, [232, 232, 1, 1]], (1, 1), (0, 0), (1, 1), 1],
+            [[np.float32, 0, (1, 4, 5, 5)], [np.float32, 0, (4, 4, 3, 3)],
+             (1, 1), (1, 1), (1, 1), 1]
+        ]
+
+        for item in shape_format:
+            self.weight_grad.clear()
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -2, 2)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], -2, 2)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output, cpu_dweight = self.cpu_op_exec(cpu_input1, cpu_input2, item[2],
+                                          item[3], item[4], item[5])
+            npu_output, npu_dweight = self.npu_op_exec(npu_input1, npu_input2, item[2],
+                                          item[3], item[4], item[5])
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            cpu_dweight = cpu_dweight.to(npu_dweight.dtype)
+            self.assertRtolEqual(cpu_output, npu_output, 0.007)
+            self.assertRtolEqual(cpu_dweight, npu_dweight, 0.003)
+
+
+instantiate_device_type_tests(TestCudnnConvolutionBackwardWeight,
+                              globals(),
+                              except_for='cpu')
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_copy_.py b/test/test_npu/test_network_ops/test_copy_.py
index ec4aea8203021b8fb0aee615e3cba7c0caf34a05..8f6a5aee604b2cf8a5c3ecae988bd35ed4899d55 100644
--- a/test/test_npu/test_network_ops/test_copy_.py
+++ b/test/test_npu/test_network_ops/test_copy_.py
@@ -1,56 +1,56 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION.
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestCopy(TestCase):
-
-    def cpu_op_exec(self, input1, input2):
-        output = input1.copy_(input2);
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        output = input1.copy_(input2);
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_copy__(self, device):
-        format_list = [0]
-        shape_list = [(4, 1), (4, 3, 1)]
-        dtype_list = [np.float32, np.int32, np.float16]
-        shape_format = [
-            [i, j, k] for i in dtype_list for j in format_list for k in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestCopy, globals(), except_for='cpu')
-if __name__ == "__main__":
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION.
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestCopy(TestCase):
+
+    def cpu_op_exec(self, input1, input2):
+        output = input1.copy_(input2);
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        output = input1.copy_(input2);
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_copy__(self, device):
+        format_list = [0]
+        shape_list = [(4, 1), (4, 3, 1)]
+        dtype_list = [np.float32, np.int32, np.float16]
+        shape_format = [
+            [i, j, k] for i in dtype_list for j in format_list for k in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestCopy, globals(), except_for='cpu')
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_ctc_loss.py b/test/test_npu/test_network_ops/test_ctc_loss.py
index 2821539914f52ef9f6f62fbe281d31672460fb3a..725fc42a933ad9799aab3d632cda103e55d6d968 100644
--- a/test/test_npu/test_network_ops/test_ctc_loss.py
+++ b/test/test_npu/test_network_ops/test_ctc_loss.py
@@ -1,93 +1,93 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION.
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestCtcLoss(TestCase):
-    def generate_data(self, item):
-        T = item[0][0]
-        C = item[0][1]
-        N = item[0][2]
-        S = item[0][3]
-        S_min = item[0][4]
-        dtype = item[1]
-        reduction_str = item[2] 
-        blk = item[3]
-
-        log_probs = np.random.uniform(-10, 10, (T, N, C)).astype(dtype)
-        targets = torch.randint(1, C, (N, S), dtype = torch.long)
-        input_lengths = torch.full((N,), T, dtype=torch.long)
-        target_lengths = torch.randint(S_min, S, (N,), dtype=torch.long)
-
-        # modify from numpy.ndarray to torch.tensor
-        log_probs = torch.from_numpy(log_probs)
-        
-        ctc_loss = torch.nn.CTCLoss(blank= blk, zero_infinity=True, reduction=reduction_str)
-
-        return ctc_loss, log_probs, targets, input_lengths, target_lengths
-
-    def cpu_op_exec(self, ctc_loss, log_probs, targets, input_lengths, target_lengths):
-        if log_probs.dtype == torch.float16:
-            log_probs = log_probs.to(torch.float32)
-    
-        neg_log_likelihood = ctc_loss(log_probs.log_softmax(2), targets, input_lengths, target_lengths)
-
-        neg_log_likelihood = neg_log_likelihood.numpy()
-
-        return neg_log_likelihood
-
-    def npu_op_exec(self, ctc_loss, log_probs, targets, input_lengths, target_lengths):
-        log_probs = log_probs.npu()
-        targets = targets.npu()
-        input_lengths = input_lengths.npu()
-        target_lengths = target_lengths.npu()
-        
-        neg_log_likelihood = ctc_loss(log_probs.log_softmax(2), targets, input_lengths, target_lengths)
-                
-        if neg_log_likelihood.dtype == torch.float16:
-            neg_log_likelihood = neg_log_likelihood.to(torch.float32)
-
-        neg_log_likelihood = neg_log_likelihood.cpu().numpy()
-
-        return neg_log_likelihood
-
-    def test_ctc_loss(self, device):
-        sizes_list = [[50, 20, 16, 30, 10], [26, 37, 256, 18, 10]]
-        para_reduction = ["sum", "mean", "none"]
-        dtype = [np.float32, np.float16]        
-        blank = [0, 9]
-        shape_format = [
-            [i, j, k, l] for i in sizes_list for j in dtype for k in para_reduction for l in blank
-        ]        
-
-        for item in shape_format:
-            ctc_loss, log_probs, targets, input_lengths, target_lengths = self.generate_data(item)
-
-            neg_log_likelihood_cpu = self.cpu_op_exec(ctc_loss, log_probs, targets, input_lengths, target_lengths)
-            neg_log_likelihood_npu = self.npu_op_exec(ctc_loss, log_probs, targets, input_lengths, target_lengths)
-            
-            self.assertRtolEqual(neg_log_likelihood_cpu, neg_log_likelihood_npu, 1e-3)
-
-instantiate_device_type_tests(TestCtcLoss, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION.
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestCtcLoss(TestCase):
+    def generate_data(self, item):
+        T = item[0][0]
+        C = item[0][1]
+        N = item[0][2]
+        S = item[0][3]
+        S_min = item[0][4]
+        dtype = item[1]
+        reduction_str = item[2] 
+        blk = item[3]
+
+        log_probs = np.random.uniform(-10, 10, (T, N, C)).astype(dtype)
+        targets = torch.randint(1, C, (N, S), dtype = torch.long)
+        input_lengths = torch.full((N,), T, dtype=torch.long)
+        target_lengths = torch.randint(S_min, S, (N,), dtype=torch.long)
+
+        # modify from numpy.ndarray to torch.tensor
+        log_probs = torch.from_numpy(log_probs)
+        
+        ctc_loss = torch.nn.CTCLoss(blank= blk, zero_infinity=True, reduction=reduction_str)
+
+        return ctc_loss, log_probs, targets, input_lengths, target_lengths
+
+    def cpu_op_exec(self, ctc_loss, log_probs, targets, input_lengths, target_lengths):
+        if log_probs.dtype == torch.float16:
+            log_probs = log_probs.to(torch.float32)
+    
+        neg_log_likelihood = ctc_loss(log_probs.log_softmax(2), targets, input_lengths, target_lengths)
+
+        neg_log_likelihood = neg_log_likelihood.numpy()
+
+        return neg_log_likelihood
+
+    def npu_op_exec(self, ctc_loss, log_probs, targets, input_lengths, target_lengths):
+        log_probs = log_probs.npu()
+        targets = targets.npu()
+        input_lengths = input_lengths.npu()
+        target_lengths = target_lengths.npu()
+        
+        neg_log_likelihood = ctc_loss(log_probs.log_softmax(2), targets, input_lengths, target_lengths)
+                
+        if neg_log_likelihood.dtype == torch.float16:
+            neg_log_likelihood = neg_log_likelihood.to(torch.float32)
+
+        neg_log_likelihood = neg_log_likelihood.cpu().numpy()
+
+        return neg_log_likelihood
+
+    def test_ctc_loss(self, device):
+        sizes_list = [[50, 20, 16, 30, 10], [26, 37, 256, 18, 10]]
+        para_reduction = ["sum", "mean", "none"]
+        dtype = [np.float32, np.float16]        
+        blank = [0, 9]
+        shape_format = [
+            [i, j, k, l] for i in sizes_list for j in dtype for k in para_reduction for l in blank
+        ]        
+
+        for item in shape_format:
+            ctc_loss, log_probs, targets, input_lengths, target_lengths = self.generate_data(item)
+
+            neg_log_likelihood_cpu = self.cpu_op_exec(ctc_loss, log_probs, targets, input_lengths, target_lengths)
+            neg_log_likelihood_npu = self.npu_op_exec(ctc_loss, log_probs, targets, input_lengths, target_lengths)
+            
+            self.assertRtolEqual(neg_log_likelihood_cpu, neg_log_likelihood_npu, 1e-3)
+
+instantiate_device_type_tests(TestCtcLoss, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_ctc_loss_backward.py b/test/test_npu/test_network_ops/test_ctc_loss_backward.py
index 0929c060edaed47a0b84623d34196fae7e4839cf..1935b0b7630c8ea7ab7c2974e5a43a9c9d716a0d 100644
--- a/test/test_npu/test_network_ops/test_ctc_loss_backward.py
+++ b/test/test_npu/test_network_ops/test_ctc_loss_backward.py
@@ -1,100 +1,100 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION.
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestCtcLossBackward(TestCase):
-    def generate_data(self, item):
-        T = item[0][0]
-        C = item[0][1]
-        N = item[0][2]
-        S = item[0][3]
-        S_min = item[0][4]
-        dtype = item[1]
-        reduction_str = item[2] 
-
-        log_probs = np.random.uniform(-10, 10, (T, N, C)).astype(dtype)
-        targets = torch.randint(1, C, (N, S), dtype = torch.long)
-        input_lengths = torch.full((N,), T, dtype=torch.long)
-        target_lengths = torch.randint(S_min, S, (N,), dtype=torch.long)
-
-        # modify from numpy.ndarray to torch.tensor
-        log_probs = torch.from_numpy(log_probs)
-        
-        ctc_loss = torch.nn.CTCLoss(zero_infinity=True, reduction=reduction_str)
-
-        return ctc_loss, log_probs, targets, input_lengths, target_lengths
-
-    def cpu_op_exec(self, ctc_loss, log_probs, targets, input_lengths, target_lengths):
-        if log_probs.dtype == torch.float16:
-            log_probs = log_probs.to(torch.float32)
-    
-        log_probs.requires_grad_(True)
-        log_probs.retain_grad()
-
-        neg_log_likelihood = ctc_loss(log_probs.log_softmax(2), targets, input_lengths, target_lengths)
-        neg_log_likelihood.backward()
-        grad = log_probs.grad
-        
-        grad = grad.numpy()
-
-        return grad
-
-    def npu_op_exec(self, ctc_loss, log_probs, targets, input_lengths, target_lengths):
-        log_probs = copy.deepcopy(log_probs).npu()
-        targets = targets.npu()
-        log_probs.requires_grad_(True)
-        log_probs.retain_grad()
-
-        neg_log_likelihood = ctc_loss(log_probs.log_softmax(2), targets, input_lengths.npu(), target_lengths.npu())
-        neg_log_likelihood.backward()
-        grad = log_probs.grad
-        
-        if grad.dtype == torch.float16:
-            grad = grad.to(torch.float32)
-        
-        grad = grad.cpu().numpy()
-
-        return grad
-
-    def test_ctc_loss_backward(self, device):
-        sizes_list = [[50, 20, 16, 30, 10], [26, 37, 2560, 18, 10]]
-        para_reduction = ["sum", "mean"]
-        dtype = [np.float32]  # Insufficient accuracy when use fp16 data
-        shape_format = [
-            [i, j, k] for i in sizes_list for j in dtype for k in para_reduction
-        ]
-
-        for item in shape_format:
-            ctc_loss, log_probs, targets, input_lengths, target_lengths = self.generate_data(item)
-
-            grad_cpu = self.cpu_op_exec(ctc_loss, log_probs, targets, input_lengths, target_lengths)
-            grad_npu = self.npu_op_exec(ctc_loss, log_probs, targets, input_lengths, target_lengths)
-            
-            self.assertRtolEqual(grad_cpu, grad_npu, 1e-3)
-
-
-
-instantiate_device_type_tests(TestCtcLossBackward, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION.
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestCtcLossBackward(TestCase):
+    def generate_data(self, item):
+        T = item[0][0]
+        C = item[0][1]
+        N = item[0][2]
+        S = item[0][3]
+        S_min = item[0][4]
+        dtype = item[1]
+        reduction_str = item[2] 
+
+        log_probs = np.random.uniform(-10, 10, (T, N, C)).astype(dtype)
+        targets = torch.randint(1, C, (N, S), dtype = torch.long)
+        input_lengths = torch.full((N,), T, dtype=torch.long)
+        target_lengths = torch.randint(S_min, S, (N,), dtype=torch.long)
+
+        # modify from numpy.ndarray to torch.tensor
+        log_probs = torch.from_numpy(log_probs)
+        
+        ctc_loss = torch.nn.CTCLoss(zero_infinity=True, reduction=reduction_str)
+
+        return ctc_loss, log_probs, targets, input_lengths, target_lengths
+
+    def cpu_op_exec(self, ctc_loss, log_probs, targets, input_lengths, target_lengths):
+        if log_probs.dtype == torch.float16:
+            log_probs = log_probs.to(torch.float32)
+    
+        log_probs.requires_grad_(True)
+        log_probs.retain_grad()
+
+        neg_log_likelihood = ctc_loss(log_probs.log_softmax(2), targets, input_lengths, target_lengths)
+        neg_log_likelihood.backward()
+        grad = log_probs.grad
+        
+        grad = grad.numpy()
+
+        return grad
+
+    def npu_op_exec(self, ctc_loss, log_probs, targets, input_lengths, target_lengths):
+        log_probs = copy.deepcopy(log_probs).npu()
+        targets = targets.npu()
+        log_probs.requires_grad_(True)
+        log_probs.retain_grad()
+
+        neg_log_likelihood = ctc_loss(log_probs.log_softmax(2), targets, input_lengths.npu(), target_lengths.npu())
+        neg_log_likelihood.backward()
+        grad = log_probs.grad
+        
+        if grad.dtype == torch.float16:
+            grad = grad.to(torch.float32)
+        
+        grad = grad.cpu().numpy()
+
+        return grad
+
+    def test_ctc_loss_backward(self, device):
+        sizes_list = [[50, 20, 16, 30, 10], [26, 37, 2560, 18, 10]]
+        para_reduction = ["sum", "mean"]
+        dtype = [np.float32]  # Insufficient accuracy when use fp16 data
+        shape_format = [
+            [i, j, k] for i in sizes_list for j in dtype for k in para_reduction
+        ]
+
+        for item in shape_format:
+            ctc_loss, log_probs, targets, input_lengths, target_lengths = self.generate_data(item)
+
+            grad_cpu = self.cpu_op_exec(ctc_loss, log_probs, targets, input_lengths, target_lengths)
+            grad_npu = self.npu_op_exec(ctc_loss, log_probs, targets, input_lengths, target_lengths)
+            
+            self.assertRtolEqual(grad_cpu, grad_npu, 1e-3)
+
+
+
+instantiate_device_type_tests(TestCtcLossBackward, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_cudnn_is_acceptable.py b/test/test_npu/test_network_ops/test_cudnn_is_acceptable.py
index 3bbc4b872843dd4e1e7eeab9e38fd9b656683872..d32be4ae6d50a8e8df3c5bd3498ef2aca1b55d58 100644
--- a/test/test_npu/test_network_ops/test_cudnn_is_acceptable.py
+++ b/test/test_npu/test_network_ops/test_cudnn_is_acceptable.py
@@ -1,38 +1,38 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
- 
-class TestCudnnIsAcceptable(TestCase):
-    def test_cudnn_is_acceptable_common_shape_format(self, device):        
-        shape_format = [
-                [[np.float16, 0, 1]],
-                [[np.float16, 0, 5]],
-                [[np.float32, 4, 3]],
-                [[np.float32, 29, 4]]
-        ]
-        for shape in shape_format:
-            cpu_input, npu_input = create_common_tensor(shape[0], -1, 1)
-            cpu_output = np.array([torch.cudnn_is_acceptable(cpu_input)]).astype(np.float32)
-            npu_output = np.array([torch.cudnn_is_acceptable(npu_input)]).astype(np.float32)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestCudnnIsAcceptable, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+ 
+class TestCudnnIsAcceptable(TestCase):
+    def test_cudnn_is_acceptable_common_shape_format(self, device):        
+        shape_format = [
+                [[np.float16, 0, 1]],
+                [[np.float16, 0, 5]],
+                [[np.float32, 4, 3]],
+                [[np.float32, 29, 4]]
+        ]
+        for shape in shape_format:
+            cpu_input, npu_input = create_common_tensor(shape[0], -1, 1)
+            cpu_output = np.array([torch.cudnn_is_acceptable(cpu_input)]).astype(np.float32)
+            npu_output = np.array([torch.cudnn_is_acceptable(npu_input)]).astype(np.float32)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestCudnnIsAcceptable, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_div.py b/test/test_npu/test_network_ops/test_div.py
old mode 100644
new mode 100755
index 0050359a335d0cccd282743d84a905e8cf975b26..d691dc286b08a1dcfb7a921fc81911198e629ec0
--- a/test/test_npu/test_network_ops/test_div.py
+++ b/test/test_npu/test_network_ops/test_div.py
@@ -1,120 +1,120 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-import unittest
-from util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
-from common_device_type import dtypes, instantiate_device_type_tests
-
-class TestDiv(TestCase):
-    def get_outputs(self, cpu_args, npu_args, dtype):
-        # cpu not support fp16 div
-        cpu_args = [i.float() if dtype==torch.half else i for i in cpu_args]
-        cpu_output = torch.div(cpu_args[0], cpu_args[1]).to(dtype).numpy()
-        npu_output = torch.div(npu_args[0], npu_args[1]).to("cpu").numpy()
-        return cpu_output, npu_output
-    
-    def get_outputs_chk(self, cpu_args, npu_args, dtype):
-        # cpu not support fp16 div
-        cpu_out = torch.randn(6).to(dtype)
-        npu_out = torch.randn(6).to("npu").to(dtype)
-        cpu_args = [i.float() if dtype==torch.half else i for i in cpu_args]
-        torch.div(cpu_args[0], cpu_args[1], out = cpu_out)
-        torch.div(npu_args[0], npu_args[1], out = npu_out)
-        cpu_output = cpu_out.to(dtype).numpy()
-        npu_output = npu_out.to("cpu").numpy()
-        return cpu_output, npu_output
-
-    def test_div_broadcast(self, device):
-        for item in test_2args_broadcast(torch.div):
-            self.assertRtolEqual(item[0], item[1])
-
-    # div not support bool
-    @dtypes(torch.float, torch.half, torch.int)
-    def test_div_dtype(self, device, dtype):
-        cpu_input1, npu_input1 = create_dtype_tensor((2,3,4,5), dtype)
-        # divisor can not be zero
-        cpu_input2, npu_input2 = create_dtype_tensor((2,3,4,5), dtype, no_zero=True)
-        cpu_output, npu_output = self.get_outputs([cpu_input1, cpu_input2], [npu_input1, npu_input2], dtype)
-
-        # div 在int结果为负数时采用截断而不是向下取整的方式取整，所以选用numpy比较
-        if dtype == torch.int:
-            cpu_output = np.floor_divide(cpu_input1.numpy(), cpu_input2.numpy())
-
-        self.assertRtolEqual(cpu_output, npu_output)
-        
-    @unittest.skipIf(UT_FAST_MODE, "Run UT in fast mode")
-    def test_div_shape_format_fp16(self, device):
-        format_list = [0, 3, 29]
-        shape_list = [1, (64, 10), (32, 3, 3), (256, 2048, 7, 7)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item, 1, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output, npu_output = self.get_outputs([cpu_input1, cpu_input2], [npu_input1, npu_input2], torch.half)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    @unittest.skipIf(UT_FAST_MODE, "Run UT in fast mode")
-    def test_div_shape_format_fp32(self, device):
-        format_list = [0, 3, 29]
-        shape_list = [1, (64, 10), (32, 3, 3), (256, 2048, 7, 7), (2, 0, 2)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item, 1, 100)
-            cpu_output, npu_output = self.get_outputs([cpu_input1, cpu_input2], [npu_input1, npu_input2], torch.float)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_div_mix_dtype_1(self, device):
-        npu_input1, npu_input2 = create_common_tensor([np.int32, 0, (2, 3)], 1, 100)
-        npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
-        cpu_output, npu_output = self.get_outputs([npu_input1, npu_input3], [npu_input2, npu_input4], torch.float)
-        self.assertRtolEqual(cpu_output, npu_output)
-        
-    def test_div_mix_dtype_2(self, device):
-        npu_input1, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
-        npu_input3 = torch.tensor(3).int()
-        cpu_output, npu_output = self.get_outputs([npu_input1, npu_input3], [npu_input2, npu_input3], torch.float)
-        self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_div_scalar_dtype(self, device):
-        cpu_input1, npu_input1 = create_common_tensor([np.int32, 0, (2, 3)], 1, 100)
-        cpu_output = cpu_input1 / 0.5
-        npu_output = npu_input1 / 0.5
-        self.assertRtolEqual(cpu_output, npu_output.cpu())
-        
-    @unittest.skipIf(UT_FAST_MODE, "Run UT in fast mode")
-    def test_div_shape_format_fp32(self, device):
-        format_list = [0, 3, 29]
-        shape_list = [1, (64, 10), (32, 3, 3), (256, 2048, 7, 7)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item, 1, 100)
-            cpu_output, npu_output = self.get_outputs_chk([cpu_input1, cpu_input2], [npu_input1, npu_input2], torch.float)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestDiv, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+import unittest
+from util_test import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+from common_device_type import dtypes, instantiate_device_type_tests
+
+class TestDiv(TestCase):
+    def get_outputs(self, cpu_args, npu_args, dtype):
+        # cpu not support fp16 div
+        cpu_args = [i.float() if dtype==torch.half else i for i in cpu_args]
+        cpu_output = torch.div(cpu_args[0], cpu_args[1]).to(dtype).numpy()
+        npu_output = torch.div(npu_args[0], npu_args[1]).to("cpu").numpy()
+        return cpu_output, npu_output
+    
+    def get_outputs_chk(self, cpu_args, npu_args, dtype):
+        # cpu not support fp16 div
+        cpu_out = torch.randn(6).to(dtype)
+        npu_out = torch.randn(6).to("npu").to(dtype)
+        cpu_args = [i.float() if dtype==torch.half else i for i in cpu_args]
+        torch.div(cpu_args[0], cpu_args[1], out = cpu_out)
+        torch.div(npu_args[0], npu_args[1], out = npu_out)
+        cpu_output = cpu_out.to(dtype).numpy()
+        npu_output = npu_out.to("cpu").numpy()
+        return cpu_output, npu_output
+
+    def test_div_broadcast(self, device):
+        for item in test_2args_broadcast(torch.div):
+            self.assertRtolEqual(item[0], item[1])
+
+    # div not support bool
+    @dtypes(torch.float, torch.half, torch.int)
+    def test_div_dtype(self, device, dtype):
+        cpu_input1, npu_input1 = create_dtype_tensor((2,3,4,5), dtype)
+        # divisor can not be zero
+        cpu_input2, npu_input2 = create_dtype_tensor((2,3,4,5), dtype, no_zero=True)
+        cpu_output, npu_output = self.get_outputs([cpu_input1, cpu_input2], [npu_input1, npu_input2], dtype)
+
+        # div 在int结果为负数时采用截断而不是向下取整的方式取整，所以选用numpy比较
+        if dtype == torch.int:
+            cpu_output = np.floor_divide(cpu_input1.numpy(), cpu_input2.numpy())
+
+        self.assertRtolEqual(cpu_output, npu_output)
+        
+    @unittest.skipIf(UT_FAST_MODE, "Run UT in fast mode")
+    def test_div_shape_format_fp16(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [1, (64, 10), (32, 3, 3), (256, 2048, 7, 7)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 1, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output, npu_output = self.get_outputs([cpu_input1, cpu_input2], [npu_input1, npu_input2], torch.half)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    @unittest.skipIf(UT_FAST_MODE, "Run UT in fast mode")
+    def test_div_shape_format_fp32(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [1, (64, 10), (32, 3, 3), (256, 2048, 7, 7), (2, 0, 2)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 1, 100)
+            cpu_output, npu_output = self.get_outputs([cpu_input1, cpu_input2], [npu_input1, npu_input2], torch.float)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_div_mix_dtype_1(self, device):
+        npu_input1, npu_input2 = create_common_tensor([np.int32, 0, (2, 3)], 1, 100)
+        npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
+        cpu_output, npu_output = self.get_outputs([npu_input1, npu_input3], [npu_input2, npu_input4], torch.float)
+        self.assertRtolEqual(cpu_output, npu_output)
+        
+    def test_div_mix_dtype_2(self, device):
+        npu_input1, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
+        npu_input3 = torch.tensor(3).int()
+        cpu_output, npu_output = self.get_outputs([npu_input1, npu_input3], [npu_input2, npu_input3], torch.float)
+        self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test_div_scalar_dtype(self, device):
+        cpu_input1, npu_input1 = create_common_tensor([np.int32, 0, (2, 3)], 1, 100)
+        cpu_output = cpu_input1 / 0.5
+        npu_output = npu_input1 / 0.5
+        self.assertRtolEqual(cpu_output, npu_output.cpu())
+        
+    @unittest.skipIf(UT_FAST_MODE, "Run UT in fast mode")
+    def test_div_shape_format_fp32(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [1, (64, 10), (32, 3, 3), (256, 2048, 7, 7)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 1, 100)
+            cpu_output, npu_output = self.get_outputs_chk([cpu_input1, cpu_input2], [npu_input1, npu_input2], torch.float)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestDiv, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_dropout.py b/test/test_npu/test_network_ops/test_dropout.py
old mode 100644
new mode 100755
index 500302a6edbc0f78c26797058326cae2f1dd7b5b..7275d1549f1d9a9f90a0542254518da4d96b8dd0
--- a/test/test_npu/test_network_ops/test_dropout.py
+++ b/test/test_npu/test_network_ops/test_dropout.py
@@ -1,70 +1,70 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-sys.path.append('..')
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestDropOutDoMask(TestCase):
-    def cpu_op_exec(self, input):
-        out = torch.nn.Dropout(0.5)(input)
-        out = out.numpy()
-        return out
-
-    def npu_op_exec(self, input):
-        out = torch.nn.Dropout(0.5)(input)
-        out = out.to("cpu")
-        out = out.numpy()
-        return out
-
-    def dropout_list_exec(self, list):
-        epsilon = 1e-3
-        for item in list:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            # 该算子随机结果的比较方式
-            for a, b in zip(cpu_output.flatten(), npu_output.flatten()):
-                if abs(a) > 0 and abs(b) > 0 and abs(a - b) > epsilon:
-                    print(f'input = {item}, ERROR!')
-                    break
-            else:
-                print(f'input = {item}, Successfully!')
-
-    def test_op_shape_format_fp16(self, device):
-        format_list = [0, 3, 29]
-        shape_list = [1, (256, 1280), (32, 3, 3), (256, 2048, 7, 7)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        self.dropout_list_exec(shape_format)
-
-    def test_op_shape_format_fp32(self, device):
-        format_list = [0, 3, 29]
-        shape_list = [1, (256, 1280), (32, 3, 3), (256, 2048, 7, 7)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        self.dropout_list_exec(shape_format)
-
-instantiate_device_type_tests(TestDropOutDoMask, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('..')
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestDropOutDoMask(TestCase):
+    def cpu_op_exec(self, input):
+        out = torch.nn.Dropout(0.5)(input)
+        out = out.numpy()
+        return out
+
+    def npu_op_exec(self, input):
+        out = torch.nn.Dropout(0.5)(input)
+        out = out.to("cpu")
+        out = out.numpy()
+        return out
+
+    def dropout_list_exec(self, list):
+        epsilon = 1e-3
+        for item in list:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            # 该算子随机结果的比较方式
+            for a, b in zip(cpu_output.flatten(), npu_output.flatten()):
+                if abs(a) > 0 and abs(b) > 0 and abs(a - b) > epsilon:
+                    print(f'input = {item}, ERROR!')
+                    break
+            else:
+                print(f'input = {item}, Successfully!')
+
+    def test_op_shape_format_fp16(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [1, (256, 1280), (32, 3, 3), (256, 2048, 7, 7)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        self.dropout_list_exec(shape_format)
+
+    def test_op_shape_format_fp32(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [1, (256, 1280), (32, 3, 3), (256, 2048, 7, 7)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        self.dropout_list_exec(shape_format)
+
+instantiate_device_type_tests(TestDropOutDoMask, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_dropoutv2.py b/test/test_npu/test_network_ops/test_dropoutv2.py
index 43953b5e61bc984dd452ced5c7c874565041cd67..66d93ca1f48dceb7d0626054bd0679a095167d6a 100644
--- a/test/test_npu/test_network_ops/test_dropoutv2.py
+++ b/test/test_npu/test_network_ops/test_dropoutv2.py
@@ -1,43 +1,43 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-from common_utils import TestCase, run_tests
-
-
-class TestDropOutV2(TestCase):
-   def _gen_seeds(self, shape):
-       return np.random.uniform(1, 10, size=shape).astype(np.float32)
-
-   def npu_op_exec(self, input1, seed, prob):
-        output, mask, seed = torch.npu_dropoutV2(input1, seed, p = prob)
-        output = output.to("cpu")
-        output = output.numpy()
-        mask   = mask.to("cpu")
-        mask   = mask.numpy()
-        return output, mask
-
-   def test_dropoutV2(self, device):
-        input    = torch.tensor([1.,2.,3.,4.]).npu()
-        seed_shape = (int(32 * 1024 * 12),)
-        seed = self._gen_seeds(seed_shape)
-        seed = torch.from_numpy(seed).to("npu")
-        prob     = 0.3
-        output, mask   = self.npu_op_exec(input, seed, prob) #result is random,only check api can exec success!
-
-instantiate_device_type_tests(TestDropOutV2, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+from common_utils import TestCase, run_tests
+
+
+class TestDropOutV2(TestCase):
+   def _gen_seeds(self, shape):
+       return np.random.uniform(1, 10, size=shape).astype(np.float32)
+
+   def npu_op_exec(self, input1, seed, prob):
+        output, mask, seed = torch.npu_dropoutV2(input1, seed, p = prob)
+        output = output.to("cpu")
+        output = output.numpy()
+        mask   = mask.to("cpu")
+        mask   = mask.numpy()
+        return output, mask
+
+   def test_dropoutV2(self, device):
+        input    = torch.tensor([1.,2.,3.,4.]).npu()
+        seed_shape = (int(32 * 1024 * 12),)
+        seed = self._gen_seeds(seed_shape)
+        seed = torch.from_numpy(seed).to("npu")
+        prob     = 0.3
+        output, mask   = self.npu_op_exec(input, seed, prob) #result is random,only check api can exec success!
+
+instantiate_device_type_tests(TestDropOutV2, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_dropoutv2backward.py b/test/test_npu/test_network_ops/test_dropoutv2backward.py
index c0de31fecb7d5b02b6e09ff5ceaa758cabf3bd57..62361c7cd46e8fa96a7dcd8d1904ce4ef1e23516 100644
--- a/test/test_npu/test_network_ops/test_dropoutv2backward.py
+++ b/test/test_npu/test_network_ops/test_dropoutv2backward.py
@@ -1,50 +1,50 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-from common_utils import TestCase, run_tests
-
-
-class TestDropOutV2(TestCase):
-   def _gen_seeds(self, shape):
-       return np.random.uniform(1, 10, size=shape).astype(np.float32)
-
-   def npu_op_exec(self, input1, seed, prob):
-        input1.requires_grad = True
-        output, mask, seed = torch.npu_dropoutV2(input1, seed, p = prob)
-        output.backward(torch.ones_like(output))
-        output = output.to("cpu")
-        output = output.detach().numpy()
-        mask   = mask.to("cpu")
-        mask   = mask.numpy()
-
-        output_grad = input1.grad
-        output_grad = output_grad.to("cpu")
-        output_grad = output_grad.detach().numpy()
-
-        return output_grad, output, mask
-
-   def test_dropoutV2backward(self, device):
-        input    = torch.tensor([1.,2.,3.,4.]).npu()
-        seed_shape = (int(32 * 1024 * 12),)
-        seed = self._gen_seeds(seed_shape)
-        seed = torch.from_numpy(seed).to("npu")
-        prob     = 0.3
-        output_grad, output, mask   = self.npu_op_exec(input, seed, prob) #result is random,only check api can exec success!
-
-instantiate_device_type_tests(TestDropOutV2, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+from common_utils import TestCase, run_tests
+
+
+class TestDropOutV2(TestCase):
+   def _gen_seeds(self, shape):
+       return np.random.uniform(1, 10, size=shape).astype(np.float32)
+
+   def npu_op_exec(self, input1, seed, prob):
+        input1.requires_grad = True
+        output, mask, seed = torch.npu_dropoutV2(input1, seed, p = prob)
+        output.backward(torch.ones_like(output))
+        output = output.to("cpu")
+        output = output.detach().numpy()
+        mask   = mask.to("cpu")
+        mask   = mask.numpy()
+
+        output_grad = input1.grad
+        output_grad = output_grad.to("cpu")
+        output_grad = output_grad.detach().numpy()
+
+        return output_grad, output, mask
+
+   def test_dropoutV2backward(self, device):
+        input    = torch.tensor([1.,2.,3.,4.]).npu()
+        seed_shape = (int(32 * 1024 * 12),)
+        seed = self._gen_seeds(seed_shape)
+        seed = torch.from_numpy(seed).to("npu")
+        prob     = 0.3
+        output_grad, output, mask   = self.npu_op_exec(input, seed, prob) #result is random,only check api can exec success!
+
+instantiate_device_type_tests(TestDropOutV2, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_embedding_backward.py b/test/test_npu/test_network_ops/test_embedding_backward.py
old mode 100644
new mode 100755
index 4c88fe91554c6539914fe9d85290f45dcc560423..0603e9ff96b753c5e12d5d47121605f58676d3ad
--- a/test/test_npu/test_network_ops/test_embedding_backward.py
+++ b/test/test_npu/test_network_ops/test_embedding_backward.py
@@ -1,66 +1,66 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-import torch.nn.functional as F
-
-
-class TestEmbeddingBackward(TestCase):
-    def cpu_op_exec(self, weight, indices):
-        weight.requires_grad_(True)
-        out = F.embedding(indices, weight, scale_grad_by_freq=True, padding_idx=37)
-        out.backward(torch.ones_like(out))
-        grad_cpu = weight.grad
-        return out.detach().numpy(), grad_cpu.detach().numpy()
-
-    def npu_op_exec(self, weight, indices):
-        weight.requires_grad_(True)
-        out = F.embedding(indices, weight, scale_grad_by_freq=True, padding_idx=37)
-        out.backward(torch.ones_like(out))
-        out_npu = out.to("cpu")
-        grad_npu = weight.grad
-        grad_npu = grad_npu.to("cpu")
-        return out_npu.detach().numpy(), grad_npu.detach().numpy()
-
-    def test_embedding_backward_shape_format_fp32(self, device):
-        format_list = [0]
-        shape_list1 = [[40, 32], [40, 1024], [40000, 1024], [33712, 1024]]
-        shape_list2 = [[40], [40], [3125], [64, 7]]
-        shape_format1 = [
-            [np.float32, i, j] for i in format_list for j in shape_list1
-        ]
-        shape_format2 = [
-            [np.int64, i, j] for i in format_list for j in shape_list2
-        ]
-        shape_format = [
-            [i, j] for i in shape_format1 for j in shape_format2
-        ]
-        for item in shape_format:
-            weight_cpu, weight_npu = create_common_tensor(item[0], 1, 1)
-            indices_cpu, indices_npu = create_common_tensor(item[1], 0, min(item[0][2][0:-1]))
-
-            cpu_out, cpu_grad = self.cpu_op_exec(weight_cpu, indices_cpu)
-            npu_out, npu_grad = self.npu_op_exec(weight_npu, indices_npu)
-
-            self.assertRtolEqual(cpu_out, npu_out)
-            self.assertRtolEqual(cpu_grad, npu_grad)
-
-
-instantiate_device_type_tests(TestEmbeddingBackward, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
-
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+import torch.nn.functional as F
+
+
+class TestEmbeddingBackward(TestCase):
+    def cpu_op_exec(self, weight, indices):
+        weight.requires_grad_(True)
+        out = F.embedding(indices, weight, scale_grad_by_freq=True, padding_idx=37)
+        out.backward(torch.ones_like(out))
+        grad_cpu = weight.grad
+        return out.detach().numpy(), grad_cpu.detach().numpy()
+
+    def npu_op_exec(self, weight, indices):
+        weight.requires_grad_(True)
+        out = F.embedding(indices, weight, scale_grad_by_freq=True, padding_idx=37)
+        out.backward(torch.ones_like(out))
+        out_npu = out.to("cpu")
+        grad_npu = weight.grad
+        grad_npu = grad_npu.to("cpu")
+        return out_npu.detach().numpy(), grad_npu.detach().numpy()
+
+    def test_embedding_backward_shape_format_fp32(self, device):
+        format_list = [0]
+        shape_list1 = [[40, 32], [40, 1024], [40000, 1024], [33712, 1024]]
+        shape_list2 = [[40], [40], [3125], [64, 7]]
+        shape_format1 = [
+            [np.float32, i, j] for i in format_list for j in shape_list1
+        ]
+        shape_format2 = [
+            [np.int64, i, j] for i in format_list for j in shape_list2
+        ]
+        shape_format = [
+            [i, j] for i in shape_format1 for j in shape_format2
+        ]
+        for item in shape_format:
+            weight_cpu, weight_npu = create_common_tensor(item[0], 1, 1)
+            indices_cpu, indices_npu = create_common_tensor(item[1], 0, min(item[0][2][0:-1]))
+
+            cpu_out, cpu_grad = self.cpu_op_exec(weight_cpu, indices_cpu)
+            npu_out, npu_grad = self.npu_op_exec(weight_npu, indices_npu)
+
+            self.assertRtolEqual(cpu_out, npu_out)
+            self.assertRtolEqual(cpu_grad, npu_grad)
+
+
+instantiate_device_type_tests(TestEmbeddingBackward, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
+
diff --git a/test/test_npu/test_network_ops/test_embedding_bag.py b/test/test_npu/test_network_ops/test_embedding_bag.py
index e6ca989223ae2b90a4402d93ef1ce24fc8454612..4ee9f0b676ebfd5f7f242d9f4f93ca799da9dfa6 100644
--- a/test/test_npu/test_network_ops/test_embedding_bag.py
+++ b/test/test_npu/test_network_ops/test_embedding_bag.py
@@ -1,44 +1,44 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-import torch.nn.functional as F
-
-class TestEmbeddingBag(TestCase):
-    def test_embedding_bag_1d(self, device):
-        cpu_weight = torch.rand(10, 3)
-        cpu_indices = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9])
-        cpu_offsets = torch.tensor([0, 4])
-        npu_weight = cpu_weight.npu()
-        npu_indices = cpu_indices.npu()
-        npu_offsets = cpu_offsets.npu()
-        cpu_output = F.embedding_bag(cpu_weight, cpu_indices, cpu_offsets).detach().numpy()
-        npu_output = F.embedding_bag(npu_weight, npu_indices, npu_offsets).cpu().detach().numpy()
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_embedding_bag_2d(self, device):
-        cpu_weight = torch.rand(10, 3)
-        cpu_indices = torch.tensor([[1, 2, 4, 5, 4, 3, 2, 9], [1, 2, 4, 5, 4, 3, 2, 9]])
-        npu_weight = cpu_weight.npu()
-        npu_indices = cpu_indices.npu()
-        cpu_output = F.embedding_bag(cpu_weight, cpu_indices).detach().numpy()
-        npu_output = F.embedding_bag(npu_weight, npu_indices).cpu().detach().numpy()
-        self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestEmbeddingBag, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+import torch.nn.functional as F
+
+class TestEmbeddingBag(TestCase):
+    def test_embedding_bag_1d(self, device):
+        cpu_weight = torch.rand(10, 3)
+        cpu_indices = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9])
+        cpu_offsets = torch.tensor([0, 4])
+        npu_weight = cpu_weight.npu()
+        npu_indices = cpu_indices.npu()
+        npu_offsets = cpu_offsets.npu()
+        cpu_output = F.embedding_bag(cpu_weight, cpu_indices, cpu_offsets).detach().numpy()
+        npu_output = F.embedding_bag(npu_weight, npu_indices, npu_offsets).cpu().detach().numpy()
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_embedding_bag_2d(self, device):
+        cpu_weight = torch.rand(10, 3)
+        cpu_indices = torch.tensor([[1, 2, 4, 5, 4, 3, 2, 9], [1, 2, 4, 5, 4, 3, 2, 9]])
+        npu_weight = cpu_weight.npu()
+        npu_indices = cpu_indices.npu()
+        cpu_output = F.embedding_bag(cpu_weight, cpu_indices).detach().numpy()
+        npu_output = F.embedding_bag(npu_weight, npu_indices).cpu().detach().numpy()
+        self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestEmbeddingBag, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_embedding_renorm.py b/test/test_npu/test_network_ops/test_embedding_renorm.py
similarity index 97%
rename from test/test_npu/test_embedding_renorm.py
rename to test/test_npu/test_network_ops/test_embedding_renorm.py
index 51f06efe73e646ebd64fb4c482adc83d12fe406a..3f791824546f14e0e2504e90e4ae38f5d8be9a94 100644
--- a/test/test_npu/test_embedding_renorm.py
+++ b/test/test_npu/test_network_ops/test_embedding_renorm.py
@@ -1,117 +1,115 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestEmbeddingRenorm(TestCase):
-    def generate_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        npu_input1 = torch.from_numpy(input1)
-        npu_input2 = torch.LongTensor(np.random.uniform(0,shape[0], int(shape[0]/2,)).astype(np.int32))
-        #npu_input2=torch.LongTensor([[0,1,1,0,1],[0,1,1,0,1],[1,0,1,1,2]])
-        return npu_input1, npu_input2
-
-    def cpu_op_exec(self, input1, input2, max_norm, norm_type):
-        stype = input1.dtype
-        if stype == torch.float16:
-            input1 = input1.float()
-        output = torch.embedding_renorm_(input1, input2, max_norm=max_norm, norm_type=norm_type)
-        if stype == torch.float16:
-            output = output.half()
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2, max_norm,norm_type):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        output = torch.embedding_renorm_(input1, input2, max_norm=max_norm, norm_type=norm_type)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_embedding_renorm_float16_2(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (5, 3), np.float16)
-        cpu_input1 = copy.deepcopy(npu_input1)
-        cpu_input2 = copy.deepcopy(npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0.1, 2)
-        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, 0.1, 2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_embedding_renorm_float16_0(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (10, 4),np.float16)
-        cpu_input1 = copy.deepcopy(npu_input1)
-        cpu_input2 = copy.deepcopy(npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0.2, 0)
-        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, 0.2, 0)       
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_embedding_renorm_float16_1(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (3, 3), np.float16)
-        cpu_input1 = copy.deepcopy(npu_input1)
-        cpu_input2 = copy.deepcopy(npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0.5, 1)
-        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, 0.5, 1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_embedding_renorm_float16_10(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (4, 6), np.float16)
-        cpu_input1 = copy.deepcopy(npu_input1)
-        cpu_input2 = copy.deepcopy(npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, 1.0, 10)
-        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, 1.0, 10)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_embedding_renorm_float32_2(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (5, 3), np.float32)
-        cpu_input1 = copy.deepcopy(npu_input1)
-        cpu_input2 = copy.deepcopy(npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0.1, 2)
-        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, 0.1, 2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_embedding_renorm_float32_0(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (10, 4), np.float32)
-        cpu_input1 = copy.deepcopy(npu_input1)
-        cpu_input2 = copy.deepcopy(npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0.2, 0)
-        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, 0.2, 0)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_embedding_renorm_float32_1(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (3, 3), np.float32)
-        cpu_input1 = copy.deepcopy(npu_input1)
-        cpu_input2 = copy.deepcopy(npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0.5, 1)
-        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, 0.5, 1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_embedding_renorm_float32_10(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (4,6), np.float32)
-        cpu_input1 = copy.deepcopy(npu_input1)
-        cpu_input2 = copy.deepcopy(npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, 1.0, 10)        
-        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, 1.0, 10)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestEmbeddingRenorm, globals(), except_for='cpu')
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestEmbeddingRenorm(TestCase):
+    def generate_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        npu_input1 = torch.from_numpy(input1)
+        npu_input2 = torch.LongTensor(np.random.uniform(0,shape[0], int(shape[0]/2,)).astype(np.int32))
+        
+        return npu_input1, npu_input2
+
+    def cpu_op_exec(self, input1, input2, max_norm, norm_type):
+        stype = input1.dtype
+        if stype == torch.float16:
+            input1 = input1.float()
+        output = torch.embedding_renorm_(input1, input2, max_norm=max_norm, norm_type=norm_type)
+        if stype == torch.float16:
+            output = output.half()
+        return output
+
+    def npu_op_exec(self, input1, input2, max_norm,norm_type):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        output = torch.embedding_renorm_(input1, input2, max_norm=max_norm, norm_type=norm_type)
+        output = output.to("cpu")
+        return output
+
+    def test_embedding_renorm_float16_2(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (5, 3), np.float16)
+        cpu_input1 = copy.deepcopy(npu_input1)
+        cpu_input2 = copy.deepcopy(npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0.1, 2)
+        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, 0.1, 2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_embedding_renorm_float16_0(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (10, 4),np.float16)
+        cpu_input1 = copy.deepcopy(npu_input1)
+        cpu_input2 = copy.deepcopy(npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0.2, 0)
+        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, 0.2, 0)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_embedding_renorm_float16_1(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (3, 3), np.float16)
+        cpu_input1 = copy.deepcopy(npu_input1)
+        cpu_input2 = copy.deepcopy(npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0.5, 1)
+        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, 0.5, 1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_embedding_renorm_float16_10(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (4, 6), np.float16)
+        cpu_input1 = copy.deepcopy(npu_input1)
+        cpu_input2 = copy.deepcopy(npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, 1.0, 10)
+        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, 1.0, 10)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_embedding_renorm_float32_2(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (5, 3), np.float32)
+        cpu_input1 = copy.deepcopy(npu_input1)
+        cpu_input2 = copy.deepcopy(npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0.1, 2)
+        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, 0.1, 2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_embedding_renorm_float32_0(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (10, 4), np.float32)
+        cpu_input1 = copy.deepcopy(npu_input1)
+        cpu_input2 = copy.deepcopy(npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0.2, 0)
+        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, 0.2, 0)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_embedding_renorm_float32_1(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (3, 3), np.float32)
+        cpu_input1 = copy.deepcopy(npu_input1)
+        cpu_input2 = copy.deepcopy(npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0.5, 1)
+        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, 0.5, 1)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_embedding_renorm_float32_10(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (4,6), np.float32)
+        cpu_input1 = copy.deepcopy(npu_input1)
+        cpu_input2 = copy.deepcopy(npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, 1.0, 10)        
+        cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, 1.0, 10)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestEmbeddingRenorm, globals(), except_for='cpu')
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_embeddingdensebackward.py b/test/test_npu/test_network_ops/test_embeddingdensebackward.py
index c86e4dbfd23fa1cc5c3da0eaab5bbff35c1c30e3..347e242b6212fe3cfbea1e3ea06188b964015480 100644
--- a/test/test_npu/test_network_ops/test_embeddingdensebackward.py
+++ b/test/test_npu/test_network_ops/test_embeddingdensebackward.py
@@ -1,66 +1,66 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-import torch.nn.functional as F
-
-
-class TestEmbeddingDenseBackward(TestCase):
-    def cpu_op_exec(self, weight, indices):
-        weight.requires_grad_(True)
-        out = F.embedding(indices, weight, scale_grad_by_freq=True, padding_idx=37)
-        out.backward(torch.ones_like(out))
-        grad_cpu = weight.grad
-        return out.detach().numpy(), grad_cpu.detach().numpy()
-
-    def npu_op_exec(self, weight, indices):
-        weight.requires_grad_(True)
-        out = F.embedding(indices, weight, scale_grad_by_freq=True, padding_idx=37)
-        out.backward(torch.ones_like(out))
-        out_npu = out.to("cpu")
-        grad_npu = weight.grad
-        grad_npu = grad_npu.to("cpu")
-        return out_npu.detach().numpy(), grad_npu.detach().numpy()
-
-    def test_embedding_dense_backward_shape_format_fp32(self, device):
-        format_list = [0]
-        shape_list1 = [[40, 32], [40, 1024], [40000, 1024], [33712, 1024]]
-        shape_list2 = [[40], [40], [3125], [64, 7]]
-        shape_format1 = [
-            [np.float32, i, j] for i in format_list for j in shape_list1
-        ]
-        shape_format2 = [
-            [np.int64, i, j] for i in format_list for j in shape_list2
-        ]
-        shape_format = [
-            [i, j] for i in shape_format1 for j in shape_format2
-        ]
-        for item in shape_format:
-            weight_cpu, weight_npu = create_common_tensor(item[0], 1, 1)
-            indices_cpu, indices_npu = create_common_tensor(item[1], 0, min(item[0][2][0:-1]))
-
-            cpu_out, cpu_grad = self.cpu_op_exec(weight_cpu, indices_cpu)
-            npu_out, npu_grad = self.npu_op_exec(weight_npu, indices_npu)
-
-            self.assertRtolEqual(cpu_out, npu_out)
-            self.assertRtolEqual(cpu_grad, npu_grad)
-
-
-instantiate_device_type_tests(TestEmbeddingDenseBackward, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
-
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+import torch.nn.functional as F
+
+
+class TestEmbeddingDenseBackward(TestCase):
+    def cpu_op_exec(self, weight, indices):
+        weight.requires_grad_(True)
+        out = F.embedding(indices, weight, scale_grad_by_freq=True, padding_idx=37)
+        out.backward(torch.ones_like(out))
+        grad_cpu = weight.grad
+        return out.detach().numpy(), grad_cpu.detach().numpy()
+
+    def npu_op_exec(self, weight, indices):
+        weight.requires_grad_(True)
+        out = F.embedding(indices, weight, scale_grad_by_freq=True, padding_idx=37)
+        out.backward(torch.ones_like(out))
+        out_npu = out.to("cpu")
+        grad_npu = weight.grad
+        grad_npu = grad_npu.to("cpu")
+        return out_npu.detach().numpy(), grad_npu.detach().numpy()
+
+    def test_embedding_dense_backward_shape_format_fp32(self, device):
+        format_list = [0]
+        shape_list1 = [[40, 32], [40, 1024], [40000, 1024], [33712, 1024]]
+        shape_list2 = [[40], [40], [3125], [64, 7]]
+        shape_format1 = [
+            [np.float32, i, j] for i in format_list for j in shape_list1
+        ]
+        shape_format2 = [
+            [np.int64, i, j] for i in format_list for j in shape_list2
+        ]
+        shape_format = [
+            [i, j] for i in shape_format1 for j in shape_format2
+        ]
+        for item in shape_format:
+            weight_cpu, weight_npu = create_common_tensor(item[0], 1, 1)
+            indices_cpu, indices_npu = create_common_tensor(item[1], 0, min(item[0][2][0:-1]))
+
+            cpu_out, cpu_grad = self.cpu_op_exec(weight_cpu, indices_cpu)
+            npu_out, npu_grad = self.npu_op_exec(weight_npu, indices_npu)
+
+            self.assertRtolEqual(cpu_out, npu_out)
+            self.assertRtolEqual(cpu_grad, npu_grad)
+
+
+instantiate_device_type_tests(TestEmbeddingDenseBackward, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
+
diff --git a/test/test_npu/test_network_ops/test_empty_strided.py b/test/test_npu/test_network_ops/test_empty_strided.py
index e0fdf0278643528449128282133c92761afcbb69..090b9cc6c19ffe5c949b7a418d5286852f5921fd 100644
--- a/test/test_npu/test_network_ops/test_empty_strided.py
+++ b/test/test_npu/test_network_ops/test_empty_strided.py
@@ -1,44 +1,44 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestEmptyStrided(TestCase):
-    def test_empty_strided(self, device):
-        for shape in [(2, 3, 4), (0, 2, 0)]:
-            # some of these cases are pretty strange, just verifying that if as_strided
-            # allows them then empty_strided can as well.
-            for strides in [(12, 4, 1), (2, 4, 6), (0, 0, 0)]:
-                empty_strided = torch.empty_strided(shape, strides, device=device)
-                # as_strided checks the storage size is big enough to support such a strided tensor;
-                # instead of repeating this calculation, we just use empty_strided which does the same
-                # calculation when setting the storage size.
-                as_strided = torch.empty(empty_strided.storage().size(),
-                                         device=device).as_strided(shape, strides)
-
-                self.assertEqual(empty_strided.shape, as_strided.shape)
-                self.assertEqual(empty_strided.stride(), as_strided.stride())
-
-instantiate_device_type_tests(TestEmptyStrided, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestEmptyStrided(TestCase):
+    def test_empty_strided(self, device):
+        for shape in [(2, 3, 4), (0, 2, 0)]:
+            # some of these cases are pretty strange, just verifying that if as_strided
+            # allows them then empty_strided can as well.
+            for strides in [(12, 4, 1), (2, 4, 6), (0, 0, 0)]:
+                empty_strided = torch.empty_strided(shape, strides, device=device)
+                # as_strided checks the storage size is big enough to support such a strided tensor;
+                # instead of repeating this calculation, we just use empty_strided which does the same
+                # calculation when setting the storage size.
+                as_strided = torch.empty(empty_strided.storage().size(),
+                                         device=device).as_strided(shape, strides)
+
+                self.assertEqual(empty_strided.shape, as_strided.shape)
+                self.assertEqual(empty_strided.stride(), as_strided.stride())
+
+instantiate_device_type_tests(TestEmptyStrided, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_exp.py b/test/test_npu/test_network_ops/test_exp.py
old mode 100644
new mode 100755
index d7450774d5cc73c14ea4490dc82ee61a4a68a3e7..94a800743f67e679f17429d189d5745061fb5844
--- a/test/test_npu/test_network_ops/test_exp.py
+++ b/test/test_npu/test_network_ops/test_exp.py
@@ -1,109 +1,109 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestExp(TestCase):
-    def cpu_op_exec(self, input):
-        output = torch.exp(input)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input):
-        output = torch.exp(input)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_out_exec(self, input, output):
-        torch.exp(input, out = output)
-        output = output.to("cpu").numpy()
-        return output
-
-    def test_exp_shape_format_fp16(self, device):
-        format_list = [0, 3]
-        shape_list = [[5], [2, 4], [2, 2, 4], [2, 3, 3, 4]]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, -1, 1)
-            cpu_input = cpu_input.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec(npu_input)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_exp_shape_format_fp32(self, device):
-        format_list = [0, 3]
-        shape_list = [[5], [2, 4], [2, 2, 4], [2, 3, 3, 4]]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, -1, 1)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec(npu_input)
-            # cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_exp_out_float32_shape_format(self, device):
-        shape_format = [
-            [[np.float32, 0, [1024, 32, 7, 7]], [np.float32, 0, [1024, 32, 7, 7]]],
-            [[np.float32, 0, [1024, 32, 7]], [np.float32, 0, [1024, 32]]],
-            [[np.float32, 0, [1024, 32]], [np.float32, 0, [1024, 32]]],
-            [[np.float32, 0, [1024]], [np.float32, 0, [1024, 1]]],
-            [[np.float32, 3, [1024, 32, 7, 7]], [np.float32, 3, [1024, 32, 7, 7]]],
-            [[np.float32, 3, [1024, 32, 7]], [np.float32, 3, [1024, 32]]],
-            [[np.float32, 3, [1024, 32]], [np.float32, 3, [1024, 20]]],
-            [[np.float32, 3, [1024]], [np.float32, 3, [1024]]],
-            ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], -1, 1)
-            cpu_output, npu_output = create_common_tensor(item[1], -1, 1)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_out_exec(npu_input, npu_output)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_exp_out_float16_shape_format(self, device):
-        shape_format = [
-            [[np.float16, 0, [1024, 32, 7, 7]], [np.float16, 0, [1024, 32, 7, 7]]],
-            [[np.float16, 0, [1024, 32, 7]], [np.float16, 0, [1024, 32]]],
-            [[np.float16, 0, [1024, 32]], [np.float16, 0, [1024, 32]]],
-            [[np.float16, 0, [1024]], [np.float16, 0, [1024, 1]]],
-            [[np.float16, 3, [1024, 32, 7, 7]], [np.float16, 3, [1024, 32, 7, 7]]],
-            [[np.float16, 3, [1024, 32, 7]], [np.float16, 3, [1024, 32]]],
-            [[np.float16, 3, [1024, 32]], [np.float16, 3, [1024, 20]]],
-            [[np.float16, 3, [1024]], [np.float16, 3, [1024]]],
-            ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], -1, 1)
-            cpu_output, npu_output = create_common_tensor(item[1], -1, 1)
-            if item[0][0] == np.float16:
-                cpu_input = cpu_input.to(torch.float32)
-                cpu_output = cpu_output.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_out_exec(npu_input, npu_output)
-            if item[0][0] == np.float16:
-                cpu_output = cpu_output.astype(np.float16)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestExp, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestExp(TestCase):
+    def cpu_op_exec(self, input):
+        output = torch.exp(input)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input):
+        output = torch.exp(input)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_out_exec(self, input, output):
+        torch.exp(input, out = output)
+        output = output.to("cpu").numpy()
+        return output
+
+    def test_exp_shape_format_fp16(self, device):
+        format_list = [0, 3]
+        shape_list = [[5], [2, 4], [2, 2, 4], [2, 3, 3, 4]]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, -1, 1)
+            cpu_input = cpu_input.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_exp_shape_format_fp32(self, device):
+        format_list = [0, 3]
+        shape_list = [[5], [2, 4], [2, 2, 4], [2, 3, 3, 4]]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, -1, 1)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            # cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_exp_out_float32_shape_format(self, device):
+        shape_format = [
+            [[np.float32, 0, [1024, 32, 7, 7]], [np.float32, 0, [1024, 32, 7, 7]]],
+            [[np.float32, 0, [1024, 32, 7]], [np.float32, 0, [1024, 32]]],
+            [[np.float32, 0, [1024, 32]], [np.float32, 0, [1024, 32]]],
+            [[np.float32, 0, [1024]], [np.float32, 0, [1024, 1]]],
+            [[np.float32, 3, [1024, 32, 7, 7]], [np.float32, 3, [1024, 32, 7, 7]]],
+            [[np.float32, 3, [1024, 32, 7]], [np.float32, 3, [1024, 32]]],
+            [[np.float32, 3, [1024, 32]], [np.float32, 3, [1024, 20]]],
+            [[np.float32, 3, [1024]], [np.float32, 3, [1024]]],
+            ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], -1, 1)
+            cpu_output, npu_output = create_common_tensor(item[1], -1, 1)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_out_exec(npu_input, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_exp_out_float16_shape_format(self, device):
+        shape_format = [
+            [[np.float16, 0, [1024, 32, 7, 7]], [np.float16, 0, [1024, 32, 7, 7]]],
+            [[np.float16, 0, [1024, 32, 7]], [np.float16, 0, [1024, 32]]],
+            [[np.float16, 0, [1024, 32]], [np.float16, 0, [1024, 32]]],
+            [[np.float16, 0, [1024]], [np.float16, 0, [1024, 1]]],
+            [[np.float16, 3, [1024, 32, 7, 7]], [np.float16, 3, [1024, 32, 7, 7]]],
+            [[np.float16, 3, [1024, 32, 7]], [np.float16, 3, [1024, 32]]],
+            [[np.float16, 3, [1024, 32]], [np.float16, 3, [1024, 20]]],
+            [[np.float16, 3, [1024]], [np.float16, 3, [1024]]],
+            ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], -1, 1)
+            cpu_output, npu_output = create_common_tensor(item[1], -1, 1)
+            if item[0][0] == np.float16:
+                cpu_input = cpu_input.to(torch.float32)
+                cpu_output = cpu_output.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_out_exec(npu_input, npu_output)
+            if item[0][0] == np.float16:
+                cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestExp, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_expand.py b/test/test_npu/test_network_ops/test_expand.py
index 9fbc7f9435afa59d283c8759fc5b3ef8d9708e98..7143ecc0564877a87bbe00710b3161d919004ab3 100644
--- a/test/test_npu/test_network_ops/test_expand.py
+++ b/test/test_npu/test_network_ops/test_expand.py
@@ -1,53 +1,53 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestExpand(TestCase):
-    def cpu_op_exec(self, input1, size):
-        output = input1.expand(size)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self,input1, size):
-        output = input1.expand(size)
-        output = output.cpu().numpy()
-        return output
-
-    def test_expand(self, device):
-        shape_format = [
-                [[np.float32, 0, [1, 3]], (3, 3)],
-                [[np.float16, 0, [5, 1]], (-1, 7)],
-                [[np.int32, 0, [1, 1]], (3, 3)],
-    	]
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
-            npu_output = self.npu_op_exec(npu_input1, item[1])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestExpand, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestExpand(TestCase):
+    def cpu_op_exec(self, input1, size):
+        output = input1.expand(size)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self,input1, size):
+        output = input1.expand(size)
+        output = output.cpu().numpy()
+        return output
+
+    def test_expand(self, device):
+        shape_format = [
+                [[np.float32, 0, [1, 3]], (3, 3)],
+                [[np.float16, 0, [5, 1]], (-1, 7)],
+                [[np.int32, 0, [1, 1]], (3, 3)],
+    	]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
+            npu_output = self.npu_op_exec(npu_input1, item[1])
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestExpand, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_fastgelu.py b/test/test_npu/test_network_ops/test_fastgelu.py
index ba7f02b10db6a2e1b6f21c045537eb1242984147..98aa6a90906331487f60b2e0182ea28b89732e32 100644
--- a/test/test_npu/test_network_ops/test_fastgelu.py
+++ b/test/test_npu/test_network_ops/test_fastgelu.py
@@ -1,36 +1,36 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-from common_utils import TestCase, run_tests
-
-
-class TestFastGelu(TestCase):
-   def npu_op_exec(self, input1):
-        output = torch.fast_gelu(input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-   def test_fastgelu(self, device):
-        input    = torch.tensor([1.,2.,3.,4.]).npu()
-        exoutput = torch.tensor([0.8458, 1.9357, 2.9819, 3.9956])
-        output   = self.npu_op_exec(input)
-        self.assertRtolEqual(exoutput.numpy(), output) 
-
-instantiate_device_type_tests(TestFastGelu, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+from common_utils import TestCase, run_tests
+
+
+class TestFastGelu(TestCase):
+   def npu_op_exec(self, input1):
+        output = torch.fast_gelu(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+   def test_fastgelu(self, device):
+        input    = torch.tensor([1.,2.,3.,4.]).npu()
+        exoutput = torch.tensor([0.8458, 1.9357, 2.9819, 3.9956])
+        output   = self.npu_op_exec(input)
+        self.assertRtolEqual(exoutput.numpy(), output) 
+
+instantiate_device_type_tests(TestFastGelu, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_fastgelubackward.py b/test/test_npu/test_network_ops/test_fastgelubackward.py
index f6164eb4a01bd961c9a2f022200e6782ff9664e1..c23fb4ecffab54bed3048d4120b818efa220920b 100644
--- a/test/test_npu/test_network_ops/test_fastgelubackward.py
+++ b/test/test_npu/test_network_ops/test_fastgelubackward.py
@@ -1,42 +1,42 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-from common_utils import TestCase, run_tests
-
-
-class TestFastGelu(TestCase):
-   def npu_op_exec(self, input1):
-        input1.requires_grad = True
-        output = torch.fast_gelu(input1)
-        output.backward(torch.ones_like(output))
-        output_grad = input1.grad
-        output_grad = output_grad.to("cpu")
-        output_grad = output_grad.detach().numpy()
-        output = output.cpu().detach().numpy()
-        return output_grad, output
-
-   def test_fastgelu(self, device):
-        input    = torch.tensor([1.,2.,3.,4.]).npu()
-        exoutputgrad = torch.tensor([1.0677795, 1.0738151, 1.0245483, 1.0064018])
-        exoutput = torch.tensor([0.8458, 1.9357, 2.9819, 3.9956])
-        outputgrad, output   = self.npu_op_exec(input)
-        self.assertRtolEqual(exoutputgrad.numpy(), outputgrad) 
-        self.assertRtolEqual(exoutput.numpy(), output) 
-
-instantiate_device_type_tests(TestFastGelu, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+from common_utils import TestCase, run_tests
+
+
+class TestFastGelu(TestCase):
+   def npu_op_exec(self, input1):
+        input1.requires_grad = True
+        output = torch.fast_gelu(input1)
+        output.backward(torch.ones_like(output))
+        output_grad = input1.grad
+        output_grad = output_grad.to("cpu")
+        output_grad = output_grad.detach().numpy()
+        output = output.cpu().detach().numpy()
+        return output_grad, output
+
+   def test_fastgelu(self, device):
+        input    = torch.tensor([1.,2.,3.,4.]).npu()
+        exoutputgrad = torch.tensor([1.0677795, 1.0738151, 1.0245483, 1.0064018])
+        exoutput = torch.tensor([0.8458, 1.9357, 2.9819, 3.9956])
+        outputgrad, output   = self.npu_op_exec(input)
+        self.assertRtolEqual(exoutputgrad.numpy(), outputgrad) 
+        self.assertRtolEqual(exoutput.numpy(), output) 
+
+instantiate_device_type_tests(TestFastGelu, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_fill_.py b/test/test_npu/test_network_ops/test_fill_.py
old mode 100644
new mode 100755
index a9fcb1ef0396c231d88a747e9bf9f0767b13174a..1c3f6630231b3213d1bc0979b9ffb589eaf799fa
--- a/test/test_npu/test_network_ops/test_fill_.py
+++ b/test/test_npu/test_network_ops/test_fill_.py
@@ -1,66 +1,66 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestFill_(TestCase):
-    def cpu_op_exec(self, input1, input2):
-        output = torch.fill_(input1, input2)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        output = torch.fill_(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_fills_shape_format_fp16(self, device):
-        format_list = [0, 3]
-        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]]
-        value_list = [0.8, 1.25, torch.tensor(0.8), torch.tensor(1.25)]
-        shape_format = [
-            [[np.float16, i, j], v] for i in format_list for j in shape_list for v in value_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
-            npu_output = self.npu_op_exec(npu_input1, item[1])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_fill_shape_format_fp32(self, device):
-        format_list = [0, 3]
-        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]]
-        value_list = [0.8, 1.25, torch.tensor(0.8), torch.tensor(1.25)]
-        shape_format = [
-            [[np.float32, i, j], v] for i in format_list for j in shape_list for v in value_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
-            npu_output = self.npu_op_exec(npu_input1, item[1])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestFill_, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestFill_(TestCase):
+    def cpu_op_exec(self, input1, input2):
+        output = torch.fill_(input1, input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.fill_(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_fills_shape_format_fp16(self, device):
+        format_list = [0, 3]
+        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]]
+        value_list = [0.8, 1.25, torch.tensor(0.8), torch.tensor(1.25)]
+        shape_format = [
+            [[np.float16, i, j], v] for i in format_list for j in shape_list for v in value_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
+            npu_output = self.npu_op_exec(npu_input1, item[1])
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_fill_shape_format_fp32(self, device):
+        format_list = [0, 3]
+        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]]
+        value_list = [0.8, 1.25, torch.tensor(0.8), torch.tensor(1.25)]
+        shape_format = [
+            [[np.float32, i, j], v] for i in format_list for j in shape_list for v in value_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
+            npu_output = self.npu_op_exec(npu_input1, item[1])
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestFill_, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_fill_diagonal.py b/test/test_npu/test_network_ops/test_fill_diagonal.py
index 7fb9759f1fd34a1989029fefdb7f0b2d404a7628..f4113422e8df0fe99400ee03cb9665c2cea0500b 100644
--- a/test/test_npu/test_network_ops/test_fill_diagonal.py
+++ b/test/test_npu/test_network_ops/test_fill_diagonal.py
@@ -1,83 +1,83 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestFillDiagonal(TestCase):
-    def npu_op_exec(self, input):
-        input = input.npu()
-        input.fill_diagonal_(1)
-        output = input.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec(self, input):
-        input.fill_diagonal_(1)
-        output = input.numpy()
-        return output
-
-    def npu_op_wrap_exec(self, input):
-        input = input.npu()
-        input.fill_diagonal_(1, wrap=True)
-        output = input.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_wrap_exec(self, input):
-        input.fill_diagonal_(1, wrap=True)
-        output = input.numpy()
-        return output
-
-    def test_fill_diagonal_shape_format_fp32(self, device):
-        format_list = [0, 3]
-        shape_list = ([7, 3], [3, 3, 3])
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            cpu_input1 = cpu_input.clone()
-            npu_input1 = npu_input.clone()
-            cpu_output1 = self.cpu_op_exec(cpu_input)
-            npu_output1 = self.npu_op_exec(npu_input)
-            cpu_output2 = self.cpu_op_wrap_exec(cpu_input1)
-            npu_output2 = self.npu_op_wrap_exec(npu_input1)
-            self.assertRtolEqual(cpu_output1, npu_output1)
-            self.assertRtolEqual(cpu_output2, npu_output2)
-
-    def test_fill_diagonal_shape_format_fp16(self, device):
-        format_list = [0, 3]
-        shape_list = ([7, 3], [3, 3, 3])
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            cpu_input1 = cpu_input.clone()
-            npu_input1 = npu_input.clone()
-            cpu_output1 = self.cpu_op_exec(cpu_input)
-            npu_output1 = self.npu_op_exec(npu_input)
-            cpu_output2 = self.cpu_op_wrap_exec(cpu_input1)
-            npu_output2 = self.npu_op_wrap_exec(npu_input1)
-            self.assertRtolEqual(cpu_output1, npu_output1)
-            self.assertRtolEqual(cpu_output2, npu_output2)
-
-
-instantiate_device_type_tests(TestFillDiagonal, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestFillDiagonal(TestCase):
+    def npu_op_exec(self, input):
+        input = input.npu()
+        input.fill_diagonal_(1)
+        output = input.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec(self, input):
+        input.fill_diagonal_(1)
+        output = input.numpy()
+        return output
+
+    def npu_op_wrap_exec(self, input):
+        input = input.npu()
+        input.fill_diagonal_(1, wrap=True)
+        output = input.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_wrap_exec(self, input):
+        input.fill_diagonal_(1, wrap=True)
+        output = input.numpy()
+        return output
+
+    def test_fill_diagonal_shape_format_fp32(self, device):
+        format_list = [0, 3]
+        shape_list = ([7, 3], [3, 3, 3])
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_input1 = cpu_input.clone()
+            npu_input1 = npu_input.clone()
+            cpu_output1 = self.cpu_op_exec(cpu_input)
+            npu_output1 = self.npu_op_exec(npu_input)
+            cpu_output2 = self.cpu_op_wrap_exec(cpu_input1)
+            npu_output2 = self.npu_op_wrap_exec(npu_input1)
+            self.assertRtolEqual(cpu_output1, npu_output1)
+            self.assertRtolEqual(cpu_output2, npu_output2)
+
+    def test_fill_diagonal_shape_format_fp16(self, device):
+        format_list = [0, 3]
+        shape_list = ([7, 3], [3, 3, 3])
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_input1 = cpu_input.clone()
+            npu_input1 = npu_input.clone()
+            cpu_output1 = self.cpu_op_exec(cpu_input)
+            npu_output1 = self.npu_op_exec(npu_input)
+            cpu_output2 = self.cpu_op_wrap_exec(cpu_input1)
+            npu_output2 = self.npu_op_wrap_exec(npu_input1)
+            self.assertRtolEqual(cpu_output1, npu_output1)
+            self.assertRtolEqual(cpu_output2, npu_output2)
+
+
+instantiate_device_type_tests(TestFillDiagonal, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_floatstatus.py b/test/test_npu/test_network_ops/test_floatstatus.py
index bc3b73a53023a756500ca085a58595bd341c7938..bf7342e8feb261ea195da1d12962c04d17385f10 100644
--- a/test/test_npu/test_network_ops/test_floatstatus.py
+++ b/test/test_npu/test_network_ops/test_floatstatus.py
@@ -1,36 +1,36 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-from common_utils import TestCase, run_tests
-
-
-class TestFloatStatus(TestCase):
-   def npu_op_exec(self, input1):
-        output = torch.npu_alloc_float_status(input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-   def test_floatstatus(self, device):
-        input    = torch.randn([1,2,3]).npu()
-        exoutput = torch.tensor([0., 0., 0., 0., 0., 0., 0., 0.])
-        output   = self.npu_op_exec(input)
-        self.assertRtolEqual(exoutput.numpy(), output) 
-
-instantiate_device_type_tests(TestFloatStatus, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+from common_utils import TestCase, run_tests
+
+
+class TestFloatStatus(TestCase):
+   def npu_op_exec(self, input1):
+        output = torch.npu_alloc_float_status(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+   def test_floatstatus(self, device):
+        input    = torch.randn([1,2,3]).npu()
+        exoutput = torch.tensor([0., 0., 0., 0., 0., 0., 0., 0.])
+        output   = self.npu_op_exec(input)
+        self.assertRtolEqual(exoutput.numpy(), output) 
+
+instantiate_device_type_tests(TestFloatStatus, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_floor.py b/test/test_npu/test_network_ops/test_floor.py
old mode 100644
new mode 100755
index fb2f2985aa7dc97f4c64b7825b50e2f1cf1150ed..99214bb38a962e9c35a71052f19aed60c9758069
--- a/test/test_npu/test_network_ops/test_floor.py
+++ b/test/test_npu/test_network_ops/test_floor.py
@@ -1,157 +1,157 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestFloor(TestCase):
-    def cpu_op_exec(self, input):
-        output = torch.floor(input)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input):
-        output = torch.floor(input)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-    
-    def cpu_op_inter_exec(self, input):
-        torch.floor_(input)
-        output = input.numpy()
-        return output
-
-    def npu_op_inter_exec(self, input):
-        torch.floor_(input)
-        output = input.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_out_exec(self, input, output):
-        torch.floor(input, out = output)
-        output = output.numpy()
-        return output
-
-    def npu_op_out_exec(self, input, output):
-        torch.floor(input, out = output)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_floor_float32_shape_format(self, device):
-        format_list = [0, 3]
-        shape_list = [[256, 1, 1, 1], [1024, 32, 7, 7], [1024, 32, 7], [1024, 32], [1024]]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 1, 100)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec(npu_input)
-            self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_floor_inter_float32_shape_format(self, device):
-        format_list = [0, 3]
-        shape_list = [[256, 1, 1, 1], [1024, 32, 7, 7], [1024, 32, 7], [1024, 32], [1024]]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 1, 100)
-            cpu_output = self.cpu_op_inter_exec(cpu_input)
-            npu_output = self.npu_op_inter_exec(npu_input)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_floor_out_float32_shape_format(self, device):
-        shape_format = [
-            [[np.float32, 0, [1024, 32, 7, 7]], [np.float32, 0, [1024, 32, 7, 7]]],
-            [[np.float32, 0, [1024, 32, 7]], [np.float32, 0, [1024, 32]]],
-            [[np.float32, 0, [1024, 32]], [np.float32, 0, [1024, 32]]],
-            [[np.float32, 0, [1024]], [np.float32, 0, [1024, 1]]],
-            [[np.float32, 3, [1024, 32, 7, 7]], [np.float32, 3, [1024, 32, 7, 7]]],
-            [[np.float32, 3, [1024, 32, 7]], [np.float32, 3, [1024, 32]]],
-            [[np.float32, 3, [1024, 32]], [np.float32, 3, [1024, 20]]],
-            [[np.float32, 3, [1024]], [np.float32, 3, [1024]]],
-            ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 1, 100)
-            cpu_output, npu_output = create_common_tensor(item[1], 1, 100)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_out_exec(npu_input, npu_output)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_floor_float16_shape_format(self, device):
-        format_list = [0, 3]
-        shape_list = [[256, 1, 1, 1], [1024, 32, 7, 7], [1024, 32, 7], [1024, 32], [1024]]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 1, 100)
-            if item[0] == np.float16:
-                cpu_input = cpu_input.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec(npu_input)
-            if item[0] == np.float16:
-                cpu_output = cpu_output.astype(np.float16)
-            self.assertRtolEqual(cpu_output, npu_output)
-    
-    def test_floor_inter_float16_shape_format(self, device):
-        format_list = [0, 3]
-        shape_list = [[256, 1, 1, 1], [1024, 32, 7, 7], [1024, 32, 7], [1024, 32], [1024]]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 1, 100)
-            if item[0] == np.float16:
-                cpu_input = cpu_input.to(torch.float32)
-            cpu_output = self.cpu_op_inter_exec(cpu_input)
-            npu_output = self.npu_op_inter_exec(npu_input)
-            if item[0] == np.float16:
-                cpu_output = cpu_output.astype(np.float16)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_floor_out_float16_shape_format(self, device):
-        shape_format = [
-            [[np.float16, 0, [1024, 32, 7, 7]], [np.float16, 0, [1024, 32, 7, 7]]],
-            [[np.float16, 0, [1024, 32, 7]], [np.float16, 0, [1024, 32]]],
-            [[np.float16, 0, [1024, 32]], [np.float16, 0, [1024, 32]]],
-            [[np.float16, 0, [1024]], [np.float16, 0, [1024, 1]]],
-            [[np.float16, 3, [1024, 32, 7, 7]], [np.float16, 3, [1024, 32, 7, 7]]],
-            [[np.float16, 3, [1024, 32, 7]], [np.float16, 3, [1024, 32]]],
-            [[np.float16, 3, [1024, 32]], [np.float16, 3, [1024, 20]]],
-            [[np.float16, 3, [1024]], [np.float16, 3, [1024]]],
-            ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 1, 100)
-            cpu_output, npu_output = create_common_tensor(item[1], 1, 100)
-            if item[0][0] == np.float16:
-                cpu_input = cpu_input.to(torch.float32)
-                cpu_output = cpu_output.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_out_exec(npu_input, npu_output)
-            if item[0][0] == np.float16:
-                cpu_output = cpu_output.astype(np.float16)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestFloor, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestFloor(TestCase):
+    def cpu_op_exec(self, input):
+        output = torch.floor(input)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input):
+        output = torch.floor(input)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+    
+    def cpu_op_inter_exec(self, input):
+        torch.floor_(input)
+        output = input.numpy()
+        return output
+
+    def npu_op_inter_exec(self, input):
+        torch.floor_(input)
+        output = input.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_out_exec(self, input, output):
+        torch.floor(input, out = output)
+        output = output.numpy()
+        return output
+
+    def npu_op_out_exec(self, input, output):
+        torch.floor(input, out = output)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_floor_float32_shape_format(self, device):
+        format_list = [0, 3]
+        shape_list = [[256, 1, 1, 1], [1024, 32, 7, 7], [1024, 32, 7], [1024, 32], [1024]]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 1, 100)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test_floor_inter_float32_shape_format(self, device):
+        format_list = [0, 3]
+        shape_list = [[256, 1, 1, 1], [1024, 32, 7, 7], [1024, 32, 7], [1024, 32], [1024]]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 1, 100)
+            cpu_output = self.cpu_op_inter_exec(cpu_input)
+            npu_output = self.npu_op_inter_exec(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_floor_out_float32_shape_format(self, device):
+        shape_format = [
+            [[np.float32, 0, [1024, 32, 7, 7]], [np.float32, 0, [1024, 32, 7, 7]]],
+            [[np.float32, 0, [1024, 32, 7]], [np.float32, 0, [1024, 32]]],
+            [[np.float32, 0, [1024, 32]], [np.float32, 0, [1024, 32]]],
+            [[np.float32, 0, [1024]], [np.float32, 0, [1024, 1]]],
+            [[np.float32, 3, [1024, 32, 7, 7]], [np.float32, 3, [1024, 32, 7, 7]]],
+            [[np.float32, 3, [1024, 32, 7]], [np.float32, 3, [1024, 32]]],
+            [[np.float32, 3, [1024, 32]], [np.float32, 3, [1024, 20]]],
+            [[np.float32, 3, [1024]], [np.float32, 3, [1024]]],
+            ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 1, 100)
+            cpu_output, npu_output = create_common_tensor(item[1], 1, 100)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_out_exec(npu_input, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_floor_float16_shape_format(self, device):
+        format_list = [0, 3]
+        shape_list = [[256, 1, 1, 1], [1024, 32, 7, 7], [1024, 32, 7], [1024, 32], [1024]]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 1, 100)
+            if item[0] == np.float16:
+                cpu_input = cpu_input.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            if item[0] == np.float16:
+                cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+    
+    def test_floor_inter_float16_shape_format(self, device):
+        format_list = [0, 3]
+        shape_list = [[256, 1, 1, 1], [1024, 32, 7, 7], [1024, 32, 7], [1024, 32], [1024]]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 1, 100)
+            if item[0] == np.float16:
+                cpu_input = cpu_input.to(torch.float32)
+            cpu_output = self.cpu_op_inter_exec(cpu_input)
+            npu_output = self.npu_op_inter_exec(npu_input)
+            if item[0] == np.float16:
+                cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_floor_out_float16_shape_format(self, device):
+        shape_format = [
+            [[np.float16, 0, [1024, 32, 7, 7]], [np.float16, 0, [1024, 32, 7, 7]]],
+            [[np.float16, 0, [1024, 32, 7]], [np.float16, 0, [1024, 32]]],
+            [[np.float16, 0, [1024, 32]], [np.float16, 0, [1024, 32]]],
+            [[np.float16, 0, [1024]], [np.float16, 0, [1024, 1]]],
+            [[np.float16, 3, [1024, 32, 7, 7]], [np.float16, 3, [1024, 32, 7, 7]]],
+            [[np.float16, 3, [1024, 32, 7]], [np.float16, 3, [1024, 32]]],
+            [[np.float16, 3, [1024, 32]], [np.float16, 3, [1024, 20]]],
+            [[np.float16, 3, [1024]], [np.float16, 3, [1024]]],
+            ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 1, 100)
+            cpu_output, npu_output = create_common_tensor(item[1], 1, 100)
+            if item[0][0] == np.float16:
+                cpu_input = cpu_input.to(torch.float32)
+                cpu_output = cpu_output.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_out_exec(npu_input, npu_output)
+            if item[0][0] == np.float16:
+                cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestFloor, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_fmod.py b/test/test_npu/test_network_ops/test_fmod.py
old mode 100644
new mode 100755
index ebe4bc79a9699aa047cfbd5f6629718248cbb0cc..8da4dfb6f84f2c6f6fb00fa326a061c08385e458
--- a/test/test_npu/test_network_ops/test_fmod.py
+++ b/test/test_npu/test_network_ops/test_fmod.py
@@ -1,119 +1,119 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestFmod(TestCase):
-    def cpu_op_exec(self, input1, input2):
-        output = torch.fmod(input1, input2)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        output = torch.fmod(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_out(self, input1, input2, input3):
-        torch.fmod(input1, input2, out=input3)
-        output = input3.to("cpu")
-        output = output.numpy()
-        return output
-
-    def case_exec_tensor(self, shape):
-        for item in shape:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 1, 100)
-            npu_input3 = torch.empty(0).npu().to(cpu_input1.dtype)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
-            self.assertEqual(cpu_output, npu_output)
-            self.assertEqual(npu_output_out, npu_output)
-
-    def case_exec_scalar(self, shape):
-        for item in shape:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            npu_input3 = torch.empty(0).npu().to(cpu_input1.dtype)
-            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
-            npu_output = self.npu_op_exec(npu_input1, item[1])
-            npu_output_out = self.npu_op_exec_out(npu_input1, item[1], npu_input3)
-            self.assertEqual(cpu_output, npu_output)
-            self.assertEqual(npu_output_out, npu_output)
-
-    def case_exec_tensor_fp16(self, shape):
-        for item in shape:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 1, 100)
-            npu_input3 = torch.empty(0).npu().to(cpu_input1.dtype)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
-            self.assertEqual(npu_output_out, npu_output)
-
-    def case_exec_scalar_fp32(self, shape):
-        for item in shape:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            npu_input3 = torch.empty(0).npu().to(cpu_input1.dtype)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
-            npu_output = self.npu_op_exec(npu_input1, item[1])
-            npu_output_out = self.npu_op_exec_out(npu_input1, item[1], npu_input3)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
-            self.assertEqual(npu_output_out, npu_output)
-
-    def test_fmod_shape_format_fp32(self, device):
-        format_list = [0, 3]
-        shape_list = [[5, 6], [3, 4, 5]]
-        shape_format_tensor = [[[np.float32, i, j], [np.float32, i, j]]
-                               for i in format_list for j in shape_list]
-        shape_format_scalar_tensor = [
-            [[np.float32, i, j], 5] for i in format_list for j in shape_list
-        ]
-        self.case_exec_tensor(shape_format_tensor)
-        self.case_exec_scalar(shape_format_scalar_tensor)
-
-    def test_fmod_shape_format_fp16(self, device):
-        format_list = [0, 3]
-        shape_list = [[5, 6], [3, 4, 5]]
-        shape_format_tensor = [[[np.float16, i, j], [np.float16, i, j]]
-                               for i in format_list for j in shape_list]
-        shape_format_scalar_tensor = [
-            [[np.float16, i, j], 5] for i in format_list for j in shape_list
-        ]
-        self.case_exec_tensor_fp16(shape_format_tensor)
-        self.case_exec_scalar_fp32(shape_format_scalar_tensor)
-
-    def test_fmod_mix_dtype(self, device):
-        npu_input1, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
-        npu_input3, npu_input4 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
-        npu_output = self.npu_op_exec(npu_input2, npu_input4)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestFmod, globals(), except_for="cpu")
-
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestFmod(TestCase):
+    def cpu_op_exec(self, input1, input2):
+        output = torch.fmod(input1, input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.fmod(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2, input3):
+        torch.fmod(input1, input2, out=input3)
+        output = input3.to("cpu")
+        output = output.numpy()
+        return output
+
+    def case_exec_tensor(self, shape):
+        for item in shape:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 1, 100)
+            npu_input3 = torch.empty(0).npu().to(cpu_input1.dtype)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
+            self.assertEqual(cpu_output, npu_output)
+            self.assertEqual(npu_output_out, npu_output)
+
+    def case_exec_scalar(self, shape):
+        for item in shape:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            npu_input3 = torch.empty(0).npu().to(cpu_input1.dtype)
+            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
+            npu_output = self.npu_op_exec(npu_input1, item[1])
+            npu_output_out = self.npu_op_exec_out(npu_input1, item[1], npu_input3)
+            self.assertEqual(cpu_output, npu_output)
+            self.assertEqual(npu_output_out, npu_output)
+
+    def case_exec_tensor_fp16(self, shape):
+        for item in shape:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 1, 100)
+            npu_input3 = torch.empty(0).npu().to(cpu_input1.dtype)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+            self.assertEqual(npu_output_out, npu_output)
+
+    def case_exec_scalar_fp32(self, shape):
+        for item in shape:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            npu_input3 = torch.empty(0).npu().to(cpu_input1.dtype)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
+            npu_output = self.npu_op_exec(npu_input1, item[1])
+            npu_output_out = self.npu_op_exec_out(npu_input1, item[1], npu_input3)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+            self.assertEqual(npu_output_out, npu_output)
+
+    def test_fmod_shape_format_fp32(self, device):
+        format_list = [0, 3]
+        shape_list = [[5, 6], [3, 4, 5]]
+        shape_format_tensor = [[[np.float32, i, j], [np.float32, i, j]]
+                               for i in format_list for j in shape_list]
+        shape_format_scalar_tensor = [
+            [[np.float32, i, j], 5] for i in format_list for j in shape_list
+        ]
+        self.case_exec_tensor(shape_format_tensor)
+        self.case_exec_scalar(shape_format_scalar_tensor)
+
+    def test_fmod_shape_format_fp16(self, device):
+        format_list = [0, 3]
+        shape_list = [[5, 6], [3, 4, 5]]
+        shape_format_tensor = [[[np.float16, i, j], [np.float16, i, j]]
+                               for i in format_list for j in shape_list]
+        shape_format_scalar_tensor = [
+            [[np.float16, i, j], 5] for i in format_list for j in shape_list
+        ]
+        self.case_exec_tensor_fp16(shape_format_tensor)
+        self.case_exec_scalar_fp32(shape_format_scalar_tensor)
+
+    def test_fmod_mix_dtype(self, device):
+        npu_input1, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
+        npu_input3, npu_input4 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
+        npu_output = self.npu_op_exec(npu_input2, npu_input4)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestFmod, globals(), except_for="cpu")
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_format_div.py b/test/test_npu/test_network_ops/test_format_div.py
index bec9df2c4afc543fdc5c4c0514b13b31c73c356f..f3cb28c3eaa867dd8f26e99d5cdd1b95293965dd 100644
--- a/test/test_npu/test_network_ops/test_format_div.py
+++ b/test/test_npu/test_network_ops/test_format_div.py
@@ -1,53 +1,53 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests, formats
-from util_test import create_common_tensor, create_dtype_tensor
-
-
-class TestDiv(TestCase):
-
-    def cpu_op_exec(self, input1, input2):
-        output = torch.div(input1, input2)
-        return output.numpy()
-
-    def npu_op_exec(self, input1, input2):
-        output = torch.div(input1, input2)
-        output = output.to("cpu")
-        return output.numpy()
-        
-    @formats(0, 3)
-    def test_div_shape_format(self, device, npu_format):
-        shape_list = [6]
-        shape_format = [
-            [np.float16, npu_format, j] for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item, 1, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestDiv, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests, formats
+from util_test import create_common_tensor, create_dtype_tensor
+
+
+class TestDiv(TestCase):
+
+    def cpu_op_exec(self, input1, input2):
+        output = torch.div(input1, input2)
+        return output.numpy()
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.div(input1, input2)
+        output = output.to("cpu")
+        return output.numpy()
+        
+    @formats(0, 3)
+    def test_div_shape_format(self, device, npu_format):
+        shape_list = [6]
+        shape_format = [
+            [np.float16, npu_format, j] for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 1, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestDiv, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_full.py b/test/test_npu/test_network_ops/test_full.py
old mode 100644
new mode 100755
index 0d9f7829fa001d7941678a1bb956f906862efbdb..723443f0d471f2d5025611f6d0044f869726d3ac
--- a/test/test_npu/test_network_ops/test_full.py
+++ b/test/test_npu/test_network_ops/test_full.py
@@ -1,55 +1,55 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestFull(TestCase):
-    def test_full_shape_format_fp16(self, device):
-        format_list = [0, 3]
-        dtype_list = [torch.float32, torch.float16, torch.int32]
-        shape_list = [[5, 8], [2, 4, 1, 1], [16]]
-        shape_format = [[[np.float16, i, j], k]
-                        for i in format_list for j in shape_list for k in dtype_list]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            cpu_output = torch.full(cpu_input.size(), 6, dtype=item[1])
-            cpu_output = cpu_output.numpy()
-            npu_output = torch.full(npu_input.size(), 6, dtype=item[1])
-            npu_output = npu_output.to("cpu")
-            npu_output = npu_output.numpy()
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_full_shape_format_fp32(self, device):
-        format_list = [0, 3]
-        dtype_list = [torch.float32, torch.float16, torch.int32]
-        shape_list = [[5, 8], [2, 4, 1, 1], [16]]
-        shape_format = [[[np.float32, i, j], k]
-                        for i in format_list for j in shape_list for k in dtype_list]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            cpu_output = torch.full(cpu_input.size(), 6, dtype=item[1])
-            cpu_output = cpu_output.numpy()
-            npu_output = torch.full(npu_input.size(), 6, dtype=item[1])
-            npu_output = npu_output.to("cpu")
-            npu_output = npu_output.numpy()
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestFull, globals(), except_for="cpu")
-if __name__ == '__main__':
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestFull(TestCase):
+    def test_full_shape_format_fp16(self, device):
+        format_list = [0, 3]
+        dtype_list = [torch.float32, torch.float16, torch.int32]
+        shape_list = [[5, 8], [2, 4, 1, 1], [16]]
+        shape_format = [[[np.float16, i, j], k]
+                        for i in format_list for j in shape_list for k in dtype_list]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            cpu_output = torch.full(cpu_input.size(), 6, dtype=item[1])
+            cpu_output = cpu_output.numpy()
+            npu_output = torch.full(npu_input.size(), 6, dtype=item[1])
+            npu_output = npu_output.to("cpu")
+            npu_output = npu_output.numpy()
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_full_shape_format_fp32(self, device):
+        format_list = [0, 3]
+        dtype_list = [torch.float32, torch.float16, torch.int32]
+        shape_list = [[5, 8], [2, 4, 1, 1], [16]]
+        shape_format = [[[np.float32, i, j], k]
+                        for i in format_list for j in shape_list for k in dtype_list]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            cpu_output = torch.full(cpu_input.size(), 6, dtype=item[1])
+            cpu_output = cpu_output.numpy()
+            npu_output = torch.full(npu_input.size(), 6, dtype=item[1])
+            npu_output = npu_output.to("cpu")
+            npu_output = npu_output.numpy()
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestFull, globals(), except_for="cpu")
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_gather.py b/test/test_npu/test_network_ops/test_gather.py
index 041d68ff1de7bd9e12c256034ccdb2bbcfedb37a..891f07268e3b405394fba7c171eaa2bcf9e350ee 100644
--- a/test/test_npu/test_network_ops/test_gather.py
+++ b/test/test_npu/test_network_ops/test_gather.py
@@ -1,76 +1,76 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestIndex(TestCase):
-    def cpu_op_exec(self, input1, dim, index):
-        output = torch.index_select(input1, dim, index)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, dim, index):
-        index = index.to("npu")
-        output = torch.index_select(input1, dim, index)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_index_shape_format_fp32(self, device):
-        format_list = [0, 3, 29]
-        shape_list = [(1000, 1280), (32, 3, 3), (1024, 464, 7, 7)]
-        shape_format = [
-            [[np.float32, i, j], [np.int64, 0, [2]]] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            dim = np.random.randint(0, len(item[0][2]))
-            index1 = np.random.uniform(0, item[0][2][dim], item[1][2]).astype(np.int64)
-            index = torch.from_numpy(index1)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1, dim, index)
-            npu_output = self.npu_op_exec(npu_input1, dim, index)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_index_shape_format_fp16(self, device):
-        format_list = [0, 3, 29]
-        shape_list = [(1000, 1280), (32, 3, 3), (1024, 464, 7, 7)]
-        shape_format = [
-            [[np.float16, i, j], [np.int64, 0, [2]]] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            dim = np.random.randint(0, len(item[0][2]))
-            index1 = np.random.uniform(0, item[0][2][dim], item[1][2]).astype(np.int64)
-            index = torch.from_numpy(index1)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1, dim, index)
-            npu_output = self.npu_op_exec(npu_input1, dim, index)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestIndex, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestIndex(TestCase):
+    def cpu_op_exec(self, input1, dim, index):
+        output = torch.index_select(input1, dim, index)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, dim, index):
+        index = index.to("npu")
+        output = torch.index_select(input1, dim, index)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_index_shape_format_fp32(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [(1000, 1280), (32, 3, 3), (1024, 464, 7, 7)]
+        shape_format = [
+            [[np.float32, i, j], [np.int64, 0, [2]]] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            dim = np.random.randint(0, len(item[0][2]))
+            index1 = np.random.uniform(0, item[0][2][dim], item[1][2]).astype(np.int64)
+            index = torch.from_numpy(index1)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, dim, index)
+            npu_output = self.npu_op_exec(npu_input1, dim, index)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_index_shape_format_fp16(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [(1000, 1280), (32, 3, 3), (1024, 464, 7, 7)]
+        shape_format = [
+            [[np.float16, i, j], [np.int64, 0, [2]]] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            dim = np.random.randint(0, len(item[0][2]))
+            index1 = np.random.uniform(0, item[0][2][dim], item[1][2]).astype(np.int64)
+            index = torch.from_numpy(index1)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, dim, index)
+            npu_output = self.npu_op_exec(npu_input1, dim, index)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestIndex, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_ge.py b/test/test_npu/test_network_ops/test_ge.py
old mode 100644
new mode 100755
index b098b29274480835733577f25eebd03476d3e9b1..836387473b4b1a6c796b2927b8b31bf2e45ea6a9
--- a/test/test_npu/test_network_ops/test_ge.py
+++ b/test/test_npu/test_network_ops/test_ge.py
@@ -1,301 +1,301 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import copy
-import sys
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestGe(TestCase):
-    def generate_scalar(self, min, max):
-        scalar = np.random.uniform(min, max)
-        return scalar
-
-    def cpu_op_exec(self, input1, input2):
-        output = torch.ge(input1, input2)
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_out(self, input1, input2, input3):
-        torch.ge(input1, input2, out = input3)
-        output = input3.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        output = torch.ge(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_out(self, input1, input2, input3):
-        torch.ge(input1, input2, out = input3)
-        output = input3.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_inplace_exec(self, input1, input2):
-        output = input1.ge_(input2)
-        output = input1
-        output = output.numpy()
-        return output
-
-    def npu_op_inplace_exec(self, input1, input2):
-        output = input1.ge_(input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_scalar(self, input, scalar):
-        output = torch.ge(input, scalar)
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_scalar_out(self, input1, scalar, input2):
-        torch.ge(input1, scalar, out = input2)
-        output = input2.numpy()
-        return output
-
-    def npu_op_exec_scalar(self, input, scalar):
-        output = torch.ge(input, scalar)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_scalar_out(self, input1, scalar, input2):
-        torch.ge(input1, scalar, out = input2)
-        output = input2.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_inplace_exec_scalar(self, input, scalar):
-        output = input.ge_(scalar)
-        output = output.numpy()
-        return output
-
-    def npu_op_inplace_exec_scalar(self, input, scalar):
-        output = input.ge_(scalar)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def ge_tensor_out_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100)
-            cpu_input3 = torch.randn(item[1][2])<0
-            npu_input3 = cpu_input3.npu()
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            if cpu_input2.dtype == torch.float16:
-                cpu_input2 = cpu_input2.to(torch.float32)
-            if cpu_input3.dtype == torch.float16:
-                cpu_input3 = cpu_input3.to(torch.float32)
-            cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3)
-            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
-            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
-
-            self.assertRtolEqual(cpu_output_out, npu_output_out)
-
-    def test_ge_tensor_out(self, device):
-        shape_format = [
-            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
-            [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]],
-            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
-            [[np.float32, 0, [256, 128, 7, 7]],   [np.float32, 0, [128, 256, 3, 3]]],
-            [[np.float32, 0, [2, 3, 3, 3]],       [np.float32, 0, [3, 1, 3]]],
-            [[np.float32, 0, [128, 232, 7, 7]],   [np.float32, 0, [128, 232, 7, 7]]],
-        ]
-        self.ge_tensor_out_result(shape_format)
-
-    def ge_scalar_out_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_input2 = torch.randn(item[1][2])<0
-            npu_input2 = cpu_input2.npu()
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            if cpu_input2.dtype == torch.float16:
-                cpu_input2 = cpu_input2.to(torch.float32)
-            scalar = self.generate_scalar(0, 100)
-            cpu_output_out = self.cpu_op_exec_scalar_out(cpu_input1, scalar, cpu_input2)
-            npu_output_out = self.npu_op_exec_scalar_out(npu_input1, scalar, npu_input2)
-            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
-            self.assertRtolEqual(cpu_output_out, npu_output_out)
-
-    def test_ge_scalar_out(self, device):
-        shape_format = [
-            [[np.float16, 0, [4, 4, 128, 128]], [np.float16, 0, [256, 116, 1, 1]]],
-            [[np.float16, 0, [12, 10, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
-            [[np.float16, 0, [16, 3, 1111, 1212]], [np.float16, 0, [3, 3, 3]]],
-            [[np.float16, 0, [16, 16, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
-            [[np.float32, 0, [20, 10, 7, 7]], [np.float32, 0, [128, 256, 3, 3]]],
-            [[np.float32, 0, [1313, 3, 3, 3]], [np.float32, 0, [3, 1, 3]]],
-            [[np.float32, 0, [16, 22, 7, 7]], [np.float32, 0, [128, 232, 7, 7]]],
-        ]
-        self.ge_scalar_out_result(shape_format)
-
-    def test_ge_bool(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        scalar_list = [True, False]
-        shape_format = [
-            [[np.int32, i, j], k] for i in format_list for j in shape_list 
-            for k in scalar_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], 0, 100)
-            cpu_output1 = self.cpu_op_exec_scalar(cpu_input1 > 50, item[1])
-            npu_output1 = self.npu_op_exec_scalar(npu_input1 > 50, item[1])
-            cpu_output2 = self.cpu_op_exec(cpu_input1 > 50, cpu_input2 > 50)
-            npu_output2 = self.npu_op_exec(npu_input1 > 50, npu_input2 > 50)
-            self.assertEqual(cpu_output1, npu_output1)
-            self.assertEqual(cpu_output2, npu_output2)
-
-    def test_ge_scalar_float32(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            scalar = self.generate_scalar(0, 100)
-            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
-            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_ge_scalar_float16(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            cpu_input = cpu_input.to(torch.float32)
-            scalar = self.generate_scalar(0, 100)
-            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
-            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_ge_scalar_int32(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [
-            [np.int32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            scalar = self.generate_scalar(0, 100)
-            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
-            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_ge_tensor_float32(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [[[np.float32, i, j], [np.float32, i, j]]
-                        for i in format_list for j in shape_list]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_ge_tensor_float16(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [[[np.float16, i, j], [np.float16, i, j]]
-                        for i in format_list for j in shape_list]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_ge_inplace_float32(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [[[np.float32, i, j], [np.float32, i, j]]
-                        for i in format_list for j in shape_list]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
-            cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_ge_inplace_float16(self, device):
-        format_list = [0, 3]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [[[np.float16, i, j], [np.float16, i, j]]
-                        for i in format_list for j in shape_list]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_ge_inplace_scalar_float32(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            scalar = self.generate_scalar(0, 100)
-            cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
-            npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_ge_inplace_scalar_float16(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            cpu_input = cpu_input.to(torch.float32)
-            scalar = self.generate_scalar(0, 100)
-            cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
-            npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_ge_mix_dtype(self, device):
-        npu_input1, npu_input2 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
-        npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
-        npu_output = self.npu_op_exec(npu_input2, npu_input4)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestGe, globals(), except_for="cpu")
-if __name__ == '__main__':
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import copy
+import sys
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestGe(TestCase):
+    def generate_scalar(self, min, max):
+        scalar = np.random.uniform(min, max)
+        return scalar
+
+    def cpu_op_exec(self, input1, input2):
+        output = torch.ge(input1, input2)
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_out(self, input1, input2, input3):
+        torch.ge(input1, input2, out = input3)
+        output = input3.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.ge(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2, input3):
+        torch.ge(input1, input2, out = input3)
+        output = input3.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_exec(self, input1, input2):
+        output = input1.ge_(input2)
+        output = input1
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_exec(self, input1, input2):
+        output = input1.ge_(input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_scalar(self, input, scalar):
+        output = torch.ge(input, scalar)
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_scalar_out(self, input1, scalar, input2):
+        torch.ge(input1, scalar, out = input2)
+        output = input2.numpy()
+        return output
+
+    def npu_op_exec_scalar(self, input, scalar):
+        output = torch.ge(input, scalar)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_scalar_out(self, input1, scalar, input2):
+        torch.ge(input1, scalar, out = input2)
+        output = input2.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_exec_scalar(self, input, scalar):
+        output = input.ge_(scalar)
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_exec_scalar(self, input, scalar):
+        output = input.ge_(scalar)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def ge_tensor_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100)
+            cpu_input3 = torch.randn(item[1][2])<0
+            npu_input3 = cpu_input3.npu()
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            if cpu_input3.dtype == torch.float16:
+                cpu_input3 = cpu_input3.to(torch.float32)
+            cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
+            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
+
+            self.assertRtolEqual(cpu_output_out, npu_output_out)
+
+    def test_ge_tensor_out(self, device):
+        shape_format = [
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]],
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
+            [[np.float32, 0, [256, 128, 7, 7]],   [np.float32, 0, [128, 256, 3, 3]]],
+            [[np.float32, 0, [2, 3, 3, 3]],       [np.float32, 0, [3, 1, 3]]],
+            [[np.float32, 0, [128, 232, 7, 7]],   [np.float32, 0, [128, 232, 7, 7]]],
+        ]
+        self.ge_tensor_out_result(shape_format)
+
+    def ge_scalar_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2 = torch.randn(item[1][2])<0
+            npu_input2 = cpu_input2.npu()
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output_out = self.cpu_op_exec_scalar_out(cpu_input1, scalar, cpu_input2)
+            npu_output_out = self.npu_op_exec_scalar_out(npu_input1, scalar, npu_input2)
+            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
+            self.assertRtolEqual(cpu_output_out, npu_output_out)
+
+    def test_ge_scalar_out(self, device):
+        shape_format = [
+            [[np.float16, 0, [4, 4, 128, 128]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [12, 10, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [16, 3, 1111, 1212]], [np.float16, 0, [3, 3, 3]]],
+            [[np.float16, 0, [16, 16, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
+            [[np.float32, 0, [20, 10, 7, 7]], [np.float32, 0, [128, 256, 3, 3]]],
+            [[np.float32, 0, [1313, 3, 3, 3]], [np.float32, 0, [3, 1, 3]]],
+            [[np.float32, 0, [16, 22, 7, 7]], [np.float32, 0, [128, 232, 7, 7]]],
+        ]
+        self.ge_scalar_out_result(shape_format)
+
+    def test_ge_bool(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        scalar_list = [True, False]
+        shape_format = [
+            [[np.int32, i, j], k] for i in format_list for j in shape_list 
+            for k in scalar_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], 0, 100)
+            cpu_output1 = self.cpu_op_exec_scalar(cpu_input1 > 50, item[1])
+            npu_output1 = self.npu_op_exec_scalar(npu_input1 > 50, item[1])
+            cpu_output2 = self.cpu_op_exec(cpu_input1 > 50, cpu_input2 > 50)
+            npu_output2 = self.npu_op_exec(npu_input1 > 50, npu_input2 > 50)
+            self.assertEqual(cpu_output1, npu_output1)
+            self.assertEqual(cpu_output2, npu_output2)
+
+    def test_ge_scalar_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_ge_scalar_float16(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_input = cpu_input.to(torch.float32)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_ge_scalar_int32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.int32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_ge_tensor_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float32, i, j], [np.float32, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_ge_tensor_float16(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float16, i, j], [np.float16, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_ge_inplace_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float32, i, j], [np.float32, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_ge_inplace_float16(self, device):
+        format_list = [0, 3]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float16, i, j], [np.float16, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_ge_inplace_scalar_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_ge_inplace_scalar_float16(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_input = cpu_input.to(torch.float32)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_ge_mix_dtype(self, device):
+        npu_input1, npu_input2 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
+        npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
+        npu_output = self.npu_op_exec(npu_input2, npu_input4)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestGe, globals(), except_for="cpu")
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_npu/test_gelu_backward.py b/test/test_npu/test_network_ops/test_gelu_backward.py
similarity index 77%
rename from test/test_npu/test_gelu_backward.py
rename to test/test_npu/test_network_ops/test_gelu_backward.py
index a21092c621a4f257e02569101c0fbf0c6f242ab9..439e57e28c7d5e1640670a4765461fe7ec3ef256 100644
--- a/test/test_npu/test_gelu_backward.py
+++ b/test/test_npu/test_network_ops/test_gelu_backward.py
@@ -33,7 +33,7 @@ class TestGeluBackward(TestCase):
         z = output.sum()
         z.backward()
         res = input1.grad        
-        return res.detach()
+        return res.detach().numpy()
 
     def npu_op_exec(self, input1):
         input1 = input1.to("npu")
@@ -42,44 +42,37 @@ class TestGeluBackward(TestCase):
         z = output.sum()
         z.backward()
         res = input1.grad.to("cpu")        
-        return res.detach()
+        return res.detach().numpy()
         
     def test_gelu_backward_float32_1(self, device):
-        input1= self.generate_single_data(0, 100, (4,3,1,1), np.float32)
+        input1= self.generate_single_data(0, 100, (4, 3, 1, 1), np.float32)
         cpu_input1 = copy.deepcopy(input1)
         cpu_output = self.cpu_op_exec(cpu_input1)
         npu_output = self.npu_op_exec(input1)
         self.assertRtolEqual(cpu_output, npu_output)
         
     def test_gelu_backward_float32_2(self, device):
-        input1= self.generate_single_data(0, 100, (4,3,10), np.float32)
+        input1= self.generate_single_data(0, 100, (15, 3, 1), np.float32)
         cpu_input1 = copy.deepcopy(input1)
         cpu_output = self.cpu_op_exec(cpu_input1)
         npu_output = self.npu_op_exec(input1)
         self.assertRtolEqual(cpu_output, npu_output)
 
     def test_gelu_backward_float32_3(self, device):
-        input1= self.generate_single_data(0, 100, (400,30,10), np.float32)
-        cpu_input1 = copy.deepcopy(input1)
-        cpu_output = self.cpu_op_exec(cpu_input1)
-        npu_output = self.npu_op_exec(input1)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_gelu_backward_float32_4(self, device):
-        input1= self.generate_single_data(-30, 0, (4,4), np.float32)
+        input1= self.generate_single_data(0, 100, (4, 4), np.float32)
         cpu_input1 = copy.deepcopy(input1)
         cpu_output = self.cpu_op_exec(cpu_input1)
         npu_output = self.npu_op_exec(input1)
         self.assertRtolEqual(cpu_output, npu_output)       
 
     def test_gelu_backward_float16(self, device):
-        input1 = self.generate_single_data(0, 100, (5, 10, 100) , np.float16)
-        input1 =  input1.to(torch.float32)
-        cpu_input1 = copy.deepcopy(input1)
+        input1 = self.generate_single_data(0, 100, (5, 10, 100), np.float16)
+        cpu_input1 =  input1.to(torch.float32)
         cpu_output = self.cpu_op_exec(cpu_input1)
+        cpu_output = cpu_output.astype(np.float16)
         npu_output = self.npu_op_exec(input1)
         self.assertRtolEqual(cpu_output, npu_output) 
         
 instantiate_device_type_tests(TestGeluBackward, globals(), except_for="cpu")
 if __name__ == "__main__":
-    run_tests()
\ No newline at end of file
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_ger.py b/test/test_npu/test_network_ops/test_ger.py
index 20d41f7b286647af116cfc497317f6c1f73aac2e..3799258025a57b92cc9e72ad653415bbcea5ecc2 100644
--- a/test/test_npu/test_network_ops/test_ger.py
+++ b/test/test_npu/test_network_ops/test_ger.py
@@ -1,102 +1,102 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import copy
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestGer(TestCase):
-
-    def cpu_op_exec(self,input1, input2):
-        output = torch.ger(input1, input2)
-        output = output.numpy()
-
-        return output
-
-    def npu_op_exec(self,input1, input2):
-        output = torch.ger(input1, input2)
-        output = output.to("cpu").numpy()
-
-        return output
-
-    def npu_op_exec_out(self,input1, input2, output):
-        torch.ger(input1, input2, out=output)
-        output = output.to("cpu").numpy()
-
-        return output
-
-    def ger_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], -100, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            if cpu_input2.dtype == torch.float16:
-                cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def ger_out_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], -100, 100)
-            cpu_input3, npu_input3 = create_common_tensor(item[2], -100, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            if cpu_input2.dtype == torch.float16:
-                cpu_input2 = cpu_input2.to(torch.float32)
-            if cpu_input3.dtype == torch.float16:
-                cpu_input3 = cpu_input3.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
-            cpu_output = cpu_output.astype(npu_output_out.dtype)
-            self.assertRtolEqual(cpu_output, npu_output_out)
-
-    def test_ger_result(self, device):
-        shape_format = [
-            [[np.float16, 0, [128]], [np.float16, 0, [256]]],
-            [[np.float16, 0, [128]], [np.float16, 0, [58]]],
-            [[np.float16, 0, [128]], [np.float16, 0, [3]]],
-            [[np.float16, 0, [128]], [np.float16, 0, [116]]],
-            [[np.float32, 0, [256]], [np.float32, 0, [128]]],
-            [[np.float32, 0, [256]], [np.float32, 0, [3]]],
-            [[np.float32, 0, [2]],   [np.float32, 0, [3]]],
-            [[np.float32, 0, [128]], [np.float32, 0, [232]]],
-        ]
-        self.ger_result(shape_format)
-
-    def test_ger_out_result(self, device):
-        shape_format = [
-            [[np.float16, 0, [128]], [np.float16, 0, [256]], [np.float16, 0, [256, 116]]],
-            [[np.float16, 0, [128]], [np.float16, 0, [58]],  [np.float16, 0, [58, 58, 1, 1]]],
-            [[np.float16, 0, [128]], [np.float16, 0, [3]],   [np.float16, 0, [3, 3]]],
-            [[np.float16, 0, [128]], [np.float16, 0, [116]], [np.float16, 0, [128, 116]]],
-            [[np.float32, 0, [256]], [np.float32, 0, [128]], [np.float32, 0, [128, 128, 3, 3]]],
-            [[np.float32, 0, [256]], [np.float32, 0, [3]],   [np.float32, 0, [256, 3]]],
-            [[np.float32, 0, [2]],   [np.float32, 0, [3]],   [np.float32, 0, [3, 1, 3, 3]]],
-            [[np.float32, 0, [128]], [np.float32, 0, [232]], [np.float32, 0, [232, 232]]],
-        ]
-        self.ger_out_result(shape_format)
-
-
-instantiate_device_type_tests(TestGer, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import copy
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestGer(TestCase):
+
+    def cpu_op_exec(self,input1, input2):
+        output = torch.ger(input1, input2)
+        output = output.numpy()
+
+        return output
+
+    def npu_op_exec(self,input1, input2):
+        output = torch.ger(input1, input2)
+        output = output.to("cpu").numpy()
+
+        return output
+
+    def npu_op_exec_out(self,input1, input2, output):
+        torch.ger(input1, input2, out=output)
+        output = output.to("cpu").numpy()
+
+        return output
+
+    def ger_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], -100, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def ger_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], -100, 100)
+            cpu_input3, npu_input3 = create_common_tensor(item[2], -100, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            if cpu_input3.dtype == torch.float16:
+                cpu_input3 = cpu_input3.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
+            cpu_output = cpu_output.astype(npu_output_out.dtype)
+            self.assertRtolEqual(cpu_output, npu_output_out)
+
+    def test_ger_result(self, device):
+        shape_format = [
+            [[np.float16, 0, [128]], [np.float16, 0, [256]]],
+            [[np.float16, 0, [128]], [np.float16, 0, [58]]],
+            [[np.float16, 0, [128]], [np.float16, 0, [3]]],
+            [[np.float16, 0, [128]], [np.float16, 0, [116]]],
+            [[np.float32, 0, [256]], [np.float32, 0, [128]]],
+            [[np.float32, 0, [256]], [np.float32, 0, [3]]],
+            [[np.float32, 0, [2]],   [np.float32, 0, [3]]],
+            [[np.float32, 0, [128]], [np.float32, 0, [232]]],
+        ]
+        self.ger_result(shape_format)
+
+    def test_ger_out_result(self, device):
+        shape_format = [
+            [[np.float16, 0, [128]], [np.float16, 0, [256]], [np.float16, 0, [256, 116]]],
+            [[np.float16, 0, [128]], [np.float16, 0, [58]],  [np.float16, 0, [58, 58, 1, 1]]],
+            [[np.float16, 0, [128]], [np.float16, 0, [3]],   [np.float16, 0, [3, 3]]],
+            [[np.float16, 0, [128]], [np.float16, 0, [116]], [np.float16, 0, [128, 116]]],
+            [[np.float32, 0, [256]], [np.float32, 0, [128]], [np.float32, 0, [128, 128, 3, 3]]],
+            [[np.float32, 0, [256]], [np.float32, 0, [3]],   [np.float32, 0, [256, 3]]],
+            [[np.float32, 0, [2]],   [np.float32, 0, [3]],   [np.float32, 0, [3, 1, 3, 3]]],
+            [[np.float32, 0, [128]], [np.float32, 0, [232]], [np.float32, 0, [232, 232]]],
+        ]
+        self.ger_out_result(shape_format)
+
+
+instantiate_device_type_tests(TestGer, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_grid_assign_positive.py b/test/test_npu/test_network_ops/test_grid_assign_positive.py
index 3e3c717523b993ee9fa6edf7ac9221db96d2784b..166c4921a94ce91261f8a0f072a61e886b042c68 100644
--- a/test/test_npu/test_network_ops/test_grid_assign_positive.py
+++ b/test/test_npu/test_network_ops/test_grid_assign_positive.py
@@ -1,52 +1,52 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import copy
-import sys
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestGridAssignPositive(TestCase):
-    def npu_op_exec(self, *args):
-        out = torch.npu_grid_assign_positive(*args)
-        out = out.to("cpu")
-        return out.detach().numpy()
-        
-    def test_grid_assign_positive(self, device):
-        assigned_gt_inds = torch.rand((4,), dtype=torch.float32).to("npu")
-        overlaps = torch.rand((2,4), dtype=torch.float32).to("npu")
-        box_responsible_flags = torch.tensor([1,1,1,0], dtype=torch.uint8).to("npu")
-        max_overlap = torch.rand((4,), dtype=torch.float32).to("npu")
-        argmax_overlap = torch.tensor([1,0,1,0], dtype=torch.int32).to("npu")
-        gt_max_overlaps = torch.rand((2,), dtype=torch.float32).to("npu")
-        gt_argmax_overlaps = torch.tensor([1,0],dtype=torch.int32).to("npu")
-        inputs = [assigned_gt_inds,overlaps,box_responsible_flags,max_overlap,
-                             argmax_overlap,gt_max_overlaps,gt_argmax_overlaps]
-        num_gts = 128
-        pos_iou_thr = .5
-        min_pos_iou = .0
-        gt_max_assign_all = True
-        attrs = [num_gts, pos_iou_thr, min_pos_iou, gt_max_assign_all]
-        
-        params = inputs + attrs
-        expect_cpu = torch.tensor([2., 1., 0.25984418, 0.36664134], dtype=torch.float32)
-        npu_output = self.npu_op_exec(*params)
-        self.assertRtolEqual(expect_cpu.numpy(), npu_output)
-
-instantiate_device_type_tests(TestGridAssignPositive, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import copy
+import sys
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestGridAssignPositive(TestCase):
+    def npu_op_exec(self, *args):
+        out = torch.npu_grid_assign_positive(*args)
+        out = out.to("cpu")
+        return out.detach().numpy()
+        
+    def test_grid_assign_positive(self, device):
+        assigned_gt_inds = torch.rand((4,), dtype=torch.float32).to("npu")
+        overlaps = torch.rand((2,4), dtype=torch.float32).to("npu")
+        box_responsible_flags = torch.tensor([1,1,1,0], dtype=torch.uint8).to("npu")
+        max_overlap = torch.rand((4,), dtype=torch.float32).to("npu")
+        argmax_overlap = torch.tensor([1,0,1,0], dtype=torch.int32).to("npu")
+        gt_max_overlaps = torch.rand((2,), dtype=torch.float32).to("npu")
+        gt_argmax_overlaps = torch.tensor([1,0],dtype=torch.int32).to("npu")
+        inputs = [assigned_gt_inds,overlaps,box_responsible_flags,max_overlap,
+                             argmax_overlap,gt_max_overlaps,gt_argmax_overlaps]
+        num_gts = 128
+        pos_iou_thr = .5
+        min_pos_iou = .0
+        gt_max_assign_all = True
+        attrs = [num_gts, pos_iou_thr, min_pos_iou, gt_max_assign_all]
+        
+        params = inputs + attrs
+        expect_cpu = torch.tensor([2., 1., 0.25984418, 0.36664134], dtype=torch.float32)
+        npu_output = self.npu_op_exec(*params)
+        self.assertRtolEqual(expect_cpu.numpy(), npu_output)
+
+instantiate_device_type_tests(TestGridAssignPositive, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_grid_sampler.py b/test/test_npu/test_network_ops/test_grid_sampler.py
index 12030a5371d9fe995a1d286cade21fc9b31ec903..dbfb6a746e9aa4aaf1ad4450c1b40c6acf7ed66f 100644
--- a/test/test_npu/test_network_ops/test_grid_sampler.py
+++ b/test/test_npu/test_network_ops/test_grid_sampler.py
@@ -1,74 +1,74 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestGridSampler(TestCase):
-    def test_grid_sampler_fp32(self, device):
-        format_list = [0]
-        shape_list = [[100, 1, 28, 28], [100, 64, 32, 28]]
-        shape_format = [
-            [np.float32, j, k] for j in format_list for k in shape_list
-        ]
-        sample_format = [np.float32, 0, [100, 1, 1, 2]]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            cpu_sample, npu_sample = create_common_tensor(sample_format, -1, 1)
-            cpu_output = self.cpu_op_exec(cpu_input, cpu_sample)
-            npu_output = self.npu_op_exec(npu_input, npu_sample)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_grid_sampler_fp16(self, device):
-        format_list = [0]
-        shape_list = [[1, 1, 3, 3], [1, 2, 3, 4]]
-        shape_format = [
-            [np.float16, j, k] for j in format_list for k in shape_list
-        ]
-        sample_format = [np.float32, 0, [1, 2, 2, 2]]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 10)
-            cpu_sample, npu_sample = create_common_tensor(sample_format, -1, 1)
-            cpu_output = self.cpu_op_fp16_exec(cpu_input, cpu_sample)
-            npu_output = self.npu_op_exec(npu_input, npu_sample)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def cpu_op_exec(self, input, sample):
-        output = torch.grid_sampler(input, sample, 0, 0, True)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input, sample):
-        output = torch.grid_sampler(input, sample, 0, 0, True)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_fp16_exec(self, input, sample):
-        input = input.to(torch.float32)
-        sample = sample.to(torch.float32)
-        output = torch.grid_sampler(input, sample, 0, 0, True)
-        output = output.numpy()
-        output = output.astype(np.float16)
-        return output
-
-instantiate_device_type_tests(TestGridSampler, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestGridSampler(TestCase):
+    def test_grid_sampler_fp32(self, device):
+        format_list = [0]
+        shape_list = [[100, 1, 28, 28], [100, 64, 32, 28]]
+        shape_format = [
+            [np.float32, j, k] for j in format_list for k in shape_list
+        ]
+        sample_format = [np.float32, 0, [100, 1, 1, 2]]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_sample, npu_sample = create_common_tensor(sample_format, -1, 1)
+            cpu_output = self.cpu_op_exec(cpu_input, cpu_sample)
+            npu_output = self.npu_op_exec(npu_input, npu_sample)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_grid_sampler_fp16(self, device):
+        format_list = [0]
+        shape_list = [[1, 1, 3, 3], [1, 2, 3, 4]]
+        shape_format = [
+            [np.float16, j, k] for j in format_list for k in shape_list
+        ]
+        sample_format = [np.float32, 0, [1, 2, 2, 2]]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 10)
+            cpu_sample, npu_sample = create_common_tensor(sample_format, -1, 1)
+            cpu_output = self.cpu_op_fp16_exec(cpu_input, cpu_sample)
+            npu_output = self.npu_op_exec(npu_input, npu_sample)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def cpu_op_exec(self, input, sample):
+        output = torch.grid_sampler(input, sample, 0, 0, True)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input, sample):
+        output = torch.grid_sampler(input, sample, 0, 0, True)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_fp16_exec(self, input, sample):
+        input = input.to(torch.float32)
+        sample = sample.to(torch.float32)
+        output = torch.grid_sampler(input, sample, 0, 0, True)
+        output = output.numpy()
+        output = output.astype(np.float16)
+        return output
+
+instantiate_device_type_tests(TestGridSampler, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_gru.py b/test/test_npu/test_network_ops/test_gru.py
index f21975b333b636882c62d11445af21bd3b6efc0a..17fd7b8351c597609a34cccf2ef8b026867204e5 100644
--- a/test/test_npu/test_network_ops/test_gru.py
+++ b/test/test_npu/test_network_ops/test_gru.py
@@ -1,63 +1,63 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestGru(TestCase):
-    def test_gru(self, device):
-        # shape_format:[[dtype, (num_step, batch_size, input_size)], input_size, hidden_size]
-        shape_format = [
-                        [[np.float16, (16, 32, 64)], [np.float16, (1, 32, 32)], 64, 32], 
-                        [[np.float16, (2, 32, 64)], [np.float16, (1, 32, 32)], 64, 32],
-                        [[np.float32, (5, 32, 64)], [np.float32, (1, 32, 32)], 64, 32],
-                        [[np.float32, (10, 33, 128)], [np.float32, (1, 33, 64)], 128, 64],
-        ]
-
-        for item in shape_format: 
-            cpu_gru = torch.nn.GRU(input_size=item[2], hidden_size=item[3], num_layers=1, bidirectional=False)
-            npu_gru = copy.deepcopy(cpu_gru).npu()
-            
-            input1 = np.random.uniform(0, 1, item[0][1]).astype(item[0][0])
-            if item[0][0] == np.float16:
-                cpu_input1 = torch.from_numpy(input1.astype(np.float32))    # cpu only support fp32
-            else:
-                cpu_input1 = torch.from_numpy(input1)
-            npu_input1 = torch.from_numpy(input1).npu()
-
-            h0 = np.random.uniform(0, 1, item[1][1]).astype(item[1][0])
-            if item[1][0] == np.float16:
-                cpu_h0 = torch.from_numpy(h0.astype(np.float32))    # cpu only support fp32
-            else:
-                cpu_h0 = torch.from_numpy(h0)
-            npu_h0 = torch.from_numpy(h0).npu()
-
-            cpu_output_y, cpu_output_h = cpu_gru(cpu_input1, cpu_h0)
-            npu_output_y, npu_output_h = npu_gru(npu_input1, npu_h0)
-
-            self.assertRtolEqual(cpu_output_y.detach().numpy(), npu_output_y.cpu().detach().numpy().astype(np.float32), prec=1.e-1)
-            self.assertRtolEqual(cpu_output_h.detach().numpy(), npu_output_h.cpu().detach().numpy().astype(np.float32), prec=1.e-1)
-
-
-instantiate_device_type_tests(TestGru, globals(), except_for='cpu')
-if __name__ == "__main__":
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestGru(TestCase):
+    def test_gru(self, device):
+        # shape_format:[[dtype, (num_step, batch_size, input_size)], input_size, hidden_size]
+        shape_format = [
+                        [[np.float16, (16, 32, 64)], [np.float16, (1, 32, 32)], 64, 32], 
+                        [[np.float16, (2, 32, 64)], [np.float16, (1, 32, 32)], 64, 32],
+                        [[np.float32, (5, 32, 64)], [np.float32, (1, 32, 32)], 64, 32],
+                        [[np.float32, (10, 33, 128)], [np.float32, (1, 33, 64)], 128, 64],
+        ]
+
+        for item in shape_format: 
+            cpu_gru = torch.nn.GRU(input_size=item[2], hidden_size=item[3], num_layers=1, bidirectional=False)
+            npu_gru = copy.deepcopy(cpu_gru).npu()
+            
+            input1 = np.random.uniform(0, 1, item[0][1]).astype(item[0][0])
+            if item[0][0] == np.float16:
+                cpu_input1 = torch.from_numpy(input1.astype(np.float32))    # cpu only support fp32
+            else:
+                cpu_input1 = torch.from_numpy(input1)
+            npu_input1 = torch.from_numpy(input1).npu()
+
+            h0 = np.random.uniform(0, 1, item[1][1]).astype(item[1][0])
+            if item[1][0] == np.float16:
+                cpu_h0 = torch.from_numpy(h0.astype(np.float32))    # cpu only support fp32
+            else:
+                cpu_h0 = torch.from_numpy(h0)
+            npu_h0 = torch.from_numpy(h0).npu()
+
+            cpu_output_y, cpu_output_h = cpu_gru(cpu_input1, cpu_h0)
+            npu_output_y, npu_output_h = npu_gru(npu_input1, npu_h0)
+
+            self.assertRtolEqual(cpu_output_y.detach().numpy(), npu_output_y.cpu().detach().numpy().astype(np.float32), prec=1.e-1)
+            self.assertRtolEqual(cpu_output_h.detach().numpy(), npu_output_h.cpu().detach().numpy().astype(np.float32), prec=1.e-1)
+
+
+instantiate_device_type_tests(TestGru, globals(), except_for='cpu')
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_gru_backward.py b/test/test_npu/test_network_ops/test_gru_backward.py
index a8dd2ea87ba2992f9564ab8caeb69f0cef1132b7..313bf7f6dd6fd5b55289b47739182348af5cca72 100644
--- a/test/test_npu/test_network_ops/test_gru_backward.py
+++ b/test/test_npu/test_network_ops/test_gru_backward.py
@@ -1,85 +1,85 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestGruBackward(TestCase):
-    def test_gru_backward(self, device):
-        # shape_format:[[dtype, (num_step, batch_size, input_size)], input_size, hidden_size]
-        shape_format = [
-                        [[np.float16, (16, 32, 64)], 64, 32], 
-                        [[np.float16, (5, 32, 64)], 64, 32],
-                        [[np.float32, (5, 32, 64)], 64, 32],
-                        [[np.float32, (5, 32, 64)], 64, 64],
-        ]
-
-        for item in shape_format: 
-            cpu_gru = torch.nn.GRU(input_size=item[1], hidden_size=item[2], num_layers=1, bidirectional=False)
-            cpu_gru.weight_ih_l0.requires_grad_(True)
-            cpu_gru.weight_hh_l0.requires_grad_(True)
-            cpu_gru.bias_ih_l0.requires_grad_(True)
-            cpu_gru.bias_hh_l0.requires_grad_(True)
-            npu_gru = copy.deepcopy(cpu_gru).npu()
-            
-            input1 = np.random.uniform(0, 1, item[0][1]).astype(item[0][0])
-            cpu_input1 = torch.from_numpy(input1.astype(np.float32))
-            cpu_input1.requires_grad_(True)
-            npu_input1 = torch.from_numpy(input1).npu()
-            npu_input1.requires_grad_(True)
-
-            cpu_output_y, cpu_output_h = cpu_gru(cpu_input1)
-            npu_output_y, npu_output_h = npu_gru(npu_input1)
-
-            self.assertRtolEqual(cpu_output_y.detach().numpy(), npu_output_y.cpu().detach().numpy().astype(np.float32), prec=1.e-1)
-            self.assertRtolEqual(cpu_output_h.detach().numpy(), npu_output_h.cpu().detach().numpy().astype(np.float32), prec=1.e-1)
-            
-            cpu_input1.retain_grad()
-            cpu_output_y.backward(torch.ones(cpu_output_y.size(), dtype=torch.float))
-            cpu_dx = cpu_input1.grad
-            cpu_dw_ih = cpu_gru.weight_ih_l0.grad
-            cpu_dw_hh = cpu_gru.weight_hh_l0.grad
-            cpu_db_ih = cpu_gru.bias_ih_l0.grad
-            cpu_db_hh = cpu_gru.bias_hh_l0.grad
-            
-            npu_input1.retain_grad()
-            npu_output_y.backward(torch.ones(npu_output_y.size(), dtype=torch.float).npu())
-            npu_dx = npu_input1.grad
-            npu_dw_ih = npu_gru.weight_ih_l0.grad
-            npu_dw_hh = npu_gru.weight_hh_l0.grad
-            npu_db_ih = npu_gru.bias_ih_l0.grad
-            npu_db_hh = npu_gru.bias_hh_l0.grad
-            
-            self.assertRtolEqual(cpu_dx.numpy(), npu_dx.cpu().numpy().astype(np.float32), prec=1.e-1)
-            self.assertRtolEqual(cpu_dw_ih.numpy(), npu_dw_ih.cpu().numpy().astype(np.float32), prec=1.e-1)
-            self.assertRtolEqual(cpu_dw_hh.numpy(), npu_dw_hh.cpu().numpy().astype(np.float32), prec=1.e-1)
-            # TODO(ascend): Insufficient precision
-            #精度未满足 self.assertRtolEqual(cpu_db_ih.numpy(), npu_db_ih.cpu().numpy().astype(np.float32), prec=1.e-1)
-            self.assertRtolEqual(cpu_db_ih.numpy(), npu_db_ih.cpu().numpy().astype(np.float32), prec=1.e1)
-            # TODO(ascend): Insufficient precision
-            #精度未满足 self.assertRtolEqual(cpu_db_hh.numpy(), npu_db_hh.cpu().numpy().astype(np.float32), prec=1.e-1)
-            self.assertRtolEqual(cpu_db_hh.numpy(), npu_db_hh.cpu().numpy().astype(np.float32), prec=1.e1)
-
-
-instantiate_device_type_tests(TestGruBackward, globals(), except_for='cpu')
-if __name__ == "__main__":
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestGruBackward(TestCase):
+    def test_gru_backward(self, device):
+        # shape_format:[[dtype, (num_step, batch_size, input_size)], input_size, hidden_size]
+        shape_format = [
+                        [[np.float16, (16, 32, 64)], 64, 32], 
+                        [[np.float16, (5, 32, 64)], 64, 32],
+                        [[np.float32, (5, 32, 64)], 64, 32],
+                        [[np.float32, (5, 32, 64)], 64, 64],
+        ]
+
+        for item in shape_format: 
+            cpu_gru = torch.nn.GRU(input_size=item[1], hidden_size=item[2], num_layers=1, bidirectional=False)
+            cpu_gru.weight_ih_l0.requires_grad_(True)
+            cpu_gru.weight_hh_l0.requires_grad_(True)
+            cpu_gru.bias_ih_l0.requires_grad_(True)
+            cpu_gru.bias_hh_l0.requires_grad_(True)
+            npu_gru = copy.deepcopy(cpu_gru).npu()
+            
+            input1 = np.random.uniform(0, 1, item[0][1]).astype(item[0][0])
+            cpu_input1 = torch.from_numpy(input1.astype(np.float32))
+            cpu_input1.requires_grad_(True)
+            npu_input1 = torch.from_numpy(input1).npu()
+            npu_input1.requires_grad_(True)
+
+            cpu_output_y, cpu_output_h = cpu_gru(cpu_input1)
+            npu_output_y, npu_output_h = npu_gru(npu_input1)
+
+            self.assertRtolEqual(cpu_output_y.detach().numpy(), npu_output_y.cpu().detach().numpy().astype(np.float32), prec=1.e-1)
+            self.assertRtolEqual(cpu_output_h.detach().numpy(), npu_output_h.cpu().detach().numpy().astype(np.float32), prec=1.e-1)
+            
+            cpu_input1.retain_grad()
+            cpu_output_y.backward(torch.ones(cpu_output_y.size(), dtype=torch.float))
+            cpu_dx = cpu_input1.grad
+            cpu_dw_ih = cpu_gru.weight_ih_l0.grad
+            cpu_dw_hh = cpu_gru.weight_hh_l0.grad
+            cpu_db_ih = cpu_gru.bias_ih_l0.grad
+            cpu_db_hh = cpu_gru.bias_hh_l0.grad
+            
+            npu_input1.retain_grad()
+            npu_output_y.backward(torch.ones(npu_output_y.size(), dtype=torch.float).npu())
+            npu_dx = npu_input1.grad
+            npu_dw_ih = npu_gru.weight_ih_l0.grad
+            npu_dw_hh = npu_gru.weight_hh_l0.grad
+            npu_db_ih = npu_gru.bias_ih_l0.grad
+            npu_db_hh = npu_gru.bias_hh_l0.grad
+            
+            self.assertRtolEqual(cpu_dx.numpy(), npu_dx.cpu().numpy().astype(np.float32), prec=1.e-1)
+            self.assertRtolEqual(cpu_dw_ih.numpy(), npu_dw_ih.cpu().numpy().astype(np.float32), prec=1.e-1)
+            self.assertRtolEqual(cpu_dw_hh.numpy(), npu_dw_hh.cpu().numpy().astype(np.float32), prec=1.e-1)
+            # TODO(ascend): Insufficient precision
+            #精度未满足 self.assertRtolEqual(cpu_db_ih.numpy(), npu_db_ih.cpu().numpy().astype(np.float32), prec=1.e-1)
+            self.assertRtolEqual(cpu_db_ih.numpy(), npu_db_ih.cpu().numpy().astype(np.float32), prec=1.e1)
+            # TODO(ascend): Insufficient precision
+            #精度未满足 self.assertRtolEqual(cpu_db_hh.numpy(), npu_db_hh.cpu().numpy().astype(np.float32), prec=1.e-1)
+            self.assertRtolEqual(cpu_db_hh.numpy(), npu_db_hh.cpu().numpy().astype(np.float32), prec=1.e1)
+
+
+instantiate_device_type_tests(TestGruBackward, globals(), except_for='cpu')
+if __name__ == "__main__":
     run_tests() 
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_gt.py b/test/test_npu/test_network_ops/test_gt.py
old mode 100644
new mode 100755
index d3ec28991001811d22a6eda7da3cb86b7ee4aa02..5dc5e2f8d58d27a324afc6beb7d92adab430c3e5
--- a/test/test_npu/test_network_ops/test_gt.py
+++ b/test/test_npu/test_network_ops/test_gt.py
@@ -1,333 +1,333 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import copy
-import sys
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestGt(TestCase):
-    def generate_scalar(self, min, max):
-        scalar = np.random.uniform(min, max)
-        return scalar
-
-    def cpu_op_exec(self, input1, input2):
-        output = torch.gt(input1, input2)
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_out(self, input1, input2, input3):
-        torch.gt(input1, input2, out = input3)
-        output = input3.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        output = torch.gt(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_inplace_exec(self, input1, input2):
-        output = input1.gt_(input2)
-        output = input1
-        output = output.numpy()
-        return output
-
-    def npu_op_inplace_exec(self, input1, input2):
-        output = input1.gt_(input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_out(self, input1, input2, output):
-        torch.gt(input1, input2, out=output)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_scalar(self, input, scalar):
-        output = torch.gt(input, scalar)
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_scalar_out(self, input1, scalar, input2):
-        torch.gt(input1, scalar, out = input2)
-        output = input2.numpy()
-        return output
-
-    def npu_op_exec_scalar(self, input, scalar):
-        output = torch.gt(input, scalar)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_inplace_exec_scalar(self, input, scalar):
-        output = input.gt_(scalar)
-        output = output.numpy()
-        return output
-
-    def npu_op_inplace_exec_scalar(self, input, scalar):
-        input = input.to("npu")
-        output = input.gt_(scalar)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_scalar_out(self, input, scalar, output):
-        torch.gt(input, scalar, out=output)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_inplace_stride_exec(self, input1, input2):
-        input1 = input1.as_strided([2, 2], [1, 2], 1)
-        input2 = input2.as_strided([2, 2], [1, 2], 1)
-        output = input1.gt_(input2)
-        output = output.numpy()
-        return output
-
-    def npu_op_inplace_stride_exec(self, input1, input2):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        input1 = input1.as_strided([2, 2], [1, 2], 1)
-        input2 = input2.as_strided([2, 2], [1, 2], 1)
-        output = input1.gt_(input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_inplace_stride_scalar_exec(self, input1, input2):
-        input1 = input1.as_strided([2, 2], [1, 2], 1)
-        output = input1.gt_(input2)
-        output = output.numpy()
-        return output
-
-    def npu_op_inplace_stride_scalar_exec(self, input1, input2):
-        input1 = input1.to("npu")
-        input1 = input1.as_strided([2, 2], [1, 2], 1)
-        output = input1.gt_(input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def gt_tensor_out_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100)
-            cpu_input3 = torch.randn(item[1][2])<0
-            npu_input3 = cpu_input3.npu()
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            if cpu_input2.dtype == torch.float16:
-                cpu_input2 = cpu_input2.to(torch.float32)
-            if cpu_input3.dtype == torch.float16:
-                cpu_input3 = cpu_input3.to(torch.float32)
-            cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3)
-            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
-            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
-
-            self.assertRtolEqual(cpu_output_out, npu_output_out)
-
-    def test_gt_bool(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        scalar_list = [True, False]
-        shape_format = [
-            [[np.int32, i, j], k] for i in format_list for j in shape_list 
-            for k in scalar_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], 0, 100)
-            cpu_output1 = self.cpu_op_exec_scalar(cpu_input1 > 50, item[1])
-            npu_output1 = self.npu_op_exec_scalar(npu_input1 > 50, item[1])
-            cpu_output2 = self.cpu_op_exec(cpu_input1 > 50, cpu_input2 > 50)
-            npu_output2 = self.npu_op_exec(npu_input1 > 50, npu_input2 > 50)
-            self.assertEqual(cpu_output1, npu_output1)
-            self.assertEqual(cpu_output2, npu_output2)
-
-    def test_gt_tensor_out(self, device):
-        shape_format = [
-            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
-            [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]],
-            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
-            [[np.float32, 0, [256, 128, 7, 7]],   [np.float32, 0, [128, 256, 3, 3]]],
-            [[np.float32, 0, [2, 3, 3, 3]],       [np.float32, 0, [3, 1, 3]]],
-            [[np.float32, 0, [128, 232, 7, 7]],   [np.float32, 0, [128, 232, 7, 7]]],
-        ]
-        self.gt_tensor_out_result(shape_format)
-
-    def gt_scalar_out_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_input2 = torch.randn(item[1][2])<0
-            npu_input2 = cpu_input2.npu()
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            if cpu_input2.dtype == torch.float16:
-                cpu_input2 = cpu_input2.to(torch.float32)
-            scalar = self.generate_scalar(0, 100)
-            cpu_output_out = self.cpu_op_exec_scalar_out(cpu_input1, scalar, cpu_input2)
-            npu_output_out = self.npu_op_exec_scalar_out(npu_input1, scalar, npu_input2)
-            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
-            self.assertRtolEqual(cpu_output_out, npu_output_out)
-
-    def test_gt_scalar_out(self, device):
-        shape_format = [
-            [[np.float16, 0, [12, 4, 12, 121]], [np.float16, 0, [256, 116, 1, 1]]],
-            [[np.float16, 0, [12, 10, 14, 111]], [np.float16, 0, [256, 116, 1, 1]]],
-            [[np.float16, 2, [16, 3, 11, 121, 21]], [np.float16, 0, [3, 3, 3]]],
-            [[np.float16, 0, [16, 16, 14]], [np.float16, 0, [128, 116, 14, 14]]],
-            [[np.float32, 0, [20, 10, 7, 7]], [np.float32, 0, [128, 256, 3, 3]]],
-            [[np.float32, 2, [1313, 3, 3, 3, 121]], [np.float32, 0, [3, 1, 3]]],
-            [[np.float32, 0, [16, 22, 7, 7]], [np.float32, 0, [128, 232, 7, 7]]],
-        ]
-        self.gt_scalar_out_result(shape_format)
-
-    def test_gt_scalar_float32(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            scalar = self.generate_scalar(0, 100)
-            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
-            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_gt_scalar_float16(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            cpu_input = cpu_input.to(torch.float32)
-            scalar = self.generate_scalar(0, 100)
-            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
-            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_gt_scalar_int32(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [
-            [np.int32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            scalar = self.generate_scalar(0, 100)
-            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
-            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_gt_tensor_float32(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [[[np.float32, i, j], [np.float32, i, j]]
-                        for i in format_list for j in shape_list]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_gt_tensor_float16(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [[[np.float16, i, j], [np.float16, i, j]]
-                        for i in format_list for j in shape_list]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_gt_inplace_float32(self, device):
-        format_list = [0, 3]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [[[np.float32, i, j], [np.float32, i, j]]
-                        for i in format_list for j in shape_list]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
-            cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_gt_inplace_float16(self, device):
-        format_list = [0, 3]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [[[np.float16, i, j], [np.float16, i, j]]
-                        for i in format_list for j in shape_list]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_gt_inplace_scalar_float32(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            scalar = self.generate_scalar(0, 100)
-            cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
-            npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_gt_inplace_scalar_float16(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            cpu_input = cpu_input.to(torch.float32)
-            scalar = self.generate_scalar(0, 100)
-            cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
-            npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_gt_mix_dtype(self, device):
-        npu_input1, npu_input2 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
-        npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
-        npu_output = self.npu_op_exec(npu_input2, npu_input4)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestGt, globals(), except_for="cpu")
-if __name__ == '__main__':
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import copy
+import sys
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestGt(TestCase):
+    def generate_scalar(self, min, max):
+        scalar = np.random.uniform(min, max)
+        return scalar
+
+    def cpu_op_exec(self, input1, input2):
+        output = torch.gt(input1, input2)
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_out(self, input1, input2, input3):
+        torch.gt(input1, input2, out = input3)
+        output = input3.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.gt(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_exec(self, input1, input2):
+        output = input1.gt_(input2)
+        output = input1
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_exec(self, input1, input2):
+        output = input1.gt_(input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2, output):
+        torch.gt(input1, input2, out=output)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_scalar(self, input, scalar):
+        output = torch.gt(input, scalar)
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_scalar_out(self, input1, scalar, input2):
+        torch.gt(input1, scalar, out = input2)
+        output = input2.numpy()
+        return output
+
+    def npu_op_exec_scalar(self, input, scalar):
+        output = torch.gt(input, scalar)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_exec_scalar(self, input, scalar):
+        output = input.gt_(scalar)
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_exec_scalar(self, input, scalar):
+        input = input.to("npu")
+        output = input.gt_(scalar)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_scalar_out(self, input, scalar, output):
+        torch.gt(input, scalar, out=output)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_stride_exec(self, input1, input2):
+        input1 = input1.as_strided([2, 2], [1, 2], 1)
+        input2 = input2.as_strided([2, 2], [1, 2], 1)
+        output = input1.gt_(input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_stride_exec(self, input1, input2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        input1 = input1.as_strided([2, 2], [1, 2], 1)
+        input2 = input2.as_strided([2, 2], [1, 2], 1)
+        output = input1.gt_(input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_stride_scalar_exec(self, input1, input2):
+        input1 = input1.as_strided([2, 2], [1, 2], 1)
+        output = input1.gt_(input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_stride_scalar_exec(self, input1, input2):
+        input1 = input1.to("npu")
+        input1 = input1.as_strided([2, 2], [1, 2], 1)
+        output = input1.gt_(input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def gt_tensor_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100)
+            cpu_input3 = torch.randn(item[1][2])<0
+            npu_input3 = cpu_input3.npu()
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            if cpu_input3.dtype == torch.float16:
+                cpu_input3 = cpu_input3.to(torch.float32)
+            cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
+            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
+
+            self.assertRtolEqual(cpu_output_out, npu_output_out)
+
+    def test_gt_bool(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        scalar_list = [True, False]
+        shape_format = [
+            [[np.int32, i, j], k] for i in format_list for j in shape_list 
+            for k in scalar_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], 0, 100)
+            cpu_output1 = self.cpu_op_exec_scalar(cpu_input1 > 50, item[1])
+            npu_output1 = self.npu_op_exec_scalar(npu_input1 > 50, item[1])
+            cpu_output2 = self.cpu_op_exec(cpu_input1 > 50, cpu_input2 > 50)
+            npu_output2 = self.npu_op_exec(npu_input1 > 50, npu_input2 > 50)
+            self.assertEqual(cpu_output1, npu_output1)
+            self.assertEqual(cpu_output2, npu_output2)
+
+    def test_gt_tensor_out(self, device):
+        shape_format = [
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]],
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
+            [[np.float32, 0, [256, 128, 7, 7]],   [np.float32, 0, [128, 256, 3, 3]]],
+            [[np.float32, 0, [2, 3, 3, 3]],       [np.float32, 0, [3, 1, 3]]],
+            [[np.float32, 0, [128, 232, 7, 7]],   [np.float32, 0, [128, 232, 7, 7]]],
+        ]
+        self.gt_tensor_out_result(shape_format)
+
+    def gt_scalar_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2 = torch.randn(item[1][2])<0
+            npu_input2 = cpu_input2.npu()
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output_out = self.cpu_op_exec_scalar_out(cpu_input1, scalar, cpu_input2)
+            npu_output_out = self.npu_op_exec_scalar_out(npu_input1, scalar, npu_input2)
+            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
+            self.assertRtolEqual(cpu_output_out, npu_output_out)
+
+    def test_gt_scalar_out(self, device):
+        shape_format = [
+            [[np.float16, 0, [12, 4, 12, 121]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [12, 10, 14, 111]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 2, [16, 3, 11, 121, 21]], [np.float16, 0, [3, 3, 3]]],
+            [[np.float16, 0, [16, 16, 14]], [np.float16, 0, [128, 116, 14, 14]]],
+            [[np.float32, 0, [20, 10, 7, 7]], [np.float32, 0, [128, 256, 3, 3]]],
+            [[np.float32, 2, [1313, 3, 3, 3, 121]], [np.float32, 0, [3, 1, 3]]],
+            [[np.float32, 0, [16, 22, 7, 7]], [np.float32, 0, [128, 232, 7, 7]]],
+        ]
+        self.gt_scalar_out_result(shape_format)
+
+    def test_gt_scalar_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_gt_scalar_float16(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_input = cpu_input.to(torch.float32)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_gt_scalar_int32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.int32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_gt_tensor_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float32, i, j], [np.float32, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_gt_tensor_float16(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float16, i, j], [np.float16, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_gt_inplace_float32(self, device):
+        format_list = [0, 3]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float32, i, j], [np.float32, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_gt_inplace_float16(self, device):
+        format_list = [0, 3]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float16, i, j], [np.float16, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_gt_inplace_scalar_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_gt_inplace_scalar_float16(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_input = cpu_input.to(torch.float32)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_gt_mix_dtype(self, device):
+        npu_input1, npu_input2 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
+        npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
+        npu_output = self.npu_op_exec(npu_input2, npu_input4)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestGt, globals(), except_for="cpu")
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_hardtanh.py b/test/test_npu/test_network_ops/test_hardtanh.py
old mode 100644
new mode 100755
index a20eb8bd935c932f254aa4adacd17347f16b88d0..742620836b9d21e6a8fd9db0765e79a55486b47c
--- a/test/test_npu/test_network_ops/test_hardtanh.py
+++ b/test/test_npu/test_network_ops/test_hardtanh.py
@@ -1,120 +1,120 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-sys.path.append('..')
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestHardtanh(TestCase):
-    def cpu_op_backward_exec(self, input, min, max):
-        w = torch.ones_like(input)
-        input.requires_grad_(True)
-        output = torch.nn.functional.hardtanh(input, min, max)
-        output.backward(w)
-        output = output.detach().numpy() 
-        res = input.grad
-        res = res.numpy()
-        return output, res
-        
-    def npu_op_backward_exec(self, input, min, max):
-        w = torch.ones_like(input)
-        w = w.to("npu")
-        input.requires_grad_(True)
-        output = torch.nn.functional.hardtanh(input, min, max)
-        output.backward(w)    
-        output = output.to("cpu").detach().numpy()    
-        res = input.grad
-        res = res.to("cpu").numpy()
-        return output, res        
-    
-    def hardtanh_result(self, shape_format):
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 2)
-            if cpu_input.dtype == torch.float16:
-                cpu_input = cpu_input.to(torch.float32)
-                
-            cpu_output_forward, cpu_output_backward = self.cpu_op_backward_exec(cpu_input, 0, 1)
-            npu_output_forward, npu_output_backward = self.npu_op_backward_exec(npu_input, 0, 1)
-            cpu_output_forward = cpu_output_forward.astype(npu_output_forward.dtype)
-            cpu_output_backward = cpu_output_backward.astype(npu_output_backward.dtype)
-            
-            self.assertRtolEqual(cpu_output_forward, npu_output_forward)
-            self.assertRtolEqual(cpu_output_backward, npu_output_backward)
-    
-    # 1d do not support format 29
-    def test_hardtanh_shape_format_fp16_1d(self, device):
-        format_list = [0, 3, 4]
-        shape_format = [
-            [np.float16, i, [18]]  for i in format_list
-        ]
-        self.hardtanh_result(shape_format)
-        
-    def test_hardtanh_shape_format_fp32_1d(self, device):
-        format_list = [0, 3, 4]
-        shape_format = [
-            [np.float32, i, [18]]  for i in format_list
-        ]
-        self.hardtanh_result(shape_format)
-        
-    def test_hardtanh_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 4, 29]
-        shape_format = [
-            [np.float16, i, [256, 1000]]  for i in format_list
-        ]
-        self.hardtanh_result(shape_format)
-        
-    def test_hardtanh_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 4, 29]
-        shape_format = [
-            [np.float32, i, [256, 1000]]  for i in format_list
-        ]
-        self.hardtanh_result(shape_format)
-        
-    def test_hardtanh_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 4, 29]
-        shape_format = [
-            [np.float16, i, [32, 328, 368]]  for i in format_list
-        ]
-        self.hardtanh_result(shape_format)
-        
-    def test_hardtanh_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 4, 29]
-        shape_format = [
-            [np.float32, i, [32, 328, 368]]  for i in format_list
-        ]
-        self.hardtanh_result(shape_format)
-        
-    def test_hardtanh_shape_format_fp16_4d(self, device):
-        format_list = [0, 3, 4, 29]
-        shape_format = [
-            [np.float16, i, [256, 576, 7, 7]]  for i in format_list
-        ]
-        self.hardtanh_result(shape_format)
-        
-    def test_hardtanh_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 4, 29]
-        shape_format = [
-            [np.float32, i, [256, 576, 7, 7]]  for i in format_list
-        ]
-        self.hardtanh_result(shape_format)
-                    
-
-instantiate_device_type_tests(TestHardtanh, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()    
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('..')
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestHardtanh(TestCase):
+    def cpu_op_backward_exec(self, input, min, max):
+        w = torch.ones_like(input)
+        input.requires_grad_(True)
+        output = torch.nn.functional.hardtanh(input, min, max)
+        output.backward(w)
+        output = output.detach().numpy() 
+        res = input.grad
+        res = res.numpy()
+        return output, res
+        
+    def npu_op_backward_exec(self, input, min, max):
+        w = torch.ones_like(input)
+        w = w.to("npu")
+        input.requires_grad_(True)
+        output = torch.nn.functional.hardtanh(input, min, max)
+        output.backward(w)    
+        output = output.to("cpu").detach().numpy()    
+        res = input.grad
+        res = res.to("cpu").numpy()
+        return output, res        
+    
+    def hardtanh_result(self, shape_format):
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 2)
+            if cpu_input.dtype == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+                
+            cpu_output_forward, cpu_output_backward = self.cpu_op_backward_exec(cpu_input, 0, 1)
+            npu_output_forward, npu_output_backward = self.npu_op_backward_exec(npu_input, 0, 1)
+            cpu_output_forward = cpu_output_forward.astype(npu_output_forward.dtype)
+            cpu_output_backward = cpu_output_backward.astype(npu_output_backward.dtype)
+            
+            self.assertRtolEqual(cpu_output_forward, npu_output_forward)
+            self.assertRtolEqual(cpu_output_backward, npu_output_backward)
+    
+    # 1d do not support format 29
+    def test_hardtanh_shape_format_fp16_1d(self, device):
+        format_list = [0, 3, 4]
+        shape_format = [
+            [np.float16, i, [18]]  for i in format_list
+        ]
+        self.hardtanh_result(shape_format)
+        
+    def test_hardtanh_shape_format_fp32_1d(self, device):
+        format_list = [0, 3, 4]
+        shape_format = [
+            [np.float32, i, [18]]  for i in format_list
+        ]
+        self.hardtanh_result(shape_format)
+        
+    def test_hardtanh_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 4, 29]
+        shape_format = [
+            [np.float16, i, [256, 1000]]  for i in format_list
+        ]
+        self.hardtanh_result(shape_format)
+        
+    def test_hardtanh_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 4, 29]
+        shape_format = [
+            [np.float32, i, [256, 1000]]  for i in format_list
+        ]
+        self.hardtanh_result(shape_format)
+        
+    def test_hardtanh_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 4, 29]
+        shape_format = [
+            [np.float16, i, [32, 328, 368]]  for i in format_list
+        ]
+        self.hardtanh_result(shape_format)
+        
+    def test_hardtanh_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 4, 29]
+        shape_format = [
+            [np.float32, i, [32, 328, 368]]  for i in format_list
+        ]
+        self.hardtanh_result(shape_format)
+        
+    def test_hardtanh_shape_format_fp16_4d(self, device):
+        format_list = [0, 3, 4, 29]
+        shape_format = [
+            [np.float16, i, [256, 576, 7, 7]]  for i in format_list
+        ]
+        self.hardtanh_result(shape_format)
+        
+    def test_hardtanh_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 4, 29]
+        shape_format = [
+            [np.float32, i, [256, 576, 7, 7]]  for i in format_list
+        ]
+        self.hardtanh_result(shape_format)
+                    
+
+instantiate_device_type_tests(TestHardtanh, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()    
diff --git a/test/test_npu/test_network_ops/test_ifmr.py b/test/test_npu/test_network_ops/test_ifmr.py
index b78cae42a69f917b21374cb532cd759e38f9b36c..24e3381172b36ebbb75218bab9d102314ff49b32 100644
--- a/test/test_npu/test_network_ops/test_ifmr.py
+++ b/test/test_npu/test_network_ops/test_ifmr.py
@@ -1,139 +1,139 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from functools import reduce
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestIFMR(TestCase):
-    def cpu_op_exec(self,
-                    input_data,
-                    with_offset,
-                    bins_num=128,
-                    min_percentile=0.999999,
-                    max_percentile=0.999999,
-                    search_range=[0.7, 1.3],
-                    search_step=0.01):
-        pre_mode = np.float32
-        input_data = input_data.numpy().astype(pre_mode)
-        data_min = input_data.min()
-        data_max = input_data.max()
-        data_num = reduce(lambda x, y: x * y, input_data.shape)
-        data_num = np.array(data_num, pre_mode)
-
-        bins, threshold = np.histogram(input_data, bins_num)
-        cumsum = np.cumsum(bins).astype(np.int32)
-
-        bins_num = np.array(bins_num, pre_mode)
-        cdf = cumsum.astype(pre_mode) / data_num
-        max_index = np.where(cdf >= np.array(max_percentile, pre_mode), 0,
-                             1).sum().astype(pre_mode)
-        min_index = np.where(cdf >= np.array(1 - min_percentile, pre_mode), 0,
-                             1).sum().astype(pre_mode)
-        max_init = max_index / bins_num * (data_max - data_min) + data_min
-        min_init = min_index / bins_num * (data_max - data_min) + data_min
-
-        step = np.arange(search_range[0],
-                         search_range[1],
-                         search_step,
-                         dtype=pre_mode)
-        if with_offset:
-            if max_init < 0:
-                max_init = np.array(0, pre_mode)
-            if min_init > 0:
-                min_init = np.array(0, pre_mode)
-            min_list = min_init * np.ones(step.shape, dtype=pre_mode)
-        else:
-            max_init = np.max([np.abs(max_init), np.abs(min_init)])
-        max_list = max_init * step
-
-        if with_offset:
-            scale = (max_list - min_list) / 255
-            scale = np.where(scale < 1.192092896e-07, 1, scale)
-            offset = np.round(min_list / scale)
-            offset = -(offset + 128)
-        else:
-            scale = max_list / 127
-            offset = np.round(scale * 0)
-
-        loss_list = np.zeros(step.shape, dtype=pre_mode)
-        for i in range(step.size):
-            quant_data = np.round(input_data / scale[i]) + offset[i]
-            np.clip(quant_data, -128, 127, out=quant_data)
-            quant_data = (quant_data - offset[i]) * scale[i]
-            loss_list[i] = np.sum(np.square(quant_data - input_data))
-        index = np.argmin(loss_list)
-        return scale[index], offset[index]
-
-    def npu_op_exec(self, input_data, with_offset):
-        min_value = torch.min(input_data)
-        max_value = torch.max(input_data)
-        min_value = torch.reshape(min_value, (1, ))
-        max_value = torch.reshape(max_value, (1, ))
-        hist = torch.histc(input_data.to('cpu'),
-                           bins=128,
-                           min=min_value[0].to('cpu'),
-                           max=max_value[0].to('cpu'))
-        cdf = torch.cumsum(hist, dim=0).int()
-
-        cdf = cdf.to('npu')
-        scale, offset = torch.npu_ifmr(input_data,
-                                       min_value,
-                                       max_value,
-                                       cdf,
-                                       min_percentile=0.999999,
-                                       max_percentile=0.999999,
-                                       search_start=0.7,
-                                       search_end=1.3,
-                                       search_step=0.01,
-                                       with_offset=with_offset)
-
-        return scale, offset
-
-    def test_ifrm_with_offset(self, device):
-        format_list = [0, 3]
-        shape_list = [(2, 2, 3, 4), (5, 5)]
-        shape_format = [[np.float32, i, j] for i in format_list
-                        for j in shape_list]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, -1, 1)
-            scale_cpu, offset_cpu = self.cpu_op_exec(cpu_input,
-                                                     with_offset=True)
-            scale_npu, offset_npu = self.npu_op_exec(npu_input,
-                                                     with_offset=True)
-            self.assertTrue((scale_cpu - scale_npu[0]) / scale_cpu < 0.0001)
-            self.assertEqual(offset_cpu, offset_npu[0])
-
-    def test_ifrm_without_offset(self, device):
-        format_list = [0, 3]
-        shape_list = [(2, 2, 3, 4), (5, 5)]
-        shape_format = [[np.float32, i, j] for i in format_list
-                        for j in shape_list]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, -1, 1)
-            scale_cpu, offset_cpu = self.cpu_op_exec(cpu_input,
-                                                     with_offset=False)
-            scale_npu, offset_npu = self.npu_op_exec(npu_input,
-                                                     with_offset=False)
-            self.assertTrue((scale_cpu - scale_npu[0]) / scale_cpu < 0.0001)
-            self.assertEqual(offset_cpu, offset_npu[0])
-
-
-instantiate_device_type_tests(TestIFMR, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import reduce
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestIFMR(TestCase):
+    def cpu_op_exec(self,
+                    input_data,
+                    with_offset,
+                    bins_num=128,
+                    min_percentile=0.999999,
+                    max_percentile=0.999999,
+                    search_range=[0.7, 1.3],
+                    search_step=0.01):
+        pre_mode = np.float32
+        input_data = input_data.numpy().astype(pre_mode)
+        data_min = input_data.min()
+        data_max = input_data.max()
+        data_num = reduce(lambda x, y: x * y, input_data.shape)
+        data_num = np.array(data_num, pre_mode)
+
+        bins, threshold = np.histogram(input_data, bins_num)
+        cumsum = np.cumsum(bins).astype(np.int32)
+
+        bins_num = np.array(bins_num, pre_mode)
+        cdf = cumsum.astype(pre_mode) / data_num
+        max_index = np.where(cdf >= np.array(max_percentile, pre_mode), 0,
+                             1).sum().astype(pre_mode)
+        min_index = np.where(cdf >= np.array(1 - min_percentile, pre_mode), 0,
+                             1).sum().astype(pre_mode)
+        max_init = max_index / bins_num * (data_max - data_min) + data_min
+        min_init = min_index / bins_num * (data_max - data_min) + data_min
+
+        step = np.arange(search_range[0],
+                         search_range[1],
+                         search_step,
+                         dtype=pre_mode)
+        if with_offset:
+            if max_init < 0:
+                max_init = np.array(0, pre_mode)
+            if min_init > 0:
+                min_init = np.array(0, pre_mode)
+            min_list = min_init * np.ones(step.shape, dtype=pre_mode)
+        else:
+            max_init = np.max([np.abs(max_init), np.abs(min_init)])
+        max_list = max_init * step
+
+        if with_offset:
+            scale = (max_list - min_list) / 255
+            scale = np.where(scale < 1.192092896e-07, 1, scale)
+            offset = np.round(min_list / scale)
+            offset = -(offset + 128)
+        else:
+            scale = max_list / 127
+            offset = np.round(scale * 0)
+
+        loss_list = np.zeros(step.shape, dtype=pre_mode)
+        for i in range(step.size):
+            quant_data = np.round(input_data / scale[i]) + offset[i]
+            np.clip(quant_data, -128, 127, out=quant_data)
+            quant_data = (quant_data - offset[i]) * scale[i]
+            loss_list[i] = np.sum(np.square(quant_data - input_data))
+        index = np.argmin(loss_list)
+        return scale[index], offset[index]
+
+    def npu_op_exec(self, input_data, with_offset):
+        min_value = torch.min(input_data)
+        max_value = torch.max(input_data)
+        min_value = torch.reshape(min_value, (1, ))
+        max_value = torch.reshape(max_value, (1, ))
+        hist = torch.histc(input_data.to('cpu'),
+                           bins=128,
+                           min=min_value[0].to('cpu'),
+                           max=max_value[0].to('cpu'))
+        cdf = torch.cumsum(hist, dim=0).int()
+
+        cdf = cdf.to('npu')
+        scale, offset = torch.npu_ifmr(input_data,
+                                       min_value,
+                                       max_value,
+                                       cdf,
+                                       min_percentile=0.999999,
+                                       max_percentile=0.999999,
+                                       search_start=0.7,
+                                       search_end=1.3,
+                                       search_step=0.01,
+                                       with_offset=with_offset)
+
+        return scale, offset
+
+    def test_ifrm_with_offset(self, device):
+        format_list = [0, 3]
+        shape_list = [(2, 2, 3, 4), (5, 5)]
+        shape_format = [[np.float32, i, j] for i in format_list
+                        for j in shape_list]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, -1, 1)
+            scale_cpu, offset_cpu = self.cpu_op_exec(cpu_input,
+                                                     with_offset=True)
+            scale_npu, offset_npu = self.npu_op_exec(npu_input,
+                                                     with_offset=True)
+            self.assertTrue((scale_cpu - scale_npu[0]) / scale_cpu < 0.0001)
+            self.assertEqual(offset_cpu, offset_npu[0])
+
+    def test_ifrm_without_offset(self, device):
+        format_list = [0, 3]
+        shape_list = [(2, 2, 3, 4), (5, 5)]
+        shape_format = [[np.float32, i, j] for i in format_list
+                        for j in shape_list]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, -1, 1)
+            scale_cpu, offset_cpu = self.cpu_op_exec(cpu_input,
+                                                     with_offset=False)
+            scale_npu, offset_npu = self.npu_op_exec(npu_input,
+                                                     with_offset=False)
+            self.assertTrue((scale_cpu - scale_npu[0]) / scale_cpu < 0.0001)
+            self.assertEqual(offset_cpu, offset_npu[0])
+
+
+instantiate_device_type_tests(TestIFMR, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_im2col_backward.py b/test/test_npu/test_network_ops/test_im2col_backward.py
index 63b309c5bb06a26488b9462cfb340df451cc22b1..54fa10e998542df4563d274029546230ee31c6c5 100644
--- a/test/test_npu/test_network_ops/test_im2col_backward.py
+++ b/test/test_npu/test_network_ops/test_im2col_backward.py
@@ -1,34 +1,34 @@
-import torch
-import numpy as np
-import sys
-import copy
-import os
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestIm2colBackward(TestCase):
-    def test_im2col_backward_fp16(self, device):
-        fold_cpu = torch.nn.Fold(output_size=(18, 18), kernel_size=(3, 3))
-        input_cpu = torch.rand(1, 16 * 3 * 3, 256).half()
-        fold_npu = fold_cpu.npu()
-        input_npu = input_cpu.npu()
-        output_cpu = fold_cpu(input_cpu)
-        output_npu = fold_npu(input_npu)
-
-        self.assertRtolEqual(output_cpu.numpy(), output_npu.cpu().numpy())
-
-    def test_im2col_backward_fp32(self, device):
-        fold_cpu = torch.nn.Fold(output_size=(18, 18), kernel_size=(3, 3))
-        input_cpu = torch.rand(1, 16 * 3 * 3, 256)
-        fold_npu = fold_cpu.npu()
-        input_npu = input_cpu.npu()
-        output_cpu = fold_cpu(input_cpu).numpy()
-        output_npu = fold_npu(input_npu).cpu().numpy()
-
-        self.assertRtolEqual(output_cpu, output_npu)
-
-
-instantiate_device_type_tests(TestIm2colBackward, globals(), except_for='cpu')
-if __name__ == '__main__':
+import torch
+import numpy as np
+import sys
+import copy
+import os
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestIm2colBackward(TestCase):
+    def test_im2col_backward_fp16(self, device):
+        fold_cpu = torch.nn.Fold(output_size=(18, 18), kernel_size=(3, 3))
+        input_cpu = torch.rand(1, 16 * 3 * 3, 256).half()
+        fold_npu = fold_cpu.npu()
+        input_npu = input_cpu.npu()
+        output_cpu = fold_cpu(input_cpu)
+        output_npu = fold_npu(input_npu)
+
+        self.assertRtolEqual(output_cpu.numpy(), output_npu.cpu().numpy())
+
+    def test_im2col_backward_fp32(self, device):
+        fold_cpu = torch.nn.Fold(output_size=(18, 18), kernel_size=(3, 3))
+        input_cpu = torch.rand(1, 16 * 3 * 3, 256)
+        fold_npu = fold_cpu.npu()
+        input_npu = input_cpu.npu()
+        output_cpu = fold_cpu(input_cpu).numpy()
+        output_npu = fold_npu(input_npu).cpu().numpy()
+
+        self.assertRtolEqual(output_cpu, output_npu)
+
+
+instantiate_device_type_tests(TestIm2colBackward, globals(), except_for='cpu')
+if __name__ == '__main__':
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_index_add.py b/test/test_npu/test_network_ops/test_index_add.py
index fcd1749abbacf6a4818e10b1cd3c309f1d9d83d8..7701f21498eac07b30cd898e65316a23f64cbf22 100644
--- a/test/test_npu/test_network_ops/test_index_add.py
+++ b/test/test_npu/test_network_ops/test_index_add.py
@@ -1,175 +1,175 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestIndexAdd(TestCase):
-
-    def cpu_op_exec(self, var, index, source, dim):
-        output = var.index_add(dim=dim, index=index.long(), source=source)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, var, index, source, dim):
-        output = torch.index_add(var, dim, index.int(), source)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-    
-    def cpu_op_inter_exec(self, var, index, source, dim):
-        output = var.index_add_(dim=dim, index=index.long(), source=source)
-        output = output.numpy()
-        return output
-
-    def npu_op_inter_exec(self, var, index, source, dim):
-        output = var.index_add_(dim, index.int(), source)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-    
-    def test_index_add_float32(self, device):
-        shape_format = [
-                [[np.float32, -1, (5, 3)], [np.int32, -1, (3, )], [np.float32, -1, (3, 3)], 0],
-                [[np.float32, -1, (6, 4)], [np.int32, -1, (5, )], [np.float32, -1, (5, 4)], 0],
-                [[np.float32, -1, (3, 2)], [np.int32, -1, (2, )], [np.float32, -1, (2, 2)], 0],
-                [[np.float32, -1, (8, 6)], [np.int32, -1, (4, )], [np.float32, -1, (4, 6)], 0],
-                [[np.float32, -1, (3, 5)], [np.int32, -1, (2, )], [np.float32, -1, (3, 2)], 1],
-                [[np.float32, 4, (4, 6)], [np.int32, -1, (5, )], [np.float32, 4, (4, 5)], 1],
-                [[np.float32, 3, (2, 3)], [np.int32, -1, (2, )], [np.float32, 3, (2, 2)], 1],
-                [[np.float32, -1, (16, 7, 5, 9, 11)], [np.int32, -1, (11, )], [np.float32, -1, (16, 7, 5, 9, 11)], 4],
-                [[np.float32, 3, (1600, 200)], [np.int32, -1, (200, )], [np.float32, 3, (1600, 200)], 1],     
-        ]
-        for item in shape_format:
-            cpu_var, npu_var = create_common_tensor(item[0], -10, 10)
-            cpu_index, npu_index = create_common_tensor(item[1], 0, 2)
-            cpu_source, npu_source = create_common_tensor(item[2], -10, 10)
-        
-            
-            cpu_output = self.cpu_op_exec(cpu_var, cpu_index, cpu_source, item[3])
-            npu_output = self.npu_op_exec(npu_var, npu_index, npu_source, item[3])
-            self.assertRtolEqual(cpu_output, npu_output)
-            cpu_output = self.cpu_op_inter_exec(cpu_var, cpu_index, cpu_source, item[3])
-            npu_output = self.npu_op_inter_exec(npu_var, npu_index, npu_source, item[3])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_index_add_int32(self, device):
-        shape_format = [   
-                [[np.int32, -1, (5, 3)], [np.int32, -1, (3, )], [np.int32, -1, (3, 3)], 0],
-                [[np.int32, -1, (6, 4)], [np.int32, -1, (5, )], [np.int32, -1, (5, 4)], 0],
-                [[np.int32, -1, (3, 2)], [np.int32, -1, (2, )], [np.int32, -1, (2, 2)], 0],
-                [[np.int32, -1, (3, 5)], [np.int32, -1, (2, )], [np.int32, -1, (3, 2)], 1],
-                [[np.int32, -1, (4, 6)], [np.int32, -1, (5, )], [np.int32, -1, (4, 5)], 1],
-                [[np.int32, -1, (2, 3)], [np.int32, -1, (2, )], [np.int32, -1, (2, 2)], 1],
-                [[np.int32, -1, (16, 7, 5, 9, 11)], [np.int32, -1, (11, )], [np.int32, -1, (16, 7, 5, 9, 11)], 4],
-                [[np.int32, -1, (1600, 200)], [np.int32, -1, (200, )], [np.int32, -1, (1600, 200)], 1],        
-        ]
-        for item in shape_format:
-            cpu_var, npu_var = create_common_tensor(item[0], -10, 10)
-            cpu_index, npu_index = create_common_tensor(item[1], 0, 2)
-            cpu_source, npu_source = create_common_tensor(item[2], -10, 10)
-                
-            cpu_output = self.cpu_op_exec(cpu_var, cpu_index, cpu_source, item[3])
-            npu_output = self.npu_op_exec(npu_var, npu_index, npu_source, item[3])
-            self.assertRtolEqual(cpu_output, npu_output)
-            cpu_output = self.cpu_op_inter_exec(cpu_var, cpu_index, cpu_source, item[3])
-            npu_output = self.npu_op_inter_exec(npu_var, npu_index, npu_source, item[3])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_index_add_int8(self, device):
-        shape_format = [  
-                [[np.int8, -1, (5, 3)], [np.int32, -1, (3, )], [np.int8, -1, (3, 3)], 0],
-                [[np.int8, -1, (6, 4)], [np.int32, -1, (5, )], [np.int8, -1, (5, 4)], 0],
-                [[np.int8, -1, (3, 2)], [np.int32, -1, (2, )], [np.int8, -1, (2, 2)], 0],
-                [[np.int8, -1, (3, 5)], [np.int32, -1, (2, )], [np.int8, -1, (3, 2)], 1],
-                [[np.int8, -1, (4, 6)], [np.int32, -1, (5, )], [np.int8, -1, (4, 5)], 1],
-                [[np.int8, -1, (2, 3)], [np.int32, -1, (2, )], [np.int8, -1, (2, 2)], 1],
-                [[np.int8, -1, (16, 7, 5, 9, 11)], [np.int32, -1, (11, )], [np.int8, -1, (16, 7, 5, 9, 11)], 4],
-                [[np.int8, -1, (1600, 200)], [np.int32, -1, (200, )], [np.int8, -1, (1600, 200)], 1],     
-        ]
-        for item in shape_format:
-            cpu_var, npu_var = create_common_tensor(item[0], -10, 10)
-            cpu_index = torch.arange(0, item[1][2][0])
-            npu_index = cpu_index.npu()
-            cpu_source, npu_source = create_common_tensor(item[2], -10, 10)
-          
-            cpu_output = self.cpu_op_exec(cpu_var, cpu_index, cpu_source, item[3])
-            npu_output = self.npu_op_exec(npu_var, npu_index, npu_source, item[3])
-            self.assertRtolEqual(cpu_output, npu_output)
-            cpu_output = self.cpu_op_inter_exec(cpu_var, cpu_index, cpu_source, item[3])
-            npu_output = self.npu_op_inter_exec(npu_var, npu_index, npu_source, item[3])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_index_add_uint8(self, device):
-        shape_format = [
-                [[np.uint8, -1, (5, 3)], [np.int32, -1, (3, )], [np.uint8, -1, (3, 3)], 0],
-                [[np.uint8, -1, (6, 4)], [np.int32, -1, (5, )], [np.uint8, -1, (5, 4)], 0],
-                [[np.uint8, -1, (3, 2)], [np.int32, -1, (2, )], [np.uint8, -1, (2, 2)], 0],
-                [[np.uint8, -1, (3, 5)], [np.int32, -1, (2, )], [np.uint8, -1, (3, 2)], 1],
-                [[np.uint8, -1, (4, 6)], [np.int32, -1, (5, )], [np.uint8, -1, (4, 5)], 1],
-                [[np.uint8, -1, (2, 3)], [np.int32, -1, (2, )], [np.uint8, -1, (2, 2)], 1],
-                [[np.uint8, -1, (16, 7, 5, 9, 11)], [np.int32, -1, (11, )], [np.uint8, -1, (16, 7, 5, 9, 11)], 4],
-                [[np.uint8, -1, (1600, 200)], [np.int32, -1, (200, )], [np.uint8, -1, (1600, 200)], 1],       
-        ]
-        for item in shape_format:
-            cpu_var, npu_var = create_common_tensor(item[0], 0, 10)
-            cpu_index = torch.arange(0, item[1][2][0])
-            npu_index = cpu_index.npu()
-            cpu_source, npu_source = create_common_tensor(item[2], 0, 10)
-            
-            
-            cpu_output = self.cpu_op_exec(cpu_var, cpu_index, cpu_source, item[3])
-            npu_output = self.npu_op_exec(npu_var, npu_index, npu_source, item[3])
-            self.assertRtolEqual(cpu_output, npu_output)
-            cpu_output = self.cpu_op_inter_exec(cpu_var, cpu_index, cpu_source, item[3])
-            npu_output = self.npu_op_inter_exec(npu_var, npu_index, npu_source, item[3])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_index_add_fp16(self, device):
-        shape_format = [
-                [[np.float16, -1, (5, 3)], [np.int32, -1, (3, )], [np.float16, -1, (3, 3)], 0],
-                [[np.float16, -1, (3, 2)], [np.int32, -1, (2, )], [np.float16, -1, (2, 2)], 0],
-                [[np.float16, -1, (3, 5)], [np.int32, -1, (2, )], [np.float16, -1, (3, 2)], 1],
-                [[np.float16, -1, (2, 6)], [np.int32, -1, (4, )], [np.float16, -1, (2, 4)], 1],
-                [[np.float16, -1, (16, 7, 5, 9, 11)], [np.int32, -1, (11, )], [np.float16, -1, (16, 7, 5, 9, 11)], 4],
-                [[np.float16, -1, (1600, 200)], [np.int32, -1, (200, )], [np.float16, -1, (1600, 200)], 1],
-        ]
-        for item in shape_format:
-            cpu_var, npu_var = create_common_tensor(item[0], 0, 10)
-            cpu_index = torch.arange(0, item[1][2][0])
-            npu_index = cpu_index.npu()
-            cpu_source, npu_source = create_common_tensor(item[2], 0, 10)
-            
-            cpu_var = cpu_var.to(torch.float32)
-            cpu_source = cpu_source.to(torch.float32)
-        
-            cpu_output = self.cpu_op_exec(cpu_var, cpu_index, cpu_source, item[3])
-            npu_output = self.npu_op_exec(npu_var, npu_index, npu_source, item[3])
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-            cpu_output = self.cpu_op_inter_exec(cpu_var, cpu_index, cpu_source, item[3])
-            npu_output = self.npu_op_inter_exec(npu_var, npu_index, npu_source, item[3])
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestIndexAdd, globals(), except_for='cpu')
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestIndexAdd(TestCase):
+
+    def cpu_op_exec(self, var, index, source, dim):
+        output = var.index_add(dim=dim, index=index.long(), source=source)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, var, index, source, dim):
+        output = torch.index_add(var, dim, index.int(), source)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+    
+    def cpu_op_inter_exec(self, var, index, source, dim):
+        output = var.index_add_(dim=dim, index=index.long(), source=source)
+        output = output.numpy()
+        return output
+
+    def npu_op_inter_exec(self, var, index, source, dim):
+        output = var.index_add_(dim, index.int(), source)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+    
+    def test_index_add_float32(self, device):
+        shape_format = [
+                [[np.float32, -1, (5, 3)], [np.int32, -1, (3, )], [np.float32, -1, (3, 3)], 0],
+                [[np.float32, -1, (6, 4)], [np.int32, -1, (5, )], [np.float32, -1, (5, 4)], 0],
+                [[np.float32, -1, (3, 2)], [np.int32, -1, (2, )], [np.float32, -1, (2, 2)], 0],
+                [[np.float32, -1, (8, 6)], [np.int32, -1, (4, )], [np.float32, -1, (4, 6)], 0],
+                [[np.float32, -1, (3, 5)], [np.int32, -1, (2, )], [np.float32, -1, (3, 2)], 1],
+                [[np.float32, 4, (4, 6)], [np.int32, -1, (5, )], [np.float32, 4, (4, 5)], 1],
+                [[np.float32, 3, (2, 3)], [np.int32, -1, (2, )], [np.float32, 3, (2, 2)], 1],
+                [[np.float32, -1, (16, 7, 5, 9, 11)], [np.int32, -1, (11, )], [np.float32, -1, (16, 7, 5, 9, 11)], 4],
+                [[np.float32, 3, (1600, 200)], [np.int32, -1, (200, )], [np.float32, 3, (1600, 200)], 1],     
+        ]
+        for item in shape_format:
+            cpu_var, npu_var = create_common_tensor(item[0], -10, 10)
+            cpu_index, npu_index = create_common_tensor(item[1], 0, 2)
+            cpu_source, npu_source = create_common_tensor(item[2], -10, 10)
+        
+            
+            cpu_output = self.cpu_op_exec(cpu_var, cpu_index, cpu_source, item[3])
+            npu_output = self.npu_op_exec(npu_var, npu_index, npu_source, item[3])
+            self.assertRtolEqual(cpu_output, npu_output)
+            cpu_output = self.cpu_op_inter_exec(cpu_var, cpu_index, cpu_source, item[3])
+            npu_output = self.npu_op_inter_exec(npu_var, npu_index, npu_source, item[3])
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_index_add_int32(self, device):
+        shape_format = [   
+                [[np.int32, -1, (5, 3)], [np.int32, -1, (3, )], [np.int32, -1, (3, 3)], 0],
+                [[np.int32, -1, (6, 4)], [np.int32, -1, (5, )], [np.int32, -1, (5, 4)], 0],
+                [[np.int32, -1, (3, 2)], [np.int32, -1, (2, )], [np.int32, -1, (2, 2)], 0],
+                [[np.int32, -1, (3, 5)], [np.int32, -1, (2, )], [np.int32, -1, (3, 2)], 1],
+                [[np.int32, -1, (4, 6)], [np.int32, -1, (5, )], [np.int32, -1, (4, 5)], 1],
+                [[np.int32, -1, (2, 3)], [np.int32, -1, (2, )], [np.int32, -1, (2, 2)], 1],
+                [[np.int32, -1, (16, 7, 5, 9, 11)], [np.int32, -1, (11, )], [np.int32, -1, (16, 7, 5, 9, 11)], 4],
+                [[np.int32, -1, (1600, 200)], [np.int32, -1, (200, )], [np.int32, -1, (1600, 200)], 1],        
+        ]
+        for item in shape_format:
+            cpu_var, npu_var = create_common_tensor(item[0], -10, 10)
+            cpu_index, npu_index = create_common_tensor(item[1], 0, 2)
+            cpu_source, npu_source = create_common_tensor(item[2], -10, 10)
+                
+            cpu_output = self.cpu_op_exec(cpu_var, cpu_index, cpu_source, item[3])
+            npu_output = self.npu_op_exec(npu_var, npu_index, npu_source, item[3])
+            self.assertRtolEqual(cpu_output, npu_output)
+            cpu_output = self.cpu_op_inter_exec(cpu_var, cpu_index, cpu_source, item[3])
+            npu_output = self.npu_op_inter_exec(npu_var, npu_index, npu_source, item[3])
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_index_add_int8(self, device):
+        shape_format = [  
+                [[np.int8, -1, (5, 3)], [np.int32, -1, (3, )], [np.int8, -1, (3, 3)], 0],
+                [[np.int8, -1, (6, 4)], [np.int32, -1, (5, )], [np.int8, -1, (5, 4)], 0],
+                [[np.int8, -1, (3, 2)], [np.int32, -1, (2, )], [np.int8, -1, (2, 2)], 0],
+                [[np.int8, -1, (3, 5)], [np.int32, -1, (2, )], [np.int8, -1, (3, 2)], 1],
+                [[np.int8, -1, (4, 6)], [np.int32, -1, (5, )], [np.int8, -1, (4, 5)], 1],
+                [[np.int8, -1, (2, 3)], [np.int32, -1, (2, )], [np.int8, -1, (2, 2)], 1],
+                [[np.int8, -1, (16, 7, 5, 9, 11)], [np.int32, -1, (11, )], [np.int8, -1, (16, 7, 5, 9, 11)], 4],
+                [[np.int8, -1, (1600, 200)], [np.int32, -1, (200, )], [np.int8, -1, (1600, 200)], 1],     
+        ]
+        for item in shape_format:
+            cpu_var, npu_var = create_common_tensor(item[0], -10, 10)
+            cpu_index = torch.arange(0, item[1][2][0])
+            npu_index = cpu_index.npu()
+            cpu_source, npu_source = create_common_tensor(item[2], -10, 10)
+          
+            cpu_output = self.cpu_op_exec(cpu_var, cpu_index, cpu_source, item[3])
+            npu_output = self.npu_op_exec(npu_var, npu_index, npu_source, item[3])
+            self.assertRtolEqual(cpu_output, npu_output)
+            cpu_output = self.cpu_op_inter_exec(cpu_var, cpu_index, cpu_source, item[3])
+            npu_output = self.npu_op_inter_exec(npu_var, npu_index, npu_source, item[3])
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_index_add_uint8(self, device):
+        shape_format = [
+                [[np.uint8, -1, (5, 3)], [np.int32, -1, (3, )], [np.uint8, -1, (3, 3)], 0],
+                [[np.uint8, -1, (6, 4)], [np.int32, -1, (5, )], [np.uint8, -1, (5, 4)], 0],
+                [[np.uint8, -1, (3, 2)], [np.int32, -1, (2, )], [np.uint8, -1, (2, 2)], 0],
+                [[np.uint8, -1, (3, 5)], [np.int32, -1, (2, )], [np.uint8, -1, (3, 2)], 1],
+                [[np.uint8, -1, (4, 6)], [np.int32, -1, (5, )], [np.uint8, -1, (4, 5)], 1],
+                [[np.uint8, -1, (2, 3)], [np.int32, -1, (2, )], [np.uint8, -1, (2, 2)], 1],
+                [[np.uint8, -1, (16, 7, 5, 9, 11)], [np.int32, -1, (11, )], [np.uint8, -1, (16, 7, 5, 9, 11)], 4],
+                [[np.uint8, -1, (1600, 200)], [np.int32, -1, (200, )], [np.uint8, -1, (1600, 200)], 1],       
+        ]
+        for item in shape_format:
+            cpu_var, npu_var = create_common_tensor(item[0], 0, 10)
+            cpu_index = torch.arange(0, item[1][2][0])
+            npu_index = cpu_index.npu()
+            cpu_source, npu_source = create_common_tensor(item[2], 0, 10)
+            
+            
+            cpu_output = self.cpu_op_exec(cpu_var, cpu_index, cpu_source, item[3])
+            npu_output = self.npu_op_exec(npu_var, npu_index, npu_source, item[3])
+            self.assertRtolEqual(cpu_output, npu_output)
+            cpu_output = self.cpu_op_inter_exec(cpu_var, cpu_index, cpu_source, item[3])
+            npu_output = self.npu_op_inter_exec(npu_var, npu_index, npu_source, item[3])
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_index_add_fp16(self, device):
+        shape_format = [
+                [[np.float16, -1, (5, 3)], [np.int32, -1, (3, )], [np.float16, -1, (3, 3)], 0],
+                [[np.float16, -1, (3, 2)], [np.int32, -1, (2, )], [np.float16, -1, (2, 2)], 0],
+                [[np.float16, -1, (3, 5)], [np.int32, -1, (2, )], [np.float16, -1, (3, 2)], 1],
+                [[np.float16, -1, (2, 6)], [np.int32, -1, (4, )], [np.float16, -1, (2, 4)], 1],
+                [[np.float16, -1, (16, 7, 5, 9, 11)], [np.int32, -1, (11, )], [np.float16, -1, (16, 7, 5, 9, 11)], 4],
+                [[np.float16, -1, (1600, 200)], [np.int32, -1, (200, )], [np.float16, -1, (1600, 200)], 1],
+        ]
+        for item in shape_format:
+            cpu_var, npu_var = create_common_tensor(item[0], 0, 10)
+            cpu_index = torch.arange(0, item[1][2][0])
+            npu_index = cpu_index.npu()
+            cpu_source, npu_source = create_common_tensor(item[2], 0, 10)
+            
+            cpu_var = cpu_var.to(torch.float32)
+            cpu_source = cpu_source.to(torch.float32)
+        
+            cpu_output = self.cpu_op_exec(cpu_var, cpu_index, cpu_source, item[3])
+            npu_output = self.npu_op_exec(npu_var, npu_index, npu_source, item[3])
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+            cpu_output = self.cpu_op_inter_exec(cpu_var, cpu_index, cpu_source, item[3])
+            npu_output = self.npu_op_inter_exec(npu_var, npu_index, npu_source, item[3])
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestIndexAdd, globals(), except_for='cpu')
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_index_put.py b/test/test_npu/test_network_ops/test_index_put.py
old mode 100644
new mode 100755
index 2ab3b47b940c269bc9dda2b2719f193b140e2cb1..60cc47d362372f8b2ea2de9d0d003e04949c7b9b
--- a/test/test_npu/test_network_ops/test_index_put.py
+++ b/test/test_npu/test_network_ops/test_index_put.py
@@ -1,134 +1,144 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestIndexPut(TestCase):
-    def cpu_op_exec(self, input, indices, value):
-        output = input.index_put(indices, value)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input, indices, value):
-        output = input.index_put(indices, value)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_inp_exec(self, input, indices, value):
-        input.index_put_(indices, value)
-        output = input.numpy()
-        return output
-
-    def npu_op_inp_exec(self, input, indices, value):
-        input.index_put_(indices, value)
-        input = input.to("cpu")
-        output = input.numpy()
-        return output
-
-    def case_exec(self, shape):
-        cpu_indices = []
-        npu_indices = []
-        for item in shape:
-            cpu_input, npu_input = create_common_tensor(item[0], 1, 100)
-            for i in range(1, 3):
-                cpu_indices1, npu_indices1 = create_common_tensor(
-                    item[1], 1, 5)
-                cpu_indices.append(cpu_indices1)
-                npu_indices.append(npu_indices1)
-            cpu_value, npu_value = create_common_tensor(item[2], 1, 100)
-            cpu_output = self.cpu_op_exec(cpu_input, cpu_indices, cpu_value)
-            npu_output = self.npu_op_exec(npu_input, npu_indices, npu_value)
-            self.assertEqual(cpu_output, npu_output)
-
-    def case_exec_fp16(self, shape):
-        cpu_indices = []
-        npu_indices = []
-        for item in shape:
-            cpu_input, npu_input = create_common_tensor(item[0], 1, 100)
-            cpu_input = cpu_input.to(torch.float32)
-            for i in range(1, 3):
-                cpu_indices1, npu_indices1 = create_common_tensor(
-                    item[1], 1, 5)
-                cpu_indices.append(cpu_indices1)
-                npu_indices.append(npu_indices1)
-            cpu_value, npu_value = create_common_tensor(item[2], 1, 100)
-            cpu_value = cpu_value.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input, cpu_indices, cpu_value)
-            npu_output = self.npu_op_exec(npu_input, npu_indices, npu_value)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
-
-    def case_inp_exec(self, shape):
-        cpu_indices = []
-        npu_indices = []
-        for item in shape:
-            cpu_input, npu_input = create_common_tensor(item[0], 1, 100)
-            for i in range(1, 3):
-                cpu_indices1, npu_indices1 = create_common_tensor(
-                    item[1], 1, 5)
-                cpu_indices.append(cpu_indices1)
-                npu_indices.append(npu_indices1)
-            cpu_value, npu_value = create_common_tensor(item[2], 1, 100)
-            cpu_output = self.cpu_op_inp_exec(
-                cpu_input, cpu_indices, cpu_value)
-            npu_output = self.npu_op_inp_exec(
-                npu_input, npu_indices, npu_value)
-            self.assertEqual(cpu_output, npu_output)
-
-    def case_inp_exec_fp16(self, shape):
-        cpu_indices = []
-        npu_indices = []
-        for item in shape:
-            cpu_input, npu_input = create_common_tensor(item[0], 1, 100)
-            cpu_input = cpu_input.to(torch.float32)
-            for i in range(1, 3):
-                cpu_indices1, npu_indices1 = create_common_tensor(
-                    item[1], 1, 5)
-                cpu_indices.append(cpu_indices1)
-                npu_indices.append(npu_indices1)
-            cpu_value, npu_value = create_common_tensor(item[2], 1, 100)
-            cpu_value = cpu_value.to(torch.float32)
-            cpu_output = self.cpu_op_inp_exec(
-                cpu_input, cpu_indices, cpu_value)
-            npu_output = self.npu_op_inp_exec(
-                npu_input, npu_indices, npu_value)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_index_put_shape_format_fp32(self, device):
-        format_list = [0]
-        shape_list = [(5, 6)]
-        shape_format = [[[np.float32, i, j], [np.int64, 0, [1, 2]], [
-            np.float32, 0, [1, 2]]] for i in format_list for j in shape_list]
-        self.case_exec(shape_format)
-        self.case_inp_exec(shape_format)
-
-    def test_index_put_shape_format_fp16(self, device):
-        format_list = [0]
-        shape_list = [(5, 6)]
-        shape_format = [[[np.float16, i, j], [np.int64, 0, [1, 2]], [
-            np.float16, 0, [1, 2]]] for i in format_list for j in shape_list]
-        self.case_exec_fp16(shape_format)
-        self.case_inp_exec_fp16(shape_format)
-
-
-instantiate_device_type_tests(TestIndexPut, globals(), except_for="cpu")
-
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestIndexPut(TestCase):
+    def cpu_op_exec(self, input, indices, value):
+        output = input.index_put(indices, value)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input, indices, value):
+        output = input.index_put(indices, value)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inp_exec(self, input, indices, value):
+        input.index_put_(indices, value)
+        output = input.numpy()
+        return output
+
+    def npu_op_inp_exec(self, input, indices, value):
+        input.index_put_(indices, value)
+        input = input.to("cpu")
+        output = input.numpy()
+        return output
+
+    def case_exec(self, shape):
+        cpu_indices = []
+        npu_indices = []
+        for item in shape:
+            cpu_input, npu_input = create_common_tensor(item[0], 1, 100)
+            for i in range(1, 3):
+                cpu_indices1, npu_indices1 = create_common_tensor(
+                    item[1], 1, 5)
+                cpu_indices.append(cpu_indices1)
+                npu_indices.append(npu_indices1)
+            cpu_value, npu_value = create_common_tensor(item[2], 1, 100)
+            cpu_output = self.cpu_op_exec(cpu_input, cpu_indices, cpu_value)
+            npu_output = self.npu_op_exec(npu_input, npu_indices, npu_value)
+            self.assertEqual(cpu_output, npu_output)
+
+    def case_exec_fp16(self, shape):
+        cpu_indices = []
+        npu_indices = []
+        for item in shape:
+            cpu_input, npu_input = create_common_tensor(item[0], 1, 100)
+            cpu_input = cpu_input.to(torch.float32)
+            for i in range(1, 3):
+                cpu_indices1, npu_indices1 = create_common_tensor(
+                    item[1], 1, 5)
+                cpu_indices.append(cpu_indices1)
+                npu_indices.append(npu_indices1)
+            cpu_value, npu_value = create_common_tensor(item[2], 1, 100)
+            cpu_value = cpu_value.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input, cpu_indices, cpu_value)
+            npu_output = self.npu_op_exec(npu_input, npu_indices, npu_value)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def case_inp_exec(self, shape):
+        cpu_indices = []
+        npu_indices = []
+        for item in shape:
+            cpu_input, npu_input = create_common_tensor(item[0], 1, 100)
+            for i in range(1, 3):
+                cpu_indices1, npu_indices1 = create_common_tensor(
+                    item[1], 1, 5)
+                cpu_indices.append(cpu_indices1)
+                npu_indices.append(npu_indices1)
+            cpu_value, npu_value = create_common_tensor(item[2], 1, 100)
+            cpu_output = self.cpu_op_inp_exec(
+                cpu_input, cpu_indices, cpu_value)
+            npu_output = self.npu_op_inp_exec(
+                npu_input, npu_indices, npu_value)
+            self.assertEqual(cpu_output, npu_output)
+
+    def case_inp_exec_fp16(self, shape):
+        cpu_indices = []
+        npu_indices = []
+        for item in shape:
+            cpu_input, npu_input = create_common_tensor(item[0], 1, 100)
+            cpu_input = cpu_input.to(torch.float32)
+            for i in range(1, 3):
+                cpu_indices1, npu_indices1 = create_common_tensor(
+                    item[1], 1, 5)
+                cpu_indices.append(cpu_indices1)
+                npu_indices.append(npu_indices1)
+            cpu_value, npu_value = create_common_tensor(item[2], 1, 100)
+            cpu_value = cpu_value.to(torch.float32)
+            cpu_output = self.cpu_op_inp_exec(
+                cpu_input, cpu_indices, cpu_value)
+            npu_output = self.npu_op_inp_exec(
+                npu_input, npu_indices, npu_value)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_index_put_shape_format_fp32(self, device):
+        format_list = [0]
+        shape_list = [(5, 6)]
+        shape_format = [[[np.float32, i, j], [np.int64, 0, [1, 2]], [
+            np.float32, 0, [1, 2]]] for i in format_list for j in shape_list]
+        self.case_exec(shape_format)
+        self.case_inp_exec(shape_format)
+
+    def test_index_put_shape_format_fp16(self, device):
+        format_list = [0]
+        shape_list = [(5, 6)]
+        shape_format = [[[np.float16, i, j], [np.int64, 0, [1, 2]], [
+            np.float16, 0, [1, 2]]] for i in format_list for j in shape_list]
+        self.case_exec_fp16(shape_format)
+        self.case_inp_exec_fp16(shape_format)
+
+    def test_index_put_null(self, device):
+        cpu_input1 = torch.rand(2, 2)
+        cpu_input2 = torch.rand(2, 2)
+        cpu_mask_index = torch.tensor([[False, False], [False, False]])
+        npu_mask_index = cpu_mask_index.to("npu")
+        npu_input1 = cpu_input1.to("npu")
+        npu_input2 = cpu_input2.to("npu")
+        cpu_input1[cpu_mask_index] = cpu_input2.detach()[cpu_mask_index]
+        npu_input1[npu_mask_index] = npu_input2.detach()[npu_mask_index]
+        self.assertEqual(cpu_input1, npu_input1.to("cpu"))
+
+instantiate_device_type_tests(TestIndexPut, globals(), except_for="cpu")
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_inverse.py b/test/test_npu/test_network_ops/test_inverse.py
index cbf0680ce7e6fd43ef0a1caf451bca002e0aec5b..e2b533d78a6031366557012c16698bfa1f089fba 100644
--- a/test/test_npu/test_network_ops/test_inverse.py
+++ b/test/test_npu/test_network_ops/test_inverse.py
@@ -1,52 +1,52 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestInverse(TestCase):
-    def cpu_op_exec(self, input):
-        output = torch.inverse(input)
-        output = output.numpy()
-        return output
-    
-    def npu_op_exec(self, input):
-        output = torch.inverse(input)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-    
-    def test_inverse_shape_format(self, device):
-        #aicpu暂不支持5HD format，待支持后增加其他格式测试
-        shape_format = [
-                [np.float32, 0, (4,4)],
-                [np.float32, 0, (0,3,29,29)],
-                [np.float32, 0, (1,2,4,4)]
-        ]
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, -100, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestInverse, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestInverse(TestCase):
+    def cpu_op_exec(self, input):
+        output = torch.inverse(input)
+        output = output.numpy()
+        return output
+    
+    def npu_op_exec(self, input):
+        output = torch.inverse(input)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+    
+    def test_inverse_shape_format(self, device):
+        #aicpu暂不支持5HD format，待支持后增加其他格式测试
+        shape_format = [
+                [np.float32, 0, (4,4)],
+                [np.float32, 0, (0,3,29,29)],
+                [np.float32, 0, (1,2,4,4)]
+        ]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, -100, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestInverse, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_is_floating_point.py b/test/test_npu/test_network_ops/test_is_floating_point.py
index eca2d1fd7a4e43c7eec5e8be3ae3480df2941443..1740cf9cf3f908171f5ac72ce15628921f3219e7 100644
--- a/test/test_npu/test_network_ops/test_is_floating_point.py
+++ b/test/test_npu/test_network_ops/test_is_floating_point.py
@@ -1,47 +1,47 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestIsFloatingPiont(TestCase):
-    def test_is_floating_point(self, device):
-        shape_format = [
-            [0.36, torch.float32],
-            [1, torch.int32],
-            [1, torch.float32],
-            [1, torch.float16],
-            [1, torch.int8],
-        ]
-
-        for item in shape_format:
-            inputs = torch.tensor([item[0]], dtype=item[1])
-            cpu_out = inputs.is_floating_point()
-            cpu_out = np.array(cpu_out, dtype=np.int32)
-            inputs = inputs.to("npu")
-            npu_out = inputs.is_floating_point()
-            npu_out = np.array(npu_out, dtype=np.int32)
-            self.assertRtolEqual(cpu_out, npu_out)
-
-
-instantiate_device_type_tests(TestIsFloatingPiont, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestIsFloatingPiont(TestCase):
+    def test_is_floating_point(self, device):
+        shape_format = [
+            [0.36, torch.float32],
+            [1, torch.int32],
+            [1, torch.float32],
+            [1, torch.float16],
+            [1, torch.int8],
+        ]
+
+        for item in shape_format:
+            inputs = torch.tensor([item[0]], dtype=item[1])
+            cpu_out = inputs.is_floating_point()
+            cpu_out = np.array(cpu_out, dtype=np.int32)
+            inputs = inputs.to("npu")
+            npu_out = inputs.is_floating_point()
+            npu_out = np.array(npu_out, dtype=np.int32)
+            self.assertRtolEqual(cpu_out, npu_out)
+
+
+instantiate_device_type_tests(TestIsFloatingPiont, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_is_nonzero.py b/test/test_npu/test_network_ops/test_is_nonzero.py
index 68f46ebd25561bbe65bb3a80782474a97793b987..252565dcd5df6eeadc1e1575316b7a39d03709cc 100644
--- a/test/test_npu/test_network_ops/test_is_nonzero.py
+++ b/test/test_npu/test_network_ops/test_is_nonzero.py
@@ -1,44 +1,44 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestIsNonzero(TestCase):
-    def cpu_op_exec(self, input1):
-        output = torch.is_nonzero(input1)
-        return output
-
-    def npu_op_exec(self, input1):
-        output = torch.is_nonzero(input1)
-        return output
-
-    def test_isnonzero_shape_format(self, device):
-        dtype_list = [np.float16, np.float32, np.int32, np.bool_]
-        format_list = [0]
-        shape_list = [[1], [1, 1, 1], [1, 1, 1, 1]]
-        shape_format = [
-            [i, j, k] for i in dtype_list for j in format_list for k in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec(npu_input)
-            cpu_output == npu_output
-
-instantiate_device_type_tests(TestIsNonzero, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestIsNonzero(TestCase):
+    def cpu_op_exec(self, input1):
+        output = torch.is_nonzero(input1)
+        return output
+
+    def npu_op_exec(self, input1):
+        output = torch.is_nonzero(input1)
+        return output
+
+    def test_isnonzero_shape_format(self, device):
+        dtype_list = [np.float16, np.float32, np.int32, np.bool_]
+        format_list = [0]
+        shape_list = [[1], [1, 1, 1], [1, 1, 1, 1]]
+        shape_format = [
+            [i, j, k] for i in dtype_list for j in format_list for k in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            cpu_output == npu_output
+
+instantiate_device_type_tests(TestIsNonzero, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_isfinite.py b/test/test_npu/test_network_ops/test_isfinite.py
index 8c1e6ecd2363ba79ca8ae2f5fe046e3c6dc38341..19d775fc0e1fcb076069d8c4c0ff2b7fe9f98389 100644
--- a/test/test_npu/test_network_ops/test_isfinite.py
+++ b/test/test_npu/test_network_ops/test_isfinite.py
@@ -1,57 +1,57 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestIsfinite(TestCase):
-    def test_isfinite(self, device):
-        x = torch.Tensor([1, 2, -10]).to("npu")
-        self.assertEqual(torch.isfinite(x).to("cpu"), torch.BoolTensor([True, True, True]))
-    
-    
-    def cpu_op_exec(self, input):
-        output = torch.isfinite(input)
-        output = output.numpy()
-        return output
-    
-    def npu_op_exec(self, input):
-        output = torch.isfinite(input)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-    
-    def test_isfinite_shape_format(self, device):
-        shape_format = [
-                [np.int16, 0, (1, 2, 2, 5)],
-                [np.int32, 0, (1, 4, 3)],
-                [np.int64, 0, (2, 3)],
-                [np.float32, 0, (8, 4, 3, 9)],
-        ]
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, -100, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestIsfinite, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestIsfinite(TestCase):
+    def test_isfinite(self, device):
+        x = torch.Tensor([1, 2, -10]).to("npu")
+        self.assertEqual(torch.isfinite(x).to("cpu"), torch.BoolTensor([True, True, True]))
+    
+    
+    def cpu_op_exec(self, input):
+        output = torch.isfinite(input)
+        output = output.numpy()
+        return output
+    
+    def npu_op_exec(self, input):
+        output = torch.isfinite(input)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+    
+    def test_isfinite_shape_format(self, device):
+        shape_format = [
+                [np.int16, 0, (1, 2, 2, 5)],
+                [np.int32, 0, (1, 4, 3)],
+                [np.int64, 0, (2, 3)],
+                [np.float32, 0, (8, 4, 3, 9)],
+        ]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, -100, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestIsfinite, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_item.py b/test/test_npu/test_network_ops/test_item.py
index 313304bdc1202e701d6f2e95ddb624fcb3c1d30d..3348e06a6e78c44efdb1ca077026bb2d3778c6bb 100644
--- a/test/test_npu/test_network_ops/test_item.py
+++ b/test/test_npu/test_network_ops/test_item.py
@@ -1,38 +1,38 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
- 
-class TestItem(TestCase):
-    def test_item_common_shape_format(self, device):        
-        shape_format = [
-                [[np.float16, 0, 1]],
-                [[np.float16, 0, 5]],
-                [[np.float32, 4, 3]],
-                [[np.float32, 29, 4]]
-        ]
-        for shape in shape_format:
-            cpu_input, npu_input = create_common_tensor(shape[0], -1, 1)
-            cpu_output = np.array([cpu_input[0].item()]).astype(np.float32)
-            npu_output = np.array([npu_input[0].item()]).astype(np.float32)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestItem, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+ 
+class TestItem(TestCase):
+    def test_item_common_shape_format(self, device):        
+        shape_format = [
+                [[np.float16, 0, 1]],
+                [[np.float16, 0, 5]],
+                [[np.float32, 4, 3]],
+                [[np.float32, 29, 4]]
+        ]
+        for shape in shape_format:
+            cpu_input, npu_input = create_common_tensor(shape[0], -1, 1)
+            cpu_output = np.array([cpu_input[0].item()]).astype(np.float32)
+            npu_output = np.array([npu_input[0].item()]).astype(np.float32)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestItem, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_le.py b/test/test_npu/test_network_ops/test_le.py
old mode 100644
new mode 100755
index 8d7fcc8753fec6fe21e4b02e9f5a9539b595a511..093f55fdf3d6a481c9158f71c4b1dca8e1d5f491
--- a/test/test_npu/test_network_ops/test_le.py
+++ b/test/test_npu/test_network_ops/test_le.py
@@ -1,318 +1,318 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import copy
-import sys
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestLe(TestCase):
-    def generate_scalar(self, min, max):
-        scalar = np.random.uniform(min, max)
-        return scalar
-
-    def cpu_op_exec(self, input1, input2):
-        output = torch.le(input1, input2)
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_out(self, input1, input2, input3):
-        torch.le(input1, input2, out = input3)
-        output = input3.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        output = torch.le(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_inplace_exec(self, input1, input2):
-        output = input1.le_(input2)
-        output = input1
-        output = output.numpy()
-        return output
-
-    def npu_op_inplace_exec(self, input1, input2):
-        output = input1.le_(input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_out(self, input1, input2, output):
-        torch.le(input1, input2, out=output)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_scalar(self, input, scalar):
-        output = torch.le(input, scalar)
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_scalar_out(self, input1, scalar, input2):
-        torch.le(input1, scalar, out = input2)
-        output = input2.numpy()
-        return output
-
-    def npu_op_exec_scalar(self, input, scalar):
-        output = torch.le(input, scalar)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_inplace_exec_scalar(self, input, scalar):
-        output = input.le_(scalar)
-        output = output.numpy()
-        return output
-
-    def npu_op_inplace_exec_scalar(self, input, scalar):
-        input = input.to("npu")
-        output = input.le_(scalar)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_scalar_out(self, input, scalar, output):
-        torch.le(input, scalar, out=output)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_inplace_stride_exec(self, input1, input2):
-        input1 = input1.as_strided([2, 2], [1, 2], 1)
-        input2 = input2.as_strided([2, 2], [1, 2], 1)
-        output = input1.le_(input2)
-        output = output.numpy()
-        return output
-
-    def npu_op_inplace_stride_exec(self, input1, input2):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        input1 = input1.as_strided([2, 2], [1, 2], 1)
-        input2 = input2.as_strided([2, 2], [1, 2], 1)
-        output = input1.le_(input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_inplace_stride_scalar_exec(self, input1, input2):
-        input1 = input1.as_strided([2, 2], [1, 2], 1)
-        output = input1.le_(input2)
-        output = output.numpy()
-        return output
-
-    def npu_op_inplace_stride_scalar_exec(self, input1, input2):
-        input1 = input1.to("npu")
-        input1 = input1.as_strided([2, 2], [1, 2], 1)
-        output = input1.le_(input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def le_tensor_out_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100)
-            cpu_input3 = torch.randn(item[1][2])<0
-            npu_input3 = cpu_input3.npu()
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            if cpu_input2.dtype == torch.float16:
-                cpu_input2 = cpu_input2.to(torch.float32)
-            if cpu_input3.dtype == torch.float16:
-                cpu_input3 = cpu_input3.to(torch.float32)
-            cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3)
-            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
-            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
-
-            self.assertRtolEqual(cpu_output_out, npu_output_out)
-
-    def test_le_tensor_out(self, device):
-        shape_format = [
-            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
-            [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]],
-            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
-            [[np.float32, 0, [256, 128, 7, 7]],   [np.float32, 0, [128, 256, 3, 3]]],
-            [[np.float32, 0, [2, 3, 3, 3]],       [np.float32, 0, [3, 1, 3]]],
-            [[np.float32, 0, [128, 232, 7, 7]],   [np.float32, 0, [128, 232, 7, 7]]],
-        ]
-        self.le_tensor_out_result(shape_format)
-
-    def le_scalar_out_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_input2 = torch.randn(item[1][2])<0
-            npu_input2 = cpu_input2.npu()
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            if cpu_input2.dtype == torch.float16:
-                cpu_input2 = cpu_input2.to(torch.float32)
-            scalar = self.generate_scalar(0, 100)
-            cpu_output_out = self.cpu_op_exec_scalar_out(cpu_input1, scalar, cpu_input2)
-            npu_output_out = self.npu_op_exec_scalar_out(npu_input1, scalar, npu_input2)
-            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
-            self.assertRtolEqual(cpu_output_out, npu_output_out)
-
-    def test_le_scalar_out(self, device):
-        shape_format = [
-            [[np.float16, 0, [12, 4, 12, 121]], [np.float16, 0, [256, 116, 1, 1]]],
-            [[np.float16, 0, [12, 10, 14, 111]], [np.float16, 0, [256, 116, 1, 1]]],
-            [[np.float16, 2, [16, 3, 11, 121, 21]], [np.float16, 0, [3, 3, 3]]],
-            [[np.float16, 0, [16, 16, 14]], [np.float16, 0, [128, 116, 14, 14]]],
-            [[np.float32, 0, [20, 10, 7, 7]], [np.float32, 0, [128, 256, 3, 3]]],
-            [[np.float32, 2, [1313, 3, 3, 3, 121]], [np.float32, 0, [3, 1, 3]]],
-            [[np.float32, 0, [16, 22, 7, 7]], [np.float32, 0, [128, 232, 7, 7]]],
-        ]
-        self.le_scalar_out_result(shape_format)
-
-    def test_le_scalar_float32(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            scalar = self.generate_scalar(0, 100)
-            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
-            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_le_scalar_int32(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [
-            [np.int32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            scalar = self.generate_scalar(0, 100)
-            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
-            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_gt_scalar_float16(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            cpu_input = cpu_input.to(torch.float32)
-            scalar = self.generate_scalar(0, 100)
-            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
-            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_le_tensor_float32(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [[[np.float32, i, j], [np.float32, i, j]]
-                        for i in format_list for j in shape_list]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_le_tensor_float16(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [[[np.float16, i, j], [np.float16, i, j]]
-                        for i in format_list for j in shape_list]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_le_inplace_float32(self, device):
-        format_list = [0, 3]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [[[np.float32, i, j], [np.float32, i, j]]
-                        for i in format_list for j in shape_list]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
-            cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_le_inplace_float16(self, device):
-        format_list = [0, 3]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [[[np.float16, i, j], [np.float16, i, j]]
-                        for i in format_list for j in shape_list]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_le_inplace_scalar_float32(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            scalar = self.generate_scalar(0, 100)
-            scalar1 = copy.deepcopy(scalar)
-            ncpu_input = copy.deepcopy(cpu_input)
-            cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
-            npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar1)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_le_inplace_scalar_float16(self, device):
-        format_list = [0]
-        shape_list = [(5, 3), (2, 3, 4)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            cpu_input = cpu_input.to(torch.float32)
-            scalar = self.generate_scalar(0, 100)
-            cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
-            npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_le_mix_dtype(self, device):
-        npu_input1, npu_input2 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
-        npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
-        npu_output = self.npu_op_exec(npu_input2, npu_input4)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestLe, globals(), except_for="cpu")
-if __name__ == '__main__':
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import copy
+import sys
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestLe(TestCase):
+    def generate_scalar(self, min, max):
+        scalar = np.random.uniform(min, max)
+        return scalar
+
+    def cpu_op_exec(self, input1, input2):
+        output = torch.le(input1, input2)
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_out(self, input1, input2, input3):
+        torch.le(input1, input2, out = input3)
+        output = input3.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.le(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_exec(self, input1, input2):
+        output = input1.le_(input2)
+        output = input1
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_exec(self, input1, input2):
+        output = input1.le_(input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2, output):
+        torch.le(input1, input2, out=output)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_scalar(self, input, scalar):
+        output = torch.le(input, scalar)
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_scalar_out(self, input1, scalar, input2):
+        torch.le(input1, scalar, out = input2)
+        output = input2.numpy()
+        return output
+
+    def npu_op_exec_scalar(self, input, scalar):
+        output = torch.le(input, scalar)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_exec_scalar(self, input, scalar):
+        output = input.le_(scalar)
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_exec_scalar(self, input, scalar):
+        input = input.to("npu")
+        output = input.le_(scalar)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_scalar_out(self, input, scalar, output):
+        torch.le(input, scalar, out=output)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_stride_exec(self, input1, input2):
+        input1 = input1.as_strided([2, 2], [1, 2], 1)
+        input2 = input2.as_strided([2, 2], [1, 2], 1)
+        output = input1.le_(input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_stride_exec(self, input1, input2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        input1 = input1.as_strided([2, 2], [1, 2], 1)
+        input2 = input2.as_strided([2, 2], [1, 2], 1)
+        output = input1.le_(input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_stride_scalar_exec(self, input1, input2):
+        input1 = input1.as_strided([2, 2], [1, 2], 1)
+        output = input1.le_(input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_stride_scalar_exec(self, input1, input2):
+        input1 = input1.to("npu")
+        input1 = input1.as_strided([2, 2], [1, 2], 1)
+        output = input1.le_(input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def le_tensor_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100)
+            cpu_input3 = torch.randn(item[1][2])<0
+            npu_input3 = cpu_input3.npu()
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            if cpu_input3.dtype == torch.float16:
+                cpu_input3 = cpu_input3.to(torch.float32)
+            cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
+            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
+
+            self.assertRtolEqual(cpu_output_out, npu_output_out)
+
+    def test_le_tensor_out(self, device):
+        shape_format = [
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]],
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
+            [[np.float32, 0, [256, 128, 7, 7]],   [np.float32, 0, [128, 256, 3, 3]]],
+            [[np.float32, 0, [2, 3, 3, 3]],       [np.float32, 0, [3, 1, 3]]],
+            [[np.float32, 0, [128, 232, 7, 7]],   [np.float32, 0, [128, 232, 7, 7]]],
+        ]
+        self.le_tensor_out_result(shape_format)
+
+    def le_scalar_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2 = torch.randn(item[1][2])<0
+            npu_input2 = cpu_input2.npu()
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output_out = self.cpu_op_exec_scalar_out(cpu_input1, scalar, cpu_input2)
+            npu_output_out = self.npu_op_exec_scalar_out(npu_input1, scalar, npu_input2)
+            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
+            self.assertRtolEqual(cpu_output_out, npu_output_out)
+
+    def test_le_scalar_out(self, device):
+        shape_format = [
+            [[np.float16, 0, [12, 4, 12, 121]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [12, 10, 14, 111]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 2, [16, 3, 11, 121, 21]], [np.float16, 0, [3, 3, 3]]],
+            [[np.float16, 0, [16, 16, 14]], [np.float16, 0, [128, 116, 14, 14]]],
+            [[np.float32, 0, [20, 10, 7, 7]], [np.float32, 0, [128, 256, 3, 3]]],
+            [[np.float32, 2, [1313, 3, 3, 3, 121]], [np.float32, 0, [3, 1, 3]]],
+            [[np.float32, 0, [16, 22, 7, 7]], [np.float32, 0, [128, 232, 7, 7]]],
+        ]
+        self.le_scalar_out_result(shape_format)
+
+    def test_le_scalar_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_le_scalar_int32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.int32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_gt_scalar_float16(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_input = cpu_input.to(torch.float32)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_exec_scalar(npu_input, scalar)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_le_tensor_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float32, i, j], [np.float32, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_le_tensor_float16(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float16, i, j], [np.float16, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_le_inplace_float32(self, device):
+        format_list = [0, 3]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float32, i, j], [np.float32, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_le_inplace_float16(self, device):
+        format_list = [0, 3]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [[[np.float16, i, j], [np.float16, i, j]]
+                        for i in format_list for j in shape_list]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_inplace_exec(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_le_inplace_scalar_float32(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            scalar = self.generate_scalar(0, 100)
+            scalar1 = copy.deepcopy(scalar)
+            ncpu_input = copy.deepcopy(cpu_input)
+            cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar1)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_le_inplace_scalar_float16(self, device):
+        format_list = [0]
+        shape_list = [(5, 3), (2, 3, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_input = cpu_input.to(torch.float32)
+            scalar = self.generate_scalar(0, 100)
+            cpu_output = self.cpu_op_inplace_exec_scalar(cpu_input, scalar)
+            npu_output = self.npu_op_inplace_exec_scalar(npu_input, scalar)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_le_mix_dtype(self, device):
+        npu_input1, npu_input2 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
+        npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
+        npu_output = self.npu_op_exec(npu_input2, npu_input4)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestLe, globals(), except_for="cpu")
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_leaky_relu_backward.py b/test/test_npu/test_network_ops/test_leaky_relu_backward.py
old mode 100644
new mode 100755
index 9f5a6aca6498a972494ca08d072f91bbc430f115..e690f37809a884db76b77f1544aa1a876788f7fd
--- a/test/test_npu/test_network_ops/test_leaky_relu_backward.py
+++ b/test/test_npu/test_network_ops/test_leaky_relu_backward.py
@@ -1,82 +1,82 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestLeakyReluBackward(TestCase):
-    def cpu_op_backward_exec(self, input1):
-        w = torch.ones_like(input1)
-        input1.requires_grad_(True)
-        output = torch.nn.functional.leaky_relu(input1)
-        output.backward(w)
-        res = input1.grad
-        res = res.numpy()
-        return res, output
-
-    def npu_op_backward_exec(self, input1):
-        w = torch.ones_like(input1)
-        w = w.to("npu")
-        input1 = input1.to("npu")
-        input1.requires_grad_(True)
-        output = torch.nn.functional.leaky_relu(input1)
-        output.backward(w)
-        output = output.to("cpu")
-        res = input1.grad
-        res = input1.grad.to("cpu")
-        res = res.numpy()
-        return res, output
-
-    def test_leaky_relu_backward_format_fp32(self, device):
-        format_list = [0, 3]
-        shape_list = [(5, 3)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            # print(item)
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 2)
-            cpu_output = self.cpu_op_backward_exec(cpu_input1)
-            npu_output = self.npu_op_backward_exec(npu_input1)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_leaky_relu_backward_format_fp16(self, device):
-        format_list = [0, 3]
-        shape_list = [(5, 3)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            # print(item)
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 2)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output1, cpu_output2 = self.cpu_op_backward_exec(cpu_input1)
-            npu_output1, npu_output2 = self.npu_op_backward_exec(npu_input1)
-            cpu_output1 = cpu_output1.astype(np.float16)
-            self.assertEqual(cpu_output1, npu_output1)
-            self.assertEqual(cpu_output2, npu_output2)
-
-
-instantiate_device_type_tests(
-    TestLeakyReluBackward,
-    globals(),
-    except_for="cpu")
-
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestLeakyReluBackward(TestCase):
+    def cpu_op_backward_exec(self, input1):
+        w = torch.ones_like(input1)
+        input1.requires_grad_(True)
+        output = torch.nn.functional.leaky_relu(input1)
+        output.backward(w)
+        res = input1.grad
+        res = res.numpy()
+        return res, output
+
+    def npu_op_backward_exec(self, input1):
+        w = torch.ones_like(input1)
+        w = w.to("npu")
+        input1 = input1.to("npu")
+        input1.requires_grad_(True)
+        output = torch.nn.functional.leaky_relu(input1)
+        output.backward(w)
+        output = output.to("cpu")
+        res = input1.grad
+        res = input1.grad.to("cpu")
+        res = res.numpy()
+        return res, output
+
+    def test_leaky_relu_backward_format_fp32(self, device):
+        format_list = [0, 3]
+        shape_list = [(5, 3)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            # print(item)
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 2)
+            cpu_output = self.cpu_op_backward_exec(cpu_input1)
+            npu_output = self.npu_op_backward_exec(npu_input1)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_leaky_relu_backward_format_fp16(self, device):
+        format_list = [0, 3]
+        shape_list = [(5, 3)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            # print(item)
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 2)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output1, cpu_output2 = self.cpu_op_backward_exec(cpu_input1)
+            npu_output1, npu_output2 = self.npu_op_backward_exec(npu_input1)
+            cpu_output1 = cpu_output1.astype(np.float16)
+            self.assertEqual(cpu_output1, npu_output1)
+            self.assertEqual(cpu_output2, npu_output2)
+
+
+instantiate_device_type_tests(
+    TestLeakyReluBackward,
+    globals(),
+    except_for="cpu")
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_lerp.py b/test/test_npu/test_network_ops/test_lerp.py
index eeb0b7250506026e724a90a5578ecc805cab80b1..497893f832af5d57245472c103b2789437adaf5c 100644
--- a/test/test_npu/test_network_ops/test_lerp.py
+++ b/test/test_npu/test_network_ops/test_lerp.py
@@ -1,225 +1,225 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch 
-import numpy as np 
-import sys 
-import copy 
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-import random
-import math
-
-class TestLerp(TestCase):
-# pylint: disable=unused-variable,unused-argument
-
-    def cpu_op_exec(self, input1, input2, input3): 
-        output = torch.lerp(input1,input2,input3) 
-        output = output.numpy() 
-        return output 
-    
-    def npu_op_exec(self, input1, input2, input3): 
-        output = torch.lerp(input1, input2, input3) 
-        output = output.to("cpu") 
-        output = output.numpy() 
-        return output
-
-    def cpu_op_out_exec(self, input1, input2, input3):
-        output = torch.ones_like(input1) 
-        torch.lerp(input1,input2,input3, out = output) 
-        output = output.numpy() 
-        return output 
-    
-    def npu_op_out_exec(self, input1, input2, input3):
-        output = torch.ones_like(input1) 
-        torch.lerp(input1, input2, input3, out = output) 
-        output = output.to("cpu") 
-        output = output.numpy() 
-        return output
-
-    def cpu_op_scalar_out_exec(self, input1, input2, input3):
-        output = torch.ones_like(input1) 
-        torch.lerp(input1,input2,input3, out = output) 
-        output = output.numpy() 
-        return output 
-    
-    def npu_op_scalar_out_exec(self, input1, input2, input3):
-        output = torch.ones_like(input1) 
-        torch.lerp(input1, input2, input3, out = output) 
-        output = output.to("cpu") 
-        output = output.numpy() 
-        return output
-
-
-    def test_lerp_common_shape_format(self, device):
-        shape_format = [
-            [[np.float32, -1, (4, 2, 2, 3)]],
-            [[np.float32, -1, (2, 2, 3, 4)]],
-            [[np.float32, -1, (3, 3, 3)]],
-            [[np.float32, -1, (4, 4, 4)]]
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], 1, 100)
-            cpu_input3, npu_input3 = create_common_tensor(item[0], 1, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_input3)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_lerp_float16_shape_format(self, device):
-        def cpu_op_exec_fp16(input1, input2, input3):
-            input1 = input1.to(torch.float32)
-            input2 = input2.to(torch.float32)
-            input3 = input3.to(torch.float32)
-            output = torch.lerp(input1,input2,input3)
-            output = output.numpy()
-            output = output.astype(np.float16)
-            return output
-
-        shape_format = [
-            [[np.float16, -1, (100, 4, 5, 5)]],
-            [[np.float16, -1, (100, 5, 5, 4)]],
-        ]
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 10, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], 10, 100)
-            cpu_input3, npu_input3 = create_common_tensor(item[0], 10, 100)
-            cpu_output = cpu_op_exec_fp16(cpu_input1, cpu_input2, cpu_input3)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_input3)
-            self.assertRtolEqual(cpu_output, npu_output, prec=0.003, prec16=0.003)
-
-    
-    def test_lerp_out_common_shape_format(self, device):
-        shape_format = [
-            [[np.float32, -1, (4, 2, 2, 3)]],
-            [[np.float32, -1, (2, 2, 3, 4)]],
-            [[np.float32, -1, (3, 3, 3)]],
-            [[np.float32, -1, (4, 4, 4)]]
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], 1, 100)
-            cpu_input3, npu_input3 = create_common_tensor(item[0], 1, 100)
-            cpu_output = self.cpu_op_out_exec(cpu_input1, cpu_input2, cpu_input3)
-            npu_output = self.npu_op_out_exec(npu_input1, npu_input2, npu_input3)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_lerp_out_float16_shape_format(self, device):
-        def cpu_op_out_exec_fp16(input1, input2, input3):
-            input1 = input1.to(torch.float32)
-            input2 = input2.to(torch.float32)
-            input3 = input3.to(torch.float32)
-            output = torch.ones_like(input1)
-            torch.lerp(input1,input2,input3, out = output)
-            output = output.numpy()
-            output = output.astype(np.float16)
-            return output
-
-        shape_format = [
-            [[np.float16, -1, (100, 4, 5, 5)]],
-            [[np.float16, -1, (100, 5, 5, 4)]],
-        ]
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 10, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], 10, 100)
-            cpu_input3, npu_input3 = create_common_tensor(item[0], 10, 100)
-            cpu_output = cpu_op_out_exec_fp16(cpu_input1, cpu_input2, cpu_input3)
-            npu_output = self.npu_op_out_exec(npu_input1, npu_input2, npu_input3)
-            self.assertRtolEqual(cpu_output, npu_output, prec=0.003, prec16=0.003)
-
-    def test_lerp_scalar_common_shape_format(self, device):
-        shape_format = [
-            [[np.float32, -1, (4, 2, 2, 3)], 1.0],
-            [[np.float32, -1, (2, 2, 3, 4)], 2.0],
-            [[np.float32, -1, (3, 3, 3)], 1.2],
-            [[np.float32, -1, (4, 4, 4)], 1.2]
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], 1, 100)
-            cpu_input3 = item[1]
-            npu_input3 = item[1]
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_input3)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_lerp_scalar_float16_shape_format(self, device):
-        def cpu_op_scalar_exec_fp16(input1, input2, input3):
-            input1 = input1.to(torch.float32)
-            input2 = input2.to(torch.float32)
-            output = torch.lerp(input1,input2,input3)
-            output = output.numpy()
-            output = output.astype(np.float16)
-            return output
-
-        shape_format = [
-            [[np.float16, -1, (100, 4, 5, 5)], 1.2],
-            [[np.float16, -1, (100, 5, 5, 4)], 1.2],
-        ]
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 10, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], 10, 100)
-            cpu_input3 = item[1]
-            npu_input3 = item[1]
-            cpu_output = cpu_op_scalar_exec_fp16(cpu_input1, cpu_input2, cpu_input3)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_input3)
-            self.assertRtolEqual(cpu_output, npu_output, prec16=0.02)
-
-    
-    def test_lerp_scalar_out_common_shape_format(self, device):
-        shape_format = [
-            [[np.float32, -1, (4, 2, 2, 3)], 1.2],
-            [[np.float32, -1, (2, 2, 3, 4)],1.2],
-            [[np.float32, -1, (3, 3, 3)], 1.0],
-            [[np.float32, -1, (4, 4, 4)], 2.0]
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], 1, 100)
-            cpu_input3, npu_input3 = create_common_tensor(item[0], 1, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_input3)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_lerp_scalar_out_float16_shape_format(self, device):
-        def cpu_op_scalar_out_exec_fp16(input1, input2, input3):
-            input1 = input1.to(torch.float32)
-            input2 = input2.to(torch.float32)
-            output = torch.ones_like(input1)
-            torch.lerp(input1,input2,input3, out = output)
-            output = output.numpy()
-            output = output.astype(np.float16)
-            return output
-
-        shape_format = [
-            [[np.float16, -1, (100, 4, 5, 5)], 1.2],
-            [[np.float16, -1, (100, 5, 5, 4)], 1.2],
-        ]
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 10, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], 10, 100)
-            cpu_input3 = item[1]
-            npu_input3 = item[1]
-            cpu_output = cpu_op_scalar_out_exec_fp16(cpu_input1, cpu_input2, cpu_input3)
-            npu_output = self.npu_op_scalar_out_exec(npu_input1, npu_input2, npu_input3)
-            self.assertRtolEqual(cpu_output, npu_output, prec16=0.02)
-     
-instantiate_device_type_tests(TestLerp, globals(), except_for='cpu')
-if __name__ == '__main__': 
-    run_tests() 
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch 
+import numpy as np 
+import sys 
+import copy 
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+import random
+import math
+
+class TestLerp(TestCase):
+# pylint: disable=unused-variable,unused-argument
+
+    def cpu_op_exec(self, input1, input2, input3): 
+        output = torch.lerp(input1,input2,input3) 
+        output = output.numpy() 
+        return output 
+    
+    def npu_op_exec(self, input1, input2, input3): 
+        output = torch.lerp(input1, input2, input3) 
+        output = output.to("cpu") 
+        output = output.numpy() 
+        return output
+
+    def cpu_op_out_exec(self, input1, input2, input3):
+        output = torch.ones_like(input1) 
+        torch.lerp(input1,input2,input3, out = output) 
+        output = output.numpy() 
+        return output 
+    
+    def npu_op_out_exec(self, input1, input2, input3):
+        output = torch.ones_like(input1) 
+        torch.lerp(input1, input2, input3, out = output) 
+        output = output.to("cpu") 
+        output = output.numpy() 
+        return output
+
+    def cpu_op_scalar_out_exec(self, input1, input2, input3):
+        output = torch.ones_like(input1) 
+        torch.lerp(input1,input2,input3, out = output) 
+        output = output.numpy() 
+        return output 
+    
+    def npu_op_scalar_out_exec(self, input1, input2, input3):
+        output = torch.ones_like(input1) 
+        torch.lerp(input1, input2, input3, out = output) 
+        output = output.to("cpu") 
+        output = output.numpy() 
+        return output
+
+
+    def test_lerp_common_shape_format(self, device):
+        shape_format = [
+            [[np.float32, -1, (4, 2, 2, 3)]],
+            [[np.float32, -1, (2, 2, 3, 4)]],
+            [[np.float32, -1, (3, 3, 3)]],
+            [[np.float32, -1, (4, 4, 4)]]
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], 1, 100)
+            cpu_input3, npu_input3 = create_common_tensor(item[0], 1, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_input3)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_lerp_float16_shape_format(self, device):
+        def cpu_op_exec_fp16(input1, input2, input3):
+            input1 = input1.to(torch.float32)
+            input2 = input2.to(torch.float32)
+            input3 = input3.to(torch.float32)
+            output = torch.lerp(input1,input2,input3)
+            output = output.numpy()
+            output = output.astype(np.float16)
+            return output
+
+        shape_format = [
+            [[np.float16, -1, (100, 4, 5, 5)]],
+            [[np.float16, -1, (100, 5, 5, 4)]],
+        ]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 10, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], 10, 100)
+            cpu_input3, npu_input3 = create_common_tensor(item[0], 10, 100)
+            cpu_output = cpu_op_exec_fp16(cpu_input1, cpu_input2, cpu_input3)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_input3)
+            self.assertRtolEqual(cpu_output, npu_output, prec=0.003, prec16=0.003)
+
+    
+    def test_lerp_out_common_shape_format(self, device):
+        shape_format = [
+            [[np.float32, -1, (4, 2, 2, 3)]],
+            [[np.float32, -1, (2, 2, 3, 4)]],
+            [[np.float32, -1, (3, 3, 3)]],
+            [[np.float32, -1, (4, 4, 4)]]
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], 1, 100)
+            cpu_input3, npu_input3 = create_common_tensor(item[0], 1, 100)
+            cpu_output = self.cpu_op_out_exec(cpu_input1, cpu_input2, cpu_input3)
+            npu_output = self.npu_op_out_exec(npu_input1, npu_input2, npu_input3)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_lerp_out_float16_shape_format(self, device):
+        def cpu_op_out_exec_fp16(input1, input2, input3):
+            input1 = input1.to(torch.float32)
+            input2 = input2.to(torch.float32)
+            input3 = input3.to(torch.float32)
+            output = torch.ones_like(input1)
+            torch.lerp(input1,input2,input3, out = output)
+            output = output.numpy()
+            output = output.astype(np.float16)
+            return output
+
+        shape_format = [
+            [[np.float16, -1, (100, 4, 5, 5)]],
+            [[np.float16, -1, (100, 5, 5, 4)]],
+        ]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 10, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], 10, 100)
+            cpu_input3, npu_input3 = create_common_tensor(item[0], 10, 100)
+            cpu_output = cpu_op_out_exec_fp16(cpu_input1, cpu_input2, cpu_input3)
+            npu_output = self.npu_op_out_exec(npu_input1, npu_input2, npu_input3)
+            self.assertRtolEqual(cpu_output, npu_output, prec=0.003, prec16=0.003)
+
+    def test_lerp_scalar_common_shape_format(self, device):
+        shape_format = [
+            [[np.float32, -1, (4, 2, 2, 3)], 1.0],
+            [[np.float32, -1, (2, 2, 3, 4)], 2.0],
+            [[np.float32, -1, (3, 3, 3)], 1.2],
+            [[np.float32, -1, (4, 4, 4)], 1.2]
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], 1, 100)
+            cpu_input3 = item[1]
+            npu_input3 = item[1]
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_input3)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_lerp_scalar_float16_shape_format(self, device):
+        def cpu_op_scalar_exec_fp16(input1, input2, input3):
+            input1 = input1.to(torch.float32)
+            input2 = input2.to(torch.float32)
+            output = torch.lerp(input1,input2,input3)
+            output = output.numpy()
+            output = output.astype(np.float16)
+            return output
+
+        shape_format = [
+            [[np.float16, -1, (100, 4, 5, 5)], 1.2],
+            [[np.float16, -1, (100, 5, 5, 4)], 1.2],
+        ]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 10, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], 10, 100)
+            cpu_input3 = item[1]
+            npu_input3 = item[1]
+            cpu_output = cpu_op_scalar_exec_fp16(cpu_input1, cpu_input2, cpu_input3)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_input3)
+            self.assertRtolEqual(cpu_output, npu_output, prec16=0.02)
+
+    
+    def test_lerp_scalar_out_common_shape_format(self, device):
+        shape_format = [
+            [[np.float32, -1, (4, 2, 2, 3)], 1.2],
+            [[np.float32, -1, (2, 2, 3, 4)],1.2],
+            [[np.float32, -1, (3, 3, 3)], 1.0],
+            [[np.float32, -1, (4, 4, 4)], 2.0]
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], 1, 100)
+            cpu_input3, npu_input3 = create_common_tensor(item[0], 1, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, cpu_input3)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_input3)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_lerp_scalar_out_float16_shape_format(self, device):
+        def cpu_op_scalar_out_exec_fp16(input1, input2, input3):
+            input1 = input1.to(torch.float32)
+            input2 = input2.to(torch.float32)
+            output = torch.ones_like(input1)
+            torch.lerp(input1,input2,input3, out = output)
+            output = output.numpy()
+            output = output.astype(np.float16)
+            return output
+
+        shape_format = [
+            [[np.float16, -1, (100, 4, 5, 5)], 1.2],
+            [[np.float16, -1, (100, 5, 5, 4)], 1.2],
+        ]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 10, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], 10, 100)
+            cpu_input3 = item[1]
+            npu_input3 = item[1]
+            cpu_output = cpu_op_scalar_out_exec_fp16(cpu_input1, cpu_input2, cpu_input3)
+            npu_output = self.npu_op_scalar_out_exec(npu_input1, npu_input2, npu_input3)
+            self.assertRtolEqual(cpu_output, npu_output, prec16=0.02)
+     
+instantiate_device_type_tests(TestLerp, globals(), except_for='cpu')
+if __name__ == '__main__': 
+    run_tests() 
diff --git a/test/test_npu/test_network_ops/test_log.py b/test/test_npu/test_network_ops/test_log.py
old mode 100644
new mode 100755
index ab6beb1e8c25584df191921ef8affca628dc7a55..8dc68518f2e7ead7103de6f4d347211d98f7a6b9
--- a/test/test_npu/test_network_ops/test_log.py
+++ b/test/test_npu/test_network_ops/test_log.py
@@ -1,193 +1,193 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestLog(TestCase):
-    def cpu_op_exec(self, input1):
-        output = torch.log(input1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1):
-        input1 = input1.to("npu")
-        output = torch.log(input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_out(self, input1, input2):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        torch.log(input1, out=input2)
-        output = input2.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_inp_op_exec(self, input1):
-        output = torch.log_(input1)
-        output = output.numpy()
-        return output
-
-    def npu_inp_op_exec(self, input1):
-        input1 = input1.to("npu")
-        output = torch.log_(input1)
-        output = input1.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_inp_uncon_op_exec(self, input1):
-        input1 = input1.as_strided([2, 2], [1, 2], 2)
-        output = torch.log_(input1)
-        output = output.numpy()
-        return output
-
-    def npu_inp_uncon_op_exec(self, input1):
-        input1 = input1.to("npu")
-        input1 = input1.as_strided([2, 2], [1, 2], 2)
-        output = torch.log_(input1)
-        output = input1.to("cpu")
-        output = output.numpy()
-        return output
-
-# TestCase
-    def test_log_shape_format_fp32(self, device):
-        format_list = [3]
-        shape_list = [(4, 4)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_log_shape_format_fp16(self, device):
-        format_list = [3]
-        shape_list = [(4, 4)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_log_inp_shape_format_fp32(self, device):
-        format_list = [3]
-        shape_list = [(4, 4)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_output = self.cpu_inp_op_exec(cpu_input1)
-            npu_output = self.npu_inp_op_exec(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_log_inp_shape_format_fp16(self, device):
-        format_list = [3]
-        shape_list = [(4, 4)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_inp_op_exec(cpu_input1)
-            npu_output = self.npu_inp_op_exec(npu_input1)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_log_inp_uncon_shape_format_fp32(self, device):
-        format_list = [3]
-        shape_list = [(8, 6)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_output = self.cpu_inp_uncon_op_exec(cpu_input1)
-            npu_output = self.npu_inp_uncon_op_exec(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_log_inp_uncon_shape_format_fp16(self, device):
-        format_list = [3]
-        shape_list = [(8, 6)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_inp_uncon_op_exec(cpu_input1)
-            npu_output = self.npu_inp_uncon_op_exec(npu_input1)
-            cpu_output = cpu_output.astype(np.float16)
-            print("cpu:", cpu_output, "npu:", npu_output)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_log_out_float32_shape_format(self, device):
-        shape_format = [
-            [[np.float32, 0, [1024, 32, 7, 7]], [np.float32, 0, [1024, 32, 7, 7]]],
-            [[np.float32, 0, [1024, 32, 7]], [np.float32, 0, [1024, 32]]],
-            [[np.float32, 0, [1024, 32]], [np.float32, 0, [1024, 32]]],
-            [[np.float32, 0, [1024]], [np.float32, 0, [1024, 1]]],
-            [[np.float32, 3, [1024, 32, 7, 7]], [np.float32, 3, [1024, 32, 7, 7]]],
-            [[np.float32, 3, [1024, 32, 7]], [np.float32, 3, [1024, 32]]],
-            [[np.float32, 3, [1024, 32]], [np.float32, 3, [1024, 20]]],
-            [[np.float32, 3, [1024]], [np.float32, 3, [1024]]],
-            ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            cpu_output, npu_output = create_common_tensor(item[1], 0, 100)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec_out(npu_input, npu_output)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_log_out_float16_shape_format(self, device):
-        shape_format = [
-            [[np.float16, 0, [1024, 32, 7, 7]], [np.float16, 0, [1024, 32, 7, 7]]],
-            [[np.float16, 0, [1024, 32, 7]], [np.float16, 0, [1024, 32]]],
-            [[np.float16, 0, [1024, 32]], [np.float16, 0, [1024, 32]]],
-            [[np.float16, 0, [1024]], [np.float16, 0, [1024, 1]]],
-            [[np.float16, 3, [1024, 32, 7, 7]], [np.float16, 3, [1024, 32, 7, 7]]],
-            [[np.float16, 3, [1024, 32, 7]], [np.float16, 3, [1024, 32]]],
-            [[np.float16, 3, [1024, 32]], [np.float16, 3, [1024, 20]]],
-            [[np.float16, 3, [1024]], [np.float16, 3, [1024]]],
-            ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            cpu_output, npu_output = create_common_tensor(item[1], 0, 100)
-            if item[0][0] == np.float16:
-                cpu_input = cpu_input.to(torch.float32)
-                cpu_output = cpu_output.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec_out(npu_input, npu_output)
-            if item[0][0] == np.float16:
-                cpu_output = cpu_output.astype(np.float16)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestLog, globals(), except_for="cpu")
-if __name__ == '__main__':
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestLog(TestCase):
+    def cpu_op_exec(self, input1):
+        output = torch.log(input1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        input1 = input1.to("npu")
+        output = torch.log(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        torch.log(input1, out=input2)
+        output = input2.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_inp_op_exec(self, input1):
+        output = torch.log_(input1)
+        output = output.numpy()
+        return output
+
+    def npu_inp_op_exec(self, input1):
+        input1 = input1.to("npu")
+        output = torch.log_(input1)
+        output = input1.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_inp_uncon_op_exec(self, input1):
+        input1 = input1.as_strided([2, 2], [1, 2], 2)
+        output = torch.log_(input1)
+        output = output.numpy()
+        return output
+
+    def npu_inp_uncon_op_exec(self, input1):
+        input1 = input1.to("npu")
+        input1 = input1.as_strided([2, 2], [1, 2], 2)
+        output = torch.log_(input1)
+        output = input1.to("cpu")
+        output = output.numpy()
+        return output
+
+# TestCase
+    def test_log_shape_format_fp32(self, device):
+        format_list = [3]
+        shape_list = [(4, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log_shape_format_fp16(self, device):
+        format_list = [3]
+        shape_list = [(4, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log_inp_shape_format_fp32(self, device):
+        format_list = [3]
+        shape_list = [(4, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_inp_op_exec(cpu_input1)
+            npu_output = self.npu_inp_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log_inp_shape_format_fp16(self, device):
+        format_list = [3]
+        shape_list = [(4, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_inp_op_exec(cpu_input1)
+            npu_output = self.npu_inp_op_exec(npu_input1)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log_inp_uncon_shape_format_fp32(self, device):
+        format_list = [3]
+        shape_list = [(8, 6)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_inp_uncon_op_exec(cpu_input1)
+            npu_output = self.npu_inp_uncon_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log_inp_uncon_shape_format_fp16(self, device):
+        format_list = [3]
+        shape_list = [(8, 6)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_inp_uncon_op_exec(cpu_input1)
+            npu_output = self.npu_inp_uncon_op_exec(npu_input1)
+            cpu_output = cpu_output.astype(np.float16)
+            print("cpu:", cpu_output, "npu:", npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log_out_float32_shape_format(self, device):
+        shape_format = [
+            [[np.float32, 0, [1024, 32, 7, 7]], [np.float32, 0, [1024, 32, 7, 7]]],
+            [[np.float32, 0, [1024, 32, 7]], [np.float32, 0, [1024, 32]]],
+            [[np.float32, 0, [1024, 32]], [np.float32, 0, [1024, 32]]],
+            [[np.float32, 0, [1024]], [np.float32, 0, [1024, 1]]],
+            [[np.float32, 3, [1024, 32, 7, 7]], [np.float32, 3, [1024, 32, 7, 7]]],
+            [[np.float32, 3, [1024, 32, 7]], [np.float32, 3, [1024, 32]]],
+            [[np.float32, 3, [1024, 32]], [np.float32, 3, [1024, 20]]],
+            [[np.float32, 3, [1024]], [np.float32, 3, [1024]]],
+            ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            cpu_output, npu_output = create_common_tensor(item[1], 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec_out(npu_input, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log_out_float16_shape_format(self, device):
+        shape_format = [
+            [[np.float16, 0, [1024, 32, 7, 7]], [np.float16, 0, [1024, 32, 7, 7]]],
+            [[np.float16, 0, [1024, 32, 7]], [np.float16, 0, [1024, 32]]],
+            [[np.float16, 0, [1024, 32]], [np.float16, 0, [1024, 32]]],
+            [[np.float16, 0, [1024]], [np.float16, 0, [1024, 1]]],
+            [[np.float16, 3, [1024, 32, 7, 7]], [np.float16, 3, [1024, 32, 7, 7]]],
+            [[np.float16, 3, [1024, 32, 7]], [np.float16, 3, [1024, 32]]],
+            [[np.float16, 3, [1024, 32]], [np.float16, 3, [1024, 20]]],
+            [[np.float16, 3, [1024]], [np.float16, 3, [1024]]],
+            ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            cpu_output, npu_output = create_common_tensor(item[1], 0, 100)
+            if item[0][0] == np.float16:
+                cpu_input = cpu_input.to(torch.float32)
+                cpu_output = cpu_output.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec_out(npu_input, npu_output)
+            if item[0][0] == np.float16:
+                cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestLog, globals(), except_for="cpu")
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_log10.py b/test/test_npu/test_network_ops/test_log10.py
index c3cc4226ae48d2e5aeaecbdb5e3ad7ddfd28804c..6b99d68c56a2659220df8749a63555d1ea5564c2 100644
--- a/test/test_npu/test_network_ops/test_log10.py
+++ b/test/test_npu/test_network_ops/test_log10.py
@@ -1,183 +1,183 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestLog10(TestCase):
-    def cpu_op_exec(self, input1):
-        output = torch.log10(input1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1):
-        output = torch.log10(input1)
-        output = output.to("cpu").numpy()
-        return output
-
-    def npu_op_exec_out(self, input1, input2):
-        torch.log10(input1, out=input2)
-        output = input2.to("cpu").numpy()
-        return output
-
-    def cpu_inp_op_exec(self, input1):
-        output = torch.log10_(input1)
-        output = output.numpy()
-        return output
-
-    def npu_inp_op_exec(self, input1):
-        torch.log10_(input1)
-        output = input1.to("cpu").numpy()
-        return output
-
-    def cpu_inp_uncon_op_exec(self, input1):
-        input1 = input1.as_strided([2, 2], [1, 2], 2)
-        output = torch.log10_(input1)
-        output = output.numpy()
-        return output
-
-    def npu_inp_uncon_op_exec(self, input1):
-        input1 = input1.as_strided([2, 2], [1, 2], 2)
-        torch.log10_(input1)
-        output = input1.to("cpu").numpy()
-        return output
-
-    def test_log10_shape_format_fp32(self, device):
-        format_list = [3]
-        shape_list = [(4, 4)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_log10_shape_format_fp16(self, device):
-        format_list = [3]
-        shape_list = [(4, 4)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_log10_inp_shape_format_fp32(self, device):
-        format_list = [3]
-        shape_list = [(4, 4)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_output = self.cpu_inp_op_exec(cpu_input1)
-            npu_output = self.npu_inp_op_exec(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_log10_inp_shape_format_fp16(self, device):
-        format_list = [3]
-        shape_list = [(4, 4)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_inp_op_exec(cpu_input1)
-            npu_output = self.npu_inp_op_exec(npu_input1)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_log10_inp_uncon_shape_format_fp32(self, device):
-        format_list = [3]
-        shape_list = [(8, 6)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_output = self.cpu_inp_uncon_op_exec(cpu_input1)
-            npu_output = self.npu_inp_uncon_op_exec(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_log10_inp_uncon_shape_format_fp16(self, device):
-        format_list = [3]
-        shape_list = [(8, 6)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_inp_uncon_op_exec(cpu_input1)
-            npu_output = self.npu_inp_uncon_op_exec(npu_input1)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_log10_out_float32_shape_format(self, device):
-        shape_format = [
-            [[np.float32, 0, [1024, 32, 7, 7]], [np.float32, 0, [1024, 32, 7, 7]]],
-            [[np.float32, 0, [1024, 32, 7]], [np.float32, 0, [1024, 32]]],
-            [[np.float32, 0, [1024, 32]], [np.float32, 0, [1024, 32]]],
-            [[np.float32, 0, [1024]], [np.float32, 0, [1024, 1]]],
-            [[np.float32, 3, [1024, 32, 7, 7]], [np.float32, 3, [1024, 32, 7, 7]]],
-            [[np.float32, 3, [1024, 32, 7]], [np.float32, 3, [1024, 32]]],
-            [[np.float32, 3, [1024, 32]], [np.float32, 3, [1024, 20]]],
-            [[np.float32, 3, [1024]], [np.float32, 3, [1024]]],
-            ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            cpu_output, npu_output = create_common_tensor(item[1], 0, 100)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec_out(npu_input, npu_output)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_log10_out_float16_shape_format(self, device):
-        shape_format = [
-            [[np.float16, 0, [1024, 32, 7, 7]], [np.float16, 0, [1024, 32, 7, 7]]],
-            [[np.float16, 0, [1024, 32, 7]], [np.float16, 0, [1024, 32]]],
-            [[np.float16, 0, [1024, 32]], [np.float16, 0, [1024, 32]]],
-            [[np.float16, 0, [1024]], [np.float16, 0, [1024, 1]]],
-            [[np.float16, 3, [1024, 32, 7, 7]], [np.float16, 3, [1024, 32, 7, 7]]],
-            [[np.float16, 3, [1024, 32, 7]], [np.float16, 3, [1024, 32]]],
-            [[np.float16, 3, [1024, 32]], [np.float16, 3, [1024, 20]]],
-            [[np.float16, 3, [1024]], [np.float16, 3, [1024]]],
-            ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            cpu_output, npu_output = create_common_tensor(item[1], 0, 100)
-            if item[0][0] == np.float16:
-                cpu_input = cpu_input.to(torch.float32)
-                cpu_output = cpu_output.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec_out(npu_input, npu_output)
-            if item[0][0] == np.float16:
-                cpu_output = cpu_output.astype(np.float16)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestLog10, globals(), except_for="cpu")
-if __name__ == '__main__':
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestLog10(TestCase):
+    def cpu_op_exec(self, input1):
+        output = torch.log10(input1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        output = torch.log10(input1)
+        output = output.to("cpu").numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2):
+        torch.log10(input1, out=input2)
+        output = input2.to("cpu").numpy()
+        return output
+
+    def cpu_inp_op_exec(self, input1):
+        output = torch.log10_(input1)
+        output = output.numpy()
+        return output
+
+    def npu_inp_op_exec(self, input1):
+        torch.log10_(input1)
+        output = input1.to("cpu").numpy()
+        return output
+
+    def cpu_inp_uncon_op_exec(self, input1):
+        input1 = input1.as_strided([2, 2], [1, 2], 2)
+        output = torch.log10_(input1)
+        output = output.numpy()
+        return output
+
+    def npu_inp_uncon_op_exec(self, input1):
+        input1 = input1.as_strided([2, 2], [1, 2], 2)
+        torch.log10_(input1)
+        output = input1.to("cpu").numpy()
+        return output
+
+    def test_log10_shape_format_fp32(self, device):
+        format_list = [3]
+        shape_list = [(4, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log10_shape_format_fp16(self, device):
+        format_list = [3]
+        shape_list = [(4, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log10_inp_shape_format_fp32(self, device):
+        format_list = [3]
+        shape_list = [(4, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_inp_op_exec(cpu_input1)
+            npu_output = self.npu_inp_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log10_inp_shape_format_fp16(self, device):
+        format_list = [3]
+        shape_list = [(4, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_inp_op_exec(cpu_input1)
+            npu_output = self.npu_inp_op_exec(npu_input1)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log10_inp_uncon_shape_format_fp32(self, device):
+        format_list = [3]
+        shape_list = [(8, 6)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_inp_uncon_op_exec(cpu_input1)
+            npu_output = self.npu_inp_uncon_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log10_inp_uncon_shape_format_fp16(self, device):
+        format_list = [3]
+        shape_list = [(8, 6)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_inp_uncon_op_exec(cpu_input1)
+            npu_output = self.npu_inp_uncon_op_exec(npu_input1)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log10_out_float32_shape_format(self, device):
+        shape_format = [
+            [[np.float32, 0, [1024, 32, 7, 7]], [np.float32, 0, [1024, 32, 7, 7]]],
+            [[np.float32, 0, [1024, 32, 7]], [np.float32, 0, [1024, 32]]],
+            [[np.float32, 0, [1024, 32]], [np.float32, 0, [1024, 32]]],
+            [[np.float32, 0, [1024]], [np.float32, 0, [1024, 1]]],
+            [[np.float32, 3, [1024, 32, 7, 7]], [np.float32, 3, [1024, 32, 7, 7]]],
+            [[np.float32, 3, [1024, 32, 7]], [np.float32, 3, [1024, 32]]],
+            [[np.float32, 3, [1024, 32]], [np.float32, 3, [1024, 20]]],
+            [[np.float32, 3, [1024]], [np.float32, 3, [1024]]],
+            ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            cpu_output, npu_output = create_common_tensor(item[1], 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec_out(npu_input, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log10_out_float16_shape_format(self, device):
+        shape_format = [
+            [[np.float16, 0, [1024, 32, 7, 7]], [np.float16, 0, [1024, 32, 7, 7]]],
+            [[np.float16, 0, [1024, 32, 7]], [np.float16, 0, [1024, 32]]],
+            [[np.float16, 0, [1024, 32]], [np.float16, 0, [1024, 32]]],
+            [[np.float16, 0, [1024]], [np.float16, 0, [1024, 1]]],
+            [[np.float16, 3, [1024, 32, 7, 7]], [np.float16, 3, [1024, 32, 7, 7]]],
+            [[np.float16, 3, [1024, 32, 7]], [np.float16, 3, [1024, 32]]],
+            [[np.float16, 3, [1024, 32]], [np.float16, 3, [1024, 20]]],
+            [[np.float16, 3, [1024]], [np.float16, 3, [1024]]],
+            ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            cpu_output, npu_output = create_common_tensor(item[1], 0, 100)
+            if item[0][0] == np.float16:
+                cpu_input = cpu_input.to(torch.float32)
+                cpu_output = cpu_output.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec_out(npu_input, npu_output)
+            if item[0][0] == np.float16:
+                cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestLog10, globals(), except_for="cpu")
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_log2.py b/test/test_npu/test_network_ops/test_log2.py
old mode 100644
new mode 100755
index 164f81a004eaef80b42b14b36f4154ecf4e47b2f..925dde63b543112b751d4ccc50b72de65fee396e
--- a/test/test_npu/test_network_ops/test_log2.py
+++ b/test/test_npu/test_network_ops/test_log2.py
@@ -1,152 +1,152 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestLog2(TestCase):
-    def cpu_op_exec(self, input1):
-        output = torch.log2(input1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1):
-        input1 = input1.to("npu")
-        output = torch.log2(input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_out(self, input1):
-        input1 = input1.to("npu")
-        output = input1.to("npu")
-        torch.log2(input1, out=output)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_inp_op_exec(self, input1):
-        output = torch.log2_(input1)
-        output = output.numpy()
-        return output
-
-    def npu_inp_op_exec(self, input1):
-        input1 = input1.to("npu")
-        output = torch.log2_(input1)
-        output = input1.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_inp_uncon_op_exec(self, input1):
-        input1 = input1.as_strided([2, 2], [1, 2], 2)
-        output = torch.log2_(input1)
-        output = output.numpy()
-        return output
-
-    def npu_inp_uncon_op_exec(self, input1):
-        input1 = input1.to("npu")
-        input1 = input1.as_strided([2, 2], [1, 2], 2)
-        output = torch.log2_(input1)
-        output = input1.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_log2_shape_format_fp32(self, device):
-        format_list = [3]
-        shape_list = [(4, 4)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            # print(item)
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_log2_shape_format_fp16(self, device):
-        format_list = [3]
-        shape_list = [(4, 4)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_log2_inp_shape_format_fp32(self, device):
-        format_list = [3]
-        shape_list = [(5, 3)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_output = self.cpu_inp_op_exec(cpu_input1)
-            npu_output = self.npu_inp_op_exec(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_log2_inp_shape_format_fp16(self, device):
-        format_list = [3]
-        shape_list = [(4, 4)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_inp_op_exec(cpu_input1)
-            npu_output = self.npu_inp_op_exec(npu_input1)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_log2_inp_uncon_shape_format_fp32(self, device):
-        format_list = [3]
-        shape_list = [(8, 6)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_output = self.cpu_inp_uncon_op_exec(cpu_input1)
-            npu_output = self.npu_inp_uncon_op_exec(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_log_inp_uncon_shape_format_fp16(self, device):
-        format_list = [3]
-        shape_list = [(8, 6)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_inp_uncon_op_exec(cpu_input1)
-            npu_output = self.npu_inp_uncon_op_exec(npu_input1)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestLog2, globals(), except_for="cpu")
-if __name__ == '__main__':
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestLog2(TestCase):
+    def cpu_op_exec(self, input1):
+        output = torch.log2(input1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        input1 = input1.to("npu")
+        output = torch.log2(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1):
+        input1 = input1.to("npu")
+        output = input1.to("npu")
+        torch.log2(input1, out=output)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_inp_op_exec(self, input1):
+        output = torch.log2_(input1)
+        output = output.numpy()
+        return output
+
+    def npu_inp_op_exec(self, input1):
+        input1 = input1.to("npu")
+        output = torch.log2_(input1)
+        output = input1.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_inp_uncon_op_exec(self, input1):
+        input1 = input1.as_strided([2, 2], [1, 2], 2)
+        output = torch.log2_(input1)
+        output = output.numpy()
+        return output
+
+    def npu_inp_uncon_op_exec(self, input1):
+        input1 = input1.to("npu")
+        input1 = input1.as_strided([2, 2], [1, 2], 2)
+        output = torch.log2_(input1)
+        output = input1.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_log2_shape_format_fp32(self, device):
+        format_list = [3]
+        shape_list = [(4, 4)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            # print(item)
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log2_shape_format_fp16(self, device):
+        format_list = [3]
+        shape_list = [(4, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log2_inp_shape_format_fp32(self, device):
+        format_list = [3]
+        shape_list = [(5, 3)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_inp_op_exec(cpu_input1)
+            npu_output = self.npu_inp_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log2_inp_shape_format_fp16(self, device):
+        format_list = [3]
+        shape_list = [(4, 4)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_inp_op_exec(cpu_input1)
+            npu_output = self.npu_inp_op_exec(npu_input1)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log2_inp_uncon_shape_format_fp32(self, device):
+        format_list = [3]
+        shape_list = [(8, 6)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_inp_uncon_op_exec(cpu_input1)
+            npu_output = self.npu_inp_uncon_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_log_inp_uncon_shape_format_fp16(self, device):
+        format_list = [3]
+        shape_list = [(8, 6)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_inp_uncon_op_exec(cpu_input1)
+            npu_output = self.npu_inp_uncon_op_exec(npu_input1)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestLog2, globals(), except_for="cpu")
+if __name__ == '__main__':
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_log_softmax.py b/test/test_npu/test_network_ops/test_log_softmax.py
old mode 100644
new mode 100755
index 12c0569864fecb0b2c4e3916d040ca1e5ced3e5b..8d5a10668f1e5e50e6e5e71bcac7562b2c592d76
--- a/test/test_npu/test_network_ops/test_log_softmax.py
+++ b/test/test_npu/test_network_ops/test_log_softmax.py
@@ -1,109 +1,109 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-sys.path.append('..')
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestLogSoftmax(TestCase):
-    def cpu_op_exec(self, input1, dim):
-        output = torch.nn.functional.log_softmax(input1, dim)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_new(self, input1, dim):
-        output = torch.nn.functional.log_softmax(input1, dim)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-    
-    def logsoftmax_result(self, shape_format):
-        for item in shape_format:
-            dim = np.random.randint(0, len(item[2]))
-            print(item, " dim=", dim)
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 10)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-           
-            cpu_output = self.cpu_op_exec(cpu_input1, 0)
-            npu_output = self.npu_op_exec_new(npu_input1, 0)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-            
-    def test_logsoftmax_shape_format_fp16_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float16, i, [1024]] for i in format_list
-        ]
-        self.logsoftmax_result(shape_format)
-        
-    def test_logsoftmax_shape_format_fp32_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float32, i, [1024]] for i in format_list
-        ]
-        self.logsoftmax_result(shape_format)
-        
-    def test_logsoftmax_shape_format_fp16_2d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float16, i, [256, 1000]] for i in format_list 
-        ]
-        self.logsoftmax_result(shape_format)
-        
-    def test_logsoftmax_shape_format_fp32_2d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float32, i, [256, 1000]] for i in format_list 
-        ]
-        self.logsoftmax_result(shape_format)
-        
-    def test_logsoftmax_shape_format_fp16_3d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float16, i, [32, 48, 64]] for i in format_list 
-        ]
-        self.logsoftmax_result(shape_format)
-        
-    def test_logsoftmax_shape_format_fp32_3d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float32, i, [32, 48, 1024]] for i in format_list
-        ]
-        self.logsoftmax_result(shape_format)
-        
-    def test_logsoftmax_shape_format_fp16_4d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float16, i, [32, 24, 18, 1000]] for i in format_list
-        ]
-        self.logsoftmax_result(shape_format)
-
-    def test_logsoftmax_shape_format_fp32_4d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float32, i, [32, 24, 18, 1000]] for i in format_list
-        ]
-        self.logsoftmax_result(shape_format)
-            
-
-instantiate_device_type_tests(TestLogSoftmax, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('..')
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestLogSoftmax(TestCase):
+    def cpu_op_exec(self, input1, dim):
+        output = torch.nn.functional.log_softmax(input1, dim)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_new(self, input1, dim):
+        output = torch.nn.functional.log_softmax(input1, dim)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+    
+    def logsoftmax_result(self, shape_format):
+        for item in shape_format:
+            dim = np.random.randint(0, len(item[2]))
+            print(item, " dim=", dim)
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 10)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+           
+            cpu_output = self.cpu_op_exec(cpu_input1, 0)
+            npu_output = self.npu_op_exec_new(npu_input1, 0)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+            
+    def test_logsoftmax_shape_format_fp16_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [
+            [np.float16, i, [1024]] for i in format_list
+        ]
+        self.logsoftmax_result(shape_format)
+        
+    def test_logsoftmax_shape_format_fp32_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [
+            [np.float32, i, [1024]] for i in format_list
+        ]
+        self.logsoftmax_result(shape_format)
+        
+    def test_logsoftmax_shape_format_fp16_2d(self, device):
+        format_list = [0, 3]
+        shape_format = [
+            [np.float16, i, [256, 1000]] for i in format_list 
+        ]
+        self.logsoftmax_result(shape_format)
+        
+    def test_logsoftmax_shape_format_fp32_2d(self, device):
+        format_list = [0, 3]
+        shape_format = [
+            [np.float32, i, [256, 1000]] for i in format_list 
+        ]
+        self.logsoftmax_result(shape_format)
+        
+    def test_logsoftmax_shape_format_fp16_3d(self, device):
+        format_list = [0, 3]
+        shape_format = [
+            [np.float16, i, [32, 48, 64]] for i in format_list 
+        ]
+        self.logsoftmax_result(shape_format)
+        
+    def test_logsoftmax_shape_format_fp32_3d(self, device):
+        format_list = [0, 3]
+        shape_format = [
+            [np.float32, i, [32, 48, 1024]] for i in format_list
+        ]
+        self.logsoftmax_result(shape_format)
+        
+    def test_logsoftmax_shape_format_fp16_4d(self, device):
+        format_list = [0, 3]
+        shape_format = [
+            [np.float16, i, [32, 24, 18, 1000]] for i in format_list
+        ]
+        self.logsoftmax_result(shape_format)
+
+    def test_logsoftmax_shape_format_fp32_4d(self, device):
+        format_list = [0, 3]
+        shape_format = [
+            [np.float32, i, [32, 24, 18, 1000]] for i in format_list
+        ]
+        self.logsoftmax_result(shape_format)
+            
+
+instantiate_device_type_tests(TestLogSoftmax, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_log_softmax_backward.py b/test/test_npu/test_network_ops/test_log_softmax_backward.py
old mode 100644
new mode 100755
index 688b7b01b8211e1f81a06445c48ca90db27e34e8..cbc0907ba3af4ebf21beb1f246c7d04974de3bab
--- a/test/test_npu/test_network_ops/test_log_softmax_backward.py
+++ b/test/test_npu/test_network_ops/test_log_softmax_backward.py
@@ -1,109 +1,109 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-sys.path.append('..')
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestLogSoftmaxBackward(TestCase):
-    def cpu_op_exec(self, input1, input2, n):
-        output = torch._log_softmax_backward_data(input1, input2, n, input1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_new(self, input1, input2, n):
-        output = torch._log_softmax_backward_data(input1, input2, n, input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-    
-    def logsoftmax_backward_result(self, shape_format, min_lmt, max_lmt):
-        for item in shape_format:
-            dim = np.random.randint(0, len(item[2]))
-            print(item," dim=", dim)
-            cpu_input1, npu_input1 = create_common_tensor(item, min_lmt, max_lmt)
-            cpu_input2, npu_input2 = create_common_tensor(item, min_lmt, max_lmt)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, dim)
-            npu_output = self.npu_op_exec_new(npu_input1, npu_input2, dim)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-            
-    def test_logsoftmax_backward_shape_format_fp16_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float16, i, [18]] for i in format_list 
-        ]
-        self.logsoftmax_backward_result(shape_format, 0, 2)
-        
-    def test_logsoftmax_backward_shape_format_fp32_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float32, i, [18]] for i in format_list 
-        ]
-        self.logsoftmax_backward_result(shape_format, 0, 50)
-        
-    def test_logsoftmax_backward_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [
-            [np.float16, i, [256, 1000]] for i in format_list 
-        ]
-        self.logsoftmax_backward_result(shape_format, 0, 2)
-        
-    def test_logsoftmax_backward_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [
-            [np.float32, i, [256, 1000]] for i in format_list 
-        ]
-        self.logsoftmax_backward_result(shape_format, 0, 50)
-        
-    def test_logsoftmax_backward_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [
-            [np.float16, i, [32, 48, 64]] for i in format_list 
-        ]
-        self.logsoftmax_backward_result(shape_format, 0, 2)
-        
-    def test_logsoftmax_backward_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [
-            [np.float32, i, [32, 48, 64]] for i in format_list 
-        ]
-        self.logsoftmax_backward_result(shape_format, 0, 50)
-        
-    def test_logsoftmax_backward_shape_format_fp16_4d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float16, i, [32, 24, 18, 18]] for i in format_list 
-        ]
-        self.logsoftmax_backward_result(shape_format, 0, 2)
-        
-    def test_logsoftmax_backward_shape_format_fp32_4d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float32, i, [32, 24, 18, 18]] for i in format_list 
-        ]
-        self.logsoftmax_backward_result(shape_format, 0, 50)
-            
-
-instantiate_device_type_tests(TestLogSoftmaxBackward, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('..')
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestLogSoftmaxBackward(TestCase):
+    def cpu_op_exec(self, input1, input2, n):
+        output = torch._log_softmax_backward_data(input1, input2, n, input1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_new(self, input1, input2, n):
+        output = torch._log_softmax_backward_data(input1, input2, n, input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+    
+    def logsoftmax_backward_result(self, shape_format, min_lmt, max_lmt):
+        for item in shape_format:
+            dim = np.random.randint(0, len(item[2]))
+            print(item," dim=", dim)
+            cpu_input1, npu_input1 = create_common_tensor(item, min_lmt, max_lmt)
+            cpu_input2, npu_input2 = create_common_tensor(item, min_lmt, max_lmt)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, dim)
+            npu_output = self.npu_op_exec_new(npu_input1, npu_input2, dim)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+            
+    def test_logsoftmax_backward_shape_format_fp16_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [
+            [np.float16, i, [18]] for i in format_list 
+        ]
+        self.logsoftmax_backward_result(shape_format, 0, 2)
+        
+    def test_logsoftmax_backward_shape_format_fp32_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [
+            [np.float32, i, [18]] for i in format_list 
+        ]
+        self.logsoftmax_backward_result(shape_format, 0, 50)
+        
+    def test_logsoftmax_backward_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [
+            [np.float16, i, [256, 1000]] for i in format_list 
+        ]
+        self.logsoftmax_backward_result(shape_format, 0, 2)
+        
+    def test_logsoftmax_backward_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [
+            [np.float32, i, [256, 1000]] for i in format_list 
+        ]
+        self.logsoftmax_backward_result(shape_format, 0, 50)
+        
+    def test_logsoftmax_backward_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [
+            [np.float16, i, [32, 48, 64]] for i in format_list 
+        ]
+        self.logsoftmax_backward_result(shape_format, 0, 2)
+        
+    def test_logsoftmax_backward_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [
+            [np.float32, i, [32, 48, 64]] for i in format_list 
+        ]
+        self.logsoftmax_backward_result(shape_format, 0, 50)
+        
+    def test_logsoftmax_backward_shape_format_fp16_4d(self, device):
+        format_list = [0, 3]
+        shape_format = [
+            [np.float16, i, [32, 24, 18, 18]] for i in format_list 
+        ]
+        self.logsoftmax_backward_result(shape_format, 0, 2)
+        
+    def test_logsoftmax_backward_shape_format_fp32_4d(self, device):
+        format_list = [0, 3]
+        shape_format = [
+            [np.float32, i, [32, 24, 18, 18]] for i in format_list 
+        ]
+        self.logsoftmax_backward_result(shape_format, 0, 50)
+            
+
+instantiate_device_type_tests(TestLogSoftmaxBackward, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_logical_and.py b/test/test_npu/test_network_ops/test_logical_and.py
index ee82bac600c23868615541195812cf6d27ef7a86..a796a129ae8e697d788412b0d8c22d4d59703d73 100644
--- a/test/test_npu/test_network_ops/test_logical_and.py
+++ b/test/test_npu/test_network_ops/test_logical_and.py
@@ -1,129 +1,129 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestLogicalAnd(TestCase):
-
-    def generate_single_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        npu_input1 = torch.from_numpy(input1)
-
-        return npu_input1
-
-    def generate_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-
-        #modify from numpy.ndarray to torch.tensor
-        npu_input1 = torch.from_numpy(input1)
-        npu_input2 = torch.from_numpy(input2)
-
-        return npu_input1, npu_input2
-    
-    def generate_three_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        input3 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-
-        #modify from numpy.ndarray to torch.tensor
-        npu_input1 = torch.from_numpy(input1)
-        npu_input2 = torch.from_numpy(input2)
-        npu_input3 = torch.from_numpy(input3)
-        
-        return npu_input1, npu_input2, npu_input3
-
-    def cpu_op_exec(self, input1, input2):
-        output = torch.logical_and(input1, input2)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        output = torch.logical_and(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_out(self, input1, input2, input3):
-        torch.logical_and(input1, input2, out=input3)
-        output = input3.numpy()
-        return output
-
-    def npu_op_exec_out(self, input1, input2, input3):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        output = input3.to("npu")
-        torch.logical_and(input1, input2, out=output)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output 
-
-    def cpu_op_exec_(self, input1, input2):
-        output = torch.Tensor.logical_and_(input1, input2)
-        output = output.numpy()
-        return output
- 
-    def npu_op_exec_(self, input1, input2):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        output = torch.Tensor.logical_and_(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def logical_and_out_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1 = torch.randn(item[0])<0
-            cpu_input2 = torch.randn(item[0])<0
-            cpu_input3 = torch.randn(item[1])<0
-            cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3)
-            npu_output_out = self.npu_op_exec_out(cpu_input1, cpu_input2, cpu_input3)
-
-            self.assertRtolEqual(cpu_output_out, npu_output_out)
-
-    def test_logical_and_out(self, device):
-        shape_format = [
-            [[128, 116, 14, 14], [256, 116, 1, 1, 28]],
-            [[128, 3, 224, 224], [3, 3, 3]],
-            [[128, 116, 14, 14], [128, 116, 14, 14]],
-            [[256, 128, 7, 7], [128, 256, 3, 3, 28]],
-            [[2, 3, 3, 3], [3, 1, 3]],
-            [[128, 232, 7, 7], [128, 232, 7, 7]],
-        ]
-        self.logical_and_out_result(shape_format)
-
-    def test_logical_and_bool(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 2, (2, 5), np.bool)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2).astype(np.float32)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2).astype(np.float32)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_logical_and_inplace_bool(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 2, (2, 5), np.bool)
-        cpu_output = self.cpu_op_exec_(npu_input1, npu_input2).astype(np.float32)
-        npu_output = self.npu_op_exec_(npu_input1, npu_input2).astype(np.float32)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestLogicalAnd, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestLogicalAnd(TestCase):
+
+    def generate_single_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        npu_input1 = torch.from_numpy(input1)
+
+        return npu_input1
+
+    def generate_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+
+        #modify from numpy.ndarray to torch.tensor
+        npu_input1 = torch.from_numpy(input1)
+        npu_input2 = torch.from_numpy(input2)
+
+        return npu_input1, npu_input2
+    
+    def generate_three_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        input3 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+
+        #modify from numpy.ndarray to torch.tensor
+        npu_input1 = torch.from_numpy(input1)
+        npu_input2 = torch.from_numpy(input2)
+        npu_input3 = torch.from_numpy(input3)
+        
+        return npu_input1, npu_input2, npu_input3
+
+    def cpu_op_exec(self, input1, input2):
+        output = torch.logical_and(input1, input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        output = torch.logical_and(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_out(self, input1, input2, input3):
+        torch.logical_and(input1, input2, out=input3)
+        output = input3.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2, input3):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        output = input3.to("npu")
+        torch.logical_and(input1, input2, out=output)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output 
+
+    def cpu_op_exec_(self, input1, input2):
+        output = torch.Tensor.logical_and_(input1, input2)
+        output = output.numpy()
+        return output
+ 
+    def npu_op_exec_(self, input1, input2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        output = torch.Tensor.logical_and_(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def logical_and_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1 = torch.randn(item[0])<0
+            cpu_input2 = torch.randn(item[0])<0
+            cpu_input3 = torch.randn(item[1])<0
+            cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3)
+            npu_output_out = self.npu_op_exec_out(cpu_input1, cpu_input2, cpu_input3)
+
+            self.assertRtolEqual(cpu_output_out, npu_output_out)
+
+    def test_logical_and_out(self, device):
+        shape_format = [
+            [[128, 116, 14, 14], [256, 116, 1, 1, 28]],
+            [[128, 3, 224, 224], [3, 3, 3]],
+            [[128, 116, 14, 14], [128, 116, 14, 14]],
+            [[256, 128, 7, 7], [128, 256, 3, 3, 28]],
+            [[2, 3, 3, 3], [3, 1, 3]],
+            [[128, 232, 7, 7], [128, 232, 7, 7]],
+        ]
+        self.logical_and_out_result(shape_format)
+
+    def test_logical_and_bool(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 2, (2, 5), np.bool)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2).astype(np.float32)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2).astype(np.float32)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_logical_and_inplace_bool(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 2, (2, 5), np.bool)
+        cpu_output = self.cpu_op_exec_(npu_input1, npu_input2).astype(np.float32)
+        npu_output = self.npu_op_exec_(npu_input1, npu_input2).astype(np.float32)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestLogicalAnd, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_logspace.py b/test/test_npu/test_network_ops/test_logspace.py
index d3fb5afc10a18dfd9fce1efdfe0a0a55e755ed85..4dacbb08ec01af1e68963c59017188770d3438e5 100644
--- a/test/test_npu/test_network_ops/test_logspace.py
+++ b/test/test_npu/test_network_ops/test_logspace.py
@@ -1,92 +1,92 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestLogSpace(TestCase):
-
-    def cpu_op_exec(self, start, end, steps, base):
-        output = torch.logspace(start=start, end=end, steps=steps, base=base)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, start, end, steps, base):
-        output = torch.logspace(start=start, end=end, steps=steps, base=base, device="npu")
-        output = output.to("cpu")
-        output = output.numpy()
-        return output    
-
-    def npu_op_exec_out(self, start, end, steps, base, dtype):
-        output = torch.randn(steps)
-        torch.logspace(start=start, end=end, steps=steps, base=base, dtype=dtype, out=output)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_logspace_common_shape_format(self, device):
-        shape_format = [
-                [0.0, 1.0, 10, 0.2, torch.float32],
-                [2.0, 3.0, 10, 0.05, torch.float32],
-                [10.0, 10.5, 11, 0.2, torch.float32],
-                [10.0, 10.5, 110, 0.2, torch.float32],
-                [0.0, 0.1, 20, 1.2, torch.float32],
-                [0.5, 1.0, 50, 8.0, torch.float32],
-                [1.0, 2.0, 2, -0.5, torch.float32],
-                [0.0, 0.0, 1, 0.0, torch.float32],
-                [1.0, 1.0, 1, 0.0, torch.float32],
-                [1.0, 1.0, 0, 0.0, torch.float32],
-                [1.0, 2.0, 9, 0.0, torch.float32]
-        ] 
-
-        for item in shape_format:
-            cpu_output = self.cpu_op_exec(item[0], item[1], item[2], item[3])
-            npu_output = self.npu_op_exec(item[0], item[1], item[2], item[3])
-            self.assertRtolEqual(cpu_output, npu_output)
-            npu_out_output = self.npu_op_exec_out(item[0], item[1], item[2], item[3], item[4])
-            self.assertRtolEqual(cpu_output, npu_out_output)
-    def test_logspace_float16_shape_format(self, device):
-        def cpu_op_exec_fp16(start, end, steps, base, dtype):
-            output = torch.logspace(start=start, end=end, steps=steps, base=base, dtype=torch.float32)
-            output = output.numpy()
-            output = output.astype(np.float16)
-            return output
-
-        def npu_op_exec(start, end, steps, base, dtype):
-            output = torch.logspace( start=start, end=end, steps=steps, base=base, dtype=dtype, device="npu" )
-            output = output.to("cpu")
-            output = output.numpy()
-            return output
-
-        shape_format = [
-                [-2.0, 2.0, 32, 32, torch.float16],
-                [0.0, 1.0, 10, 0.2, torch.float16],
-                [2.0, 3.0, 10, 0.05, torch.float16],
-                [0.0, 0.1, 20, 1.2, torch.float16],
-                [0.5, 1.0, 50, 8.0, torch.float16],
-                [1.0, 2.0, 2, -0.5, torch.float16],
-                [0.0, 0.0, 1, 0.0, torch.float16]
-        ] 
-
-        for item in shape_format:
-            cpu_output = cpu_op_exec_fp16(item[0], item[1], item[2], item[3], item[4])
-            npu_output = npu_op_exec(item[0], item[1], item[2], item[3], item[4])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestLogSpace, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestLogSpace(TestCase):
+
+    def cpu_op_exec(self, start, end, steps, base):
+        output = torch.logspace(start=start, end=end, steps=steps, base=base)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, start, end, steps, base):
+        output = torch.logspace(start=start, end=end, steps=steps, base=base, device="npu")
+        output = output.to("cpu")
+        output = output.numpy()
+        return output    
+
+    def npu_op_exec_out(self, start, end, steps, base, dtype):
+        output = torch.randn(steps)
+        torch.logspace(start=start, end=end, steps=steps, base=base, dtype=dtype, out=output)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_logspace_common_shape_format(self, device):
+        shape_format = [
+                [0.0, 1.0, 10, 0.2, torch.float32],
+                [2.0, 3.0, 10, 0.05, torch.float32],
+                [10.0, 10.5, 11, 0.2, torch.float32],
+                [10.0, 10.5, 110, 0.2, torch.float32],
+                [0.0, 0.1, 20, 1.2, torch.float32],
+                [0.5, 1.0, 50, 8.0, torch.float32],
+                [1.0, 2.0, 2, -0.5, torch.float32],
+                [0.0, 0.0, 1, 0.0, torch.float32],
+                [1.0, 1.0, 1, 0.0, torch.float32],
+                [1.0, 1.0, 0, 0.0, torch.float32],
+                [1.0, 2.0, 9, 0.0, torch.float32]
+        ] 
+
+        for item in shape_format:
+            cpu_output = self.cpu_op_exec(item[0], item[1], item[2], item[3])
+            npu_output = self.npu_op_exec(item[0], item[1], item[2], item[3])
+            self.assertRtolEqual(cpu_output, npu_output)
+            npu_out_output = self.npu_op_exec_out(item[0], item[1], item[2], item[3], item[4])
+            self.assertRtolEqual(cpu_output, npu_out_output)
+    def test_logspace_float16_shape_format(self, device):
+        def cpu_op_exec_fp16(start, end, steps, base, dtype):
+            output = torch.logspace(start=start, end=end, steps=steps, base=base, dtype=torch.float32)
+            output = output.numpy()
+            output = output.astype(np.float16)
+            return output
+
+        def npu_op_exec(start, end, steps, base, dtype):
+            output = torch.logspace( start=start, end=end, steps=steps, base=base, dtype=dtype, device="npu" )
+            output = output.to("cpu")
+            output = output.numpy()
+            return output
+
+        shape_format = [
+                [-2.0, 2.0, 32, 32, torch.float16],
+                [0.0, 1.0, 10, 0.2, torch.float16],
+                [2.0, 3.0, 10, 0.05, torch.float16],
+                [0.0, 0.1, 20, 1.2, torch.float16],
+                [0.5, 1.0, 50, 8.0, torch.float16],
+                [1.0, 2.0, 2, -0.5, torch.float16],
+                [0.0, 0.0, 1, 0.0, torch.float16]
+        ] 
+
+        for item in shape_format:
+            cpu_output = cpu_op_exec_fp16(item[0], item[1], item[2], item[3], item[4])
+            npu_output = npu_op_exec(item[0], item[1], item[2], item[3], item[4])
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestLogSpace, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_lt.py b/test/test_npu/test_network_ops/test_lt.py
old mode 100644
new mode 100755
index bea84c4534abf1df3ff81c7c28a6f81feb562c79..f9bec34f489995f2a8578135e83d009df919dc51
--- a/test/test_npu/test_network_ops/test_lt.py
+++ b/test/test_npu/test_network_ops/test_lt.py
@@ -1,256 +1,256 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestLt(TestCase):
-    def cpu_op_exec(self, input1, input2):
-        output = torch.lt(input1, input2)
-        output = output.numpy().astype(np.int32)
-        return output
-
-    def cpu_op_exec_out(self, input1, input2, input3):
-        torch.lt(input1, input2, out = input3)
-        output = input3.numpy().astype(np.int32)
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        output = torch.lt(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy().astype(np.int32)
-        return output
-
-    def cpu_op_inplace_exec(self, input1, input2):
-        output = input1.lt_(input2)
-        output = input1.numpy().astype(np.int32)
-        return output
-
-    def npu_op_inplace_exec(self, input1, input2):
-        output = input1.lt_(input2)
-        output = output.to("cpu")
-        output = output.numpy().astype(np.int32)
-        return output
-
-    def npu_op_exec_out(self, input1, input2, out):
-        torch.lt(input1, input2, out=out)
-        output = out.to("cpu")
-        output = output.numpy().astype(np.int32)
-        return output
-
-    def cpu_op_exec_scalar(self, input, scalar):
-        output = torch.lt(input, scalar)
-        output = output.numpy().astype(np.int32)
-        return output
-
-    def cpu_op_exec_scalar_out(self, input1, scalar, input2):
-        torch.lt(input1, scalar, out = input2)
-        output = input2.numpy().astype(np.int32)
-        return output
-
-    def npu_op_exec_scalar(self, input, scalar):
-        output = torch.lt(input, scalar)
-        output = output.to("cpu")
-        output = output.numpy().astype(np.int32)
-        return output
-
-    def npu_op_exec_scalar_out(self, input, scalar, out):
-        torch.lt(input, scalar, out=out)
-        output = out.to("cpu")
-        output = output.numpy().astype(np.int32)
-        return output
-
-    def lt_tensor_out_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100)
-            cpu_input3 = torch.randn(item[1][2])<0
-            npu_input3 = cpu_input3.npu()
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            if cpu_input2.dtype == torch.float16:
-                cpu_input2 = cpu_input2.to(torch.float32)
-            if cpu_input3.dtype == torch.float16:
-                cpu_input3 = cpu_input3.to(torch.float32)
-            cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3)
-            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
-            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
-
-            self.assertRtolEqual(cpu_output_out, npu_output_out)
-
-    def test_lt_tensor_out(self, device):
-        shape_format = [
-            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
-            [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]],
-            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
-            [[np.float32, 0, [256, 128, 7, 7]],   [np.float32, 0, [128, 256, 3, 3]]],
-            [[np.float32, 0, [2, 3, 3, 3]],       [np.float32, 0, [3, 1, 3]]],
-            [[np.float32, 0, [128, 232, 7, 7]],   [np.float32, 0, [128, 232, 7, 7]]],
-        ]
-        self.lt_tensor_out_result(shape_format)
-
-    def lt_scalar_out_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_input2 = torch.randn(item[1][2])<0
-            npu_input2 = cpu_input2.npu()
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            if cpu_input2.dtype == torch.float16:
-                cpu_input2 = cpu_input2.to(torch.float32)
-            scalar = np.random.uniform(0, 100)
-            cpu_output_out = self.cpu_op_exec_scalar_out(cpu_input1, scalar, cpu_input2)
-            npu_output_out = self.npu_op_exec_scalar_out(npu_input1, scalar, npu_input2)
-            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
-            self.assertRtolEqual(cpu_output_out, npu_output_out)
-
-    def test_lt_scalar_out(self, device):
-        shape_format = [
-            [[np.float16, 0, [4, 4, 128, 128]], [np.float16, 0, [256, 116, 1, 1]]],
-            [[np.float16, 0, [12, 10, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
-            [[np.float16, 0, [16, 3, 1111, 1212]], [np.float16, 0, [3, 3, 3]]],
-            [[np.float16, 0, [16, 16, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
-            [[np.float32, 0, [20, 10, 7, 7]], [np.float32, 0, [128, 256, 3, 3]]],
-            [[np.float32, 0, [1313, 3, 3, 3]], [np.float32, 0, [3, 1, 3]]],
-            [[np.float32, 0, [16, 22, 7, 7]], [np.float32, 0, [128, 232, 7, 7]]],
-        ]
-        self.lt_scalar_out_result(shape_format)
-
-    def lt_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            cpu_output_inp = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
-            npu_output_inp = self.npu_op_inplace_exec(npu_input1, npu_input2)
-
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            cpu_output_inp = cpu_output_inp.astype(npu_output_inp.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_output_inp, npu_output_inp)
-
-    def lt_scalar_result(self, shape_format):
-        for item in shape_format:
-            scalar = np.random.uniform(0, 100)
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output_scalar = self.cpu_op_exec_scalar(cpu_input1, scalar)
-            npu_output_scalar = self.npu_op_exec_scalar(npu_input1, scalar)
-
-            cpu_output_scalar = cpu_output_scalar.astype(npu_output_scalar.dtype)
-
-            self.assertRtolEqual(cpu_output_scalar, npu_output_scalar)
-
-    def test_lt_shape_format_fp16_1d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[np.float16, i, 5] for i in format_list]
-        self.lt_result(shape_format)
-
-    def test_lt_shape_format_fp32_1d(self, device):
-        format_list = [-1, 0, 3]
-        shape_format = [[np.float32, i, 5] for i in format_list]
-        self.lt_result(shape_format)
-
-    def test_lt_shape_format_fp16_2d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[np.float16, i, [5, 3]] for i in format_list]
-        self.lt_result(shape_format)
-
-    def test_lt_shape_format_fp32_2d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[np.float32, i, [5, 3]] for i in format_list]
-        self.lt_result(shape_format)
-
-    def test_lt_shape_format_fp16_3d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[np.float16, i, [16, 640, 640]] for i in format_list]
-        self.lt_result(shape_format)
-
-    def test_lt_shape_format_fp32_3d(self, device):
-        format_list = [-1, 0, 3]
-        shape_format = [[np.float32, i, [16, 640, 640]] for i in format_list]
-        self.lt_result(shape_format)
-
-    def test_lt_shape_format_fp16_4d(self, device):
-        format_list = [-1, 3]
-        shape_format = [[np.float16, i, [32, 3, 3, 3]] for i in format_list]
-        self.lt_result(shape_format)
-
-    def test_lt_shape_format_fp32_4d(self, device):
-        format_list = [-1, 3]
-        shape_format = [[np.float32, i, [32, 3, 3, 3]] for i in format_list]
-        self.lt_result(shape_format)
-
-    # scalar-----------------------------------------------------------------------
-    def test_lt_scalar_shape_format_fp16_1d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[np.float16, i, 18] for i in format_list]
-        self.lt_scalar_result(shape_format)
-
-    def test_lt_scalar_shape_format_fp32_1d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[np.float32, i, [18]] for i in format_list]
-        self.lt_scalar_result(shape_format)
-
-    def test_lt_scalar_shape_format_fp16_2d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[np.float16, i, [5, 8]] for i in format_list]
-        self.lt_scalar_result(shape_format)
-
-    def test_lt_scalar_shape_format_fp32_2d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[np.float32, i, [5, 8]] for i in format_list]
-        self.lt_scalar_result(shape_format)
-
-    def test_lt_scalar_shape_format_fp16_3d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[np.float16, i, [4, 16, 32]] for i in format_list]
-        self.lt_scalar_result(shape_format)
-
-    def test_lt_scalar_shape_format_fp32_3d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[np.float32, i, [4, 16, 32]] for i in format_list]
-        self.lt_scalar_result(shape_format)
-
-    def test_lt_scalar_shape_format_fp16_4d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[np.float16, i, [32, 3, 3, 3]] for i in format_list]
-        self.lt_scalar_result(shape_format)
-
-    def test_lt_scalar_shape_format_fp32_4d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[np.float32, i, [32, 3, 3, 3]] for i in format_list]
-        self.lt_scalar_result(shape_format)
-
-    def test_lt_mix_dtype(self, device):
-        npu_input1, npu_input2 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
-        npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
-        npu_output = self.npu_op_exec(npu_input2, npu_input4)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestLt, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestLt(TestCase):
+    def cpu_op_exec(self, input1, input2):
+        output = torch.lt(input1, input2)
+        output = output.numpy().astype(np.int32)
+        return output
+
+    def cpu_op_exec_out(self, input1, input2, input3):
+        torch.lt(input1, input2, out = input3)
+        output = input3.numpy().astype(np.int32)
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.lt(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy().astype(np.int32)
+        return output
+
+    def cpu_op_inplace_exec(self, input1, input2):
+        output = input1.lt_(input2)
+        output = input1.numpy().astype(np.int32)
+        return output
+
+    def npu_op_inplace_exec(self, input1, input2):
+        output = input1.lt_(input2)
+        output = output.to("cpu")
+        output = output.numpy().astype(np.int32)
+        return output
+
+    def npu_op_exec_out(self, input1, input2, out):
+        torch.lt(input1, input2, out=out)
+        output = out.to("cpu")
+        output = output.numpy().astype(np.int32)
+        return output
+
+    def cpu_op_exec_scalar(self, input, scalar):
+        output = torch.lt(input, scalar)
+        output = output.numpy().astype(np.int32)
+        return output
+
+    def cpu_op_exec_scalar_out(self, input1, scalar, input2):
+        torch.lt(input1, scalar, out = input2)
+        output = input2.numpy().astype(np.int32)
+        return output
+
+    def npu_op_exec_scalar(self, input, scalar):
+        output = torch.lt(input, scalar)
+        output = output.to("cpu")
+        output = output.numpy().astype(np.int32)
+        return output
+
+    def npu_op_exec_scalar_out(self, input, scalar, out):
+        torch.lt(input, scalar, out=out)
+        output = out.to("cpu")
+        output = output.numpy().astype(np.int32)
+        return output
+
+    def lt_tensor_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100)
+            cpu_input3 = torch.randn(item[1][2])<0
+            npu_input3 = cpu_input3.npu()
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            if cpu_input3.dtype == torch.float16:
+                cpu_input3 = cpu_input3.to(torch.float32)
+            cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, cpu_input3)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
+            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
+
+            self.assertRtolEqual(cpu_output_out, npu_output_out)
+
+    def test_lt_tensor_out(self, device):
+        shape_format = [
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]],
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
+            [[np.float32, 0, [256, 128, 7, 7]],   [np.float32, 0, [128, 256, 3, 3]]],
+            [[np.float32, 0, [2, 3, 3, 3]],       [np.float32, 0, [3, 1, 3]]],
+            [[np.float32, 0, [128, 232, 7, 7]],   [np.float32, 0, [128, 232, 7, 7]]],
+        ]
+        self.lt_tensor_out_result(shape_format)
+
+    def lt_scalar_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2 = torch.randn(item[1][2])<0
+            npu_input2 = cpu_input2.npu()
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            scalar = np.random.uniform(0, 100)
+            cpu_output_out = self.cpu_op_exec_scalar_out(cpu_input1, scalar, cpu_input2)
+            npu_output_out = self.npu_op_exec_scalar_out(npu_input1, scalar, npu_input2)
+            cpu_output_out = cpu_output_out.astype(npu_output_out.dtype)
+            self.assertRtolEqual(cpu_output_out, npu_output_out)
+
+    def test_lt_scalar_out(self, device):
+        shape_format = [
+            [[np.float16, 0, [4, 4, 128, 128]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [12, 10, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [16, 3, 1111, 1212]], [np.float16, 0, [3, 3, 3]]],
+            [[np.float16, 0, [16, 16, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
+            [[np.float32, 0, [20, 10, 7, 7]], [np.float32, 0, [128, 256, 3, 3]]],
+            [[np.float32, 0, [1313, 3, 3, 3]], [np.float32, 0, [3, 1, 3]]],
+            [[np.float32, 0, [16, 22, 7, 7]], [np.float32, 0, [128, 232, 7, 7]]],
+        ]
+        self.lt_scalar_out_result(shape_format)
+
+    def lt_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            cpu_output_inp = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
+            npu_output_inp = self.npu_op_inplace_exec(npu_input1, npu_input2)
+
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            cpu_output_inp = cpu_output_inp.astype(npu_output_inp.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output_inp, npu_output_inp)
+
+    def lt_scalar_result(self, shape_format):
+        for item in shape_format:
+            scalar = np.random.uniform(0, 100)
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output_scalar = self.cpu_op_exec_scalar(cpu_input1, scalar)
+            npu_output_scalar = self.npu_op_exec_scalar(npu_input1, scalar)
+
+            cpu_output_scalar = cpu_output_scalar.astype(npu_output_scalar.dtype)
+
+            self.assertRtolEqual(cpu_output_scalar, npu_output_scalar)
+
+    def test_lt_shape_format_fp16_1d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[np.float16, i, 5] for i in format_list]
+        self.lt_result(shape_format)
+
+    def test_lt_shape_format_fp32_1d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[np.float32, i, 5] for i in format_list]
+        self.lt_result(shape_format)
+
+    def test_lt_shape_format_fp16_2d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[np.float16, i, [5, 3]] for i in format_list]
+        self.lt_result(shape_format)
+
+    def test_lt_shape_format_fp32_2d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[np.float32, i, [5, 3]] for i in format_list]
+        self.lt_result(shape_format)
+
+    def test_lt_shape_format_fp16_3d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[np.float16, i, [16, 640, 640]] for i in format_list]
+        self.lt_result(shape_format)
+
+    def test_lt_shape_format_fp32_3d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[np.float32, i, [16, 640, 640]] for i in format_list]
+        self.lt_result(shape_format)
+
+    def test_lt_shape_format_fp16_4d(self, device):
+        format_list = [-1, 3]
+        shape_format = [[np.float16, i, [32, 3, 3, 3]] for i in format_list]
+        self.lt_result(shape_format)
+
+    def test_lt_shape_format_fp32_4d(self, device):
+        format_list = [-1, 3]
+        shape_format = [[np.float32, i, [32, 3, 3, 3]] for i in format_list]
+        self.lt_result(shape_format)
+
+    # scalar-----------------------------------------------------------------------
+    def test_lt_scalar_shape_format_fp16_1d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[np.float16, i, 18] for i in format_list]
+        self.lt_scalar_result(shape_format)
+
+    def test_lt_scalar_shape_format_fp32_1d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[np.float32, i, [18]] for i in format_list]
+        self.lt_scalar_result(shape_format)
+
+    def test_lt_scalar_shape_format_fp16_2d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[np.float16, i, [5, 8]] for i in format_list]
+        self.lt_scalar_result(shape_format)
+
+    def test_lt_scalar_shape_format_fp32_2d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[np.float32, i, [5, 8]] for i in format_list]
+        self.lt_scalar_result(shape_format)
+
+    def test_lt_scalar_shape_format_fp16_3d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[np.float16, i, [4, 16, 32]] for i in format_list]
+        self.lt_scalar_result(shape_format)
+
+    def test_lt_scalar_shape_format_fp32_3d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[np.float32, i, [4, 16, 32]] for i in format_list]
+        self.lt_scalar_result(shape_format)
+
+    def test_lt_scalar_shape_format_fp16_4d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[np.float16, i, [32, 3, 3, 3]] for i in format_list]
+        self.lt_scalar_result(shape_format)
+
+    def test_lt_scalar_shape_format_fp32_4d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[np.float32, i, [32, 3, 3, 3]] for i in format_list]
+        self.lt_scalar_result(shape_format)
+
+    def test_lt_mix_dtype(self, device):
+        npu_input1, npu_input2 = create_common_tensor([np.float16, 0, (2, 3)], 1, 100)
+        npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
+        npu_output = self.npu_op_exec(npu_input2, npu_input4)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestLt, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_masked_scatter.py b/test/test_npu/test_network_ops/test_masked_scatter.py
index c48f27cd61d30e6a8a31d2dbdd703f56cfc05ce5..fcfdc8ed014dbcaba0c6899397f3b668febe1a01 100644
--- a/test/test_npu/test_network_ops/test_masked_scatter.py
+++ b/test/test_npu/test_network_ops/test_masked_scatter.py
@@ -1,87 +1,87 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-import torch.nn as nn
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestMaskedScatter(TestCase):
-    def cpu_op_exec(self, input, maskbool, source):
-        cpu_output = torch.masked_scatter(input, maskbool, source)
-        return cpu_output.numpy()
-
-    def npu_op_exec(self, input, maskbool, source):
-        input = input.to("npu")
-        maskbool = maskbool.to("npu")
-        source = source.to("npu")
-        npu_output = torch.masked_scatter(input, maskbool, source)
-        npu_output = npu_output.to("cpu")
-        return npu_output.numpy()
-
-    def cpu_inp_op_exec(self, input, maskbool, source):
-        cpu_output = input.masked_scatter_(maskbool, source)
-        return cpu_output.numpy()
-
-    def npu_inp_op_exec(self, input, maskbool, source):
-        maskbool = maskbool.to("npu")
-        npu_output = input.masked_scatter_(maskbool, source)
-        npu_output = npu_output.to("cpu")
-        return npu_output.numpy()
-
-    def test_masked_scatter_float(self, device):
-        dtype_list = [np.float32]
-        format_list = [0, 3]
-        shape_list = [[4, 5],[3, 4, 5], [2, 3, 4, 5]]
-        shape_format = [
-            [i, j, k] for i in dtype_list for j in format_list for k in shape_list
-        ]
-        mask = torch.randn(4, 1)
-        maskbool = mask.ge(0.5)
-        
-        for item in shape_format:
-          print(item)
-          cpu_input, npu_input = create_common_tensor(item, 0, 100)
-          cpu_source, npu_source = create_common_tensor(item, 0, 100)
-          cpu_output2 = self.cpu_inp_op_exec(cpu_input, maskbool, cpu_source)
-          npu_output2 = self.npu_inp_op_exec(npu_input, maskbool, npu_source)
-          self.assertRtolEqual(cpu_output2, npu_output2)
-          
-    def test_masked_scatter_int(self, device):
-        dtype_list = [np.int32, np.int64]
-        format_list = [0]
-        shape_list = [[4, 5],[3, 4, 5], [2, 3, 4, 5]]
-        shape_format = [
-            [i, j, k] for i in dtype_list for j in format_list for k in shape_list
-        ]
-        mask = torch.randn(4, 1)
-        maskbool = mask.ge(0.5)
-        
-        for item in shape_format:
-          print(item)
-          cpu_input, npu_input = create_common_tensor(item, 0, 100)
-          cpu_source, npu_source = create_common_tensor(item, 0, 100)
-          cpu_output2 = self.cpu_inp_op_exec(cpu_input, maskbool, cpu_source)
-          npu_output2 = self.npu_inp_op_exec(npu_input, maskbool, npu_source)
-          self.assertRtolEqual(cpu_output2, npu_output2)
-
-instantiate_device_type_tests(TestMaskedScatter, globals(), except_for='cpu')
-if __name__ == "__main__":
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+import torch.nn as nn
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestMaskedScatter(TestCase):
+    def cpu_op_exec(self, input, maskbool, source):
+        cpu_output = torch.masked_scatter(input, maskbool, source)
+        return cpu_output.numpy()
+
+    def npu_op_exec(self, input, maskbool, source):
+        input = input.to("npu")
+        maskbool = maskbool.to("npu")
+        source = source.to("npu")
+        npu_output = torch.masked_scatter(input, maskbool, source)
+        npu_output = npu_output.to("cpu")
+        return npu_output.numpy()
+
+    def cpu_inp_op_exec(self, input, maskbool, source):
+        cpu_output = input.masked_scatter_(maskbool, source)
+        return cpu_output.numpy()
+
+    def npu_inp_op_exec(self, input, maskbool, source):
+        maskbool = maskbool.to("npu")
+        npu_output = input.masked_scatter_(maskbool, source)
+        npu_output = npu_output.to("cpu")
+        return npu_output.numpy()
+
+    def test_masked_scatter_float(self, device):
+        dtype_list = [np.float32]
+        format_list = [0, 3]
+        shape_list = [[4, 5],[3, 4, 5], [2, 3, 4, 5]]
+        shape_format = [
+            [i, j, k] for i in dtype_list for j in format_list for k in shape_list
+        ]
+        mask = torch.randn(4, 1)
+        maskbool = mask.ge(0.5)
+        
+        for item in shape_format:
+          print(item)
+          cpu_input, npu_input = create_common_tensor(item, 0, 100)
+          cpu_source, npu_source = create_common_tensor(item, 0, 100)
+          cpu_output2 = self.cpu_inp_op_exec(cpu_input, maskbool, cpu_source)
+          npu_output2 = self.npu_inp_op_exec(npu_input, maskbool, npu_source)
+          self.assertRtolEqual(cpu_output2, npu_output2)
+          
+    def test_masked_scatter_int(self, device):
+        dtype_list = [np.int32, np.int64]
+        format_list = [0]
+        shape_list = [[4, 5],[3, 4, 5], [2, 3, 4, 5]]
+        shape_format = [
+            [i, j, k] for i in dtype_list for j in format_list for k in shape_list
+        ]
+        mask = torch.randn(4, 1)
+        maskbool = mask.ge(0.5)
+        
+        for item in shape_format:
+          print(item)
+          cpu_input, npu_input = create_common_tensor(item, 0, 100)
+          cpu_source, npu_source = create_common_tensor(item, 0, 100)
+          cpu_output2 = self.cpu_inp_op_exec(cpu_input, maskbool, cpu_source)
+          npu_output2 = self.npu_inp_op_exec(npu_input, maskbool, npu_source)
+          self.assertRtolEqual(cpu_output2, npu_output2)
+
+instantiate_device_type_tests(TestMaskedScatter, globals(), except_for='cpu')
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_matmul.py b/test/test_npu/test_network_ops/test_matmul.py
old mode 100644
new mode 100755
index 74755b14579a3a3332f6a82718a0ffc7489c29a7..7cfb770ea2080e86cace00aa54ff6e6bcf34372d
--- a/test/test_npu/test_network_ops/test_matmul.py
+++ b/test/test_npu/test_network_ops/test_matmul.py
@@ -1,166 +1,166 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import torch
-import numpy as np
-import torch.nn as nn
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestMatMul(TestCase):
-    def op_exec_cpu(self, mat1, mat2):
-        input1 = mat1
-        input2 = mat2
-        input1.requires_grad = True
-        input2.requires_grad = True
-
-        cpu_output = torch.matmul(input1, input2)
-        tmp = torch.ones_like(cpu_output)
-        cpu_output.backward(tmp)
-
-        return cpu_output.detach().numpy(), input1.grad.numpy(), input2.grad.numpy()
-
-    def op_exec_npu(self, mat1, mat2):
-        input1 = mat1
-        input2 = mat2
-        input1.requires_grad = True
-        input2.requires_grad = True
-        
-        npu_output = torch.matmul(input1, input2)
-        tmp = torch.ones_like(npu_output)
-        npu_output.backward(tmp)
-        npu_output = npu_output.cpu()
-        return npu_output.detach().cpu().numpy(), input1.grad.cpu().numpy(), input2.grad.cpu().numpy()
-
-    def matmul_backward_result(self, shape_format):
-        for item in shape_format:
-            mat1_cpu, mat1_npu = create_common_tensor(item[0], -10, 10)
-            if mat1_cpu.dtype == torch.float16:
-                mat1_cpu = mat1_cpu.to(torch.float32)
-            mat2_cpu, mat2_npu = create_common_tensor(item[1], -10, 10)
-            if mat2_cpu.dtype == torch.float16:
-                mat2_cpu = mat2_cpu.to(torch.float32)
-            cpu_output, cpu_mat1_grad, cpu_mat2_grad = self.op_exec_cpu(mat1_cpu, mat2_cpu)
-            npu_output, npu_mat1_grad, npu_mat2_grad = self.op_exec_npu(mat1_npu, mat2_npu)
-
-            
-            self.assertRtolEqual(cpu_output.astype(npu_output.dtype), npu_output)
-            self.assertRtolEqual(cpu_mat1_grad.astype(npu_mat1_grad.dtype), npu_mat1_grad)
-            self.assertRtolEqual(cpu_mat2_grad.astype(npu_mat2_grad.dtype), npu_mat2_grad)
-
-    def test_matmul_backward_shape_format_fp16_case1(self, device):
-        shape_format = [
-            # mat1 1dim, mat2 1dim       
-            [[np.float16, 2, [5]], [np.float16, 2, [5]]],
-            [[np.float16, 2, [2560]], [np.float16, 2, [2560]]],
-        ]
-        self.matmul_backward_result(shape_format)
-
-    # 暂不支持
-    # def test_matmul_backward_shape_format_fp16_case2(self, device):
-    #     shape_format = [  # mat1 2dim, mat2 1dim       
-    #         [[np.float16, 2, [3,5]], [np.float16, 2, [5]]],
-    #         [[np.float16, 2, [2560,4680]], [np.float16, 2, [4680]]],
-    #         [[np.float16, 2, [100,200]], [np.float16, 2, [200]]],
-    #         [[np.float16, 2, [4,4]], [np.float16, 2, [4]]],
-            
-    #     ]
-    #     self.matmul_backward_result(shape_format)
-    
-    def test_matmul_backward_shape_format_fp16_case3(self, device):
-        shape_format = [
-            # mat1 1dim, mat2 2dim       
-            [[np.float16, 2, [5]], [np.float16, 2, [5,6]]],
-            [[np.float16, 2, [2560]], [np.float16, 2, [2560,4680]]],
-            [[np.float16, 2, [5]], [np.float16, 2, [5,5]]],
-            
-        ]
-        self.matmul_backward_result(shape_format)
-        
-    def test_matmul_backward_shape_format_fp16_case4(self, device):
-        shape_format = [
-            # mat1 1dim, mat2 2dim       
-            [[np.float16, 2, [5,7]], [np.float16, 2, [7,10]]],
-            [[np.float16, 2, [3750,2560]], [np.float16, 2, [2560,4680]]],
-            [[np.float16, 2, [5,10]], [np.float16, 2, [10,20]]],
-        ]
-        self.matmul_backward_result(shape_format)
-    
-    def test_matmul_backward_shape_format_fp16_case5(self, device):
-        shape_format = [
-            # mat1 1dim, mat2 2dim       
-            [[np.float16, 2, [5,7,10]], [np.float16, 2, [10]]],
-            [[np.float16, 2, [168,3750,256]], [np.float16, 2, [256]]],
-            [[np.float16, 2, [4,5,10]], [np.float16, 2, [10]]],
-            #[[np.float16, 2, [5,10,20,30]], [np.float16, 2, [30]]],  # 该shape无法通过
-            #[[np.float16, 2, [20,30,40,50,60]], [np.float16, 2, [60]]], batch 三维 精度不行
-            [[np.float16, 2, [3,4,5,6,7,16]], [np.float16, 2, [16]]],
-        ]
-        self.matmul_backward_result(shape_format)
-        
-    def test_matmul_backward_shape_format_fp16_case6(self, device):
-        shape_format = [
-            # mat1 >2dim, mat2 2dim       
-            [[np.float16, 2, [5,7,10]], [np.float16, 2, [10,16]]],
-            #[[np.float16, 2, [5,10,20,30]], [np.float16, 2, [30,25]]], # 该shape无法过
-            [[np.float16, 2, [2,5,7,8,19,80]], [np.float16, 2, [80,32]]],
-        ]
-        self.matmul_backward_result(shape_format)
-
-    def test_matmul_backward_shape_format_fp16_case7(self, device):
-        shape_format = [
-            # mat1 1dim, mat2 >2dim       
-            [[np.float16, 2, [7]], [np.float16, 2, [5,7,10]]],
-            [[np.float16, 2, [5,]], [np.float16, 2, [4,5,10]]],
-            # [[np.float16, 2, [20]], [np.float16, 2, [5,10,20,30]]], # 该shape无法过
-            [[np.float16, 2, [7]], [np.float16, 2, [3,4,5,6,7,16]]],
-        ]
-        self.matmul_backward_result(shape_format)
-        
-    def test_matmul_backward_shape_format_fp16_case8(self, device):
-        shape_format = [
-            # mat1 2dim, mat2 >2dim       
-            [[np.float16, 2, [5,7]], [np.float16, 2, [5,7,10]]],
-            [[np.float16, 2, [12,5]], [np.float16, 2, [4,5,10]]],
-            # [[np.float16, 2, [44,20]], [np.float16, 2, [5,10,20,30]]], # 该shape无法过
-            # [[np.float16, 2, [75,50]], [np.float16, 2, [2,3,40,50,60]]], # 该shape无法过
-            [[np.float16, 2, [188,7]], [np.float16, 2, [3,4,5,6,7,16]]],
-        ]
-        self.matmul_backward_result(shape_format)
-        
-    def test_matmul_backward_shape_format_fp16_case9(self, device):
-        shape_format = [       
-            [[np.float16, 2, [5,7,10]], [np.float16, 2, [5,10,15]]],
-            [[np.float16, 2, [168,3750,256]], [np.float16, 2, [168,256,43]]],
-            # TODO(ascend): Insufficient precision
-            # 在两个输入shape不一致的情况下,会通过expand将两个tensor shape对齐。反向时expand的反向会调用sum(dim)，在fp16下与CPU比较不过。
-            # 但是结果与CUDA比对通过。所以只放开两个tensor batch部分一致的用例
-            # [[np.float16, 2, [1,6,7,65]], [np.float16, 2, [5,6,65,17]]],#该shape无法过
-            # [[np.float16, 2, [4,5,10,15]], [np.float16, 2, [5,15,20]]],
-            # [[np.float16, 2, [5,10,20,30]], [np.float16, 2, [1,30,40]]],
-            # [[np.float16, 2, [20,30,40,50,60]], [np.float16, 2, [40,60,6]]],
-            # [[np.float16, 2, [6,7,16]], [np.float16, 2, [4,5,6,16,17]]],
-            # [[np.float16, 2, [5,6,7,33]], [np.float16, 2, [12,23,5,6,33,17]]],
-            # [[np.float16, 2, [3,4,6,7,44]], [np.float16, 2, [2,3,4,6,44,17]]],
-            # [[np.float16, 2, [42,2,3,41]], [np.float16, 2, [1,2,42,2,41,17]]],
-        ]
-        self.matmul_backward_result(shape_format)
-        
-
-instantiate_device_type_tests(TestMatMul, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import torch
+import numpy as np
+import torch.nn as nn
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestMatMul(TestCase):
+    def op_exec_cpu(self, mat1, mat2):
+        input1 = mat1
+        input2 = mat2
+        input1.requires_grad = True
+        input2.requires_grad = True
+
+        cpu_output = torch.matmul(input1, input2)
+        tmp = torch.ones_like(cpu_output)
+        cpu_output.backward(tmp)
+
+        return cpu_output.detach().numpy(), input1.grad.numpy(), input2.grad.numpy()
+
+    def op_exec_npu(self, mat1, mat2):
+        input1 = mat1
+        input2 = mat2
+        input1.requires_grad = True
+        input2.requires_grad = True
+        
+        npu_output = torch.matmul(input1, input2)
+        tmp = torch.ones_like(npu_output)
+        npu_output.backward(tmp)
+        npu_output = npu_output.cpu()
+        return npu_output.detach().cpu().numpy(), input1.grad.cpu().numpy(), input2.grad.cpu().numpy()
+
+    def matmul_backward_result(self, shape_format):
+        for item in shape_format:
+            mat1_cpu, mat1_npu = create_common_tensor(item[0], -10, 10)
+            if mat1_cpu.dtype == torch.float16:
+                mat1_cpu = mat1_cpu.to(torch.float32)
+            mat2_cpu, mat2_npu = create_common_tensor(item[1], -10, 10)
+            if mat2_cpu.dtype == torch.float16:
+                mat2_cpu = mat2_cpu.to(torch.float32)
+            cpu_output, cpu_mat1_grad, cpu_mat2_grad = self.op_exec_cpu(mat1_cpu, mat2_cpu)
+            npu_output, npu_mat1_grad, npu_mat2_grad = self.op_exec_npu(mat1_npu, mat2_npu)
+
+            
+            self.assertRtolEqual(cpu_output.astype(npu_output.dtype), npu_output)
+            self.assertRtolEqual(cpu_mat1_grad.astype(npu_mat1_grad.dtype), npu_mat1_grad)
+            self.assertRtolEqual(cpu_mat2_grad.astype(npu_mat2_grad.dtype), npu_mat2_grad)
+
+    def test_matmul_backward_shape_format_fp16_case1(self, device):
+        shape_format = [
+            # mat1 1dim, mat2 1dim       
+            [[np.float16, 2, [5]], [np.float16, 2, [5]]],
+            [[np.float16, 2, [2560]], [np.float16, 2, [2560]]],
+        ]
+        self.matmul_backward_result(shape_format)
+
+    # 暂不支持
+    # def test_matmul_backward_shape_format_fp16_case2(self, device):
+    #     shape_format = [  # mat1 2dim, mat2 1dim       
+    #         [[np.float16, 2, [3,5]], [np.float16, 2, [5]]],
+    #         [[np.float16, 2, [2560,4680]], [np.float16, 2, [4680]]],
+    #         [[np.float16, 2, [100,200]], [np.float16, 2, [200]]],
+    #         [[np.float16, 2, [4,4]], [np.float16, 2, [4]]],
+            
+    #     ]
+    #     self.matmul_backward_result(shape_format)
+    
+    def test_matmul_backward_shape_format_fp16_case3(self, device):
+        shape_format = [
+            # mat1 1dim, mat2 2dim       
+            [[np.float16, 2, [5]], [np.float16, 2, [5,6]]],
+            [[np.float16, 2, [2560]], [np.float16, 2, [2560,4680]]],
+            [[np.float16, 2, [5]], [np.float16, 2, [5,5]]],
+            
+        ]
+        self.matmul_backward_result(shape_format)
+        
+    def test_matmul_backward_shape_format_fp16_case4(self, device):
+        shape_format = [
+            # mat1 1dim, mat2 2dim       
+            [[np.float16, 2, [5,7]], [np.float16, 2, [7,10]]],
+            [[np.float16, 2, [3750,2560]], [np.float16, 2, [2560,4680]]],
+            [[np.float16, 2, [5,10]], [np.float16, 2, [10,20]]],
+        ]
+        self.matmul_backward_result(shape_format)
+    
+    def test_matmul_backward_shape_format_fp16_case5(self, device):
+        shape_format = [
+            # mat1 1dim, mat2 2dim       
+            [[np.float16, 2, [5,7,10]], [np.float16, 2, [10]]],
+            [[np.float16, 2, [168,3750,256]], [np.float16, 2, [256]]],
+            [[np.float16, 2, [4,5,10]], [np.float16, 2, [10]]],
+            #[[np.float16, 2, [5,10,20,30]], [np.float16, 2, [30]]],  # 该shape无法通过
+            #[[np.float16, 2, [20,30,40,50,60]], [np.float16, 2, [60]]], batch 三维 精度不行
+            [[np.float16, 2, [3,4,5,6,7,16]], [np.float16, 2, [16]]],
+        ]
+        self.matmul_backward_result(shape_format)
+        
+    def test_matmul_backward_shape_format_fp16_case6(self, device):
+        shape_format = [
+            # mat1 >2dim, mat2 2dim       
+            [[np.float16, 2, [5,7,10]], [np.float16, 2, [10,16]]],
+            #[[np.float16, 2, [5,10,20,30]], [np.float16, 2, [30,25]]], # 该shape无法过
+            [[np.float16, 2, [2,5,7,8,19,80]], [np.float16, 2, [80,32]]],
+        ]
+        self.matmul_backward_result(shape_format)
+
+    def test_matmul_backward_shape_format_fp16_case7(self, device):
+        shape_format = [
+            # mat1 1dim, mat2 >2dim       
+            [[np.float16, 2, [7]], [np.float16, 2, [5,7,10]]],
+            [[np.float16, 2, [5,]], [np.float16, 2, [4,5,10]]],
+            # [[np.float16, 2, [20]], [np.float16, 2, [5,10,20,30]]], # 该shape无法过
+            [[np.float16, 2, [7]], [np.float16, 2, [3,4,5,6,7,16]]],
+        ]
+        self.matmul_backward_result(shape_format)
+        
+    def test_matmul_backward_shape_format_fp16_case8(self, device):
+        shape_format = [
+            # mat1 2dim, mat2 >2dim       
+            [[np.float16, 2, [5,7]], [np.float16, 2, [5,7,10]]],
+            [[np.float16, 2, [12,5]], [np.float16, 2, [4,5,10]]],
+            # [[np.float16, 2, [44,20]], [np.float16, 2, [5,10,20,30]]], # 该shape无法过
+            # [[np.float16, 2, [75,50]], [np.float16, 2, [2,3,40,50,60]]], # 该shape无法过
+            [[np.float16, 2, [188,7]], [np.float16, 2, [3,4,5,6,7,16]]],
+        ]
+        self.matmul_backward_result(shape_format)
+        
+    def test_matmul_backward_shape_format_fp16_case9(self, device):
+        shape_format = [       
+            [[np.float16, 2, [5,7,10]], [np.float16, 2, [5,10,15]]],
+            [[np.float16, 2, [168,3750,256]], [np.float16, 2, [168,256,43]]],
+            # TODO(ascend): Insufficient precision
+            # 在两个输入shape不一致的情况下,会通过expand将两个tensor shape对齐。反向时expand的反向会调用sum(dim)，在fp16下与CPU比较不过。
+            # 但是结果与CUDA比对通过。所以只放开两个tensor batch部分一致的用例
+            # [[np.float16, 2, [1,6,7,65]], [np.float16, 2, [5,6,65,17]]],#该shape无法过
+            # [[np.float16, 2, [4,5,10,15]], [np.float16, 2, [5,15,20]]],
+            # [[np.float16, 2, [5,10,20,30]], [np.float16, 2, [1,30,40]]],
+            # [[np.float16, 2, [20,30,40,50,60]], [np.float16, 2, [40,60,6]]],
+            # [[np.float16, 2, [6,7,16]], [np.float16, 2, [4,5,6,16,17]]],
+            # [[np.float16, 2, [5,6,7,33]], [np.float16, 2, [12,23,5,6,33,17]]],
+            # [[np.float16, 2, [3,4,6,7,44]], [np.float16, 2, [2,3,4,6,44,17]]],
+            # [[np.float16, 2, [42,2,3,41]], [np.float16, 2, [1,2,42,2,41,17]]],
+        ]
+        self.matmul_backward_result(shape_format)
+        
+
+instantiate_device_type_tests(TestMatMul, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_max.py b/test/test_npu/test_network_ops/test_max.py
old mode 100644
new mode 100755
index af0171f837e2ad468d95d81bfcfcb213f1b5c292..64f19c4fb090a4cb8071c04a4ff6d0b361a1a4ed
--- a/test/test_npu/test_network_ops/test_max.py
+++ b/test/test_npu/test_network_ops/test_max.py
@@ -1,570 +1,570 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestMax(TestCase):
-    def cpu_op_exec(self, input1):
-        '''
-        调用算子  torch.max(input) → Tensor
-        '''
-        output = torch.max(input1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1):
-        '''
-        调用适配算子函数  Tensor max_npu(const Tensor& self)
-        '''
-        output = torch.max(input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_other_exec(self, input1, input2):
-        '''
-        调用算子  torch.max(input, other, out=None) → Tensor
-        '''
-        output = torch.max(input1, input2)
-        output = output.numpy()
-        return output
-
-    def npu_op_other_exec(self, input1, input2):
-        '''
-        适配算子函数  Tensor max_npu(const Tensor& self, const Tensor& other)
-        '''
-        output = torch.max(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_out(self, input1, input2, out):
-        torch.max(input1, input2, out=out)
-        output = out.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_dim_exec(self, input1, dim, keepdim):
-        '''
-        调用算子  torch.max(input, dim, keepdim=False, out=None) -> (Tensor, LongTensor)
-        '''
-        output1, output2 = torch.max(input1, dim, keepdim)
-        output1 = output1.numpy()
-        # 这里需要将索引从64位转32位 便于拿去与npu的对比
-        output2 = output2.int().numpy()  
-        return output1, output2
-
-    def npu_op_dim_exec(self, input1, dim, keepdim):
-        '''
-        适配算子函数  tuple<Tensor, Tensor> max_npu(const Tensor& self, int64_t dim, bool keepdim)
-        '''
-        output1, output2 = torch.max(input1, dim, keepdim)
-        output1 = output1.to("cpu")
-        output2 = output2.to("cpu")
-        output1 = output1.numpy()
-        output2 = output2.numpy()
-        return output1, output2
-    
-    def cpu_max_values_exec(self, input):
-        output = input.max()
-        output = output.numpy()
-        return output
-        
-    def npu_max_values_exec(self, input):
-        output = input.max()
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-    
-    def _cpu_op_dim_exec(self, input1, dim, keepdim):
-        output1, output2 = torch._max(input1, dim, keepdim)
-        output1 = output1.numpy()
-        output2 = output2.numpy()
-        return output1, output2
-
-    def _npu_op_dim_exec(self, input1, dim, keepdim):
-        output1, output2 = torch._max(input1, dim, keepdim)
-        output1 = output1.to("cpu")
-        output2 = output2.to("cpu")
-        output1 = output1.numpy()
-        output2 = output2.numpy()
-        return output1, output2
-
-    def cpu_op_dim_exec_out(self, input1, dim, keepdim):
-        out = torch.tensor(0).to(input1.dtype)
-        indices = torch.tensor(0).to(torch.long)
-        torch.max(input1, dim=dim, keepdim=keepdim, out=(out,indices))
-        out = out.numpy()
-        indices = indices.numpy()
-        return out,indices
-
-    def npu_op_dim_exec_out(self, input1, dim, keepdim):
-        out = torch.tensor(0).to(input1.dtype).npu() 
-        indices = torch.tensor(0).to(torch.long).npu()
-        torch.max(input1, dim=dim, keepdim=keepdim, out=(out,indices))
-        out = out.to("cpu").numpy()
-        indices = indices.to("cpu").numpy()
-        return out,indices
-
-    def max_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def max_result_dim(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output_dim, cpu_output_indices = self.cpu_op_dim_exec(cpu_input1, item[1], item[2])
-            npu_output_dim, npu_output_indices = self.npu_op_dim_exec(npu_input1, item[1], item[2])
-            cpu_output_dim = cpu_output_dim.astype(npu_output_dim.dtype)
-
-            self.assertRtolEqual(cpu_output_dim, npu_output_dim)
-    
-    def _max_result_dim(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output_dim, cpu_output_indices = self._cpu_op_dim_exec(cpu_input1, item[1], item[2])
-            npu_output_dim, npu_output_indices = self._npu_op_dim_exec(npu_input1, item[1], item[2])
-            cpu_output_dim = cpu_output_dim.astype(npu_output_dim.dtype)
-
-            self.assertRtolEqual(cpu_output_dim, npu_output_dim)
-
-    def max_result_other(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], 0, 10)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output_other = self.cpu_op_other_exec(cpu_input1, cpu_input2)
-            npu_output_other = self.npu_op_other_exec(npu_input1, npu_input2)
-
-            cpu_output_other = cpu_output_other.astype(npu_output_other.dtype)
-            self.assertRtolEqual(cpu_output_other, npu_output_other)
-
-    def max_out_result_other(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100)
-            cpu_input3, npu_input3 = create_common_tensor(item[0], -100, 100)
-            cpu_input4, npu_input4 = create_common_tensor(item[1], -100, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            if cpu_input2.dtype == torch.float16:
-                cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_other_exec(cpu_input1, cpu_input2)
-            npu_output_out1 = self.npu_op_other_exec(npu_input1, npu_input2)
-            npu_output_out2 = self.npu_op_exec_out(npu_input1, npu_input2, npu_input4)
-            cpu_output = cpu_output.astype(npu_output_out1.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output_out1)
-            self.assertRtolEqual(cpu_output, npu_output_out2)
-
-            cpu_out_dim, cpu_out_indices = self.cpu_op_dim_exec_out(cpu_input1, dim=0, keepdim=True)
-            npu_out_dim, npu_out_indices = self.npu_op_dim_exec_out(npu_input1, dim=0, keepdim=True)
-            npu_output_dim, npu_output_indices = self.npu_op_dim_exec(npu_input1, dim=0, keepdim=True)
-            cpu_out_dim = cpu_out_dim.astype(npu_out_dim.dtype)
-            if npu_out_dim.dtype != np.float16:
-                self.assertRtolEqual(npu_out_dim, cpu_out_dim)
-                self.assertRtolEqual(npu_out_indices, cpu_out_indices)
-            else:
-                self.assertRtolEqual(npu_out_dim, npu_output_dim)
-                self.assertRtolEqual(npu_out_indices, npu_output_indices)
-    
-    # Npu and cpu have different logic to find the maximum value index. 
-    # The existence of two maximum values will cause the second output to be different. 
-    def max_name_result_other(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input1.names = item[0][3]
-            npu_input1.names = item[0][3]
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output_dim, cpu_output_indices = self.cpu_op_dim_exec(cpu_input1, item[1], item[2])
-            npu_output_dim, npu_output_indices = self.npu_op_dim_exec(cpu_input1, item[1], item[2])
-
-            if npu_output_dim.dtype != np.float16:
-                self.assertRtolEqual(npu_output_dim, cpu_output_dim)
-                #self.assertRtolEqual(npu_output_indices.astype(np.int32), cpu_output_indices.astype(np.int32))
-            else:
-                self.assertRtolEqual( npu_output_dim, cpu_output_dim.astype(np.float16))
-                #self.assertRtolEqual(npu_output_indices.astype(np.int32), cpu_output_indices.astype(np.int32))
-
-    # Npu and cpu have different logic to find the maximum value index. 
-    # The existence of two maximum values will cause the second output to be different. 
-    def max_name_out_result_other(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input1.names = item[0][3]
-            npu_input1.names = item[0][3]
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output_dim, cpu_output_indices = self.cpu_op_dim_exec_out(cpu_input1, item[1], item[2])
-            npu_output_dim, npu_output_indices = self.npu_op_dim_exec_out(npu_input1, item[1], item[2])
-
-            if npu_output_dim.dtype != np.float16:
-                self.assertRtolEqual(npu_output_dim, cpu_output_dim)
-                #self.assertRtolEqual(npu_output_indices.astype(np.int32), cpu_output_indices.astype(np.int32))
-            else:
-                self.assertRtolEqual( npu_output_dim, cpu_output_dim.astype(np.float16))
-                #self.assertRtolEqual(npu_output_indices.astype(np.int32), cpu_output_indices.astype(np.int32))
-                
-    def max_values_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_max_values_exec(cpu_input1)
-            npu_output = self.npu_max_values_exec(npu_input1)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output)            
-
-    def test_max_out_result(self, device):
-        shape_format = [
-            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
-            [[np.float16, 0, [128, 58, 28, 28]],  [np.float16, 0, [58, 58, 1, 1]]],
-            [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]],
-            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
-            [[np.float32, 0, [256, 128, 7, 7]],   [np.float32, 0, [128, 256, 3, 3]]],
-            [[np.float32, 0, [256, 3, 224, 224]], [np.float32, 0, [3, 3, 7, 7]]],
-            [[np.float32, 0, [2, 3, 3, 3]],       [np.float32, 0, [3, 1, 3]]],
-            [[np.float32, 0, [128, 232, 7, 7]],   [np.float32, 0, [128, 232, 7, 7]]],
-        ]
-        self.max_out_result_other(shape_format)
-
-    def test_max_shape_format_fp16_1d(self, device):
-        format_list = [0, 3]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
-                        ]
-        self.max_result(shape_format)
-
-    def test_max_shape_format_fp32_1d(self, device):
-        format_list = [0, 3]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.max_result(shape_format)
-
-    def test_max_shape_format_fp16_2d(self, device):
-        format_list = [0, 3]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.max_result(shape_format)
-
-    def test_max_shape_format_fp32_2d(self, device):
-        format_list = [0, 3]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.max_result(shape_format)
-
-    def test_max_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.max_result(shape_format)
-
-    def test_max_shape_format_fp32_3d(self, device):
-        format_list = [0, 3]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.max_result(shape_format)
-
-    def test_max_shape_format_fp16_4d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
-                        in keepdim_list
-                        ]
-        self.max_result(shape_format)
-
-    def test_max_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
-                        in
-                        keepdim_list
-                        ]
-        self.max_result(shape_format)
-
-    # ---------------------------------------dim
-    def test_max_dim_shape_format_fp16_1d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
-                        ]
-        self.max_result_dim(shape_format)
-
-    def test_max_dim_shape_format_fp32_1d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.max_result_dim(shape_format)
-
-    def test_max_dim_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 4]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.max_result_dim(shape_format)
-
-    def test_max_dim_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 4]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.max_result_dim(shape_format)
-
-    def test_max_dim_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.max_result_dim(shape_format)
-
-    def test_max_dim_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.max_result_dim(shape_format)
-
-    def test_max_dim_shape_format_fp16_4d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
-                        in keepdim_list
-                        ]
-        self.max_result_dim(shape_format)
-
-    def test_max_dim_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
-                        in keepdim_list
-                        ]
-        self.max_result_dim(shape_format)
-
-    def test_max_dim_shape_format_fp16_1d_(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
-                        ]
-        self._max_result_dim(shape_format)
-
-    def test_max_dim_shape_format_fp32_1d_(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self._max_result_dim(shape_format)
-    
-    #One-dimensional NZ to ND result is wrong, CCB has given a conclusion 
-    def test_max_dim_shape_format_fp16_2d_(self, device):
-        format_list = [0, 3, 4]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self._max_result_dim(shape_format)
-    
-    #One-dimensional NZ to ND result is wrong, CCB has given a conclusion 
-    def test_max_dim_shape_format_fp32_2d_(self, device):
-        format_list = [0, 3, 4]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self._max_result_dim(shape_format)
-
-    def test_max_dim_shape_format_fp16_3d_(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self._max_result_dim(shape_format)
-
-    def test_max_dim_shape_format_fp32_3d_(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self._max_result_dim(shape_format)
-
-    def test_max_dim_shape_format_fp16_4d_(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
-                        in keepdim_list
-                        ]
-        self._max_result_dim(shape_format)
-
-    def test_max_dim_shape_format_fp32_4d_(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
-                        in keepdim_list
-                        ]
-        self._max_result_dim(shape_format)
-
-    # -----------------------------other
-
-    def test_max_other_shape_format_fp16_1d(self, device):
-        format_list = [0, 3, 4]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
-                        ]
-        self.max_result_other(shape_format)
-
-    def test_max_other_shape_format_fp32_1d(self, device):
-        format_list = [0, 3, 4]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.max_result_other(shape_format)
-
-    def test_max_other_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.max_result_other(shape_format)
-
-    def test_max_other_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.max_result_other(shape_format)
-
-    def test_max_other_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.max_result_other(shape_format)
-
-    def test_max_other_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.max_result_other(shape_format)
-
-    def test_max_other_shape_format_fp16_4d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
-                        in keepdim_list
-                        ]
-        self.max_result_other(shape_format)
-
-    def test_max_other_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
-                        in
-                        keepdim_list
-                        ]
-        self.max_result_other(shape_format)
-    
-    def test_max_dimname_shape_format(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256, 64, 34], ('N', 'C', 'H', 'W')],
-         np.random.choice(['N', 'C', 'H', 'W']), j] for i in format_list for j
-                        in
-                        keepdim_list
-                        ]
-        self.max_name_result_other(shape_format)
-    
-    def test_max_dimname_shape_format_fp16(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256, 64, 34], ('N', 'C', 'H', 'W')],
-         np.random.choice(['N', 'C', 'H', 'W']), j] for i in format_list for j
-                        in
-                        keepdim_list
-                        ]
-        self.max_name_result_other(shape_format)
-    
-    def test_max_dimname_out_shape_format(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256, 64, 34], ('N', 'C', 'H', 'W')],
-         np.random.choice(['N', 'C', 'H', 'W']), j] for i in format_list for j
-                        in
-                        keepdim_list
-                        ]
-        self.max_name_out_result_other(shape_format)
-    
-    def test_max_dimname_out_shape_format_fp16(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256, 64, 34], ('N', 'C', 'H', 'W')],
-         np.random.choice(['N', 'C', 'H', 'W']), j] for i in format_list for j
-                        in
-                        keepdim_list
-                        ]
-        self.max_name_out_result_other(shape_format)
-
-    def test_max_values_shape_format(self, device):
-        format_list = [0, 3]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
-                        keepdim_list
-                        ]  
-        self.max_values_result(shape_format)
-
-instantiate_device_type_tests(TestMax, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestMax(TestCase):
+    def cpu_op_exec(self, input1):
+        '''
+        调用算子  torch.max(input) → Tensor
+        '''
+        output = torch.max(input1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        '''
+        调用适配算子函数  Tensor max_npu(const Tensor& self)
+        '''
+        output = torch.max(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_other_exec(self, input1, input2):
+        '''
+        调用算子  torch.max(input, other, out=None) → Tensor
+        '''
+        output = torch.max(input1, input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_other_exec(self, input1, input2):
+        '''
+        适配算子函数  Tensor max_npu(const Tensor& self, const Tensor& other)
+        '''
+        output = torch.max(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2, out):
+        torch.max(input1, input2, out=out)
+        output = out.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_dim_exec(self, input1, dim, keepdim):
+        '''
+        调用算子  torch.max(input, dim, keepdim=False, out=None) -> (Tensor, LongTensor)
+        '''
+        output1, output2 = torch.max(input1, dim, keepdim)
+        output1 = output1.numpy()
+        # 这里需要将索引从64位转32位 便于拿去与npu的对比
+        output2 = output2.int().numpy()  
+        return output1, output2
+
+    def npu_op_dim_exec(self, input1, dim, keepdim):
+        '''
+        适配算子函数  tuple<Tensor, Tensor> max_npu(const Tensor& self, int64_t dim, bool keepdim)
+        '''
+        output1, output2 = torch.max(input1, dim, keepdim)
+        output1 = output1.to("cpu")
+        output2 = output2.to("cpu")
+        output1 = output1.numpy()
+        output2 = output2.numpy()
+        return output1, output2
+    
+    def cpu_max_values_exec(self, input):
+        output = input.max()
+        output = output.numpy()
+        return output
+        
+    def npu_max_values_exec(self, input):
+        output = input.max()
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+    
+    def _cpu_op_dim_exec(self, input1, dim, keepdim):
+        output1, output2 = torch._max(input1, dim, keepdim)
+        output1 = output1.numpy()
+        output2 = output2.numpy()
+        return output1, output2
+
+    def _npu_op_dim_exec(self, input1, dim, keepdim):
+        output1, output2 = torch._max(input1, dim, keepdim)
+        output1 = output1.to("cpu")
+        output2 = output2.to("cpu")
+        output1 = output1.numpy()
+        output2 = output2.numpy()
+        return output1, output2
+
+    def cpu_op_dim_exec_out(self, input1, dim, keepdim):
+        out = torch.tensor(0).to(input1.dtype)
+        indices = torch.tensor(0).to(torch.long)
+        torch.max(input1, dim=dim, keepdim=keepdim, out=(out,indices))
+        out = out.numpy()
+        indices = indices.numpy()
+        return out,indices
+
+    def npu_op_dim_exec_out(self, input1, dim, keepdim):
+        out = torch.tensor(0).to(input1.dtype).npu() 
+        indices = torch.tensor(0).to(torch.long).npu()
+        torch.max(input1, dim=dim, keepdim=keepdim, out=(out,indices))
+        out = out.to("cpu").numpy()
+        indices = indices.to("cpu").numpy()
+        return out,indices
+
+    def max_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def max_result_dim(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output_dim, cpu_output_indices = self.cpu_op_dim_exec(cpu_input1, item[1], item[2])
+            npu_output_dim, npu_output_indices = self.npu_op_dim_exec(npu_input1, item[1], item[2])
+            cpu_output_dim = cpu_output_dim.astype(npu_output_dim.dtype)
+
+            self.assertRtolEqual(cpu_output_dim, npu_output_dim)
+    
+    def _max_result_dim(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output_dim, cpu_output_indices = self._cpu_op_dim_exec(cpu_input1, item[1], item[2])
+            npu_output_dim, npu_output_indices = self._npu_op_dim_exec(npu_input1, item[1], item[2])
+            cpu_output_dim = cpu_output_dim.astype(npu_output_dim.dtype)
+
+            self.assertRtolEqual(cpu_output_dim, npu_output_dim)
+
+    def max_result_other(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], 0, 10)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output_other = self.cpu_op_other_exec(cpu_input1, cpu_input2)
+            npu_output_other = self.npu_op_other_exec(npu_input1, npu_input2)
+
+            cpu_output_other = cpu_output_other.astype(npu_output_other.dtype)
+            self.assertRtolEqual(cpu_output_other, npu_output_other)
+
+    def max_out_result_other(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100)
+            cpu_input3, npu_input3 = create_common_tensor(item[0], -100, 100)
+            cpu_input4, npu_input4 = create_common_tensor(item[1], -100, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_other_exec(cpu_input1, cpu_input2)
+            npu_output_out1 = self.npu_op_other_exec(npu_input1, npu_input2)
+            npu_output_out2 = self.npu_op_exec_out(npu_input1, npu_input2, npu_input4)
+            cpu_output = cpu_output.astype(npu_output_out1.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output_out1)
+            self.assertRtolEqual(cpu_output, npu_output_out2)
+
+            cpu_out_dim, cpu_out_indices = self.cpu_op_dim_exec_out(cpu_input1, dim=0, keepdim=True)
+            npu_out_dim, npu_out_indices = self.npu_op_dim_exec_out(npu_input1, dim=0, keepdim=True)
+            npu_output_dim, npu_output_indices = self.npu_op_dim_exec(npu_input1, dim=0, keepdim=True)
+            cpu_out_dim = cpu_out_dim.astype(npu_out_dim.dtype)
+            if npu_out_dim.dtype != np.float16:
+                self.assertRtolEqual(npu_out_dim, cpu_out_dim)
+                self.assertRtolEqual(npu_out_indices, cpu_out_indices)
+            else:
+                self.assertRtolEqual(npu_out_dim, npu_output_dim)
+                self.assertRtolEqual(npu_out_indices, npu_output_indices)
+    
+    # Npu and cpu have different logic to find the maximum value index. 
+    # The existence of two maximum values will cause the second output to be different. 
+    def max_name_result_other(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input1.names = item[0][3]
+            npu_input1.names = item[0][3]
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output_dim, cpu_output_indices = self.cpu_op_dim_exec(cpu_input1, item[1], item[2])
+            npu_output_dim, npu_output_indices = self.npu_op_dim_exec(cpu_input1, item[1], item[2])
+
+            if npu_output_dim.dtype != np.float16:
+                self.assertRtolEqual(npu_output_dim, cpu_output_dim)
+                #self.assertRtolEqual(npu_output_indices.astype(np.int32), cpu_output_indices.astype(np.int32))
+            else:
+                self.assertRtolEqual( npu_output_dim, cpu_output_dim.astype(np.float16))
+                #self.assertRtolEqual(npu_output_indices.astype(np.int32), cpu_output_indices.astype(np.int32))
+
+    # Npu and cpu have different logic to find the maximum value index. 
+    # The existence of two maximum values will cause the second output to be different. 
+    def max_name_out_result_other(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input1.names = item[0][3]
+            npu_input1.names = item[0][3]
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output_dim, cpu_output_indices = self.cpu_op_dim_exec_out(cpu_input1, item[1], item[2])
+            npu_output_dim, npu_output_indices = self.npu_op_dim_exec_out(npu_input1, item[1], item[2])
+
+            if npu_output_dim.dtype != np.float16:
+                self.assertRtolEqual(npu_output_dim, cpu_output_dim)
+                #self.assertRtolEqual(npu_output_indices.astype(np.int32), cpu_output_indices.astype(np.int32))
+            else:
+                self.assertRtolEqual( npu_output_dim, cpu_output_dim.astype(np.float16))
+                #self.assertRtolEqual(npu_output_indices.astype(np.int32), cpu_output_indices.astype(np.int32))
+                
+    def max_values_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_max_values_exec(cpu_input1)
+            npu_output = self.npu_max_values_exec(npu_input1)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output)            
+
+    def test_max_out_result(self, device):
+        shape_format = [
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [128, 58, 28, 28]],  [np.float16, 0, [58, 58, 1, 1]]],
+            [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]],
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
+            [[np.float32, 0, [256, 128, 7, 7]],   [np.float32, 0, [128, 256, 3, 3]]],
+            [[np.float32, 0, [256, 3, 224, 224]], [np.float32, 0, [3, 3, 7, 7]]],
+            [[np.float32, 0, [2, 3, 3, 3]],       [np.float32, 0, [3, 1, 3]]],
+            [[np.float32, 0, [128, 232, 7, 7]],   [np.float32, 0, [128, 232, 7, 7]]],
+        ]
+        self.max_out_result_other(shape_format)
+
+    def test_max_shape_format_fp16_1d(self, device):
+        format_list = [0, 3]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
+                        ]
+        self.max_result(shape_format)
+
+    def test_max_shape_format_fp32_1d(self, device):
+        format_list = [0, 3]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.max_result(shape_format)
+
+    def test_max_shape_format_fp16_2d(self, device):
+        format_list = [0, 3]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.max_result(shape_format)
+
+    def test_max_shape_format_fp32_2d(self, device):
+        format_list = [0, 3]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.max_result(shape_format)
+
+    def test_max_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.max_result(shape_format)
+
+    def test_max_shape_format_fp32_3d(self, device):
+        format_list = [0, 3]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.max_result(shape_format)
+
+    def test_max_shape_format_fp16_4d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
+                        in keepdim_list
+                        ]
+        self.max_result(shape_format)
+
+    def test_max_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
+                        in
+                        keepdim_list
+                        ]
+        self.max_result(shape_format)
+
+    # ---------------------------------------dim
+    def test_max_dim_shape_format_fp16_1d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
+                        ]
+        self.max_result_dim(shape_format)
+
+    def test_max_dim_shape_format_fp32_1d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.max_result_dim(shape_format)
+
+    def test_max_dim_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 4]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.max_result_dim(shape_format)
+
+    def test_max_dim_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 4]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.max_result_dim(shape_format)
+
+    def test_max_dim_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.max_result_dim(shape_format)
+
+    def test_max_dim_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.max_result_dim(shape_format)
+
+    def test_max_dim_shape_format_fp16_4d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
+                        in keepdim_list
+                        ]
+        self.max_result_dim(shape_format)
+
+    def test_max_dim_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
+                        in keepdim_list
+                        ]
+        self.max_result_dim(shape_format)
+
+    def test_max_dim_shape_format_fp16_1d_(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
+                        ]
+        self._max_result_dim(shape_format)
+
+    def test_max_dim_shape_format_fp32_1d_(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self._max_result_dim(shape_format)
+    
+    #One-dimensional NZ to ND result is wrong, CCB has given a conclusion 
+    def test_max_dim_shape_format_fp16_2d_(self, device):
+        format_list = [0, 3, 4]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self._max_result_dim(shape_format)
+    
+    #One-dimensional NZ to ND result is wrong, CCB has given a conclusion 
+    def test_max_dim_shape_format_fp32_2d_(self, device):
+        format_list = [0, 3, 4]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self._max_result_dim(shape_format)
+
+    def test_max_dim_shape_format_fp16_3d_(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self._max_result_dim(shape_format)
+
+    def test_max_dim_shape_format_fp32_3d_(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self._max_result_dim(shape_format)
+
+    def test_max_dim_shape_format_fp16_4d_(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
+                        in keepdim_list
+                        ]
+        self._max_result_dim(shape_format)
+
+    def test_max_dim_shape_format_fp32_4d_(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
+                        in keepdim_list
+                        ]
+        self._max_result_dim(shape_format)
+
+    # -----------------------------other
+
+    def test_max_other_shape_format_fp16_1d(self, device):
+        format_list = [0, 3, 4]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
+                        ]
+        self.max_result_other(shape_format)
+
+    def test_max_other_shape_format_fp32_1d(self, device):
+        format_list = [0, 3, 4]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.max_result_other(shape_format)
+
+    def test_max_other_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.max_result_other(shape_format)
+
+    def test_max_other_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.max_result_other(shape_format)
+
+    def test_max_other_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.max_result_other(shape_format)
+
+    def test_max_other_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.max_result_other(shape_format)
+
+    def test_max_other_shape_format_fp16_4d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
+                        in keepdim_list
+                        ]
+        self.max_result_other(shape_format)
+
+    def test_max_other_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
+                        in
+                        keepdim_list
+                        ]
+        self.max_result_other(shape_format)
+    
+    def test_max_dimname_shape_format(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256, 64, 34], ('N', 'C', 'H', 'W')],
+         np.random.choice(['N', 'C', 'H', 'W']), j] for i in format_list for j
+                        in
+                        keepdim_list
+                        ]
+        self.max_name_result_other(shape_format)
+    
+    def test_max_dimname_shape_format_fp16(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256, 64, 34], ('N', 'C', 'H', 'W')],
+         np.random.choice(['N', 'C', 'H', 'W']), j] for i in format_list for j
+                        in
+                        keepdim_list
+                        ]
+        self.max_name_result_other(shape_format)
+    
+    def test_max_dimname_out_shape_format(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256, 64, 34], ('N', 'C', 'H', 'W')],
+         np.random.choice(['N', 'C', 'H', 'W']), j] for i in format_list for j
+                        in
+                        keepdim_list
+                        ]
+        self.max_name_out_result_other(shape_format)
+    
+    def test_max_dimname_out_shape_format_fp16(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256, 64, 34], ('N', 'C', 'H', 'W')],
+         np.random.choice(['N', 'C', 'H', 'W']), j] for i in format_list for j
+                        in
+                        keepdim_list
+                        ]
+        self.max_name_out_result_other(shape_format)
+
+    def test_max_values_shape_format(self, device):
+        format_list = [0, 3]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
+                        keepdim_list
+                        ]  
+        self.max_values_result(shape_format)
+
+instantiate_device_type_tests(TestMax, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_max_backward.py b/test/test_npu/test_network_ops/test_max_backward.py
index 280abdb341c8ec9deaa90941ac2f3fa399f3b5e3..ea5ff2c62ff1753637e694075d4bc8b9fe53f4b9 100644
--- a/test/test_npu/test_network_ops/test_max_backward.py
+++ b/test/test_npu/test_network_ops/test_max_backward.py
@@ -1,78 +1,78 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestMaxBackward(TestCase):
-    def cpu_op_exec(self, input):
-        input.requires_grad = True
-        output = input.max(0, False)
-        output[0].backward(torch.ones_like(output[0]))
-        output_grad = input.grad
-        output_grad = output_grad.detach().numpy()
-        output1 = output[0].detach().numpy()
-        output2 = output[1].detach().numpy()
-        return output_grad, output1, output2
-
-    def npu_op_exec(self, input):
-        input.requires_grad = True
-        output = input.max(0, False)
-        output[0].backward(torch.ones_like(output[0]))
-        output_grad = input.grad
-        output_grad = output_grad.to("cpu")
-        output_grad = output_grad.detach().numpy()
-        output1 = output[0].detach().cpu().numpy()
-        output2 = output[1].detach().cpu().numpy()
-        return output_grad, output1, output2
-
-    def test_avg_pool2d_backward_shape_format_fp32(self, device):
-        format_list = [-1]
-        shape_list = [(2,32,8,600,40),(2,32,16,300,40)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            cpu_output_grad, cpu_output1, cpu_output2= self.cpu_op_exec(cpu_input)
-            npu_output_grad, npu_output1, npu_output2 = self.npu_op_exec(npu_input)
-
-            self.assertRtolEqual(cpu_output1, npu_output1)
-            self.assertRtolEqual(cpu_output_grad, npu_output_grad)
-
-    def test_avg_pool2d_backward_shape_format_fp16(self, device):
-        format_list = [-1]
-        shape_list = [(2,32,8,600),(2,32,16,300,40)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            cpu_input = cpu_input.to(torch.float32)
-            cpu_output_grad, cpu_output1, cpu_output2= self.cpu_op_exec(cpu_input)
-            npu_output_grad, npu_output1, npu_output2 = self.npu_op_exec(npu_input)
-            cpu_output1 = cpu_output1.astype(npu_output1.dtype)
-            cpu_output_grad = cpu_output_grad.astype(npu_output_grad.dtype)
-            self.assertRtolEqual(cpu_output_grad, npu_output_grad)
-
-instantiate_device_type_tests(
-    TestMaxBackward,
-    globals(),
-    except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestMaxBackward(TestCase):
+    def cpu_op_exec(self, input):
+        input.requires_grad = True
+        output = input.max(0, False)
+        output[0].backward(torch.ones_like(output[0]))
+        output_grad = input.grad
+        output_grad = output_grad.detach().numpy()
+        output1 = output[0].detach().numpy()
+        output2 = output[1].detach().numpy()
+        return output_grad, output1, output2
+
+    def npu_op_exec(self, input):
+        input.requires_grad = True
+        output = input.max(0, False)
+        output[0].backward(torch.ones_like(output[0]))
+        output_grad = input.grad
+        output_grad = output_grad.to("cpu")
+        output_grad = output_grad.detach().numpy()
+        output1 = output[0].detach().cpu().numpy()
+        output2 = output[1].detach().cpu().numpy()
+        return output_grad, output1, output2
+
+    def test_avg_pool2d_backward_shape_format_fp32(self, device):
+        format_list = [-1]
+        shape_list = [(2,32,8,600,40),(2,32,16,300,40)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_output_grad, cpu_output1, cpu_output2= self.cpu_op_exec(cpu_input)
+            npu_output_grad, npu_output1, npu_output2 = self.npu_op_exec(npu_input)
+
+            self.assertRtolEqual(cpu_output1, npu_output1)
+            self.assertRtolEqual(cpu_output_grad, npu_output_grad)
+
+    def test_avg_pool2d_backward_shape_format_fp16(self, device):
+        format_list = [-1]
+        shape_list = [(2,32,8,600),(2,32,16,300,40)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_input = cpu_input.to(torch.float32)
+            cpu_output_grad, cpu_output1, cpu_output2= self.cpu_op_exec(cpu_input)
+            npu_output_grad, npu_output1, npu_output2 = self.npu_op_exec(npu_input)
+            cpu_output1 = cpu_output1.astype(npu_output1.dtype)
+            cpu_output_grad = cpu_output_grad.astype(npu_output_grad.dtype)
+            self.assertRtolEqual(cpu_output_grad, npu_output_grad)
+
+instantiate_device_type_tests(
+    TestMaxBackward,
+    globals(),
+    except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_min.py b/test/test_npu/test_network_ops/test_min.py
old mode 100644
new mode 100755
index badc9c3a2dd71a6ca45c28c1196a24b826d2404a..d34e8c8ca908e96e9ec87910f49d058ea7dfc7da
--- a/test/test_npu/test_network_ops/test_min.py
+++ b/test/test_npu/test_network_ops/test_min.py
@@ -1,574 +1,574 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import copy
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestMin(TestCase):
-    def cpu_op_exec(self, input1):
-        '''
-        调用算子  torch.min(input) → Tensor
-        '''
-        output = torch.min(input1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1):
-        '''
-        调用适配算子函数  Tensor min_npu(const Tensor& self)
-        '''
-        output = torch.min(input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_other_exec(self, input1, input2):
-        '''
-        调用算子  torch.min(input, other, out=None) → Tensor
-        '''
-        output = torch.min(input1, input2)
-        output = output.numpy()
-        return output
-
-    def npu_op_other_exec(self, input1, input2):
-        '''
-        适配算子函数  Tensor min_npu(const Tensor& self, const Tensor& other)
-        '''
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        output = torch.min(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_other_exec_out(self, input1, input2, out):
-        torch.min(input1, input2, out=out)
-        output = out.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_dim_exec(self, input1, dim, keepdim):
-        '''
-        调用算子  torch.min(input, dim, keepdim=False, out=None) -> (Tensor, LongTensor)
-        '''
-        output1, output2 = torch.min(input1, dim, keepdim)
-        output1 = output1.numpy()
-        # 这里需要将索引从64位转32位 便于拿去与npu的对比
-        output2 = output2.int().numpy()  
-        return output1, output2
-
-    def npu_op_dim_exec(self, input1, dim, keepdim):
-        '''
-        适配算子函数  tuple<Tensor, Tensor> min_npu(const Tensor& self, int64_t dim, bool keepdim)
-        '''
-        input1 = input1.to("npu")
-        output1, output2 = torch.min(input1, dim, keepdim)
-        output1 = output1.to("cpu")
-        output2 = output2.to("cpu")
-        output1 = output1.numpy()
-        output2 = output2.numpy()
-        return output1, output2
-    
-    def _cpu_op_dim_exec(self, input1, dim, keepdim):
-        output1, output2 = torch._min(input1, dim, keepdim)
-        output1 = output1.numpy()
-        output2 = output2.numpy()
-        return output1, output2
-
-    def _npu_op_dim_exec(self, input1, dim, keepdim):
-        output1, output2 = torch._min(input1, dim, keepdim)
-        output1 = output1.to("cpu")
-        output2 = output2.to("cpu")
-        output1 = output1.numpy()
-        output2 = output2.numpy()
-        return output1, output2
-
-    def cpu_op_dim_exec_out(self, input1, dim, keepdim):
-        out = torch.tensor(0).to(input1.dtype)
-        indices = torch.tensor(0).to(torch.long)
-        torch.min(input1, dim=dim, keepdim=keepdim, out=(out,indices))
-        out = out.numpy()
-        indices = indices.numpy()
-        return out,indices
-
-    def npu_op_dim_exec_out(self, input1, dim, keepdim):
-        out = torch.tensor(0).to(input1.dtype).npu()
-        indices = torch.tensor(0).to(torch.long).npu()
-        torch.min(input1, dim=dim, keepdim=keepdim, out=(out,indices))
-        out = out.to("cpu").numpy()
-        indices = indices.to("cpu").numpy()
-        return out,indices
-        
-    def cpu_min_values_exec(self, input):
-        output = input.min()
-        output = output.numpy()
-        return output
-        
-    def npu_min_values_exec(self, input):
-        output = input.min()
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def min_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def min_result_dim(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output_dim, cpu_output_indices = self.cpu_op_dim_exec(cpu_input1, item[1], item[2])
-            npu_output_dim, npu_output_indices = self.cpu_op_dim_exec(cpu_input1, item[1], item[2])
-            cpu_output_dim = cpu_output_dim.astype(npu_output_dim.dtype)
-
-            self.assertRtolEqual(cpu_output_dim, npu_output_dim)
-    
-    def _min_result_dim(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output_dim, cpu_output_indices = self._cpu_op_dim_exec(cpu_input1, item[1], item[2])
-            npu_output_dim, npu_output_indices = self._npu_op_dim_exec(npu_input1, item[1], item[2])
-            cpu_output_dim = cpu_output_dim.astype(npu_output_dim.dtype)
-
-            self.assertRtolEqual(cpu_output_dim, npu_output_dim)
-
-    def min_result_other(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], 0, 10)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output_other = self.cpu_op_other_exec(cpu_input1, cpu_input2)
-            npu_output_other = self.npu_op_other_exec(npu_input1, npu_input2)
-            cpu_output_other = cpu_output_other.astype(npu_output_other.dtype)
-
-            self.assertRtolEqual(cpu_output_other, npu_output_other)
-
-    # Npu and cpu have different logic to find the maximum value index. 
-    # The existence of two maximum values will cause the second output to be different. 
-    def min_out_result_other(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100)
-            cpu_input3, npu_input3 = create_common_tensor(item[0], -100, 100)
-            cpu_input4, npu_input4 = create_common_tensor(item[1], -100, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            if cpu_input2.dtype == torch.float16:
-                cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_other_exec(cpu_input1, cpu_input2)
-            npu_output_out1 = self.npu_op_other_exec(npu_input1, npu_input2)
-            npu_output_out2 = self.npu_op_other_exec_out(npu_input1, npu_input2, npu_input4)
-            cpu_output = cpu_output.astype(npu_output_out1.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output_out1)
-            self.assertRtolEqual(cpu_output, npu_output_out2)
-
-            cpu_out_dim, cpu_out_indices = self.cpu_op_dim_exec_out(cpu_input1, dim=0, keepdim=True)
-            npu_out_dim, npu_out_indices = self.npu_op_dim_exec_out(npu_input1, dim=0, keepdim=True)
-            npu_output_dim, npu_output_indices = self.npu_op_dim_exec(npu_input1, dim=0, keepdim=True)
-            cpu_out_dim = cpu_out_dim.astype(npu_out_dim.dtype)
-            if cpu_out_dim.dtype != np.float16:
-                self.assertRtolEqual(npu_out_dim, cpu_out_dim)
-                #self.assertRtolEqual(npu_out_indices, cpu_out_indices)
-            else:
-                self.assertRtolEqual(npu_out_dim, npu_output_dim)
-                #self.assertRtolEqual(npu_out_indices, npu_output_indices)
-
-    # Npu and cpu have different logic to find the minimum value index. 
-    # The existence of two minimum values will cause the second output to be different.    
-    def min_name_result_other(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input1.names = item[0][3]
-            npu_input1.names = item[0][3]
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output_dim, cpu_output_indices = self.cpu_op_dim_exec(cpu_input1, item[1], item[2])
-            npu_output_dim, npu_output_indices = self.npu_op_dim_exec(cpu_input1, item[1], item[2])
-
-            if npu_output_dim.dtype != np.float16:
-                self.assertRtolEqual(npu_output_dim, cpu_output_dim)
-                #self.assertRtolEqual(npu_output_indices.astype(np.int32), cpu_output_indices.astype(np.int32))
-            else:
-                self.assertRtolEqual( npu_output_dim, cpu_output_dim.astype(np.float16))
-                #self.assertRtolEqual(npu_output_indices.astype(np.int32), cpu_output_indices.astype(np.int32))
-
-    # Npu and cpu have different logic to find the minimum value index. 
-    # The existence of two minimum values will cause the second output to be different. 
-    def min_name_out_result_other(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input1.names = item[0][3]
-            npu_input1.names = item[0][3]
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output_dim, cpu_output_indices = self.cpu_op_dim_exec_out(cpu_input1, item[1], item[2])
-            npu_output_dim, npu_output_indices = self.npu_op_dim_exec_out(npu_input1, item[1], item[2])
-            
-            if npu_output_dim.dtype != np.float16:
-                self.assertRtolEqual(npu_output_dim, cpu_output_dim)
-                #self.assertRtolEqual(npu_output_indices.astype(np.int32), cpu_output_indices.astype(np.int32))
-            else:
-                self.assertRtolEqual( npu_output_dim, cpu_output_dim.astype(np.float16))
-                #self.assertRtolEqual(npu_output_indices.astype(np.int32), cpu_output_indices.astype(np.int32))
-
-    def min_values_result(self, shape_format):
-        for item in shape_format:
-            print(item)
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_min_values_exec(cpu_input1)
-            npu_output = self.npu_min_values_exec(npu_input1)
-
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-            
-    def test_min_out_result(self, device):
-        shape_format = [
-            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
-            [[np.float16, 0, [128, 58, 28, 28]],  [np.float16, 0, [58, 58, 1, 1]]],
-            [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]],
-            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
-            [[np.float32, 0, [256, 128, 7, 7]],   [np.float32, 0, [128, 256, 3, 3]]],
-            [[np.float32, 0, [256, 3, 224, 224]], [np.float32, 0, [3, 3, 7, 7]]],
-            [[np.float32, 0, [2, 3, 3, 3]],       [np.float32, 0, [3, 1, 3]]],
-            [[np.float32, 0, [128, 232, 7, 7]],   [np.float32, 0, [128, 232, 7, 7]]],
-        ]
-        self.min_out_result_other(shape_format)
-
-    def test_min_shape_format_fp16_1d(self, device):
-        format_list = [0, 3]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
-                        ]
-        self.min_result(shape_format)
-
-    def test_min_shape_format_fp32_1d(self, device):
-        format_list = [0, 3]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.min_result(shape_format)
-
-    def test_min_shape_format_fp16_2d(self, device):
-        format_list = [0, 3]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 1), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.min_result(shape_format)
-
-    def test_min_shape_format_fp32_2d(self, device):
-        format_list = [0, 3]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256]], np.random.randint(0, 1), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.min_result(shape_format)
-
-    def test_min_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256, 64]], np.random.randint(0, 1), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.min_result(shape_format)
-
-    def test_min_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256, 64]], np.random.randint(0, 1), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.min_result(shape_format)
-
-    def test_min_shape_format_fp16_4d(self, device):
-        format_list = [0, 4, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256, 64, 34]], np.random.randint(0, 1), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.min_result(shape_format)
-
-    def test_min_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256, 64, 34]], np.random.randint(0, 1), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.min_result(shape_format)
-
-    # ---------------------------------------dim
-    def test_min_dim_shape_format_fp16_1d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
-                        ]
-        self.min_result_dim(shape_format)
-
-    def test_min_dim_shape_format_fp32_1d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.min_result_dim(shape_format)
-
-    #One-dimensional NZ to ND result is wrong, CCB has given a conclusion    
-    def test_min_dim_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 4]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.min_result_dim(shape_format)
-
-    #One-dimensional NZ to ND result is wrong, CCB has given a conclusion
-    def test_min_dim_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 4]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.min_result_dim(shape_format)
-
-    def test_min_dim_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.min_result_dim(shape_format)
-
-    def test_min_dim_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.min_result_dim(shape_format)
-
-    def test_min_dim_shape_format_fp16_4d(self, device):
-        format_list = [0, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
-                        in keepdim_list
-                        ]
-        self.min_result_dim(shape_format)
-
-    def test_min_dim_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
-                        in keepdim_list
-                        ]
-        self.min_result_dim(shape_format)
-
-    def test_min_dim_shape_format_fp16_1d_(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
-                        ]
-        self._min_result_dim(shape_format)
-
-    def test_min_dim_shape_format_fp32_1d_(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self._min_result_dim(shape_format)
-
-    def test_min_dim_shape_format_fp16_2d_(self, device):
-        format_list = [0, 3, 4]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self._min_result_dim(shape_format)
-
-    def test_min_dim_shape_format_fp32_2d_(self, device):
-        format_list = [0, 3, 4]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self._min_result_dim(shape_format)
-
-    def test_min_dim_shape_format_fp16_3d_(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self._min_result_dim(shape_format)
-
-    def test_min_dim_shape_format_fp32_3d_(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self._min_result_dim(shape_format)
-
-    def test_min_dim_shape_format_fp16_4d_(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
-                        in keepdim_list
-                        ]
-        self._min_result_dim(shape_format)
-
-    def test_min_dim_shape_format_fp32_4d_(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
-                        in keepdim_list
-                        ]
-        self._min_result_dim(shape_format)
-
-    # -----------------------------other
-
-    def test_min_other_shape_format_fp16_1d(self, device):
-        format_list = [0, 3, 4]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
-                        ]
-        self.min_result_other(shape_format)
-
-    def test_min_other_shape_format_fp32_1d(self, device):
-        format_list = [0, 3, 4]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.min_result_other(shape_format)
-
-    def test_min_other_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.min_result_other(shape_format)
-
-    def test_min_other_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.min_result_other(shape_format)
-
-    def test_min_other_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.min_result_other(shape_format)
-
-    def test_min_other_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
-                        keepdim_list
-                        ]
-        self.min_result_other(shape_format)
-
-    def test_min_other_shape_format_fp16_4d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
-                        in keepdim_list
-                        ]
-        self.min_result_other(shape_format)
-
-    def test_min_other_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
-                        in keepdim_list
-                        ]
-        self.min_result_other(shape_format)
-    
-    def test_min_dimname_shape_format(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256, 64, 34], ('N', 'C', 'H', 'W')],
-         np.random.choice(['N', 'C', 'H', 'W']), j] for i in format_list for j
-                        in
-                        keepdim_list
-                        ]
-        self.min_name_result_other(shape_format)
-    
-    def test_min_dimname_shape_format_fp16(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256, 64, 34], ('N', 'C', 'H', 'W')],
-         np.random.choice(['N', 'C', 'H', 'W']), j] for i in format_list for j
-                        in
-                        keepdim_list
-                        ]
-        self.min_name_result_other(shape_format)
-    
-    def test_min_dimname_out_shape_format(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 256, 64, 34], ('N', 'C', 'H', 'W')],
-         np.random.choice(['N', 'C', 'H', 'W']), j] for i in format_list for j
-                        in
-                        keepdim_list
-                        ]
-        self.min_name_out_result_other(shape_format)
-    
-    def test_min_dimname_out_shape_format_fp16(self, device):
-        format_list = [0, 3, 4, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256, 64, 34], ('N', 'C', 'H', 'W')],
-         np.random.choice(['N', 'C', 'H', 'W']), j] for i in format_list for j
-                        in
-                        keepdim_list
-                        ]
-        self.min_name_out_result_other(shape_format)
-
-    def test_min_values_shape_format(self, device):
-        format_list = [0, 3]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
-                        keepdim_list
-                        ]  
-        self.min_values_result(shape_format) 
-
-instantiate_device_type_tests(TestMin, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import copy
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestMin(TestCase):
+    def cpu_op_exec(self, input1):
+        '''
+        调用算子  torch.min(input) → Tensor
+        '''
+        output = torch.min(input1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        '''
+        调用适配算子函数  Tensor min_npu(const Tensor& self)
+        '''
+        output = torch.min(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_other_exec(self, input1, input2):
+        '''
+        调用算子  torch.min(input, other, out=None) → Tensor
+        '''
+        output = torch.min(input1, input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_other_exec(self, input1, input2):
+        '''
+        适配算子函数  Tensor min_npu(const Tensor& self, const Tensor& other)
+        '''
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        output = torch.min(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_other_exec_out(self, input1, input2, out):
+        torch.min(input1, input2, out=out)
+        output = out.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_dim_exec(self, input1, dim, keepdim):
+        '''
+        调用算子  torch.min(input, dim, keepdim=False, out=None) -> (Tensor, LongTensor)
+        '''
+        output1, output2 = torch.min(input1, dim, keepdim)
+        output1 = output1.numpy()
+        # 这里需要将索引从64位转32位 便于拿去与npu的对比
+        output2 = output2.int().numpy()  
+        return output1, output2
+
+    def npu_op_dim_exec(self, input1, dim, keepdim):
+        '''
+        适配算子函数  tuple<Tensor, Tensor> min_npu(const Tensor& self, int64_t dim, bool keepdim)
+        '''
+        input1 = input1.to("npu")
+        output1, output2 = torch.min(input1, dim, keepdim)
+        output1 = output1.to("cpu")
+        output2 = output2.to("cpu")
+        output1 = output1.numpy()
+        output2 = output2.numpy()
+        return output1, output2
+    
+    def _cpu_op_dim_exec(self, input1, dim, keepdim):
+        output1, output2 = torch._min(input1, dim, keepdim)
+        output1 = output1.numpy()
+        output2 = output2.numpy()
+        return output1, output2
+
+    def _npu_op_dim_exec(self, input1, dim, keepdim):
+        output1, output2 = torch._min(input1, dim, keepdim)
+        output1 = output1.to("cpu")
+        output2 = output2.to("cpu")
+        output1 = output1.numpy()
+        output2 = output2.numpy()
+        return output1, output2
+
+    def cpu_op_dim_exec_out(self, input1, dim, keepdim):
+        out = torch.tensor(0).to(input1.dtype)
+        indices = torch.tensor(0).to(torch.long)
+        torch.min(input1, dim=dim, keepdim=keepdim, out=(out,indices))
+        out = out.numpy()
+        indices = indices.numpy()
+        return out,indices
+
+    def npu_op_dim_exec_out(self, input1, dim, keepdim):
+        out = torch.tensor(0).to(input1.dtype).npu()
+        indices = torch.tensor(0).to(torch.long).npu()
+        torch.min(input1, dim=dim, keepdim=keepdim, out=(out,indices))
+        out = out.to("cpu").numpy()
+        indices = indices.to("cpu").numpy()
+        return out,indices
+        
+    def cpu_min_values_exec(self, input):
+        output = input.min()
+        output = output.numpy()
+        return output
+        
+    def npu_min_values_exec(self, input):
+        output = input.min()
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def min_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def min_result_dim(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output_dim, cpu_output_indices = self.cpu_op_dim_exec(cpu_input1, item[1], item[2])
+            npu_output_dim, npu_output_indices = self.cpu_op_dim_exec(cpu_input1, item[1], item[2])
+            cpu_output_dim = cpu_output_dim.astype(npu_output_dim.dtype)
+
+            self.assertRtolEqual(cpu_output_dim, npu_output_dim)
+    
+    def _min_result_dim(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output_dim, cpu_output_indices = self._cpu_op_dim_exec(cpu_input1, item[1], item[2])
+            npu_output_dim, npu_output_indices = self._npu_op_dim_exec(npu_input1, item[1], item[2])
+            cpu_output_dim = cpu_output_dim.astype(npu_output_dim.dtype)
+
+            self.assertRtolEqual(cpu_output_dim, npu_output_dim)
+
+    def min_result_other(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], 0, 10)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output_other = self.cpu_op_other_exec(cpu_input1, cpu_input2)
+            npu_output_other = self.npu_op_other_exec(npu_input1, npu_input2)
+            cpu_output_other = cpu_output_other.astype(npu_output_other.dtype)
+
+            self.assertRtolEqual(cpu_output_other, npu_output_other)
+
+    # Npu and cpu have different logic to find the maximum value index. 
+    # The existence of two maximum values will cause the second output to be different. 
+    def min_out_result_other(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100)
+            cpu_input3, npu_input3 = create_common_tensor(item[0], -100, 100)
+            cpu_input4, npu_input4 = create_common_tensor(item[1], -100, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_input2.dtype == torch.float16:
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_other_exec(cpu_input1, cpu_input2)
+            npu_output_out1 = self.npu_op_other_exec(npu_input1, npu_input2)
+            npu_output_out2 = self.npu_op_other_exec_out(npu_input1, npu_input2, npu_input4)
+            cpu_output = cpu_output.astype(npu_output_out1.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output_out1)
+            self.assertRtolEqual(cpu_output, npu_output_out2)
+
+            cpu_out_dim, cpu_out_indices = self.cpu_op_dim_exec_out(cpu_input1, dim=0, keepdim=True)
+            npu_out_dim, npu_out_indices = self.npu_op_dim_exec_out(npu_input1, dim=0, keepdim=True)
+            npu_output_dim, npu_output_indices = self.npu_op_dim_exec(npu_input1, dim=0, keepdim=True)
+            cpu_out_dim = cpu_out_dim.astype(npu_out_dim.dtype)
+            if cpu_out_dim.dtype != np.float16:
+                self.assertRtolEqual(npu_out_dim, cpu_out_dim)
+                #self.assertRtolEqual(npu_out_indices, cpu_out_indices)
+            else:
+                self.assertRtolEqual(npu_out_dim, npu_output_dim)
+                #self.assertRtolEqual(npu_out_indices, npu_output_indices)
+
+    # Npu and cpu have different logic to find the minimum value index. 
+    # The existence of two minimum values will cause the second output to be different.    
+    def min_name_result_other(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input1.names = item[0][3]
+            npu_input1.names = item[0][3]
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output_dim, cpu_output_indices = self.cpu_op_dim_exec(cpu_input1, item[1], item[2])
+            npu_output_dim, npu_output_indices = self.npu_op_dim_exec(cpu_input1, item[1], item[2])
+
+            if npu_output_dim.dtype != np.float16:
+                self.assertRtolEqual(npu_output_dim, cpu_output_dim)
+                #self.assertRtolEqual(npu_output_indices.astype(np.int32), cpu_output_indices.astype(np.int32))
+            else:
+                self.assertRtolEqual( npu_output_dim, cpu_output_dim.astype(np.float16))
+                #self.assertRtolEqual(npu_output_indices.astype(np.int32), cpu_output_indices.astype(np.int32))
+
+    # Npu and cpu have different logic to find the minimum value index. 
+    # The existence of two minimum values will cause the second output to be different. 
+    def min_name_out_result_other(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input1.names = item[0][3]
+            npu_input1.names = item[0][3]
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output_dim, cpu_output_indices = self.cpu_op_dim_exec_out(cpu_input1, item[1], item[2])
+            npu_output_dim, npu_output_indices = self.npu_op_dim_exec_out(npu_input1, item[1], item[2])
+            
+            if npu_output_dim.dtype != np.float16:
+                self.assertRtolEqual(npu_output_dim, cpu_output_dim)
+                #self.assertRtolEqual(npu_output_indices.astype(np.int32), cpu_output_indices.astype(np.int32))
+            else:
+                self.assertRtolEqual( npu_output_dim, cpu_output_dim.astype(np.float16))
+                #self.assertRtolEqual(npu_output_indices.astype(np.int32), cpu_output_indices.astype(np.int32))
+
+    def min_values_result(self, shape_format):
+        for item in shape_format:
+            print(item)
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_min_values_exec(cpu_input1)
+            npu_output = self.npu_min_values_exec(npu_input1)
+
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+            
+    def test_min_out_result(self, device):
+        shape_format = [
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [128, 58, 28, 28]],  [np.float16, 0, [58, 58, 1, 1]]],
+            [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3]]],
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [128, 116, 14, 14]]],
+            [[np.float32, 0, [256, 128, 7, 7]],   [np.float32, 0, [128, 256, 3, 3]]],
+            [[np.float32, 0, [256, 3, 224, 224]], [np.float32, 0, [3, 3, 7, 7]]],
+            [[np.float32, 0, [2, 3, 3, 3]],       [np.float32, 0, [3, 1, 3]]],
+            [[np.float32, 0, [128, 232, 7, 7]],   [np.float32, 0, [128, 232, 7, 7]]],
+        ]
+        self.min_out_result_other(shape_format)
+
+    def test_min_shape_format_fp16_1d(self, device):
+        format_list = [0, 3]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
+                        ]
+        self.min_result(shape_format)
+
+    def test_min_shape_format_fp32_1d(self, device):
+        format_list = [0, 3]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.min_result(shape_format)
+
+    def test_min_shape_format_fp16_2d(self, device):
+        format_list = [0, 3]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 1), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.min_result(shape_format)
+
+    def test_min_shape_format_fp32_2d(self, device):
+        format_list = [0, 3]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256]], np.random.randint(0, 1), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.min_result(shape_format)
+
+    def test_min_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256, 64]], np.random.randint(0, 1), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.min_result(shape_format)
+
+    def test_min_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256, 64]], np.random.randint(0, 1), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.min_result(shape_format)
+
+    def test_min_shape_format_fp16_4d(self, device):
+        format_list = [0, 4, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256, 64, 34]], np.random.randint(0, 1), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.min_result(shape_format)
+
+    def test_min_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256, 64, 34]], np.random.randint(0, 1), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.min_result(shape_format)
+
+    # ---------------------------------------dim
+    def test_min_dim_shape_format_fp16_1d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
+                        ]
+        self.min_result_dim(shape_format)
+
+    def test_min_dim_shape_format_fp32_1d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.min_result_dim(shape_format)
+
+    #One-dimensional NZ to ND result is wrong, CCB has given a conclusion    
+    def test_min_dim_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 4]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.min_result_dim(shape_format)
+
+    #One-dimensional NZ to ND result is wrong, CCB has given a conclusion
+    def test_min_dim_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 4]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.min_result_dim(shape_format)
+
+    def test_min_dim_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.min_result_dim(shape_format)
+
+    def test_min_dim_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.min_result_dim(shape_format)
+
+    def test_min_dim_shape_format_fp16_4d(self, device):
+        format_list = [0, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
+                        in keepdim_list
+                        ]
+        self.min_result_dim(shape_format)
+
+    def test_min_dim_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
+                        in keepdim_list
+                        ]
+        self.min_result_dim(shape_format)
+
+    def test_min_dim_shape_format_fp16_1d_(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
+                        ]
+        self._min_result_dim(shape_format)
+
+    def test_min_dim_shape_format_fp32_1d_(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self._min_result_dim(shape_format)
+
+    def test_min_dim_shape_format_fp16_2d_(self, device):
+        format_list = [0, 3, 4]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self._min_result_dim(shape_format)
+
+    def test_min_dim_shape_format_fp32_2d_(self, device):
+        format_list = [0, 3, 4]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self._min_result_dim(shape_format)
+
+    def test_min_dim_shape_format_fp16_3d_(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self._min_result_dim(shape_format)
+
+    def test_min_dim_shape_format_fp32_3d_(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self._min_result_dim(shape_format)
+
+    def test_min_dim_shape_format_fp16_4d_(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
+                        in keepdim_list
+                        ]
+        self._min_result_dim(shape_format)
+
+    def test_min_dim_shape_format_fp32_4d_(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
+                        in keepdim_list
+                        ]
+        self._min_result_dim(shape_format)
+
+    # -----------------------------other
+
+    def test_min_other_shape_format_fp16_1d(self, device):
+        format_list = [0, 3, 4]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in keepdim_list
+                        ]
+        self.min_result_other(shape_format)
+
+    def test_min_other_shape_format_fp32_1d(self, device):
+        format_list = [0, 3, 4]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.min_result_other(shape_format)
+
+    def test_min_other_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.min_result_other(shape_format)
+
+    def test_min_other_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.min_result_other(shape_format)
+
+    def test_min_other_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.min_result_other(shape_format)
+
+    def test_min_other_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256, 64]], np.random.randint(0, 3), j] for i in format_list for j in
+                        keepdim_list
+                        ]
+        self.min_result_other(shape_format)
+
+    def test_min_other_shape_format_fp16_4d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
+                        in keepdim_list
+                        ]
+        self.min_result_other(shape_format)
+
+    def test_min_other_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256, 64, 34]], np.random.randint(0, 4), j] for i in format_list for j
+                        in keepdim_list
+                        ]
+        self.min_result_other(shape_format)
+    
+    def test_min_dimname_shape_format(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256, 64, 34], ('N', 'C', 'H', 'W')],
+         np.random.choice(['N', 'C', 'H', 'W']), j] for i in format_list for j
+                        in
+                        keepdim_list
+                        ]
+        self.min_name_result_other(shape_format)
+    
+    def test_min_dimname_shape_format_fp16(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256, 64, 34], ('N', 'C', 'H', 'W')],
+         np.random.choice(['N', 'C', 'H', 'W']), j] for i in format_list for j
+                        in
+                        keepdim_list
+                        ]
+        self.min_name_result_other(shape_format)
+    
+    def test_min_dimname_out_shape_format(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 256, 64, 34], ('N', 'C', 'H', 'W')],
+         np.random.choice(['N', 'C', 'H', 'W']), j] for i in format_list for j
+                        in
+                        keepdim_list
+                        ]
+        self.min_name_out_result_other(shape_format)
+    
+    def test_min_dimname_out_shape_format_fp16(self, device):
+        format_list = [0, 3, 4, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256, 64, 34], ('N', 'C', 'H', 'W')],
+         np.random.choice(['N', 'C', 'H', 'W']), j] for i in format_list for j
+                        in
+                        keepdim_list
+                        ]
+        self.min_name_out_result_other(shape_format)
+
+    def test_min_values_shape_format(self, device):
+        format_list = [0, 3]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list for j in
+                        keepdim_list
+                        ]  
+        self.min_values_result(shape_format) 
+
+instantiate_device_type_tests(TestMin, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_mish.py b/test/test_npu/test_network_ops/test_mish.py
index a79f7c426fc9edc3a6a0b01268aaca4430b659f6..6bb41262a391d45a7132526d21b9be76e388e4f0 100644
--- a/test/test_npu/test_network_ops/test_mish.py
+++ b/test/test_npu/test_network_ops/test_mish.py
@@ -1,59 +1,59 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn.functional as F
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestMish(TestCase):
-    def npu_op_exec(self, input1):
-        output = torch.npu_mish(input1)
-        output = output.cpu().numpy()
-        return output
-
-    def cpu_op_exec(self, input1):
-        output = input1 * (torch.tanh(F.softplus(input1)))
-        output = output.numpy()
-        return output
-
-    def test_mish_fp32(self, device):
-        shape_format = [
-            [[np.float32, -1, [10,30,10]]],
-            [[np.float32, -1, [20,30,20]]],
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec(npu_input)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_mish_fp16(self, device):
-        shape_format = [
-            [[np.float16, -1, [10,30,10]]],
-            [[np.float16, -1, [20,30,20]]],
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            cpu_output = self.cpu_op_exec(cpu_input.float()).astype(np.float16)
-            npu_output = self.npu_op_exec(npu_input)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestMish, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestMish(TestCase):
+    def npu_op_exec(self, input1):
+        output = torch.npu_mish(input1)
+        output = output.cpu().numpy()
+        return output
+
+    def cpu_op_exec(self, input1):
+        output = input1 * (torch.tanh(F.softplus(input1)))
+        output = output.numpy()
+        return output
+
+    def test_mish_fp32(self, device):
+        shape_format = [
+            [[np.float32, -1, [10,30,10]]],
+            [[np.float32, -1, [20,30,20]]],
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_mish_fp16(self, device):
+        shape_format = [
+            [[np.float16, -1, [10,30,10]]],
+            [[np.float16, -1, [20,30,20]]],
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input.float()).astype(np.float16)
+            npu_output = self.npu_op_exec(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestMish, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_mish_backward.py b/test/test_npu/test_network_ops/test_mish_backward.py
index 1240cb55b3ac30f375b59ba3cca882b1a2a0fd6d..5656539db1c0b7c42fd6a09e9964e45a50febc49 100644
--- a/test/test_npu/test_network_ops/test_mish_backward.py
+++ b/test/test_npu/test_network_ops/test_mish_backward.py
@@ -1,55 +1,55 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn.functional as F
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestMishBackward(TestCase):
-    def npu_op_exec(self, input1):
-        input1.requires_grad = True
-        output = torch.npu_mish(input1)
-        output.backward(torch.ones_like(output))
-        output_grad = input1.grad
-        output_grad = output_grad.to("cpu")
-        output_grad = output_grad.detach().numpy()
-        output = output.cpu().detach().numpy()
-        return output_grad, output
-    
-    def cpu_op_exec(self, input1):
-        input1.requires_grad = True
-        output = input1 * (torch.tanh(F.softplus(input1)))
-        output.backward(torch.ones_like(output))
-        output_grad = input1.grad
-        output_grad = output_grad.to("cpu")
-        output_grad = output_grad.detach().numpy()
-        output = output.detach().numpy()
-        return output_grad, output
-
-    def test_mish_fp32(self, device):
-        npu_input = torch.tensor([1.,2.,3.,4.,5.,6.,7.,8.,9.,10.]).npu()
-        cpu_input = torch.tensor([1.,2.,3.,4.,5.,6.,7.,8.,9.,10.])
-        output_grad, npu_output = self.npu_op_exec(npu_input)
-        ep_output_grad, ep_npu_output = self.cpu_op_exec(cpu_input)
-        self.assertRtolEqual(ep_output_grad, output_grad)
-        self.assertRtolEqual(ep_npu_output, npu_output)
-
-instantiate_device_type_tests(TestMishBackward, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestMishBackward(TestCase):
+    def npu_op_exec(self, input1):
+        input1.requires_grad = True
+        output = torch.npu_mish(input1)
+        output.backward(torch.ones_like(output))
+        output_grad = input1.grad
+        output_grad = output_grad.to("cpu")
+        output_grad = output_grad.detach().numpy()
+        output = output.cpu().detach().numpy()
+        return output_grad, output
+    
+    def cpu_op_exec(self, input1):
+        input1.requires_grad = True
+        output = input1 * (torch.tanh(F.softplus(input1)))
+        output.backward(torch.ones_like(output))
+        output_grad = input1.grad
+        output_grad = output_grad.to("cpu")
+        output_grad = output_grad.detach().numpy()
+        output = output.detach().numpy()
+        return output_grad, output
+
+    def test_mish_fp32(self, device):
+        npu_input = torch.tensor([1.,2.,3.,4.,5.,6.,7.,8.,9.,10.]).npu()
+        cpu_input = torch.tensor([1.,2.,3.,4.,5.,6.,7.,8.,9.,10.])
+        output_grad, npu_output = self.npu_op_exec(npu_input)
+        ep_output_grad, ep_npu_output = self.cpu_op_exec(cpu_input)
+        self.assertRtolEqual(ep_output_grad, output_grad)
+        self.assertRtolEqual(ep_npu_output, npu_output)
+
+instantiate_device_type_tests(TestMishBackward, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_mm.py b/test/test_npu/test_network_ops/test_mm.py
old mode 100644
new mode 100755
index 8b55ddcbc5ed6be782ec1c0750771119d694a309..46a75f7c13ebb91fe23d18a153e46c321f59b355
--- a/test/test_npu/test_network_ops/test_mm.py
+++ b/test/test_npu/test_network_ops/test_mm.py
@@ -1,93 +1,93 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-sys.path.append('..')
-import torch
-import numpy as np
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-from common_utils import TestCase, run_tests
-
-class TestMatMul(TestCase):
-    def trans_tensor(self, mat1, mat2):
-        if mat1.size(1) == mat2.size(0):
-            return mat1, mat2
-        mat = mat1.t()
-        if mat.size(1) == mat2.size(0):
-            return mat, mat2
-        mat = mat2.t()
-        if mat1.size(1) == mat.size(0):
-            return mat1, mat
-        return mat1, mat2
-
-    def cpu_op_exec(self, input1, input2):
-        output = torch.mm(input1, input2)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        output = torch.mm(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def mm_auto_list_exec(self, shape):
-        for item in shape:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 1)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 1)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_input_1, cpu_input_2 = self.trans_tensor(cpu_input1, cpu_input2)
-            npu_input_1, npu_input_2 = self.trans_tensor(npu_input1, npu_input2)
-            cpu_output = self.cpu_op_exec(cpu_input_1, cpu_input_2)
-            npu_output = self.npu_op_exec(npu_input_1, npu_input_2)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_muls_shape_format_fp16(self, device):
-        format_list = [0, 3, 29]
-        shape_list = [[1024, 1000], [1000, 1024],
-                      [1024, 1024]]
-        shape_format1 = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        shape_format2 = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        shape_format = [
-            [i, j] for i in shape_format1 for j in shape_format2
-        ]
-        self.mm_auto_list_exec(shape_format)
-
-    def test_matmul_shape_format_fp32(self, device):
-        format_list = [0, 3, 29]
-        shape_list = [[256, 1280], [1000, 1280],
-        ]
-        shape_format1 = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        shape_format2 = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        shape_format = [
-            [i, j] for i in shape_format1 for j in shape_format2
-        ]
-        self.mm_auto_list_exec(shape_format)
-
-instantiate_device_type_tests(TestMatMul, globals(), except_for="cpu")
-
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('..')
+import torch
+import numpy as np
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+from common_utils import TestCase, run_tests
+
+class TestMatMul(TestCase):
+    def trans_tensor(self, mat1, mat2):
+        if mat1.size(1) == mat2.size(0):
+            return mat1, mat2
+        mat = mat1.t()
+        if mat.size(1) == mat2.size(0):
+            return mat, mat2
+        mat = mat2.t()
+        if mat1.size(1) == mat.size(0):
+            return mat1, mat
+        return mat1, mat2
+
+    def cpu_op_exec(self, input1, input2):
+        output = torch.mm(input1, input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.mm(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def mm_auto_list_exec(self, shape):
+        for item in shape:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 1)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 1)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_input_1, cpu_input_2 = self.trans_tensor(cpu_input1, cpu_input2)
+            npu_input_1, npu_input_2 = self.trans_tensor(npu_input1, npu_input2)
+            cpu_output = self.cpu_op_exec(cpu_input_1, cpu_input_2)
+            npu_output = self.npu_op_exec(npu_input_1, npu_input_2)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_muls_shape_format_fp16(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [[1024, 1000], [1000, 1024],
+                      [1024, 1024]]
+        shape_format1 = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        shape_format2 = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        shape_format = [
+            [i, j] for i in shape_format1 for j in shape_format2
+        ]
+        self.mm_auto_list_exec(shape_format)
+
+    def test_matmul_shape_format_fp32(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [[256, 1280], [1000, 1280],
+        ]
+        shape_format1 = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        shape_format2 = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        shape_format = [
+            [i, j] for i in shape_format1 for j in shape_format2
+        ]
+        self.mm_auto_list_exec(shape_format)
+
+instantiate_device_type_tests(TestMatMul, globals(), except_for="cpu")
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_muls.py b/test/test_npu/test_network_ops/test_muls.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_network_ops/test_mv.py b/test/test_npu/test_network_ops/test_mv.py
index a501b40b3b5991a07a801ed2ba9b9c7b4febd400..670839b67f0f32ee0948a34f3b0811bde7e4e307 100644
--- a/test/test_npu/test_network_ops/test_mv.py
+++ b/test/test_npu/test_network_ops/test_mv.py
@@ -1,72 +1,72 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestMv(TestCase):
-    def cpu_op_exec(self, input1, input2):
-        cpu_output = torch.mv(input1, input2)
-        cpu_output = cpu_output.numpy()
-        return cpu_output
-
-    def npu_op_exec(self, input1, input2):
-        npu_output = torch.mv(input1, input2)
-        npu_output = npu_output.cpu()
-        npu_output = npu_output.numpy()
-        return npu_output
-
-    def npu_op_exec_out(self, input1, input2, output):
-        torch.mv(input1, input2, out=output)
-        output = output.cpu()
-        output = output.numpy()
-        return output
-
-    def test_mv_shape_format(self, device):
-        shape_format = [
-                [[np.float32, -1, (3, 3)], [np.float32, -1, (3)]],
-                [[np.float32, -1, (5, 8)], [np.float32, -1, (8)]],
-                [[np.float32, -1, (8, 9)], [np.float32, -1, (9)]],
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], -100, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_mv_out_shape_format(self, device):
-        shape_format = [
-                [[np.float32, -1, (3, 3)], [np.float32, -1, (3)], [np.float32, -1, (3)]],
-                [[np.float32, -1, (5, 8)], [np.float32, -1, (8)], [np.float32, -1, (5)]],
-                [[np.float32, -1, (8, 9)], [np.float32, -1, (9)], [np.float32, -1, (8)]],
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], -100, 100)
-            cpu_input3, npu_input3 = create_common_tensor(item[2], -100, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-
-instantiate_device_type_tests(TestMv, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestMv(TestCase):
+    def cpu_op_exec(self, input1, input2):
+        cpu_output = torch.mv(input1, input2)
+        cpu_output = cpu_output.numpy()
+        return cpu_output
+
+    def npu_op_exec(self, input1, input2):
+        npu_output = torch.mv(input1, input2)
+        npu_output = npu_output.cpu()
+        npu_output = npu_output.numpy()
+        return npu_output
+
+    def npu_op_exec_out(self, input1, input2, output):
+        torch.mv(input1, input2, out=output)
+        output = output.cpu()
+        output = output.numpy()
+        return output
+
+    def test_mv_shape_format(self, device):
+        shape_format = [
+                [[np.float32, -1, (3, 3)], [np.float32, -1, (3)]],
+                [[np.float32, -1, (5, 8)], [np.float32, -1, (8)]],
+                [[np.float32, -1, (8, 9)], [np.float32, -1, (9)]],
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], -100, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_mv_out_shape_format(self, device):
+        shape_format = [
+                [[np.float32, -1, (3, 3)], [np.float32, -1, (3)], [np.float32, -1, (3)]],
+                [[np.float32, -1, (5, 8)], [np.float32, -1, (8)], [np.float32, -1, (5)]],
+                [[np.float32, -1, (8, 9)], [np.float32, -1, (9)], [np.float32, -1, (8)]],
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], -100, 100)
+            cpu_input3, npu_input3 = create_common_tensor(item[2], -100, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+
+instantiate_device_type_tests(TestMv, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_neg.py b/test/test_npu/test_network_ops/test_neg.py
old mode 100644
new mode 100755
index 2456670193398aaa3aace6b23615d552f25b4839..5ee20c1d7fd04ba8c3f0c0fe115d6c3c24501824
--- a/test/test_npu/test_network_ops/test_neg.py
+++ b/test/test_npu/test_network_ops/test_neg.py
@@ -1,139 +1,139 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestNeg(TestCase):
-    def cpu_op_exec(self, input1):
-        output = torch.neg(input1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1):
-        output = torch.neg(input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_out(self, input1, input2):
-        torch.neg(input1, out=input2)
-        output = input2.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_inp_op_exec(self, input1):
-        torch.neg_(input1)
-        output = input1.numpy()
-        return output
-
-    def npu_inp_op_exec(self, input1):
-        torch.neg_(input1)
-        output = input1.to("cpu")
-        output = output.numpy()
-        return output
-
-    def neg_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-            cpu_input_inp, npu_input_inp = create_common_tensor(item[0], -100, 100)
-            if cpu_input_inp.dtype == torch.float16:
-                cpu_input_inp = cpu_input_inp.to(torch.float32)
-            cpu_output_inp = self.cpu_inp_op_exec(cpu_input_inp)
-            npu_output_inp = self.npu_inp_op_exec(npu_input_inp)
-            cpu_output_inp = cpu_output_inp.astype(npu_output_inp.dtype)
-            self.assertRtolEqual(cpu_output_inp, npu_output_inp)
-
-    def neg_out_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100)
-            cpu_input3, npu_input3 = create_common_tensor(item[1], -100, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output_out1 = self.npu_op_exec_out(npu_input1, npu_input2)
-            npu_output_out2 = self.npu_op_exec_out(npu_input1, npu_input3)
-            cpu_output = cpu_output.astype(npu_output_out1.dtype)
-            self.assertRtolEqual(cpu_output, npu_output_out1)
-            self.assertRtolEqual(cpu_output, npu_output_out2)
-
-    def test_neg_out_result(self, device):
-        shape_format = [
-            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
-            [[np.float16, 0, [128, 58, 28, 28]],  [np.float16, 0, [58, 58, 1, 1]]],
-            [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3, 3]]],
-            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [116, 116, 1, 1]]],
-            [[np.float32, 0, [256, 128, 7, 7]],   [np.float32, 0, [128, 128, 3, 3]]],
-            [[np.float32, 0, [256, 3, 224, 224]], [np.float32, 0, [3, 3, 7, 7]]],
-            [[np.float32, 0, [2, 3, 3, 3]],       [np.float32, 0, [3, 1, 3, 3]]],
-            [[np.float32, 0, [128, 232, 7, 7]],   [np.float32, 0, [232, 232, 1, 1]]],
-        ]
-        self.neg_out_result(shape_format)
-
-    def test_neg_shape_format_fp16_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [[[np.float16, i, [96]]] for i in format_list]
-        self.neg_result(shape_format)
-
-    def test_neg_shape_format_fp32_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [[[np.float32, i, [96]]] for i in format_list]
-        self.neg_result(shape_format)
-
-    def test_neg_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[[np.float16, i, [448, 1]]] for i in format_list]
-        self.neg_result(shape_format)
-
-    def test_neg_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[[np.float32, i, [448, 1]]] for i in format_list]
-        self.neg_result(shape_format)
-
-    def test_neg_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[[np.float16, i, [64, 24, 38]]] for i in format_list]
-        self.neg_result(shape_format)
-
-    def test_neg_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[[np.float32, i, [64, 24, 38]]] for i in format_list]
-        self.neg_result(shape_format)
-
-    def test_neg_shape_format_fp16_4d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[[np.float16, i, [32, 3, 3, 3]]] for i in format_list]
-        self.neg_result(shape_format)
-
-    def test_neg_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[[np.float32, i, [32, 3, 3, 3]]] for i in format_list]
-        self.neg_result(shape_format)
-
-
-instantiate_device_type_tests(TestNeg, globals(), except_for="cpu")
-
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestNeg(TestCase):
+    def cpu_op_exec(self, input1):
+        output = torch.neg(input1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        output = torch.neg(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2):
+        torch.neg(input1, out=input2)
+        output = input2.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_inp_op_exec(self, input1):
+        torch.neg_(input1)
+        output = input1.numpy()
+        return output
+
+    def npu_inp_op_exec(self, input1):
+        torch.neg_(input1)
+        output = input1.to("cpu")
+        output = output.numpy()
+        return output
+
+    def neg_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+            cpu_input_inp, npu_input_inp = create_common_tensor(item[0], -100, 100)
+            if cpu_input_inp.dtype == torch.float16:
+                cpu_input_inp = cpu_input_inp.to(torch.float32)
+            cpu_output_inp = self.cpu_inp_op_exec(cpu_input_inp)
+            npu_output_inp = self.npu_inp_op_exec(npu_input_inp)
+            cpu_output_inp = cpu_output_inp.astype(npu_output_inp.dtype)
+            self.assertRtolEqual(cpu_output_inp, npu_output_inp)
+
+    def neg_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], -100, 100)
+            cpu_input3, npu_input3 = create_common_tensor(item[1], -100, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output_out1 = self.npu_op_exec_out(npu_input1, npu_input2)
+            npu_output_out2 = self.npu_op_exec_out(npu_input1, npu_input3)
+            cpu_output = cpu_output.astype(npu_output_out1.dtype)
+            self.assertRtolEqual(cpu_output, npu_output_out1)
+            self.assertRtolEqual(cpu_output, npu_output_out2)
+
+    def test_neg_out_result(self, device):
+        shape_format = [
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [256, 116, 1, 1]]],
+            [[np.float16, 0, [128, 58, 28, 28]],  [np.float16, 0, [58, 58, 1, 1]]],
+            [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3, 3]]],
+            [[np.float16, 0, [128, 116, 14, 14]], [np.float16, 0, [116, 116, 1, 1]]],
+            [[np.float32, 0, [256, 128, 7, 7]],   [np.float32, 0, [128, 128, 3, 3]]],
+            [[np.float32, 0, [256, 3, 224, 224]], [np.float32, 0, [3, 3, 7, 7]]],
+            [[np.float32, 0, [2, 3, 3, 3]],       [np.float32, 0, [3, 1, 3, 3]]],
+            [[np.float32, 0, [128, 232, 7, 7]],   [np.float32, 0, [232, 232, 1, 1]]],
+        ]
+        self.neg_out_result(shape_format)
+
+    def test_neg_shape_format_fp16_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [[[np.float16, i, [96]]] for i in format_list]
+        self.neg_result(shape_format)
+
+    def test_neg_shape_format_fp32_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [[[np.float32, i, [96]]] for i in format_list]
+        self.neg_result(shape_format)
+
+    def test_neg_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[[np.float16, i, [448, 1]]] for i in format_list]
+        self.neg_result(shape_format)
+
+    def test_neg_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[[np.float32, i, [448, 1]]] for i in format_list]
+        self.neg_result(shape_format)
+
+    def test_neg_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[[np.float16, i, [64, 24, 38]]] for i in format_list]
+        self.neg_result(shape_format)
+
+    def test_neg_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[[np.float32, i, [64, 24, 38]]] for i in format_list]
+        self.neg_result(shape_format)
+
+    def test_neg_shape_format_fp16_4d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[[np.float16, i, [32, 3, 3, 3]]] for i in format_list]
+        self.neg_result(shape_format)
+
+    def test_neg_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[[np.float32, i, [32, 3, 3, 3]]] for i in format_list]
+        self.neg_result(shape_format)
+
+
+instantiate_device_type_tests(TestNeg, globals(), except_for="cpu")
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_new_full.py b/test/test_npu/test_network_ops/test_new_full.py
index ada9cbe2c43d36c739b35b73bb0920f9837449f9..2fdc8e7feb2db5dad81fe1ebd8e24003c139642f 100644
--- a/test/test_npu/test_network_ops/test_new_full.py
+++ b/test/test_npu/test_network_ops/test_new_full.py
@@ -1,56 +1,56 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestNewFull(TestCase):
-    def cpu_op_exec(self, input1, size, value):
-        output = input1.new_full(size, value)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, size, value):
-        output = input1.new_full(size,value)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_new_full_shape_format(self, device):
-        shape = [
-                [np.float32, 0, (4, 3)],
-                [np.float32, 4, (2, 3, 7)],
-                [np.float16, 0, (2, 3, 7)],
-        ]
-        size = [(2, 2), (1, 2)]
-        value = [-100, 0, 100]
-        
-        shape_format = [
-                [i, j, k] for i in shape for j in size for k in value
-        ]
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, item[1], item[2])
-            npu_output = self.npu_op_exec(npu_input1, item[1], item[2])
-            self.assertEqual(cpu_output.shape, npu_output.shape)
-
-
-instantiate_device_type_tests(TestNewFull, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestNewFull(TestCase):
+    def cpu_op_exec(self, input1, size, value):
+        output = input1.new_full(size, value)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, size, value):
+        output = input1.new_full(size,value)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_new_full_shape_format(self, device):
+        shape = [
+                [np.float32, 0, (4, 3)],
+                [np.float32, 4, (2, 3, 7)],
+                [np.float16, 0, (2, 3, 7)],
+        ]
+        size = [(2, 2), (1, 2)]
+        value = [-100, 0, 100]
+        
+        shape_format = [
+                [i, j, k] for i in shape for j in size for k in value
+        ]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, item[1], item[2])
+            npu_output = self.npu_op_exec(npu_input1, item[1], item[2])
+            self.assertEqual(cpu_output.shape, npu_output.shape)
+
+
+instantiate_device_type_tests(TestNewFull, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_nllloss.py b/test/test_npu/test_network_ops/test_nllloss.py
old mode 100644
new mode 100755
index 9994116093bb4b300c074d829cb93276d8c7d5ae..170ce07f2cef7c05b427a60f9667ea6d3bafa383
--- a/test/test_npu/test_network_ops/test_nllloss.py
+++ b/test/test_npu/test_network_ops/test_nllloss.py
@@ -1,85 +1,85 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestNllloss(TestCase):
-    def cpu_op_exec_new(self, input1, target, reduction, ignore_index):
-        if not ignore_index:
-            ignore_index = -100 # 默认值
-        output = torch.nn.functional.nll_loss(input1, target, reduction=reduction, ignore_index=ignore_index)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_new(self, input1, target, reduction, ignore_index):
-        if not ignore_index:
-            ignore_index = -100 # 默认值
-        target = target.to(torch.int32)
-        target = target.to("npu")
-        output = torch.nn.functional.nll_loss(input1, target, reduction=reduction, ignore_index=ignore_index)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_nllloss_shape_format_fp32(self, device):
-        # 当前仅支持设置正数, 若np.sum(ignore_index == np_target) == 0,则ignore_index设置任意数值不影响
-        ignore_index = 1 
-        for reduction in ['mean', 'none', 'sum']:
-            shape_format = [
-                [[np.float32, 0, [256, 100]], [np.int32, 0, [256]], reduction, None],
-                [[np.float32, 3, [256, 100]], [np.int32, 0, [256]], reduction, ignore_index],
-                [[np.float32, 0, [4800, 3003]], [np.int32, 0, [4800]], reduction, ignore_index],
-                [[np.float32, 3, [4800, 3003]], [np.int32, 0, [4800]], reduction, ignore_index],
-                [[np.float32, 0, [4800, 3003]], [np.int32, 0, [4800]], reduction, None],
-                ]
-            for item in shape_format:
-                np_target = np.random.randint(0, item[0][2][1], (item[1][2])).astype(np.long)
-                target = torch.from_numpy(np_target)
-                cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-                cpu_output = self.cpu_op_exec_new(cpu_input1, target, item[2], item[3])
-                npu_output = self.npu_op_exec_new(npu_input1, target, item[2], item[3])
-                self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_nllloss_shape_format_fp16(self, device):
-        # 当前仅支持设置正数, 若np.sum(ignore_index == np_target) == 0,则ignore_index设置任意数值不影响
-        ignore_index = 1
-        for reduction in ['mean', 'none', 'sum']:
-            shape_format = [
-                [[np.float16, 0, [256, 100]], [np.int32, 0, [256]], reduction, ignore_index],
-                [[np.float16, 3, [256, 100]], [np.int32, 0, [256]], reduction, ignore_index],
-                [[np.float16, 0, [4800, 3003]], [np.int32, 0, [4800]], reduction, ignore_index],
-                [[np.float16, 3, [4800, 3003]], [np.int32, 0, [4800]], reduction, ignore_index],
-                [[np.float16, 0, [4800, 3003]], [np.int32, 0, [4800]], reduction, None],
-                ]
-            for item in shape_format:
-                np_target = np.random.uniform(0, item[0][2][1], (item[1][2])).astype(np.long)
-                target = torch.from_numpy(np_target)
-                cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_output = self.cpu_op_exec_new(cpu_input1, target, item[2], item[3])
-                npu_output = self.npu_op_exec_new(npu_input1, target, item[2], item[3])
-                cpu_output = cpu_output.astype(np.float16)
-                self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestNllloss, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestNllloss(TestCase):
+    def cpu_op_exec_new(self, input1, target, reduction, ignore_index):
+        if not ignore_index:
+            ignore_index = -100 # 默认值
+        output = torch.nn.functional.nll_loss(input1, target, reduction=reduction, ignore_index=ignore_index)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_new(self, input1, target, reduction, ignore_index):
+        if not ignore_index:
+            ignore_index = -100 # 默认值
+        target = target.to(torch.int32)
+        target = target.to("npu")
+        output = torch.nn.functional.nll_loss(input1, target, reduction=reduction, ignore_index=ignore_index)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_nllloss_shape_format_fp32(self, device):
+        # 当前仅支持设置正数, 若np.sum(ignore_index == np_target) == 0,则ignore_index设置任意数值不影响
+        ignore_index = 1 
+        for reduction in ['mean', 'none', 'sum']:
+            shape_format = [
+                [[np.float32, 0, [256, 100]], [np.int32, 0, [256]], reduction, None],
+                [[np.float32, 3, [256, 100]], [np.int32, 0, [256]], reduction, ignore_index],
+                [[np.float32, 0, [4800, 3003]], [np.int32, 0, [4800]], reduction, ignore_index],
+                [[np.float32, 3, [4800, 3003]], [np.int32, 0, [4800]], reduction, ignore_index],
+                [[np.float32, 0, [4800, 3003]], [np.int32, 0, [4800]], reduction, None],
+                ]
+            for item in shape_format:
+                np_target = np.random.randint(0, item[0][2][1], (item[1][2])).astype(np.long)
+                target = torch.from_numpy(np_target)
+                cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+                cpu_output = self.cpu_op_exec_new(cpu_input1, target, item[2], item[3])
+                npu_output = self.npu_op_exec_new(npu_input1, target, item[2], item[3])
+                self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_nllloss_shape_format_fp16(self, device):
+        # 当前仅支持设置正数, 若np.sum(ignore_index == np_target) == 0,则ignore_index设置任意数值不影响
+        ignore_index = 1
+        for reduction in ['mean', 'none', 'sum']:
+            shape_format = [
+                [[np.float16, 0, [256, 100]], [np.int32, 0, [256]], reduction, ignore_index],
+                [[np.float16, 3, [256, 100]], [np.int32, 0, [256]], reduction, ignore_index],
+                [[np.float16, 0, [4800, 3003]], [np.int32, 0, [4800]], reduction, ignore_index],
+                [[np.float16, 3, [4800, 3003]], [np.int32, 0, [4800]], reduction, ignore_index],
+                [[np.float16, 0, [4800, 3003]], [np.int32, 0, [4800]], reduction, None],
+                ]
+            for item in shape_format:
+                np_target = np.random.uniform(0, item[0][2][1], (item[1][2])).astype(np.long)
+                target = torch.from_numpy(np_target)
+                cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_output = self.cpu_op_exec_new(cpu_input1, target, item[2], item[3])
+                npu_output = self.npu_op_exec_new(npu_input1, target, item[2], item[3])
+                cpu_output = cpu_output.astype(np.float16)
+                self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestNllloss, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_nms_v4.py b/test/test_npu/test_network_ops/test_nms_v4.py
index 7737ced69345e038a07549c3d54ed8b5367ebac9..0714914dfb2aa87e210cc7625808bdabe4630012 100644
--- a/test/test_npu/test_network_ops/test_nms_v4.py
+++ b/test/test_npu/test_network_ops/test_nms_v4.py
@@ -1,41 +1,41 @@
-import torch
-import numpy as np
-import sys
-import copy
-import torch.nn as nn
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestNmsV4(TestCase):
-    def generate_data(self, min, max, shape, dtype):
-        input = np.random.uniform(min, max, shape).astype(dtype)
-        npu_input = torch.from_numpy(input)
-        return npu_input
-
-    def npu_op_exec(self, boxes, scores, max_output_size, iou_threshold, scores_threshold):
-        boxes            = boxes.to("npu")
-        scores           = scores.to("npu")
-        iou_threshold    = iou_threshold.to("npu")
-        scores_threshold = scores_threshold.to("npu")
-        npu_output = torch.npu_nms_v4(boxes, scores, max_output_size, iou_threshold, scores_threshold)
-        #npu_output = npu_output.to("cpu")
-        print("===npu_output===")
-        print(npu_output)
-        return npu_output
-
-
-    def test_nms_v4_float32(self, device):
-        boxes = self.generate_data(0, 100, (100, 4), np.float32)
-        scores = self.generate_data(0, 1, (100), np.float32)
-        max_output_size = 20
-        iou_threshold = torch.tensor(0.5)
-        scores_threshold = torch.tensor(0.3)
-
-        npu_output = self.npu_op_exec(boxes, scores, max_output_size, iou_threshold, scores_threshold)
-
-
-instantiate_device_type_tests(TestNmsV4, globals(), except_for='cpu')
-if __name__ == "__main__":
+import torch
+import numpy as np
+import sys
+import copy
+import torch.nn as nn
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestNmsV4(TestCase):
+    def generate_data(self, min, max, shape, dtype):
+        input = np.random.uniform(min, max, shape).astype(dtype)
+        npu_input = torch.from_numpy(input)
+        return npu_input
+
+    def npu_op_exec(self, boxes, scores, max_output_size, iou_threshold, scores_threshold):
+        boxes            = boxes.to("npu")
+        scores           = scores.to("npu")
+        iou_threshold    = iou_threshold.to("npu")
+        scores_threshold = scores_threshold.to("npu")
+        npu_output = torch.npu_nms_v4(boxes, scores, max_output_size, iou_threshold, scores_threshold)
+        #npu_output = npu_output.to("cpu")
+        print("===npu_output===")
+        print(npu_output)
+        return npu_output
+
+
+    def test_nms_v4_float32(self, device):
+        boxes = self.generate_data(0, 100, (100, 4), np.float32)
+        scores = self.generate_data(0, 1, (100), np.float32)
+        max_output_size = 20
+        iou_threshold = torch.tensor(0.5)
+        scores_threshold = torch.tensor(0.3)
+
+        npu_output = self.npu_op_exec(boxes, scores, max_output_size, iou_threshold, scores_threshold)
+
+
+instantiate_device_type_tests(TestNmsV4, globals(), except_for='cpu')
+if __name__ == "__main__":
     run_tests() 
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_nms_with_mask.py b/test/test_npu/test_network_ops/test_nms_with_mask.py
index 39ee878cd77f69b10b2c882544839ecc3a4ef533..57b46884179a64babeca82db81d03ffd2db67b0c 100644
--- a/test/test_npu/test_network_ops/test_nms_with_mask.py
+++ b/test/test_npu/test_network_ops/test_nms_with_mask.py
@@ -1,52 +1,52 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-import torch.nn as nn
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestNmsWithMask(TestCase):
-    def npu_op_exec(self, input1, iou_threshold):
-        npu_output1, npu_output2, npu_output3, = torch.npu_nms_with_mask(input1, iou_threshold)
-        npu_output1 = npu_output1.to("cpu")
-        npu_output2 = npu_output2.to("cpu")
-        npu_output3 = npu_output3.to("cpu")
-
-        return npu_output1, npu_output2, npu_output3
-
-    def test_nms_with_mask_float32(self, device):
-        input1 = torch.tensor([[0.0, 1.0, 2.0, 3.0, 0.6], [6.0, 7.0, 8.0, 9.0, 0.4]]).npu()
-        iou_threshold = 0.5
-
-        eq_output1 = torch.tensor([[0.0000, 1.0000, 2.0000, 3.0000, 0.6001],
-                                   [6.0000, 7.0000, 8.0000, 9.0000, 0.3999]])
-        eq_output2 = torch.tensor([0, 1], dtype=torch.int32)
-        eq_output3 = torch.tensor([1, 1], dtype=torch.uint8)
-
-        npu_output1, npu_output2, npu_output3 = self.npu_op_exec(input1, iou_threshold)
-
-        self.assertRtolEqual(eq_output1, npu_output1)
-        self.assertRtolEqual(eq_output2, npu_output2)
-        self.assertRtolEqual(eq_output3, npu_output3)
-
-
-instantiate_device_type_tests(TestNmsWithMask, globals(), except_for='cpu')
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+import torch.nn as nn
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestNmsWithMask(TestCase):
+    def npu_op_exec(self, input1, iou_threshold):
+        npu_output1, npu_output2, npu_output3, = torch.npu_nms_with_mask(input1, iou_threshold)
+        npu_output1 = npu_output1.to("cpu")
+        npu_output2 = npu_output2.to("cpu")
+        npu_output3 = npu_output3.to("cpu")
+
+        return npu_output1, npu_output2, npu_output3
+
+    def test_nms_with_mask_float32(self, device):
+        input1 = torch.tensor([[0.0, 1.0, 2.0, 3.0, 0.6], [6.0, 7.0, 8.0, 9.0, 0.4]]).npu()
+        iou_threshold = 0.5
+
+        eq_output1 = torch.tensor([[0.0000, 1.0000, 2.0000, 3.0000, 0.6001],
+                                   [6.0000, 7.0000, 8.0000, 9.0000, 0.3999]])
+        eq_output2 = torch.tensor([0, 1], dtype=torch.int32)
+        eq_output3 = torch.tensor([1, 1], dtype=torch.uint8)
+
+        npu_output1, npu_output2, npu_output3 = self.npu_op_exec(input1, iou_threshold)
+
+        self.assertRtolEqual(eq_output1, npu_output1)
+        self.assertRtolEqual(eq_output2, npu_output2)
+        self.assertRtolEqual(eq_output3, npu_output3)
+
+
+instantiate_device_type_tests(TestNmsWithMask, globals(), except_for='cpu')
+if __name__ == "__main__":
     run_tests() 
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_norm.py b/test/test_npu/test_network_ops/test_norm.py
index ccbaf189c8ab7de6af8a4298313e28799053477c..9212c8e98622c047d1cb49ddd3328c5709ba30aa 100644
--- a/test/test_npu/test_network_ops/test_norm.py
+++ b/test/test_npu/test_network_ops/test_norm.py
@@ -1,80 +1,80 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import torch.nn.functional as F
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-import time
-
-class TestNorm(TestCase):
-    def norm_output_size(self, data, dimVal, keepdimVal):
-        output_size = list(data.size())
-        for i in dimVal:
-          if i < 0:
-            i = i + data.dim()
-          if i < data.dim() and keepdimVal == True:
-            output_size[i] = 1
-          if  i < data.dim() and keepdimVal == False:
-            output_size.pop(i)  
-        return output_size
-        
-    def cpu_dtype_out_exec(self, data, pVal, dimVal, keepdimVal, dtypeVal):
-        output_size = self.norm_output_size(data, dimVal, keepdimVal)
-        cpu_output = torch.randn(output_size)
-        torch.norm(data, p=pVal, dim = dimVal, keepdim = keepdimVal, out=cpu_output, dtype = dtypeVal)
-        return cpu_output.numpy()
-    
-    def npu_dtype_out_exec(self, data, pVal, dimVal, keepdimVal, dtypeVal):
-        output_size = self.norm_output_size(data, dimVal, keepdimVal)
-        npu_output = torch.randn(output_size).npu()
-        torch.norm(data, p=pVal, dim = dimVal, keepdim = keepdimVal, out=npu_output, dtype = dtypeVal)
-        return npu_output.cpu().numpy()
-        
-    def dtype_out_test(self, item):
-        cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-        cpu_out = self.cpu_dtype_out_exec(cpu_input, 2, [1,2], True, torch.float)
-        npu_out = self.npu_dtype_out_exec(npu_input, 2, [1,2], True, torch.float)
-        self.assertRtolEqual(cpu_out, npu_out)
-        
-        cpu_out = self.cpu_dtype_out_exec(cpu_input, 2, [1,2], False, torch.float)
-        npu_out = self.npu_dtype_out_exec(npu_input, 2, [1,2], False, torch.float)
-        self.assertRtolEqual(cpu_out, npu_out)
-        
-        cpu_out = self.cpu_dtype_out_exec(cpu_input, 1, [1,2], False, torch.float)
-        npu_out = self.npu_dtype_out_exec(npu_input, 1, [1,2], False, torch.float)
-        self.assertRtolEqual(cpu_out, npu_out)
-        
-        cpu_out = self.cpu_dtype_out_exec(cpu_input, 3, [1,2], False, torch.float)
-        npu_out = self.npu_dtype_out_exec(npu_input, 3, [1,2], False, torch.float)
-        self.assertRtolEqual(cpu_out, npu_out)
-        
-        cpu_out = self.cpu_dtype_out_exec(cpu_input, float("-inf"), [1,2], False, torch.float)
-        npu_out = self.npu_dtype_out_exec(npu_input, float("-inf"), [1,2], False, torch.float)
-        self.assertRtolEqual(cpu_out, npu_out)
-
-    def test_norm_shape_format(self, device):
-        shape_format = [
-                        [[np.float32, 0, (64, 64, 64, 64)]],
-                        ]
-
-        for item in shape_format:
-            # norm.dtype_out
-            self.dtype_out_test(item)
-
-instantiate_device_type_tests(TestNorm, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import torch.nn.functional as F
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+import time
+
+class TestNorm(TestCase):
+    def norm_output_size(self, data, dimVal, keepdimVal):
+        output_size = list(data.size())
+        for i in dimVal:
+          if i < 0:
+            i = i + data.dim()
+          if i < data.dim() and keepdimVal == True:
+            output_size[i] = 1
+          if  i < data.dim() and keepdimVal == False:
+            output_size.pop(i)  
+        return output_size
+        
+    def cpu_dtype_out_exec(self, data, pVal, dimVal, keepdimVal, dtypeVal):
+        output_size = self.norm_output_size(data, dimVal, keepdimVal)
+        cpu_output = torch.randn(output_size)
+        torch.norm(data, p=pVal, dim = dimVal, keepdim = keepdimVal, out=cpu_output, dtype = dtypeVal)
+        return cpu_output.numpy()
+    
+    def npu_dtype_out_exec(self, data, pVal, dimVal, keepdimVal, dtypeVal):
+        output_size = self.norm_output_size(data, dimVal, keepdimVal)
+        npu_output = torch.randn(output_size).npu()
+        torch.norm(data, p=pVal, dim = dimVal, keepdim = keepdimVal, out=npu_output, dtype = dtypeVal)
+        return npu_output.cpu().numpy()
+        
+    def dtype_out_test(self, item):
+        cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+        cpu_out = self.cpu_dtype_out_exec(cpu_input, 2, [1,2], True, torch.float)
+        npu_out = self.npu_dtype_out_exec(npu_input, 2, [1,2], True, torch.float)
+        self.assertRtolEqual(cpu_out, npu_out)
+        
+        cpu_out = self.cpu_dtype_out_exec(cpu_input, 2, [1,2], False, torch.float)
+        npu_out = self.npu_dtype_out_exec(npu_input, 2, [1,2], False, torch.float)
+        self.assertRtolEqual(cpu_out, npu_out)
+        
+        cpu_out = self.cpu_dtype_out_exec(cpu_input, 1, [1,2], False, torch.float)
+        npu_out = self.npu_dtype_out_exec(npu_input, 1, [1,2], False, torch.float)
+        self.assertRtolEqual(cpu_out, npu_out)
+        
+        cpu_out = self.cpu_dtype_out_exec(cpu_input, 3, [1,2], False, torch.float)
+        npu_out = self.npu_dtype_out_exec(npu_input, 3, [1,2], False, torch.float)
+        self.assertRtolEqual(cpu_out, npu_out)
+        
+        cpu_out = self.cpu_dtype_out_exec(cpu_input, float("-inf"), [1,2], False, torch.float)
+        npu_out = self.npu_dtype_out_exec(npu_input, float("-inf"), [1,2], False, torch.float)
+        self.assertRtolEqual(cpu_out, npu_out)
+
+    def test_norm_shape_format(self, device):
+        shape_format = [
+                        [[np.float32, 0, (64, 64, 64, 64)]],
+                        ]
+
+        for item in shape_format:
+            # norm.dtype_out
+            self.dtype_out_test(item)
+
+instantiate_device_type_tests(TestNorm, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_norm_ext.py b/test/test_npu/test_network_ops/test_norm_ext.py
index 8e5e51224cf3493e38b3b1e0fb40300874c678eb..0b8b13c90c0535e77816770d21d937423e09d05c 100644
--- a/test/test_npu/test_network_ops/test_norm_ext.py
+++ b/test/test_npu/test_network_ops/test_norm_ext.py
@@ -1,130 +1,130 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestNorm(TestCase):
-    def norm_output_size(self, input, dim, keepdim):
-        output_size = list(input.size())
-        for i in dim:
-          if i < 0:
-            i = i + input.dim()
-          if i < input.dim() and keepdim == True:
-            output_size[i] = 1
-          if  i < input.dim() and keepdim == False:
-            output_size.pop(i)  
-        return output_size
-
-    def cpu_out_exec(self, input, p1, dim1, keepdim1, dtype1):
-        output_size = self.norm_output_size(input, dim1, keepdim1)
-        cpu_out = torch.randn(output_size)
-        output = torch.norm(input, p = p1, dim = dim1 , keepdim = keepdim1, out = cpu_out, dtype = dtype1)
-        return output
-    
-    def npu_out_exec(self, input, p1, dim1, keepdim1, dtype1):
-        output_size = self.norm_output_size(input, dim1, keepdim1)
-        npu_out = torch.randn(output_size).npu()
-        output1 = torch.norm(input, p = p1, dim = dim1 , keepdim = keepdim1, out = npu_out, dtype = dtype1)
-        output = output1.to("cpu")
-        return output
-        
-    def test_norm_shape_format_0(self, device):
-        shape_format = [
-                [[np.float16, 0, (1)]],
-                [[np.float32, 0, (1)]],
-        ] 
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            if cpu_input.dtype == torch.float16:
-                cpu_input = cpu_input.to(torch.float32)
-            cpu_output = self.cpu_out_exec(cpu_input, 0, [0], True, torch.float)
-            npu_output = self.npu_out_exec(npu_input, 0, [0], True, torch.float)
-            cpu_output = cpu_output.to(npu_output.dtype)
-            self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
-       
-    def test_norm_shape_format_1(self, device):
-        shape_format = [
-                [[np.float16, 0, (12, 33)]],
-                [[np.float32, 0, (12, 33)]],
-        ] 
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            if cpu_input.dtype == torch.float16:
-                cpu_input = cpu_input.to(torch.float32)
-            cpu_output = self.cpu_out_exec(cpu_input, 1, [0,1], True, torch.float)
-            npu_output = self.npu_out_exec(npu_input, 1, [0,1], True, torch.float)
-            cpu_output = cpu_output.to(npu_output.dtype)
-            self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
-            
-    def test_norm_shape_format_2(self, device):
-        shape_format = [
-                # [[np.float16, 0, (12, 33)]],  # result error
-                [[np.float32, 0, (12, 33)]],
-        ] 
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            if cpu_input.dtype == torch.float16:
-                cpu_input = cpu_input.to(torch.float32)
-            cpu_output = self.cpu_out_exec(cpu_input, 2, [0], False, torch.float)
-            npu_output = self.npu_out_exec(npu_input, 2, [0], False, torch.float)
-            npu_output = npu_output.to(cpu_output.dtype)
-            self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
-            
-    def test_norm_shape_format_3(self, device):
-        shape_format = [
-                # [[np.float16, 0, (10, 24, 56, 2048)]], # result error
-                [[np.float32, 0, (10, 24, 56, 2048)]],
-        ] 
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            if cpu_input.dtype == torch.float16:
-                cpu_input = cpu_input.to(torch.float32)
-            cpu_output = self.cpu_out_exec(cpu_input, 3, [1,2], True, torch.float)
-            npu_output = self.npu_out_exec(npu_input, 3, [1,2], True, torch.float)
-            cpu_output = cpu_output.to(npu_output.dtype)
-            self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
-           
-    def test_norm_shape_format_inf(self, device):
-        shape_format = [
-                [[np.float16, 0, (64, 64, 64, 64)]],
-                [[np.float32, 0, (64, 64, 64, 64)]],
-        ] 
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            if cpu_input.dtype == torch.float16:
-                cpu_input = cpu_input.to(torch.float32)
-            cpu_output = self.cpu_out_exec(cpu_input, float("inf"), [1,2], True, torch.float)
-            npu_output = self.npu_out_exec(npu_input, float("inf"), [1,2], True, torch.float)
-            cpu_output = cpu_output.to(npu_output.dtype)
-            self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
-           
-    def test_norm_shape_format_inf1(self, device):
-        shape_format = [
-                [[np.float16, 0, (64, 64, 64, 64)]],
-                [[np.float32, 0, (64, 64, 64, 64)]],
-        ] 
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            cpu_output = self.cpu_out_exec(cpu_input, float("-inf"), [1,2], False, torch.float)
-            npu_output = self.npu_out_exec(npu_input, float("-inf"), [1,2], False, torch.float)
-            cpu_output = cpu_output.to(npu_output.dtype)
-            self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
-     
-instantiate_device_type_tests(TestNorm, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestNorm(TestCase):
+    def norm_output_size(self, input, dim, keepdim):
+        output_size = list(input.size())
+        for i in dim:
+          if i < 0:
+            i = i + input.dim()
+          if i < input.dim() and keepdim == True:
+            output_size[i] = 1
+          if  i < input.dim() and keepdim == False:
+            output_size.pop(i)  
+        return output_size
+
+    def cpu_out_exec(self, input, p1, dim1, keepdim1, dtype1):
+        output_size = self.norm_output_size(input, dim1, keepdim1)
+        cpu_out = torch.randn(output_size)
+        output = torch.norm(input, p = p1, dim = dim1 , keepdim = keepdim1, out = cpu_out, dtype = dtype1)
+        return output
+    
+    def npu_out_exec(self, input, p1, dim1, keepdim1, dtype1):
+        output_size = self.norm_output_size(input, dim1, keepdim1)
+        npu_out = torch.randn(output_size).npu()
+        output1 = torch.norm(input, p = p1, dim = dim1 , keepdim = keepdim1, out = npu_out, dtype = dtype1)
+        output = output1.to("cpu")
+        return output
+        
+    def test_norm_shape_format_0(self, device):
+        shape_format = [
+                [[np.float16, 0, (1)]],
+                [[np.float32, 0, (1)]],
+        ] 
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            if cpu_input.dtype == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+            cpu_output = self.cpu_out_exec(cpu_input, 0, [0], True, torch.float)
+            npu_output = self.npu_out_exec(npu_input, 0, [0], True, torch.float)
+            cpu_output = cpu_output.to(npu_output.dtype)
+            self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
+       
+    def test_norm_shape_format_1(self, device):
+        shape_format = [
+                [[np.float16, 0, (12, 33)]],
+                [[np.float32, 0, (12, 33)]],
+        ] 
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            if cpu_input.dtype == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+            cpu_output = self.cpu_out_exec(cpu_input, 1, [0,1], True, torch.float)
+            npu_output = self.npu_out_exec(npu_input, 1, [0,1], True, torch.float)
+            cpu_output = cpu_output.to(npu_output.dtype)
+            self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
+            
+    def test_norm_shape_format_2(self, device):
+        shape_format = [
+                # [[np.float16, 0, (12, 33)]],  # result error
+                [[np.float32, 0, (12, 33)]],
+        ] 
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            if cpu_input.dtype == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+            cpu_output = self.cpu_out_exec(cpu_input, 2, [0], False, torch.float)
+            npu_output = self.npu_out_exec(npu_input, 2, [0], False, torch.float)
+            npu_output = npu_output.to(cpu_output.dtype)
+            self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
+            
+    def test_norm_shape_format_3(self, device):
+        shape_format = [
+                # [[np.float16, 0, (10, 24, 56, 2048)]], # result error
+                [[np.float32, 0, (10, 24, 56, 2048)]],
+        ] 
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            if cpu_input.dtype == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+            cpu_output = self.cpu_out_exec(cpu_input, 3, [1,2], True, torch.float)
+            npu_output = self.npu_out_exec(npu_input, 3, [1,2], True, torch.float)
+            cpu_output = cpu_output.to(npu_output.dtype)
+            self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
+           
+    def test_norm_shape_format_inf(self, device):
+        shape_format = [
+                [[np.float16, 0, (64, 64, 64, 64)]],
+                [[np.float32, 0, (64, 64, 64, 64)]],
+        ] 
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            if cpu_input.dtype == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+            cpu_output = self.cpu_out_exec(cpu_input, float("inf"), [1,2], True, torch.float)
+            npu_output = self.npu_out_exec(npu_input, float("inf"), [1,2], True, torch.float)
+            cpu_output = cpu_output.to(npu_output.dtype)
+            self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
+           
+    def test_norm_shape_format_inf1(self, device):
+        shape_format = [
+                [[np.float16, 0, (64, 64, 64, 64)]],
+                [[np.float32, 0, (64, 64, 64, 64)]],
+        ] 
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            cpu_output = self.cpu_out_exec(cpu_input, float("-inf"), [1,2], False, torch.float)
+            npu_output = self.npu_out_exec(npu_input, float("-inf"), [1,2], False, torch.float)
+            cpu_output = cpu_output.to(npu_output.dtype)
+            self.assertRtolEqual(cpu_output.numpy(), npu_output.numpy())
+     
+instantiate_device_type_tests(TestNorm, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_not_equal.py b/test/test_npu/test_network_ops/test_not_equal.py
old mode 100644
new mode 100755
index 09fe26b416f8f39b1bc3594cc62188984305284f..d64cc713d90a34d7c7b73d2fecf24214d81f01a9
--- a/test/test_npu/test_network_ops/test_not_equal.py
+++ b/test/test_npu/test_network_ops/test_not_equal.py
@@ -1,190 +1,190 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestNotEqual(TestCase):
-    def cpu_op_exec(self, input1, input2):
-        output = torch.ne(input1, input2)
-        output = output.numpy().astype(np.int32)
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        output = torch.ne(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy().astype(np.int32)
-        return output
-
-    def cpu_op_inplace_exec(self, input1, input2):
-        input1.ne_(input2)
-        output = input1.numpy().astype(np.int32)
-        return output
-
-    def npu_op_inplace_exec(self, input1, input2):
-        input1.ne_(input2)
-        output = input1.to("cpu")
-        output = output.numpy().astype(np.int32)
-        return output
-
-    def npu_op_exec_out(self, input1, input2, out):
-        torch.ne(input1, input2, out=out)
-        output = out.to("cpu")
-        output = output.numpy().astype(np.int32)
-        return output
-
-    def not_equal_scalar_result(self, shape_format):
-        for item in shape_format:
-            scalar = np.random.uniform(0, 100)
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            npu_input3 = copy.deepcopy(cpu_input1).to("npu").to(torch.bool)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-
-            cpu_output = self.cpu_op_exec(cpu_input1, scalar)
-            npu_output = self.npu_op_exec(npu_input1, scalar)
-            npu_output_out = self.npu_op_exec_out(npu_input1, scalar, npu_input3)
-
-            cpu_output_inp = self.cpu_op_inplace_exec(cpu_input1, scalar)
-            npu_output_inp = self.npu_op_inplace_exec(npu_input1, scalar)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_output, npu_output_out)
-            self.assertRtolEqual(cpu_output_inp, npu_output_inp)
-
-    def not_equal_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
-            npu_input3 = copy.deepcopy(cpu_input1).to("npu").to(torch.bool)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
-
-            cpu_output_inp = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
-            npu_output_inp = self.npu_op_inplace_exec(npu_input1, npu_input2)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_output, npu_output_out)
-            self.assertRtolEqual(cpu_output_inp, npu_output_inp)
-
-    def test_not_equal_shape_format_fp16_1d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.float16, i, [16]], [np.float16, i, [16]]] for i in format_list]
-        self.not_equal_result(shape_format)
-
-    def test_not_equal_shape_format_fp32_1d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.float32, i, [16]], [np.float32, i, [16]]] for i in format_list]
-        self.not_equal_result(shape_format)
-
-    def test_not_equal_shape_format_fp16_2d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.float16, i, [448, 1]], [np.float16, i, [448, 1]]] for i in format_list]
-        self.not_equal_result(shape_format)
-
-    def test_not_equal_shape_format_fp32_2d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.float32, i, [448, 1]], [np.float32, i, [448, 1]]] for i in format_list]
-        self.not_equal_result(shape_format)
-
-    def test_not_equal_shape_format_fp16_3d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.float16, i, [16, 640, 640]], [np.float16, i, [16, 640, 640]]] for i in format_list]
-        self.not_equal_result(shape_format)
-
-    def test_not_equal_shape_format_fp32_3d(self, device):
-        format_list = [-1, 0, 3]
-        shape_format = [[[np.float32, i, [16, 640, 640]], [np.float32, i, [16, 640, 640]]] for i in format_list]
-        self.not_equal_result(shape_format)
-
-    def test_not_equal_shape_format_fp16_4d(self, device):
-        format_list = [-1, 0, 3]
-        shape_format = [[[np.float16, i, [32, 3, 3, 3]], [np.float16, i, [32, 3, 3, 3]]] for i in format_list]
-        self.not_equal_result(shape_format)
-
-    def test_not_equal_shape_format_fp32_4d(self, device):
-        format_list = [-1, 0, 3]
-        shape_format = [[[np.float32, i, [32, 3, 3, 3]], [np.float32, i, [32, 3, 3, 3]]] for i in format_list]
-        self.not_equal_result(shape_format)
-
-    # scala-----------------------------------------------------------------
-
-    def test_not_equal_scalar_shape_format_fp16_1d(self, device):
-        format_list = [-1, 0, 3]
-        shape_format = [[[np.float16, i, 18]] for i in format_list]
-        self.not_equal_scalar_result(shape_format)
-
-    def test_not_equal_scalar_shape_format_fp32_1d(self, device):
-        format_list = [-1, 0, 3]
-        shape_format = [[[np.float32, i, [18]]] for i in format_list]
-        self.not_equal_scalar_result(shape_format)
-
-    def test_not_equal_scalar_shape_format_fp16_2d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.float16, i, [64, 7]]] for i in format_list]
-        self.not_equal_scalar_result(shape_format)
-
-    def test_not_equal_scalar_shape_format_fp32_2d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.float32, i, [64, 7]]] for i in format_list]
-        self.not_equal_scalar_result(shape_format)
-
-    def test_not_equal_scalar_shape_format_fp32_3d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.float32, i, [64, 24, 38]]] for i in format_list]
-        self.not_equal_scalar_result(shape_format)
-
-    def test_not_equal_scalar_shape_format_fp16_4d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.float16, i, [32, 3, 3, 3]]] for i in format_list]
-        self.not_equal_scalar_result(shape_format)
-
-    def test_not_equal_scalar_shape_format_fp32_4d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.float32, i, [32, 3, 3, 3]]] for i in format_list]
-        self.not_equal_scalar_result(shape_format)
-
-    def test_not_equal_shape_format_int32_1d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.int32, i, [16]], [np.int32, i, [16]]] for i in format_list]
-        self.not_equal_result(shape_format)
-
-    def test_not_equal_shape_format_int32_2d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.int32, i, [448, 1]], [np.int32, i, [448, 1]]] for i in format_list]
-        self.not_equal_result(shape_format)
-
-    def test_not_equal_shape_format_int32_3d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.int32, i, [16, 640, 640]], [np.int32, i, [16, 640, 640]]] for i in format_list]
-        self.not_equal_result(shape_format)
-
-    def test_not_equal_shape_format_int32_4d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.int32, i, [32, 3, 3, 3]], [np.int32, i, [32, 3, 3, 3]]] for i in format_list]
-        self.not_equal_result(shape_format)
-
-
-instantiate_device_type_tests(TestNotEqual, globals(), except_for="cpu")
-
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestNotEqual(TestCase):
+    def cpu_op_exec(self, input1, input2):
+        output = torch.ne(input1, input2)
+        output = output.numpy().astype(np.int32)
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.ne(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy().astype(np.int32)
+        return output
+
+    def cpu_op_inplace_exec(self, input1, input2):
+        input1.ne_(input2)
+        output = input1.numpy().astype(np.int32)
+        return output
+
+    def npu_op_inplace_exec(self, input1, input2):
+        input1.ne_(input2)
+        output = input1.to("cpu")
+        output = output.numpy().astype(np.int32)
+        return output
+
+    def npu_op_exec_out(self, input1, input2, out):
+        torch.ne(input1, input2, out=out)
+        output = out.to("cpu")
+        output = output.numpy().astype(np.int32)
+        return output
+
+    def not_equal_scalar_result(self, shape_format):
+        for item in shape_format:
+            scalar = np.random.uniform(0, 100)
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            npu_input3 = copy.deepcopy(cpu_input1).to("npu").to(torch.bool)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+
+            cpu_output = self.cpu_op_exec(cpu_input1, scalar)
+            npu_output = self.npu_op_exec(npu_input1, scalar)
+            npu_output_out = self.npu_op_exec_out(npu_input1, scalar, npu_input3)
+
+            cpu_output_inp = self.cpu_op_inplace_exec(cpu_input1, scalar)
+            npu_output_inp = self.npu_op_inplace_exec(npu_input1, scalar)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output_out)
+            self.assertRtolEqual(cpu_output_inp, npu_output_inp)
+
+    def not_equal_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            npu_input3 = copy.deepcopy(cpu_input1).to("npu").to(torch.bool)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
+
+            cpu_output_inp = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
+            npu_output_inp = self.npu_op_inplace_exec(npu_input1, npu_input2)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output_out)
+            self.assertRtolEqual(cpu_output_inp, npu_output_inp)
+
+    def test_not_equal_shape_format_fp16_1d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.float16, i, [16]], [np.float16, i, [16]]] for i in format_list]
+        self.not_equal_result(shape_format)
+
+    def test_not_equal_shape_format_fp32_1d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.float32, i, [16]], [np.float32, i, [16]]] for i in format_list]
+        self.not_equal_result(shape_format)
+
+    def test_not_equal_shape_format_fp16_2d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.float16, i, [448, 1]], [np.float16, i, [448, 1]]] for i in format_list]
+        self.not_equal_result(shape_format)
+
+    def test_not_equal_shape_format_fp32_2d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.float32, i, [448, 1]], [np.float32, i, [448, 1]]] for i in format_list]
+        self.not_equal_result(shape_format)
+
+    def test_not_equal_shape_format_fp16_3d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.float16, i, [16, 640, 640]], [np.float16, i, [16, 640, 640]]] for i in format_list]
+        self.not_equal_result(shape_format)
+
+    def test_not_equal_shape_format_fp32_3d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[[np.float32, i, [16, 640, 640]], [np.float32, i, [16, 640, 640]]] for i in format_list]
+        self.not_equal_result(shape_format)
+
+    def test_not_equal_shape_format_fp16_4d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[[np.float16, i, [32, 3, 3, 3]], [np.float16, i, [32, 3, 3, 3]]] for i in format_list]
+        self.not_equal_result(shape_format)
+
+    def test_not_equal_shape_format_fp32_4d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[[np.float32, i, [32, 3, 3, 3]], [np.float32, i, [32, 3, 3, 3]]] for i in format_list]
+        self.not_equal_result(shape_format)
+
+    # scala-----------------------------------------------------------------
+
+    def test_not_equal_scalar_shape_format_fp16_1d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[[np.float16, i, 18]] for i in format_list]
+        self.not_equal_scalar_result(shape_format)
+
+    def test_not_equal_scalar_shape_format_fp32_1d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[[np.float32, i, [18]]] for i in format_list]
+        self.not_equal_scalar_result(shape_format)
+
+    def test_not_equal_scalar_shape_format_fp16_2d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.float16, i, [64, 7]]] for i in format_list]
+        self.not_equal_scalar_result(shape_format)
+
+    def test_not_equal_scalar_shape_format_fp32_2d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.float32, i, [64, 7]]] for i in format_list]
+        self.not_equal_scalar_result(shape_format)
+
+    def test_not_equal_scalar_shape_format_fp32_3d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.float32, i, [64, 24, 38]]] for i in format_list]
+        self.not_equal_scalar_result(shape_format)
+
+    def test_not_equal_scalar_shape_format_fp16_4d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.float16, i, [32, 3, 3, 3]]] for i in format_list]
+        self.not_equal_scalar_result(shape_format)
+
+    def test_not_equal_scalar_shape_format_fp32_4d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.float32, i, [32, 3, 3, 3]]] for i in format_list]
+        self.not_equal_scalar_result(shape_format)
+
+    def test_not_equal_shape_format_int32_1d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [16]], [np.int32, i, [16]]] for i in format_list]
+        self.not_equal_result(shape_format)
+
+    def test_not_equal_shape_format_int32_2d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [448, 1]], [np.int32, i, [448, 1]]] for i in format_list]
+        self.not_equal_result(shape_format)
+
+    def test_not_equal_shape_format_int32_3d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [16, 640, 640]], [np.int32, i, [16, 640, 640]]] for i in format_list]
+        self.not_equal_result(shape_format)
+
+    def test_not_equal_shape_format_int32_4d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [32, 3, 3, 3]], [np.int32, i, [32, 3, 3, 3]]] for i in format_list]
+        self.not_equal_result(shape_format)
+
+
+instantiate_device_type_tests(TestNotEqual, globals(), except_for="cpu")
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_npu_bert_apply_adam.py b/test/test_npu/test_network_ops/test_npu_bert_apply_adam.py
index a3b02059ab466903385045fc88ea75eba6ba09fb..589454b227a22b67a4a8d79ac18f4626c6a485ec 100644
--- a/test/test_npu/test_network_ops/test_npu_bert_apply_adam.py
+++ b/test/test_npu/test_network_ops/test_npu_bert_apply_adam.py
@@ -1,55 +1,55 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestNpuBertApplyAdam(TestCase):
-    def test_npu_bert_apply_adam(self, device):
-        seed = 3
-        torch.manual_seed(seed)
-        torch.npu.manual_seed(seed)
-        torch.npu.manual_seed_all(seed)
-
-        var_in = torch.rand(321538).uniform_(-32., 21.).npu()
-        m_in = torch.zeros(321538).npu()
-        v_in = torch.zeros(321538).npu()
-        grad = torch.rand(321538).uniform_(-0.05, 0.03).npu()
-
-        var_ans = torch.tensor([13.1862, -30.1250, -20.4954])
-        m_ans = torch.tensor([0.0014, 0.0018, -0.0021])
-        v_ans = torch.tensor([1.8999e-06, 3.2629e-06, 4.4347e-06])
-
-        max_grad_norm = -1.
-        beta1 = 0.9
-        beta2 = 0.99
-        weight_decay = 0.
-        lr = 0.
-        epsilon = 1e-06
-        global_grad_norm = 0.
-
-        var_out, m_out, v_out = torch.npu_bert_apply_adam(
-            var_in, m_in, v_in, lr, beta1, beta2, epsilon, grad, max_grad_norm, global_grad_norm, weight_decay)
-
-        self.assertRtolEqual(var_out[:3].cpu(), var_ans)
-        self.assertRtolEqual(m_out[:3].cpu(), m_ans)
-        self.assertRtolEqual(v_out[:3].cpu(), v_ans)
-
-instantiate_device_type_tests(TestNpuBertApplyAdam, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestNpuBertApplyAdam(TestCase):
+    def test_npu_bert_apply_adam(self, device):
+        seed = 3
+        torch.manual_seed(seed)
+        torch.npu.manual_seed(seed)
+        torch.npu.manual_seed_all(seed)
+
+        var_in = torch.rand(321538).uniform_(-32., 21.).npu()
+        m_in = torch.zeros(321538).npu()
+        v_in = torch.zeros(321538).npu()
+        grad = torch.rand(321538).uniform_(-0.05, 0.03).npu()
+
+        var_ans = torch.tensor([13.1862, -30.1250, -20.4954])
+        m_ans = torch.tensor([0.0014, 0.0018, -0.0021])
+        v_ans = torch.tensor([1.8999e-06, 3.2629e-06, 4.4347e-06])
+
+        max_grad_norm = -1.
+        beta1 = 0.9
+        beta2 = 0.99
+        weight_decay = 0.
+        lr = 0.
+        epsilon = 1e-06
+        global_grad_norm = 0.
+
+        var_out, m_out, v_out = torch.npu_bert_apply_adam(
+            var_in, m_in, v_in, lr, beta1, beta2, epsilon, grad, max_grad_norm, global_grad_norm, weight_decay)
+
+        self.assertRtolEqual(var_out[:3].cpu(), var_ans)
+        self.assertRtolEqual(m_out[:3].cpu(), m_ans)
+        self.assertRtolEqual(v_out[:3].cpu(), v_ans)
+
+instantiate_device_type_tests(TestNpuBertApplyAdam, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_npu_giou.py b/test/test_npu/test_network_ops/test_npu_giou.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6f55768d074081684971fb48a5f5469e31c9536
--- /dev/null
+++ b/test/test_npu/test_network_ops/test_npu_giou.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import math
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestNpuGiou(TestCase):
+    def generate_giou_data(self, n, m, dtype):
+        data_bboxes = np.array([]).astype(dtype)
+        for i in range(4):
+            data_bboxes_array = i // 2 + math.pow(-1, i // 2) * 0.5 * np.random.rand(1, n).astype(dtype)
+            data_bboxes = np.append(data_bboxes, data_bboxes_array)
+        data_bboxes = data_bboxes.reshape([4, n])
+        data_gtboxes = np.array([]).astype(dtype)
+        for i in range(4):
+            data_gtboxes_array = i // 2 + math.pow(-1, i // 2) * 0.5 * np.random.rand(1, m).astype(dtype)
+            data_gtboxes = np.append(data_gtboxes, data_gtboxes_array)
+        data_gtboxes = data_gtboxes.reshape([4, m])
+        cpu_input1 = torch.from_numpy(data_bboxes)
+        cpu_input2 = torch.from_numpy(data_gtboxes)
+        npu_input1 = cpu_input1.npu()
+        npu_input2 = cpu_input2.npu()
+        return cpu_input1, cpu_input2, npu_input1, npu_input2
+
+    def cpu_op_exec(self, box1, box2, trans=False, is_cross=False, mode="iou"):
+        box1 = box1.numpy()
+        box2 = box2.numpy()
+        dtype = box1.dtype
+        _, n = box1.shape
+        _, m = box2.shape
+        if trans:
+            b1_x1, b1_x2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2
+            b1_y1, b1_y2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2
+            b2_x1, b2_x2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2
+            b2_y1, b2_y2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2
+        else:
+            b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
+            b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]
+        w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1
+        w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1
+        area1 = w1 * h1
+        area2 = w2 * h2
+        giou_res =np.array([], dtype=dtype)
+        
+        for i in range(n):
+            for j in range(m):
+                inter_x1 = max(b1_x1[i], b2_x1[j])
+                inter_x2 = min(b1_x2[i], b2_x2[j])
+                inter_y1 = max(b1_y1[i], b2_y1[j])
+                inter_y2 = min(b1_y2[i], b2_y2[j])
+                outer_x1 = min(b1_x1[i], b2_x1[j])
+                outer_x2 = max(b1_x2[i], b2_x2[j])
+                outer_y1 = min(b1_y1[i], b2_y1[j])
+                outer_y2 = max(b1_y2[i], b2_y2[j])
+                inter_area = max(0, (inter_x2 - inter_x1)) * max(0, (inter_y2 - inter_y1))
+                outer_area = abs(outer_x2 - outer_x1) * abs(outer_y2 - outer_y1)
+                union_area = area1[i] + area2[j] - inter_area + 1e-16
+                other_area = outer_area - union_area
+                giou_ij = inter_area / union_area - other_area / outer_area
+                if not is_cross:
+                    if i == j:
+                        giou_res = np.append(giou_res, giou_ij)
+                else:
+                    giou_res = np.append(giou_res, giou_ij)
+        
+        if not is_cross:
+            res = giou_res.reshape(1, n)
+        else:
+            res = giou_res.reshape(n, m)
+            res = np.transpose(res)
+        res = np.transpose(res)
+        return res
+
+    def npu_op_exec(self,  box1, box2, trans=False, is_cross=False, mode=0):
+        output = torch.npu_giou(box1, box2, trans, is_cross, mode)
+        output = output.detach().cpu().numpy()
+        return output
+    
+    def test_npu_giou_shape_format_fp32(self, device):
+        self._test_npu_giou_shape_format(np.float32)
+    
+    def test_npu_giou_shape_format_fp16(self, device):
+        self._test_npu_giou_shape_format(np.float16)
+    
+    def _test_npu_giou_shape_format(self, dtype):
+        shape_list = [
+            [10, 10],
+            [12, 10],
+            [100, 100]
+        ]
+        is_trans_list = [False]
+        mode_list = ["iou"]
+        # TODO(Ascend): 反向只支持 mode=="iof", is_cross==False,
+        # is_trans==Fasle场景，这里同步验证相同场景
+        shape_format = [[j, k, m]
+                        for j in shape_list
+                        for k in is_trans_list
+                        for m in mode_list]
+
+        for item in shape_format:
+            mode_digit = 0 if item[-1] == "iou" else 1
+            is_cross = False if item[0][0] == item[0][1] else True
+            cpu_input1, cpu_input2, npu_input1, npu_input2 = self.generate_giou_data(*item[0], dtype)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, item[1], is_cross, item[-1])
+            npu_output = self.npu_op_exec(npu_input1, npu_input2, item[1], is_cross, mode_digit)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            if dtype == np.float16:
+                # TODO(Ascend): fp16 insufficient precision
+                self.assertRtolEqual(cpu_output, npu_output, prec16=1e-2)
+            else:
+                self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestNpuGiou, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_npu_giou_backward.py b/test/test_npu/test_network_ops/test_npu_giou_backward.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cf564d74b5263401d311c4486f9335c82fa7a8b
--- /dev/null
+++ b/test/test_npu/test_network_ops/test_npu_giou_backward.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import math
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestNpuGiouBackward(TestCase):
+    def generate_giou_data(self, n, m, dtype):
+        data_bboxes = np.array([]).astype(dtype)
+        for i in range(4):
+            data_bboxes_array = i // 2 + math.pow(-1, i // 2) * 0.5 * np.random.rand(1, n).astype(dtype)
+            data_bboxes = np.append(data_bboxes, data_bboxes_array)
+        data_bboxes = data_bboxes.reshape([4, n])
+        data_gtboxes = np.array([]).astype(dtype)
+        for i in range(4):
+            data_gtboxes_array = i // 2 + math.pow(-1, i // 2) * 0.5 * np.random.rand(1, m).astype(dtype)
+            data_gtboxes = np.append(data_gtboxes, data_gtboxes_array)
+        data_gtboxes = data_gtboxes.reshape([4, m])
+        cpu_input1 = torch.from_numpy(data_bboxes)
+        cpu_input2 = torch.from_numpy(data_gtboxes)
+        npu_input1 = cpu_input1.npu()
+        npu_input2 = cpu_input2.npu()
+        return cpu_input1, cpu_input2, npu_input1, npu_input2
+
+    def npu_op_exec(self,  box1, box2, trans=False, is_cross=False, mode=0):
+        box1.requires_grad = True
+        box2.requires_grad = True
+        output = torch.npu_giou(box1, box2, trans, is_cross, mode)
+        output.backward(torch.ones_like(output))
+        box1_grad = box1.grad
+        box2_grad = box2.grad
+        box1_grad = box1_grad.detach().cpu().numpy()
+        box2_grad = box2_grad.detach().cpu().numpy()
+        output = output.detach().cpu().numpy()
+        return output, box1_grad, box2_grad
+
+    def test_npu_giou_backward_shape_format(self, dtype):
+        shape_list = [
+            [1, 1]
+        ]
+        is_trans_list = [False]
+        mode_list = ["iou"]
+        # TODO(Ascend): only support mode=="iof", is_cross==False, 
+        # is_trans==Fasle currently
+        shape_format = [[j, k, m]
+                        for j in shape_list
+                        for k in is_trans_list
+                        for m in mode_list]
+
+        for item in shape_format:
+            mode_digit = 0 if item[-1] == "iou" else 1
+            is_cross = False if item[0][0] == item[0][1] else True
+            expected_cpu_grad1 = np.array([[0.51091206],
+                                           [-0.70909655],
+                                           [0.3726323],
+                                           [0.349545]], dtype=np.float32)
+            expected_cpu_grad2 = np.array([[-0.51091206],
+                                           [0.70909655],
+                                           [0.3599837],
+                                           [0.47306436]], dtype=np.float32)
+            _, _, npu_input1, npu_input2 = self.generate_giou_data(*item[0], np.float32)
+            _, npu_grad1, npu_grad2 = self.npu_op_exec(npu_input1, npu_input2, item[1], is_cross, mode_digit)
+            self.assertRtolEqual(expected_cpu_grad1, npu_grad1)
+            self.assertRtolEqual(expected_cpu_grad2, npu_grad2)
+
+
+instantiate_device_type_tests(TestNpuGiouBackward, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_npu_linear.py b/test/test_npu/test_network_ops/test_npu_linear.py
index ea9e7c2e2f507d69f4bcf3446babe2c4141cf6c0..5ac981b4df519cc1a43816bc0c60c1ec46fdd387 100644
--- a/test/test_npu/test_network_ops/test_npu_linear.py
+++ b/test/test_npu/test_network_ops/test_npu_linear.py
@@ -1,62 +1,62 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestNpuLinear(TestCase):
-    def cpu_op_exec(self, x, weight, bias):
-        output = torch.nn.functional.linear(x, weight, bias)
-        output = output.numpy()
-        return output
-    
-    def npu_op_exec(self, x, weight, bias):
-        output = torch.npu_linear(x, weight, bias)
-        output = output.cpu().numpy()
-        return output
-
-    def test_npu_linear_shape_format_fp32(self, device):
-        shape_format = [
-                [[np.float32, -1, (6144, 1024)], [np.float32, -1, (256, 1024)], [np.float32, -1, (256)]],
-                [[np.float32, -1, (123, 456)], [np.float32, -1, (789, 456)], [np.float32, -1, (789)]],
-        ]
-
-        for item in shape_format:
-            cpu_x, npu_x = create_common_tensor(item[0], -2, 2)
-            cpu_w, npu_w = create_common_tensor(item[1], -2, 2)
-            cpu_b, npu_b = create_common_tensor(item[2], -2, 2)
-            cpu_output = self.cpu_op_exec(cpu_x, cpu_w, cpu_b)
-            npu_output = self.npu_op_exec(npu_x, npu_w, npu_b)
-            self.assertRtolEqual(cpu_output, npu_output, 0.0002)
-    
-    def test_npu_linear_shape_format_fp16(self, device):
-        shape_format = [
-                [[np.float16, -1, (6144, 1024)], [np.float16, -1, (256, 1024)], [np.float16, -1, (256)]],
-                [[np.float16, -1, (123, 456)], [np.float16, -1, (789, 456)], [np.float16, -1, (789)]],
-        ]
-
-        for item in shape_format:
-            cpu_x, npu_x = create_common_tensor(item[0], -2, 2)
-            cpu_w, npu_w = create_common_tensor(item[1], -2, 2)
-            cpu_b, npu_b = create_common_tensor(item[2], -2, 2)
-            cpu_output = self.cpu_op_exec(cpu_x.float(), cpu_w.float(), cpu_b.float()).astype(np.float16)
-            npu_output = self.npu_op_exec(npu_x, npu_w, npu_b)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestNpuLinear, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
-
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestNpuLinear(TestCase):
+    def cpu_op_exec(self, x, weight, bias):
+        output = torch.nn.functional.linear(x, weight, bias)
+        output = output.numpy()
+        return output
+    
+    def npu_op_exec(self, x, weight, bias):
+        output = torch.npu_linear(x, weight, bias)
+        output = output.cpu().numpy()
+        return output
+
+    def test_npu_linear_shape_format_fp32(self, device):
+        shape_format = [
+                [[np.float32, -1, (6144, 1024)], [np.float32, -1, (256, 1024)], [np.float32, -1, (256)]],
+                [[np.float32, -1, (123, 456)], [np.float32, -1, (789, 456)], [np.float32, -1, (789)]],
+        ]
+
+        for item in shape_format:
+            cpu_x, npu_x = create_common_tensor(item[0], -2, 2)
+            cpu_w, npu_w = create_common_tensor(item[1], -2, 2)
+            cpu_b, npu_b = create_common_tensor(item[2], -2, 2)
+            cpu_output = self.cpu_op_exec(cpu_x, cpu_w, cpu_b)
+            npu_output = self.npu_op_exec(npu_x, npu_w, npu_b)
+            self.assertRtolEqual(cpu_output, npu_output, 0.0002)
+    
+    def test_npu_linear_shape_format_fp16(self, device):
+        shape_format = [
+                [[np.float16, -1, (6144, 1024)], [np.float16, -1, (256, 1024)], [np.float16, -1, (256)]],
+                [[np.float16, -1, (123, 456)], [np.float16, -1, (789, 456)], [np.float16, -1, (789)]],
+        ]
+
+        for item in shape_format:
+            cpu_x, npu_x = create_common_tensor(item[0], -2, 2)
+            cpu_w, npu_w = create_common_tensor(item[1], -2, 2)
+            cpu_b, npu_b = create_common_tensor(item[2], -2, 2)
+            cpu_output = self.cpu_op_exec(cpu_x.float(), cpu_w.float(), cpu_b.float()).astype(np.float16)
+            npu_output = self.npu_op_exec(npu_x, npu_w, npu_b)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestNpuLinear, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
+
diff --git a/test/test_npu/test_network_ops/test_npu_linear_backward.py b/test/test_npu/test_network_ops/test_npu_linear_backward.py
index 66f8a47f4143ac56fa0afe457ecbe0f9ebdc9268..f17921bfca16badd64a2bedbfd2fc804ee1a87f8 100644
--- a/test/test_npu/test_network_ops/test_npu_linear_backward.py
+++ b/test/test_npu/test_network_ops/test_npu_linear_backward.py
@@ -1,77 +1,77 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestNpuLinearBackward(TestCase):
-    def cpu_op_exec(self, x, weight, bias):
-        x.requires_grad = True
-        weight.requires_grad = True
-        bias.requires_grad = True
-        output = torch.nn.functional.linear(x, weight, bias)
-        loss = output.sum()
-        loss.backward()
-        return output.detach().numpy(), x.grad.numpy(), weight.grad.numpy(), bias.grad.numpy()
-    
-    def npu_op_exec(self, x, weight, bias):
-        x.requires_grad = True
-        weight.requires_grad = True
-        bias.requires_grad = True
-        output = torch.npu_linear(x, weight, bias)
-        loss = output.sum()
-        loss.backward()
-        return output.cpu().detach().numpy(), x.grad.cpu().numpy(), weight.grad.cpu().numpy(), bias.grad.cpu().numpy()
-
-    def test_npu_linear_backward_shape_format_fp32(self, device):
-        shape_format = [
-                [[np.float32, -1, (6144, 1024)], [np.float32, -1, (256, 1024)], [np.float32, -1, (256)]],
-                [[np.float32, -1, (123, 456)], [np.float32, -1, (789, 456)], [np.float32, -1, (789)]],
-        ]
-
-        for item in shape_format:
-            cpu_x, npu_x = create_common_tensor(item[0], -2, 2)
-            cpu_w, npu_w = create_common_tensor(item[1], -2, 2)
-            cpu_b, npu_b = create_common_tensor(item[2], -2, 2)
-            cpu_output, cpu_x_grad, cpu_w_grad, cpu_b_grad = self.cpu_op_exec(cpu_x, cpu_w, cpu_b)
-            npu_output, npu_x_grad, npu_w_grad, npu_b_grad = self.npu_op_exec(npu_x, npu_w, npu_b)
-            self.assertRtolEqual(cpu_output, npu_output, 0.0002)
-            self.assertRtolEqual(cpu_x_grad, npu_x_grad)
-            self.assertRtolEqual(cpu_w_grad, npu_w_grad)
-            self.assertRtolEqual(cpu_b_grad, npu_b_grad)
-    
-    def test_npu_linear_shape_format_fp16(self, device):
-        shape_format = [
-                [[np.float16, -1, (6144, 1024)], [np.float16, -1, (256, 1024)], [np.float16, -1, (256)]],
-                [[np.float16, -1, (123, 456)], [np.float16, -1, (789, 456)], [np.float16, -1, (789)]],
-        ]
-
-        for item in shape_format:
-            cpu_x, npu_x = create_common_tensor(item[0], -2, 2)
-            cpu_w, npu_w = create_common_tensor(item[1], -2, 2)
-            cpu_b, npu_b = create_common_tensor(item[2], -2, 2)
-            cpu_output, cpu_x_grad, cpu_w_grad, cpu_b_grad = self.cpu_op_exec(
-                cpu_x.float(), cpu_w.float(), cpu_b.float())
-            npu_output, npu_x_grad, npu_w_grad, npu_b_grad = self.npu_op_exec(npu_x, npu_w, npu_b)
-            self.assertRtolEqual(cpu_output.astype(np.float16), npu_output)
-            self.assertRtolEqual(cpu_x_grad.astype(np.float16), npu_x_grad)
-            self.assertRtolEqual(cpu_w_grad.astype(np.float16), npu_w_grad)
-            self.assertRtolEqual(cpu_b_grad.astype(np.float16), npu_b_grad)
-
-instantiate_device_type_tests(TestNpuLinearBackward, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
-
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestNpuLinearBackward(TestCase):
+    def cpu_op_exec(self, x, weight, bias):
+        x.requires_grad = True
+        weight.requires_grad = True
+        bias.requires_grad = True
+        output = torch.nn.functional.linear(x, weight, bias)
+        loss = output.sum()
+        loss.backward()
+        return output.detach().numpy(), x.grad.numpy(), weight.grad.numpy(), bias.grad.numpy()
+    
+    def npu_op_exec(self, x, weight, bias):
+        x.requires_grad = True
+        weight.requires_grad = True
+        bias.requires_grad = True
+        output = torch.npu_linear(x, weight, bias)
+        loss = output.sum()
+        loss.backward()
+        return output.cpu().detach().numpy(), x.grad.cpu().numpy(), weight.grad.cpu().numpy(), bias.grad.cpu().numpy()
+
+    def test_npu_linear_backward_shape_format_fp32(self, device):
+        shape_format = [
+                [[np.float32, -1, (6144, 1024)], [np.float32, -1, (256, 1024)], [np.float32, -1, (256)]],
+                [[np.float32, -1, (123, 456)], [np.float32, -1, (789, 456)], [np.float32, -1, (789)]],
+        ]
+
+        for item in shape_format:
+            cpu_x, npu_x = create_common_tensor(item[0], -2, 2)
+            cpu_w, npu_w = create_common_tensor(item[1], -2, 2)
+            cpu_b, npu_b = create_common_tensor(item[2], -2, 2)
+            cpu_output, cpu_x_grad, cpu_w_grad, cpu_b_grad = self.cpu_op_exec(cpu_x, cpu_w, cpu_b)
+            npu_output, npu_x_grad, npu_w_grad, npu_b_grad = self.npu_op_exec(npu_x, npu_w, npu_b)
+            self.assertRtolEqual(cpu_output, npu_output, 0.0002)
+            self.assertRtolEqual(cpu_x_grad, npu_x_grad)
+            self.assertRtolEqual(cpu_w_grad, npu_w_grad)
+            self.assertRtolEqual(cpu_b_grad, npu_b_grad)
+    
+    def test_npu_linear_shape_format_fp16(self, device):
+        shape_format = [
+                [[np.float16, -1, (6144, 1024)], [np.float16, -1, (256, 1024)], [np.float16, -1, (256)]],
+                [[np.float16, -1, (123, 456)], [np.float16, -1, (789, 456)], [np.float16, -1, (789)]],
+        ]
+
+        for item in shape_format:
+            cpu_x, npu_x = create_common_tensor(item[0], -2, 2)
+            cpu_w, npu_w = create_common_tensor(item[1], -2, 2)
+            cpu_b, npu_b = create_common_tensor(item[2], -2, 2)
+            cpu_output, cpu_x_grad, cpu_w_grad, cpu_b_grad = self.cpu_op_exec(
+                cpu_x.float(), cpu_w.float(), cpu_b.float())
+            npu_output, npu_x_grad, npu_w_grad, npu_b_grad = self.npu_op_exec(npu_x, npu_w, npu_b)
+            self.assertRtolEqual(cpu_output.astype(np.float16), npu_output)
+            self.assertRtolEqual(cpu_x_grad.astype(np.float16), npu_x_grad)
+            self.assertRtolEqual(cpu_w_grad.astype(np.float16), npu_w_grad)
+            self.assertRtolEqual(cpu_b_grad.astype(np.float16), npu_b_grad)
+
+instantiate_device_type_tests(TestNpuLinearBackward, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
+
diff --git a/test/test_npu/test_network_ops/test_npu_pad.py b/test/test_npu/test_network_ops/test_npu_pad.py
index 2580b1a65dfe8249e1bb911e54fbe873ceaf4617..7c91f8f751608e69363e610266fe5692801f327c 100644
--- a/test/test_npu/test_network_ops/test_npu_pad.py
+++ b/test/test_npu/test_network_ops/test_npu_pad.py
@@ -1,35 +1,35 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestNpuPad(TestCase):
-    def test_npu_pad(self, device):
-        npu_input = torch.ones(2, 2).npu()
-        pads = (1, 1, 1, 1)
-        benchmark = torch.tensor([[0., 0., 0., 0.],
-                                  [0., 1., 1., 0.],
-                                  [0., 1., 1., 0.],
-                                  [0., 0., 0., 0.]])
-        npu_output = torch.npu_pad(npu_input, pads)
-        npu_output = npu_output.cpu().detach()
-        self.assertRtolEqual(benchmark, npu_output)
-
-instantiate_device_type_tests(TestNpuPad, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestNpuPad(TestCase):
+    def test_npu_pad(self, device):
+        npu_input = torch.ones(2, 2).npu()
+        pads = (1, 1, 1, 1)
+        benchmark = torch.tensor([[0., 0., 0., 0.],
+                                  [0., 1., 1., 0.],
+                                  [0., 1., 1., 0.],
+                                  [0., 0., 0., 0.]])
+        npu_output = torch.npu_pad(npu_input, pads)
+        npu_output = npu_output.cpu().detach()
+        self.assertRtolEqual(benchmark, npu_output)
+
+instantiate_device_type_tests(TestNpuPad, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_numpy_T.py b/test/test_npu/test_network_ops/test_numpy_T.py
index f8a5e1af64d52a630b527885a848aee1e5c80376..37527db346ddfd1e50788e6985667c552bf16b6a 100644
--- a/test/test_npu/test_network_ops/test_numpy_T.py
+++ b/test/test_npu/test_network_ops/test_numpy_T.py
@@ -1,48 +1,48 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
- 
-class TestNumpyT(TestCase):
-    def test_numpy_T_common_shape_format(self, device):
-        def cpu_op_exec(input):
-            output = input.T
-            output = output.numpy()
-            return output
-    
-        def npu_op_exec(input):
-            output = input.T
-            output = output.to("cpu")
-            output = output.numpy() 
-            return output  
-        
-        shape_format = [
-                [[np.float16, 0, (64, 10)]],
-                [[np.float32, 4, (32, 1, 3, 3)]],
-                [[np.float32, 29, (10, 128)]]
-        ]
-        for shape in shape_format:
-            cpu_input, npu_input = create_common_tensor(shape[0], -1, 1)
-            cpu_output = cpu_op_exec(cpu_input)
-            npu_output = npu_op_exec(npu_input)
-            self.assertRtolEqual(cpu_output, npu_output)
-    
-
-instantiate_device_type_tests(TestNumpyT, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+ 
+class TestNumpyT(TestCase):
+    def test_numpy_T_common_shape_format(self, device):
+        def cpu_op_exec(input):
+            output = input.T
+            output = output.numpy()
+            return output
+    
+        def npu_op_exec(input):
+            output = input.T
+            output = output.to("cpu")
+            output = output.numpy() 
+            return output  
+        
+        shape_format = [
+                [[np.float16, 0, (64, 10)]],
+                [[np.float32, 4, (32, 1, 3, 3)]],
+                [[np.float32, 29, (10, 128)]]
+        ]
+        for shape in shape_format:
+            cpu_input, npu_input = create_common_tensor(shape[0], -1, 1)
+            cpu_output = cpu_op_exec(cpu_input)
+            npu_output = npu_op_exec(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output)
+    
+
+instantiate_device_type_tests(TestNumpyT, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_ones_like.py b/test/test_npu/test_network_ops/test_ones_like.py
index 924c4d12022a51286a084802fa59bcaa71a5fede..7b0b11ed38aab5e752e7bf942a955ea50bf1e104 100644
--- a/test/test_npu/test_network_ops/test_ones_like.py
+++ b/test/test_npu/test_network_ops/test_ones_like.py
@@ -1,65 +1,65 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestOnesLike(TestCase):
-    def cpu_op_exec(self, input1):
-        output = torch.ones_like(input1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1):
-        output = torch.ones_like(input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_op_shape_format_fp16(self, device):
-        format_list = [0, 3, 29]
-        shape_list = [1, (1000, 1280), (32, 3, 3), (32, 144, 1, 1)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            print(item)
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            cpu_input = cpu_input.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec(npu_input)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertEqual(cpu_output, npu_output)
-
-    def test_op_shape_format_fp32(self, device):
-        format_list = [0, 3, 29]
-        shape_list = [1, (1000, 1280), (32, 3, 3), (32, 144, 1, 1)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            print(item)
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec(npu_input)
-            self.assertEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestOnesLike, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestOnesLike(TestCase):
+    def cpu_op_exec(self, input1):
+        output = torch.ones_like(input1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        output = torch.ones_like(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_op_shape_format_fp16(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [1, (1000, 1280), (32, 3, 3), (32, 144, 1, 1)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            print(item)
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_input = cpu_input.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertEqual(cpu_output, npu_output)
+
+    def test_op_shape_format_fp32(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [1, (1000, 1280), (32, 3, 3), (32, 144, 1, 1)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            print(item)
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            self.assertEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestOnesLike, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_permute.py b/test/test_npu/test_network_ops/test_permute.py
index 1f5fb99d7510940b202958289895253ae8992fde..daf1a347c724567dd0028a7e7507f0a5ba740eb3 100644
--- a/test/test_npu/test_network_ops/test_permute.py
+++ b/test/test_npu/test_network_ops/test_permute.py
@@ -1,54 +1,54 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-import torch.nn as nn
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestPermute(TestCase):
-    def cpu_op_exec(self, input1, input2):
-        output = input1.permute(input2);
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        input1 = input1.to("npu")
-        output = input1.permute(input2);
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_permute(self, device):
-        shape_format = [
-            [[2, 3, 5], (2, 0, 1), torch.float32],
-            [[2, 5, 6, 9], (2, 0, 3, 1), torch.float32],
-            [[2, 4, 6, 8, 10], (2, 3, 4, 0, 1), torch.float32],
-        ]
-        for item in shape_format:
-            cpu_input1 =  torch.randn(item[0], dtype=item[2])
-            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
-            npu_output = self.npu_op_exec(cpu_input1, item[1])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestPermute, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+import torch.nn as nn
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestPermute(TestCase):
+    def cpu_op_exec(self, input1, input2):
+        output = input1.permute(input2);
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        input1 = input1.to("npu")
+        output = input1.permute(input2);
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_permute(self, device):
+        shape_format = [
+            [[2, 3, 5], (2, 0, 1), torch.float32],
+            [[2, 5, 6, 9], (2, 0, 3, 1), torch.float32],
+            [[2, 4, 6, 8, 10], (2, 3, 4, 0, 1), torch.float32],
+        ]
+        for item in shape_format:
+            cpu_input1 =  torch.randn(item[0], dtype=item[2])
+            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
+            npu_output = self.npu_op_exec(cpu_input1, item[1])
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestPermute, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_pow.py b/test/test_npu/test_network_ops/test_pow.py
old mode 100644
new mode 100755
index 31fabef610bfe9a3859d38acdf4c122d56fd32bf..1a25cadfd483f0c647a30f73f06f82ed8d7a889f
--- a/test/test_npu/test_network_ops/test_pow.py
+++ b/test/test_npu/test_network_ops/test_pow.py
@@ -1,296 +1,296 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestPow(TestCase):
-    def cpu_op_exec(self, input1, input2):
-        output = torch.pow(input1, input2)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        output = torch.pow(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_out(self, input1, input2, out):
-        torch.pow(input1, input2, out=out)
-        output = out.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_inplace_exec(self, input1, input2):
-        input1.pow_(input2)
-        output = input1.numpy()
-        return output
-
-    def npu_op_inplace_exec(self, input1, input2):
-        input1.pow_(input2)
-        output = input1.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_tensor_scalar(self, input1, n):
-        output = torch.pow(input1, n)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_tensor_scalar(self, input1, n):
-        # input1 = input1.to("npu")
-        output = torch.pow(input1, n)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_tensor_scalar_out(self, input1, n, out):
-        # input1 = input1.to("npu")
-        output = torch.pow(input1, n, out=out)
-        output = out.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_scalar_tensor(self, n, input1):
-        output = torch.pow(n, input1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_scalar_tensor(self, n, input1):
-        output = torch.pow(n, input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_scalar_tensor_out(self, n, input1, out):
-        torch.pow(n, input1, out=out)
-        output = out.to("cpu")
-        output = output.numpy()
-        return output
-
-    def pow_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 1)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 1)
-            npu_input3 = copy.deepcopy(cpu_input1).to("npu")
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
-            cpu_output_inp = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
-            npu_output_inp = self.npu_op_inplace_exec(npu_input1, npu_input2)
-
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            cpu_output_inp = cpu_output_inp.astype(npu_output_inp.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_output, npu_output_out)
-            self.assertRtolEqual(cpu_output_inp, npu_output_inp)
-
-    def pow_result_scalar_tensor(self, shape_format):
-        for item in shape_format:
-            scalar = np.random.randint(0, 1)
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 1)
-            npu_input3 = copy.deepcopy(cpu_input1).to("npu")
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output_scalar = self.cpu_op_exec_scalar_tensor(scalar, cpu_input1)
-            npu_output_scalar = self.npu_op_exec_scalar_tensor(scalar, npu_input1)
-            npu_output_scalar_out = self.npu_op_exec_scalar_tensor_out(scalar, npu_input1, npu_input3)
-
-            cpu_output_scalar = cpu_output_scalar.astype(npu_output_scalar.dtype)
-            self.assertRtolEqual(cpu_output_scalar, npu_output_scalar)
-            self.assertRtolEqual(cpu_output_scalar, npu_output_scalar_out)
-
-    def pow_result_tensor_scalar_(self, shape_format):
-        for item in shape_format:
-            scalar = np.random.randint(0, 1)
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 1)
-            npu_input3 = copy.deepcopy(cpu_input1).to("npu")
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output_tensor_scalar = self.cpu_op_exec_tensor_scalar(cpu_input1, scalar)
-            npu_output_tensor_scalar = self.npu_op_exec_tensor_scalar(npu_input1, scalar)
-            npu_output_tensor_scalar_out = self.npu_op_exec_tensor_scalar_out(npu_input1, scalar, npu_input3)
-
-            cpu_output_tensor_scalar = cpu_output_tensor_scalar.astype(npu_output_tensor_scalar.dtype)
-            self.assertRtolEqual(cpu_output_tensor_scalar, npu_output_tensor_scalar)
-            self.assertRtolEqual(cpu_output_tensor_scalar, npu_output_tensor_scalar_out)
-
-    # scalar_tensor-------------------------------------------------------
-    def test_pow_shape_format_scalar_tensor_fp16_1d(self, device):
-        format_list = [-1, 0, 3]
-        shape_format = [[np.float16, i, [18]] for i in format_list]
-        self.pow_result_scalar_tensor(shape_format)
-
-    def test_pow_shape_format_scalar_tensor_fp32_1d(self, device):
-        format_list = [-1, 0, 3]
-        shape_format = [[np.float32, i, [18]] for i in format_list]
-        # shape_format = [np.float32, 0, [18]]
-        self.pow_result_scalar_tensor(shape_format)
-
-    def test_pow_shape_format_scalar_tensor_fp16_2d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[np.float16, i, [18, 64]] for i in format_list]
-        self.pow_result_scalar_tensor(shape_format)
-
-    def test_pow_shape_format_scalar_tensor_fp32_2d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[np.float32, i, [18, 64]] for i in format_list]
-        self.pow_result_scalar_tensor(shape_format)
-
-    def test_pow_shape_format_scalar_tensor_fp16_3d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[np.float16, i, [18, 64, 128]] for i in format_list]
-        self.pow_result_scalar_tensor(shape_format)
-
-    def test_pow_shape_format_scalar_tensor_fp32_3d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[np.float32, i, [18, 64, 128]] for i in format_list]
-        self.pow_result_scalar_tensor(shape_format)
-
-    def test_pow_shape_format_scalar_tensor_fp16_4d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[np.float16, i, [18, 64, 128, 256]] for i in format_list]
-        self.pow_result_scalar_tensor(shape_format)
-
-    def test_pow_shape_format_scalar_tensor_fp32_4d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[np.float32, i, [18, 64, 128, 256]] for i in format_list]
-        self.pow_result_scalar_tensor(shape_format)
-
-    # tensor_scalar-----------------------------------------------------------
-    def test_pow_shape_format_tensor_scala_fp16_1d(self, device):
-        format_list = [-1, 0, 3]
-        shape_format = [[np.float16, i, [18]] for i in format_list]
-        self.pow_result_tensor_scalar_(shape_format)
-
-    def test_pow_shape_format_tensor_scalar_fp32_1d(self, device):
-        format_list = [-1, 0, 3]
-        shape_format = [[np.float32, i, [18]] for i in format_list]
-        self.pow_result_tensor_scalar_(shape_format)
-
-    def test_pow_shape_format_tensor_scala_fp16_2d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[np.float16, i, [18, 64]] for i in format_list]
-        self.pow_result_tensor_scalar_(shape_format)
-
-    def test_pow_shape_format_tensor_scalar_fp32_2d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[np.float32, i, [18, 64]] for i in format_list]
-        self.pow_result_tensor_scalar_(shape_format)
-
-    def test_pow_shape_format_tensor_scala_fp16_3d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[np.float16, i, [18, 64, 128]] for i in format_list]
-        self.pow_result_tensor_scalar_(shape_format)
-
-    def test_pow_shape_format_tensor_scalar_fp32_3d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[np.float32, i, [18, 64, 128]] for i in format_list]
-        self.pow_result_tensor_scalar_(shape_format)
-
-    def test_pow_shape_format_tensor_scala_fp16_4d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[np.float16, i, [18, 64, 128, 256]] for i in format_list]
-        self.pow_result_tensor_scalar_(shape_format)
-
-    def test_pow_shape_format_tensor_scalar_fp32_4d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[np.float32, i, [18, 64, 128, 256]] for i in format_list]
-        self.pow_result_tensor_scalar_(shape_format)
-
-    # tensor_tensor-----------------------------------------------------------
-    def test_pow_shape_format_fp16_1d(self, device):
-        format_list = [-1, 0, 3]
-        shape_format = [[[np.float16, i, [5]], [np.float16, i, []]] for i in format_list]
-        self.pow_result(shape_format)
-
-    def test_pow_shape_format_fp32_1d(self, device):
-        format_list = [-1, 0, 3,]
-        shape_format = [[[np.float32, i, [5]], [np.float32, i, []]] for i in format_list]
-        self.pow_result(shape_format)
-
-    def test_pow_shape_format_fp16_2d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float16, i, [448, 1]], [np.float16, i, []]] for i in format_list]
-        self.pow_result(shape_format)
-
-    def test_pow_shape_format_fp32_2d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float32, i, [448, 1]], [np.float32, i, []]] for i in format_list]
-        self.pow_result(shape_format)
-
-    def test_pow_shape_format_fp16_3d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float16, i, [16, 640, 640]], [np.float16, i, []]] for i in format_list]
-        self.pow_result(shape_format)
-
-    def test_pow_shape_format_fp32_3d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float32, i, [16, 640, 640]], [np.float32, i, []]] for i in format_list]
-        self.pow_result(shape_format)
-
-    def test_pow_shape_format_fp16_4d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float16, i, [32, 3, 3, 3]], [np.float16, i, []]] for i in format_list]
-        self.pow_result(shape_format)
-
-    def test_pow_shape_format_fp32_4d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float32, i, [32, 3, 3, 3]], [np.float32, i, []]] for i in format_list]
-        self.pow_result(shape_format)
-    
-    #broadcast
-    def test_pow_shape_format_fp16_2d_broadcast(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float16, i, [448, 20]], [np.float16, i, [448,1]]] for i in format_list]
-        self.pow_result(shape_format)
-
-    def test_pow_shape_format_fp32_2d_broadcast(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float32, i, [448, 20]], [np.float32, i, [448,1]]] for i in format_list]
-        self.pow_result(shape_format)
-
-    def test_pow_shape_format_fp16_3d_broadcast(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float16, i, [16, 640, 640]], [np.float16, i, [16, 640, 1]]] for i in format_list]
-        self.pow_result(shape_format)
-
-    def test_pow_shape_format_fp32_3d_broadcast(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float32, i, [16, 640, 640]], [np.float32, i, [16, 1, 1]]] for i in format_list]
-        self.pow_result(shape_format)
-
-    def test_pow_shape_format_fp16_4d_broadcast(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float16, i, [32, 3, 3, 3]], [np.float16, i, [32, 1, 1, 1]]] for i in format_list]
-        self.pow_result(shape_format)
-
-    def test_pow_shape_format_fp32_4d_broadcast(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float32, i, [32, 3, 3, 3]], [np.float32, i, [32, 3, 1, 1]]] for i in format_list]
-        self.pow_result(shape_format)
-
-
-instantiate_device_type_tests(TestPow, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestPow(TestCase):
+    def cpu_op_exec(self, input1, input2):
+        output = torch.pow(input1, input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.pow(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2, out):
+        torch.pow(input1, input2, out=out)
+        output = out.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_exec(self, input1, input2):
+        input1.pow_(input2)
+        output = input1.numpy()
+        return output
+
+    def npu_op_inplace_exec(self, input1, input2):
+        input1.pow_(input2)
+        output = input1.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_tensor_scalar(self, input1, n):
+        output = torch.pow(input1, n)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_tensor_scalar(self, input1, n):
+        # input1 = input1.to("npu")
+        output = torch.pow(input1, n)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_tensor_scalar_out(self, input1, n, out):
+        # input1 = input1.to("npu")
+        output = torch.pow(input1, n, out=out)
+        output = out.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_scalar_tensor(self, n, input1):
+        output = torch.pow(n, input1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_scalar_tensor(self, n, input1):
+        output = torch.pow(n, input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_scalar_tensor_out(self, n, input1, out):
+        torch.pow(n, input1, out=out)
+        output = out.to("cpu")
+        output = output.numpy()
+        return output
+
+    def pow_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 1)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 1)
+            npu_input3 = copy.deepcopy(cpu_input1).to("npu")
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
+            cpu_output_inp = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
+            npu_output_inp = self.npu_op_inplace_exec(npu_input1, npu_input2)
+
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            cpu_output_inp = cpu_output_inp.astype(npu_output_inp.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output_out)
+            self.assertRtolEqual(cpu_output_inp, npu_output_inp)
+
+    def pow_result_scalar_tensor(self, shape_format):
+        for item in shape_format:
+            scalar = np.random.randint(0, 1)
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 1)
+            npu_input3 = copy.deepcopy(cpu_input1).to("npu")
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output_scalar = self.cpu_op_exec_scalar_tensor(scalar, cpu_input1)
+            npu_output_scalar = self.npu_op_exec_scalar_tensor(scalar, npu_input1)
+            npu_output_scalar_out = self.npu_op_exec_scalar_tensor_out(scalar, npu_input1, npu_input3)
+
+            cpu_output_scalar = cpu_output_scalar.astype(npu_output_scalar.dtype)
+            self.assertRtolEqual(cpu_output_scalar, npu_output_scalar)
+            self.assertRtolEqual(cpu_output_scalar, npu_output_scalar_out)
+
+    def pow_result_tensor_scalar_(self, shape_format):
+        for item in shape_format:
+            scalar = np.random.randint(0, 1)
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 1)
+            npu_input3 = copy.deepcopy(cpu_input1).to("npu")
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output_tensor_scalar = self.cpu_op_exec_tensor_scalar(cpu_input1, scalar)
+            npu_output_tensor_scalar = self.npu_op_exec_tensor_scalar(npu_input1, scalar)
+            npu_output_tensor_scalar_out = self.npu_op_exec_tensor_scalar_out(npu_input1, scalar, npu_input3)
+
+            cpu_output_tensor_scalar = cpu_output_tensor_scalar.astype(npu_output_tensor_scalar.dtype)
+            self.assertRtolEqual(cpu_output_tensor_scalar, npu_output_tensor_scalar)
+            self.assertRtolEqual(cpu_output_tensor_scalar, npu_output_tensor_scalar_out)
+
+    # scalar_tensor-------------------------------------------------------
+    def test_pow_shape_format_scalar_tensor_fp16_1d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[np.float16, i, [18]] for i in format_list]
+        self.pow_result_scalar_tensor(shape_format)
+
+    def test_pow_shape_format_scalar_tensor_fp32_1d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[np.float32, i, [18]] for i in format_list]
+        # shape_format = [np.float32, 0, [18]]
+        self.pow_result_scalar_tensor(shape_format)
+
+    def test_pow_shape_format_scalar_tensor_fp16_2d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[np.float16, i, [18, 64]] for i in format_list]
+        self.pow_result_scalar_tensor(shape_format)
+
+    def test_pow_shape_format_scalar_tensor_fp32_2d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[np.float32, i, [18, 64]] for i in format_list]
+        self.pow_result_scalar_tensor(shape_format)
+
+    def test_pow_shape_format_scalar_tensor_fp16_3d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[np.float16, i, [18, 64, 128]] for i in format_list]
+        self.pow_result_scalar_tensor(shape_format)
+
+    def test_pow_shape_format_scalar_tensor_fp32_3d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[np.float32, i, [18, 64, 128]] for i in format_list]
+        self.pow_result_scalar_tensor(shape_format)
+
+    def test_pow_shape_format_scalar_tensor_fp16_4d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[np.float16, i, [18, 64, 128, 256]] for i in format_list]
+        self.pow_result_scalar_tensor(shape_format)
+
+    def test_pow_shape_format_scalar_tensor_fp32_4d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[np.float32, i, [18, 64, 128, 256]] for i in format_list]
+        self.pow_result_scalar_tensor(shape_format)
+
+    # tensor_scalar-----------------------------------------------------------
+    def test_pow_shape_format_tensor_scala_fp16_1d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[np.float16, i, [18]] for i in format_list]
+        self.pow_result_tensor_scalar_(shape_format)
+
+    def test_pow_shape_format_tensor_scalar_fp32_1d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[np.float32, i, [18]] for i in format_list]
+        self.pow_result_tensor_scalar_(shape_format)
+
+    def test_pow_shape_format_tensor_scala_fp16_2d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[np.float16, i, [18, 64]] for i in format_list]
+        self.pow_result_tensor_scalar_(shape_format)
+
+    def test_pow_shape_format_tensor_scalar_fp32_2d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[np.float32, i, [18, 64]] for i in format_list]
+        self.pow_result_tensor_scalar_(shape_format)
+
+    def test_pow_shape_format_tensor_scala_fp16_3d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[np.float16, i, [18, 64, 128]] for i in format_list]
+        self.pow_result_tensor_scalar_(shape_format)
+
+    def test_pow_shape_format_tensor_scalar_fp32_3d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[np.float32, i, [18, 64, 128]] for i in format_list]
+        self.pow_result_tensor_scalar_(shape_format)
+
+    def test_pow_shape_format_tensor_scala_fp16_4d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[np.float16, i, [18, 64, 128, 256]] for i in format_list]
+        self.pow_result_tensor_scalar_(shape_format)
+
+    def test_pow_shape_format_tensor_scalar_fp32_4d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[np.float32, i, [18, 64, 128, 256]] for i in format_list]
+        self.pow_result_tensor_scalar_(shape_format)
+
+    # tensor_tensor-----------------------------------------------------------
+    def test_pow_shape_format_fp16_1d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[[np.float16, i, [5]], [np.float16, i, []]] for i in format_list]
+        self.pow_result(shape_format)
+
+    def test_pow_shape_format_fp32_1d(self, device):
+        format_list = [-1, 0, 3,]
+        shape_format = [[[np.float32, i, [5]], [np.float32, i, []]] for i in format_list]
+        self.pow_result(shape_format)
+
+    def test_pow_shape_format_fp16_2d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [448, 1]], [np.float16, i, []]] for i in format_list]
+        self.pow_result(shape_format)
+
+    def test_pow_shape_format_fp32_2d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float32, i, [448, 1]], [np.float32, i, []]] for i in format_list]
+        self.pow_result(shape_format)
+
+    def test_pow_shape_format_fp16_3d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [16, 640, 640]], [np.float16, i, []]] for i in format_list]
+        self.pow_result(shape_format)
+
+    def test_pow_shape_format_fp32_3d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float32, i, [16, 640, 640]], [np.float32, i, []]] for i in format_list]
+        self.pow_result(shape_format)
+
+    def test_pow_shape_format_fp16_4d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [32, 3, 3, 3]], [np.float16, i, []]] for i in format_list]
+        self.pow_result(shape_format)
+
+    def test_pow_shape_format_fp32_4d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float32, i, [32, 3, 3, 3]], [np.float32, i, []]] for i in format_list]
+        self.pow_result(shape_format)
+    
+    #broadcast
+    def test_pow_shape_format_fp16_2d_broadcast(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [448, 20]], [np.float16, i, [448,1]]] for i in format_list]
+        self.pow_result(shape_format)
+
+    def test_pow_shape_format_fp32_2d_broadcast(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float32, i, [448, 20]], [np.float32, i, [448,1]]] for i in format_list]
+        self.pow_result(shape_format)
+
+    def test_pow_shape_format_fp16_3d_broadcast(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [16, 640, 640]], [np.float16, i, [16, 640, 1]]] for i in format_list]
+        self.pow_result(shape_format)
+
+    def test_pow_shape_format_fp32_3d_broadcast(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float32, i, [16, 640, 640]], [np.float32, i, [16, 1, 1]]] for i in format_list]
+        self.pow_result(shape_format)
+
+    def test_pow_shape_format_fp16_4d_broadcast(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [32, 3, 3, 3]], [np.float16, i, [32, 1, 1, 1]]] for i in format_list]
+        self.pow_result(shape_format)
+
+    def test_pow_shape_format_fp32_4d_broadcast(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float32, i, [32, 3, 3, 3]], [np.float32, i, [32, 3, 1, 1]]] for i in format_list]
+        self.pow_result(shape_format)
+
+
+instantiate_device_type_tests(TestPow, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_prod.py b/test/test_npu/test_network_ops/test_prod.py
old mode 100644
new mode 100755
index 985bf5912d833af19f86093cb59d9b40c1563220..1e97410c12307f2e073bf66e2db3bd4f8b911005
--- a/test/test_npu/test_network_ops/test_prod.py
+++ b/test/test_npu/test_network_ops/test_prod.py
@@ -1,457 +1,457 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestProd(TestCase):
-    def create_input_tensor(self, dtype, npu_format, shape, minValue, maxValue):
-        input1 = np.random.uniform(minValue, maxValue, shape).astype(dtype)
-        cpu_input = torch.from_numpy(input1)
-        npu_input = torch.from_numpy(input1).npu()
-        if npu_format != -1:
-            npu_input = npu_input.npu_format_cast(npu_format)
-        return cpu_input, npu_input
-
-    def cpu_op_exec(self, input1):
-        output = torch.prod(input1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1):
-        output = torch.prod(input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_dataType_exec(self, input1):
-        output = torch.prod(input1, dtype=torch.float32)
-        output = output.numpy()
-        return output
-
-    def npu_op_dataType_exec(self, input1):
-        output = torch.prod(input1, dtype=torch.float32)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_dim_exec(self, input1, dim, keepdim):
-        output = torch.prod(input1, dim, keepdim)
-        output = output.numpy()
-        return output
-
-    def npu_op_dim_exec(self, input1, dim, keepdim):
-        output = torch.prod(input1, dim, keepdim)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-    
-    def cpu_op_dim_out_exec(self, input1, dim, keepdim, output):    
-        output = torch.prod(input1, dim, keepdim, out = output)
-        output = output.numpy()
-        return output
-
-    def npu_op_dim_out_exec(self, input1, dim, keepdim, output):
-        output = torch.prod(input1, dim, keepdim, out = output)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_dimname_exec(self, input1, dim, keepdim):
-        output = torch.prod(input1, dim, keepdim)
-        output = output.numpy()
-        return output
-
-    def npu_op_dimname_exec(self, input1, dim, keepdim):
-        output = torch.prod(input1, dim, keepdim)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_dimname_out_exec(self, input1, dim, keepdim, output):
-        output = torch.prod(input1, dim, keepdim, out = output)
-        output = output.numpy()
-        return output
-
-    def npu_op_dimname_out_exec(self, input1, dim, keepdim, output):
-        output = torch.prod(input1, dim, keepdim, out = output)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def prod_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 1)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            cpu_output_dataType = self.cpu_op_dataType_exec(cpu_input1)
-            npu_output_dataType = self.npu_op_dataType_exec(npu_input1)
-
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            cpu_output_dataType = cpu_output_dataType.astype(npu_output_dataType.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_output_dataType, npu_output_dataType)
-
-    def prod_dim_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 1)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output_dim = self.cpu_op_dim_exec(cpu_input1, item[1], item[2])
-            npu_output_dim = self.npu_op_dim_exec(npu_input1, item[1], item[2])
-            cpu_output_dim = cpu_output_dim.astype(npu_output_dim.dtype)
-            self.assertRtolEqual(cpu_output_dim, npu_output_dim)
-
-    def output_shape(self, item):
-        output_size = list(item[0][2])
-        dims = len(item[0][2])
-        keepdim = item[2]
-        dim = item[1]
-        if dim < dims and keepdim == True:
-            output_size[dim] = 1
-        if  dim < dims and keepdim == False:
-            output_size.pop(dim) 
-        return output_size
-
-    def prod_dim_out_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 1)
-            shapes = self.output_shape(item)
-            cpu_output, npu_output = self.create_input_tensor(item[0][0],item[0][1],shapes, 0, 1)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_output = cpu_output.to(torch.float32)
-            cpu_output_dim = self.cpu_op_dim_out_exec(cpu_input1, item[1], item[2], cpu_output)
-            npu_output_dim = self.npu_op_dim_out_exec(npu_input1, item[1], item[2], npu_output)
-
-            cpu_output_dim = cpu_output_dim.astype(npu_output_dim.dtype)
-            self.assertRtolEqual(cpu_output_dim, npu_output_dim)
-
-            cpu_out = torch.tensor(0).to(input1.dtype)
-            npu_out = cpu_out.npu()
-            cpu_output_dim = self.cpu_op_dim_out_exec(cpu_input1, item[1], item[2], cpu_out)
-            npu_output_dim = self.npu_op_dim_out_exec(npu_input1, item[1], item[2], npu_out)
-            cpu_out = cpu_out.astype(npu_out.dtype)
-            self.assertRtolEqual(cpu_out, npu_out)
-
-    def prod_dim_out_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 1)
-            shapes = self.output_shape(item)
-            cpu_output, npu_output = self.create_input_tensor(item[0][0],item[0][1],shapes, 0, 1)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_output = cpu_output.to(torch.float32)
-            cpu_output_dim = self.cpu_op_dim_out_exec(cpu_input1, item[1], item[2], cpu_output)
-            npu_output_dim = self.npu_op_dim_out_exec(npu_input1, item[1], item[2], npu_output)
-
-            cpu_output_dim = cpu_output_dim.astype(npu_output_dim.dtype)
-            self.assertRtolEqual(cpu_output_dim, npu_output_dim)
-    
-    def prod_dim_name_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 1)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_input1.names = item[0][3]
-            npu_input1.names = item[0][3]
-
-            cpu_output_dim = self.cpu_op_dim_exec(cpu_input1, item[1], item[2])
-            npu_output_dim = self.npu_op_dim_exec(npu_input1, item[1], item[2])
-            cpu_output_dim = cpu_output_dim.astype(npu_output_dim.dtype)
-            self.assertRtolEqual(cpu_output_dim, npu_output_dim)
-    
-    def prod_dim_name_out_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 1)
-            shapes = self.output_shape(item)
-            cpu_output, npu_output = self.create_input_tensor(item[0][0],item[0][1],shapes, 0, 1)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_output = cpu_output.to(torch.float32)
-            cpu_input1.names = item[0][3]
-            npu_input1.names = item[0][3]
-            cpu_output_dim = self.cpu_op_dim_out_exec(cpu_input1, item[1], item[2], cpu_output)
-            npu_output_dim = self.npu_op_dim_out_exec(npu_input1, item[1], item[2], npu_output)
-            cpu_output_dim = cpu_output_dim.astype(npu_output_dim.dtype)
-            self.assertRtolEqual(cpu_output_dim, npu_output_dim)
-
-    def test_prod_shape_format_fp16_1d(self, device):
-        format_list = [0, 3]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_result(shape_format)
-
-    def test_prod_shape_format_fp32_1d(self, device):
-        format_list = [0, 3]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_result(shape_format)
-
-    def test_prod_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_result(shape_format)
-
-    def test_prod_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 25]], np.random.randint(0, 2), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_result(shape_format)
-
-    def test_prod_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 64, 32]], np.random.randint(0, 3), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_result(shape_format)
-
-    def test_prod_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 64, 32]], np.random.randint(0, 3), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_result(shape_format)
-
-    def test_prod_shape_format_fp16_4d(self, device):
-        format_list = [0]
-        keepdim_list = [True]
-        shape_format = [[[np.float16, i, [18, 64, 32, 128]], np.random.randint(0, 4), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_result(shape_format)
-
-    def test_prod_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 64, 32, 128]], np.random.randint(0, 4), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_result(shape_format)
-
-    # dim-------------------------------------------------------------
-
-    def test_prod_dim_shape_format_fp16_1d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_dim_result(shape_format)
-
-    def test_prod_dim_shape_format_fp32_1d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_dim_result(shape_format)
-
-    def test_prod_dim_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 1), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_dim_result(shape_format)
-
-    def test_prod_dim_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 64]], np.random.randint(0, 1), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_dim_result(shape_format)
-
-    def test_prod_dim_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 64, 32]], np.random.randint(0, 3), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_dim_result(shape_format)
-
-    def test_prod_dim_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 64, 32]], np.random.randint(0, 3), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_dim_result(shape_format)
-
-    def test_prod_dim_shape_format_fp16_4d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 64, 32, 128]], np.random.randint(0, 4), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_dim_result(shape_format)
-
-    def test_prod_dim_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 64, 32, 128]], np.random.randint(0, 4), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_dim_result(shape_format)
-
-    #prod.int_out
-
-    def test_prod_dim_out_shape_format_fp16_1d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_dim_out_result(shape_format)
-
-    def test_prod_dim_out_shape_format_fp32_1d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_dim_out_result(shape_format)
-
-    def test_prod_dim_out_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 1), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_dim_out_result(shape_format)
-
-    def test_prod_dim_out_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 64]], np.random.randint(0, 1), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_dim_out_result(shape_format)
-
-    def test_prod_dim_out_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float16, i, [18, 64, 32]], np.random.randint(0, 3), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_dim_out_result(shape_format)
-
-    def test_prod_dim_out_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 64, 32]], np.random.randint(0, 3), j] for i in format_list
-                        for j in keepdim_list
-                        ]
-        self.prod_dim_out_result(shape_format)
-
-    def test_prod_dim_out_shape_format_fp16_4d(self, device):
-        format_list = [0,3,29]
-        keepdim_list = [True]
-        shape_format = [[[np.float16, i, [18, 64, 32, 128]], np.random.randint(0, 4), j]
-                        for i in format_list for j in keepdim_list
-                        ]
-        self.prod_dim_out_result(shape_format)
-
-    def test_prod_dim_out_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 64, 32, 128]], np.random.randint(0, 4), j]
-                         for i in format_list for j in keepdim_list
-                       ]
-        self.prod_dim_out_result(shape_format)
-
-    def test_prod_dim_name_shape_format_fp32_1d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18], ('N',)], np.random.randint(0, 1), j]
-                         for i in format_list for j in keepdim_list
-                       ]
-        self.prod_dim_name_result(shape_format)
-    
-    def test_prod_dim_name_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 64], ('N','C')], np.random.randint(0, 2), j]
-                         for i in format_list for j in keepdim_list
-                       ]
-        self.prod_dim_name_result(shape_format)
-    
-    def test_prod_dim_name_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 64, 32], ('N','C','H')], np.random.randint(0, 3), j]
-                         for i in format_list for j in keepdim_list
-                       ]
-        self.prod_dim_name_result(shape_format)
-    
-    def test_prod_dim_name_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 64, 32, 128], ('N','C','H','W')], np.random.randint(0, 4), j]
-                         for i in format_list for j in keepdim_list
-                       ]
-        self.prod_dim_name_result(shape_format)
-
-    def test_prod_dim_name_out_shape_format_fp32_1d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18], ('N',)], np.random.randint(0, 1), j]
-                         for i in format_list for j in keepdim_list
-                       ]
-        self.prod_dim_name_out_result(shape_format)
-
-    def test_prod_dim_name_out_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 64], ('N','C')], np.random.randint(0, 1), j]
-                         for i in format_list for j in keepdim_list
-                       ]
-        self.prod_dim_name_out_result(shape_format)
-    
-    def test_prod_dim_name_out_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 64, 32], ('N','C','H')], np.random.randint(0, 3), j]
-                         for i in format_list for j in keepdim_list
-                       ]
-        self.prod_dim_name_out_result(shape_format)
-
-    def test_prod_dim_name_out_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 29]
-        keepdim_list = [True, False]
-        shape_format = [[[np.float32, i, [18, 64, 32, 128], ('N','C','H','W')], np.random.randint(0, 4), j]
-                         for i in format_list for j in keepdim_list
-                       ]
-        self.prod_dim_name_out_result(shape_format)
-
-instantiate_device_type_tests(TestProd, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestProd(TestCase):
+    def create_input_tensor(self, dtype, npu_format, shape, minValue, maxValue):
+        input1 = np.random.uniform(minValue, maxValue, shape).astype(dtype)
+        cpu_input = torch.from_numpy(input1)
+        npu_input = torch.from_numpy(input1).npu()
+        if npu_format != -1:
+            npu_input = npu_input.npu_format_cast(npu_format)
+        return cpu_input, npu_input
+
+    def cpu_op_exec(self, input1):
+        output = torch.prod(input1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        output = torch.prod(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_dataType_exec(self, input1):
+        output = torch.prod(input1, dtype=torch.float32)
+        output = output.numpy()
+        return output
+
+    def npu_op_dataType_exec(self, input1):
+        output = torch.prod(input1, dtype=torch.float32)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_dim_exec(self, input1, dim, keepdim):
+        output = torch.prod(input1, dim, keepdim)
+        output = output.numpy()
+        return output
+
+    def npu_op_dim_exec(self, input1, dim, keepdim):
+        output = torch.prod(input1, dim, keepdim)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+    
+    def cpu_op_dim_out_exec(self, input1, dim, keepdim, output):    
+        output = torch.prod(input1, dim, keepdim, out = output)
+        output = output.numpy()
+        return output
+
+    def npu_op_dim_out_exec(self, input1, dim, keepdim, output):
+        output = torch.prod(input1, dim, keepdim, out = output)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_dimname_exec(self, input1, dim, keepdim):
+        output = torch.prod(input1, dim, keepdim)
+        output = output.numpy()
+        return output
+
+    def npu_op_dimname_exec(self, input1, dim, keepdim):
+        output = torch.prod(input1, dim, keepdim)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_dimname_out_exec(self, input1, dim, keepdim, output):
+        output = torch.prod(input1, dim, keepdim, out = output)
+        output = output.numpy()
+        return output
+
+    def npu_op_dimname_out_exec(self, input1, dim, keepdim, output):
+        output = torch.prod(input1, dim, keepdim, out = output)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def prod_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 1)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            cpu_output_dataType = self.cpu_op_dataType_exec(cpu_input1)
+            npu_output_dataType = self.npu_op_dataType_exec(npu_input1)
+
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            cpu_output_dataType = cpu_output_dataType.astype(npu_output_dataType.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output_dataType, npu_output_dataType)
+
+    def prod_dim_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 1)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output_dim = self.cpu_op_dim_exec(cpu_input1, item[1], item[2])
+            npu_output_dim = self.npu_op_dim_exec(npu_input1, item[1], item[2])
+            cpu_output_dim = cpu_output_dim.astype(npu_output_dim.dtype)
+            self.assertRtolEqual(cpu_output_dim, npu_output_dim)
+
+    def output_shape(self, item):
+        output_size = list(item[0][2])
+        dims = len(item[0][2])
+        keepdim = item[2]
+        dim = item[1]
+        if dim < dims and keepdim == True:
+            output_size[dim] = 1
+        if  dim < dims and keepdim == False:
+            output_size.pop(dim) 
+        return output_size
+
+    def prod_dim_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 1)
+            shapes = self.output_shape(item)
+            cpu_output, npu_output = self.create_input_tensor(item[0][0],item[0][1],shapes, 0, 1)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_output = cpu_output.to(torch.float32)
+            cpu_output_dim = self.cpu_op_dim_out_exec(cpu_input1, item[1], item[2], cpu_output)
+            npu_output_dim = self.npu_op_dim_out_exec(npu_input1, item[1], item[2], npu_output)
+
+            cpu_output_dim = cpu_output_dim.astype(npu_output_dim.dtype)
+            self.assertRtolEqual(cpu_output_dim, npu_output_dim)
+
+            cpu_out = torch.tensor(0).to(input1.dtype)
+            npu_out = cpu_out.npu()
+            cpu_output_dim = self.cpu_op_dim_out_exec(cpu_input1, item[1], item[2], cpu_out)
+            npu_output_dim = self.npu_op_dim_out_exec(npu_input1, item[1], item[2], npu_out)
+            cpu_out = cpu_out.astype(npu_out.dtype)
+            self.assertRtolEqual(cpu_out, npu_out)
+
+    def prod_dim_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 1)
+            shapes = self.output_shape(item)
+            cpu_output, npu_output = self.create_input_tensor(item[0][0],item[0][1],shapes, 0, 1)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_output = cpu_output.to(torch.float32)
+            cpu_output_dim = self.cpu_op_dim_out_exec(cpu_input1, item[1], item[2], cpu_output)
+            npu_output_dim = self.npu_op_dim_out_exec(npu_input1, item[1], item[2], npu_output)
+
+            cpu_output_dim = cpu_output_dim.astype(npu_output_dim.dtype)
+            self.assertRtolEqual(cpu_output_dim, npu_output_dim)
+    
+    def prod_dim_name_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 1)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_input1.names = item[0][3]
+            npu_input1.names = item[0][3]
+
+            cpu_output_dim = self.cpu_op_dim_exec(cpu_input1, item[1], item[2])
+            npu_output_dim = self.npu_op_dim_exec(npu_input1, item[1], item[2])
+            cpu_output_dim = cpu_output_dim.astype(npu_output_dim.dtype)
+            self.assertRtolEqual(cpu_output_dim, npu_output_dim)
+    
+    def prod_dim_name_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 1)
+            shapes = self.output_shape(item)
+            cpu_output, npu_output = self.create_input_tensor(item[0][0],item[0][1],shapes, 0, 1)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_output = cpu_output.to(torch.float32)
+            cpu_input1.names = item[0][3]
+            npu_input1.names = item[0][3]
+            cpu_output_dim = self.cpu_op_dim_out_exec(cpu_input1, item[1], item[2], cpu_output)
+            npu_output_dim = self.npu_op_dim_out_exec(npu_input1, item[1], item[2], npu_output)
+            cpu_output_dim = cpu_output_dim.astype(npu_output_dim.dtype)
+            self.assertRtolEqual(cpu_output_dim, npu_output_dim)
+
+    def test_prod_shape_format_fp16_1d(self, device):
+        format_list = [0, 3]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_result(shape_format)
+
+    def test_prod_shape_format_fp32_1d(self, device):
+        format_list = [0, 3]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_result(shape_format)
+
+    def test_prod_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 2), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_result(shape_format)
+
+    def test_prod_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 25]], np.random.randint(0, 2), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_result(shape_format)
+
+    def test_prod_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 64, 32]], np.random.randint(0, 3), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_result(shape_format)
+
+    def test_prod_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 64, 32]], np.random.randint(0, 3), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_result(shape_format)
+
+    def test_prod_shape_format_fp16_4d(self, device):
+        format_list = [0]
+        keepdim_list = [True]
+        shape_format = [[[np.float16, i, [18, 64, 32, 128]], np.random.randint(0, 4), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_result(shape_format)
+
+    def test_prod_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 64, 32, 128]], np.random.randint(0, 4), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_result(shape_format)
+
+    # dim-------------------------------------------------------------
+
+    def test_prod_dim_shape_format_fp16_1d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_dim_result(shape_format)
+
+    def test_prod_dim_shape_format_fp32_1d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_dim_result(shape_format)
+
+    def test_prod_dim_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 1), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_dim_result(shape_format)
+
+    def test_prod_dim_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 64]], np.random.randint(0, 1), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_dim_result(shape_format)
+
+    def test_prod_dim_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 64, 32]], np.random.randint(0, 3), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_dim_result(shape_format)
+
+    def test_prod_dim_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 64, 32]], np.random.randint(0, 3), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_dim_result(shape_format)
+
+    def test_prod_dim_shape_format_fp16_4d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 64, 32, 128]], np.random.randint(0, 4), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_dim_result(shape_format)
+
+    def test_prod_dim_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 64, 32, 128]], np.random.randint(0, 4), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_dim_result(shape_format)
+
+    #prod.int_out
+
+    def test_prod_dim_out_shape_format_fp16_1d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_dim_out_result(shape_format)
+
+    def test_prod_dim_out_shape_format_fp32_1d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_dim_out_result(shape_format)
+
+    def test_prod_dim_out_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 256]], np.random.randint(0, 1), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_dim_out_result(shape_format)
+
+    def test_prod_dim_out_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 64]], np.random.randint(0, 1), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_dim_out_result(shape_format)
+
+    def test_prod_dim_out_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float16, i, [18, 64, 32]], np.random.randint(0, 3), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_dim_out_result(shape_format)
+
+    def test_prod_dim_out_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 64, 32]], np.random.randint(0, 3), j] for i in format_list
+                        for j in keepdim_list
+                        ]
+        self.prod_dim_out_result(shape_format)
+
+    def test_prod_dim_out_shape_format_fp16_4d(self, device):
+        format_list = [0,3,29]
+        keepdim_list = [True]
+        shape_format = [[[np.float16, i, [18, 64, 32, 128]], np.random.randint(0, 4), j]
+                        for i in format_list for j in keepdim_list
+                        ]
+        self.prod_dim_out_result(shape_format)
+
+    def test_prod_dim_out_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 64, 32, 128]], np.random.randint(0, 4), j]
+                         for i in format_list for j in keepdim_list
+                       ]
+        self.prod_dim_out_result(shape_format)
+
+    def test_prod_dim_name_shape_format_fp32_1d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18], ('N',)], np.random.randint(0, 1), j]
+                         for i in format_list for j in keepdim_list
+                       ]
+        self.prod_dim_name_result(shape_format)
+    
+    def test_prod_dim_name_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 64], ('N','C')], np.random.randint(0, 2), j]
+                         for i in format_list for j in keepdim_list
+                       ]
+        self.prod_dim_name_result(shape_format)
+    
+    def test_prod_dim_name_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 64, 32], ('N','C','H')], np.random.randint(0, 3), j]
+                         for i in format_list for j in keepdim_list
+                       ]
+        self.prod_dim_name_result(shape_format)
+    
+    def test_prod_dim_name_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 64, 32, 128], ('N','C','H','W')], np.random.randint(0, 4), j]
+                         for i in format_list for j in keepdim_list
+                       ]
+        self.prod_dim_name_result(shape_format)
+
+    def test_prod_dim_name_out_shape_format_fp32_1d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18], ('N',)], np.random.randint(0, 1), j]
+                         for i in format_list for j in keepdim_list
+                       ]
+        self.prod_dim_name_out_result(shape_format)
+
+    def test_prod_dim_name_out_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 64], ('N','C')], np.random.randint(0, 1), j]
+                         for i in format_list for j in keepdim_list
+                       ]
+        self.prod_dim_name_out_result(shape_format)
+    
+    def test_prod_dim_name_out_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 64, 32], ('N','C','H')], np.random.randint(0, 3), j]
+                         for i in format_list for j in keepdim_list
+                       ]
+        self.prod_dim_name_out_result(shape_format)
+
+    def test_prod_dim_name_out_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 29]
+        keepdim_list = [True, False]
+        shape_format = [[[np.float32, i, [18, 64, 32, 128], ('N','C','H','W')], np.random.randint(0, 4), j]
+                         for i in format_list for j in keepdim_list
+                       ]
+        self.prod_dim_name_out_result(shape_format)
+
+instantiate_device_type_tests(TestProd, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_ptmuls.py b/test/test_npu/test_network_ops/test_ptmuls.py
index e8952eb6c5fcb8470e93fc183d01f1784ceed1e9..5a23432b69b7cb2f27bcf331d3f5b2b22eb69239 100644
--- a/test/test_npu/test_network_ops/test_ptmuls.py
+++ b/test/test_npu/test_network_ops/test_ptmuls.py
@@ -1,69 +1,69 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-sys.path.append('..')
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestPtMuls(TestCase):
-    def cpu_op_exec(self, input1, input2):
-        output = torch.mul(input1, input2)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        output = torch.mul(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_ptmuls_shape_format_fp16(self, device):
-        format_list = [0, 3, 4, 29]
-        shape_list = [(64, 10), (32, 3, 3), (256, 2048, 7, 7)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
-            cpu_input2 = 4.0
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, cpu_input2)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_ptmuls_shape_format_fp32(self, device):
-        format_list = [0, 3, 4, 29]
-        shape_list = [(64, 10), (32, 3, 3), (256, 2048, 7, 7)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
-            cpu_input2 = 6.2
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, cpu_input2)
-            self.assertRtolEqual(cpu_output, npu_output)
-            cpu_output1 = self.cpu_op_exec(cpu_input2, cpu_input1)
-            npu_output1 = self.npu_op_exec(cpu_input2, npu_input1)
-            self.assertRtolEqual(cpu_output1, npu_output1)
-
-instantiate_device_type_tests(TestPtMuls, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
-
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('..')
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestPtMuls(TestCase):
+    def cpu_op_exec(self, input1, input2):
+        output = torch.mul(input1, input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.mul(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_ptmuls_shape_format_fp16(self, device):
+        format_list = [0, 3, 4, 29]
+        shape_list = [(64, 10), (32, 3, 3), (256, 2048, 7, 7)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
+            cpu_input2 = 4.0
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, cpu_input2)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_ptmuls_shape_format_fp32(self, device):
+        format_list = [0, 3, 4, 29]
+        shape_list = [(64, 10), (32, 3, 3), (256, 2048, 7, 7)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 1, 100)
+            cpu_input2 = 6.2
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, cpu_input2)
+            self.assertRtolEqual(cpu_output, npu_output)
+            cpu_output1 = self.cpu_op_exec(cpu_input2, cpu_input1)
+            npu_output1 = self.npu_op_exec(cpu_input2, npu_input1)
+            self.assertRtolEqual(cpu_output1, npu_output1)
+
+instantiate_device_type_tests(TestPtMuls, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
+
diff --git a/test/test_npu/test_network_ops/test_qr.py b/test/test_npu/test_network_ops/test_qr.py
index 52b2f94914f000593e7ffcfc83ef7a1ef5235de6..5b73a615e1ff54a23506077b2ee956c063fda56d 100644
--- a/test/test_npu/test_network_ops/test_qr.py
+++ b/test/test_npu/test_network_ops/test_qr.py
@@ -1,123 +1,123 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-import torch.nn as nn
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestQr(TestCase):
-    def cpu_op_exec(self, input1, some):
-        q, r = torch.qr(input1, some)
-        return q.numpy(), r.numpy()
-
-    def cpu_op_exec_tuple(self, input1, some):
-        out = torch.qr(input1, some)
-        output_q = out.Q
-        output_r = out.R
-        output_q = output_q.numpy()
-        output_r = output_r.numpy()
-        return output_q, output_r, out
-
-    def npu_op_exec(self, input1, some):
-        q, r = torch.qr(input1, some)
-        qout = q.to("cpu").numpy()
-        rout = r.to("cpu").numpy()
-        return qout, rout
-
-    def npu_op_exec_tuple(self, input1, some):
-        out = torch.qr(input1.to("npu"), some)
-        output_q = out.Q
-        output_r = out.R
-        output_q = output_q.to("cpu")
-        output_r = output_r.to("cpu")
-        output_q = output_q.numpy()
-        output_r = output_r.numpy()
-        return output_q, output_r, out
-
-    def npu_op_exec_out(self, input1, some, input2, input3):
-        torch.qr(input1, some, out=(input2, input3))
-        qout = input2.to("cpu").numpy()
-        rout = input3.to("cpu").numpy()
-        return qout, rout
-
-    def test_qr_shape_format(self, device):
-        # TODO(ascend): 算子目前 暂不支持fp16, 后续开发中
-        dtype_list = [np.float32]
-        format_list = [-1]
-        # Note:
-        # precision may be lost if the magnitudes of the elements of input are large
-        shape_list = [
-            [3, 4],
-            [2, 30, 30],
-            [20, 10, 20],
-            [8, 6, 50, 20],
-            [10, 4, 6, 15, 13]
-        ]
-        somes_list = [True, False]
-        shape_format = [
-            [[d, i, j], l] for d in dtype_list for i in format_list 
-                     for j in shape_list for l in somes_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 10)
-            npu_input2 = torch.empty(0).npu().to(cpu_input1.dtype)
-            npu_input3 = torch.empty(0).npu().to(cpu_input1.dtype)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output1, cpu_output2 = self.cpu_op_exec(cpu_input1, item[1])
-            npu_output1, npu_output2 = self.npu_op_exec(npu_input1, item[1])
-            npu_output1_out, npu_output2_out = self.npu_op_exec_out(npu_input1, item[1], npu_input2, npu_input3)
-            cpu_output1 = cpu_output1.astype(npu_output1.dtype)
-            cpu_output2 = cpu_output2.astype(npu_output2.dtype)
-
-            self.assertRtolEqual(cpu_output1, npu_output1)
-            self.assertRtolEqual(cpu_output2, npu_output2)
-            self.assertRtolEqual(npu_output1_out, npu_output1)
-            self.assertRtolEqual(npu_output2_out, npu_output2)
-
-    def test_qr_common_shape_format(self, device):
-        shape_format = [
-            [np.float32, -1, (5, 3)],
-            [np.float32, -1, (1, 64, 147, 147)],
-            [np.float32, -1, (65536, 14, 7, 1)],
-            [np.int32, -1, (1000000, 3, 3, 1)],
-            [np.int32, -1, (1024, 107, 31, 2)],
-            [np.int32, -1, (1, 128, 1, 1)]
-        ]
-        somes = [True, False]
-        for item in shape_format:
-            for some in somes:
-                cpu_input1, npu_input1 = create_common_tensor(item, -0.001, 0.001)
-                if cpu_input1.dtype == torch.int32:
-                    cpu_input1 = cpu_input1.to(torch.float32)
-                if npu_input1.dtype == torch.int32:
-                    npu_input1 = npu_input1.to(torch.float32)       
-                cpu_output_q, cpu_output_r, cpu_out = self.cpu_op_exec_tuple(cpu_input1, some)
-                npu_output_q, npu_output_r, npu_out = self.npu_op_exec_tuple(npu_input1, some)
-                npu_output = np.matmul(npu_output_q, npu_output_r)
-
-                self.assertRtolEqual(cpu_output_q, npu_output_q)
-                self.assertRtolEqual(cpu_output_r, npu_output_r)
-                self.assertRtolEqual(cpu_input1.numpy(), npu_output)
-
-instantiate_device_type_tests(TestQr, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+import torch.nn as nn
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestQr(TestCase):
+    def cpu_op_exec(self, input1, some):
+        q, r = torch.qr(input1, some)
+        return q.numpy(), r.numpy()
+
+    def cpu_op_exec_tuple(self, input1, some):
+        out = torch.qr(input1, some)
+        output_q = out.Q
+        output_r = out.R
+        output_q = output_q.numpy()
+        output_r = output_r.numpy()
+        return output_q, output_r, out
+
+    def npu_op_exec(self, input1, some):
+        q, r = torch.qr(input1, some)
+        qout = q.to("cpu").numpy()
+        rout = r.to("cpu").numpy()
+        return qout, rout
+
+    def npu_op_exec_tuple(self, input1, some):
+        out = torch.qr(input1.to("npu"), some)
+        output_q = out.Q
+        output_r = out.R
+        output_q = output_q.to("cpu")
+        output_r = output_r.to("cpu")
+        output_q = output_q.numpy()
+        output_r = output_r.numpy()
+        return output_q, output_r, out
+
+    def npu_op_exec_out(self, input1, some, input2, input3):
+        torch.qr(input1, some, out=(input2, input3))
+        qout = input2.to("cpu").numpy()
+        rout = input3.to("cpu").numpy()
+        return qout, rout
+
+    def test_qr_shape_format(self, device):
+        # TODO(ascend): 算子目前 暂不支持fp16, 后续开发中
+        dtype_list = [np.float32]
+        format_list = [-1]
+        # Note:
+        # precision may be lost if the magnitudes of the elements of input are large
+        shape_list = [
+            [3, 4],
+            [2, 30, 30],
+            [20, 10, 20],
+            [8, 6, 50, 20],
+            [10, 4, 6, 15, 13]
+        ]
+        somes_list = [True, False]
+        shape_format = [
+            [[d, i, j], l] for d in dtype_list for i in format_list 
+                     for j in shape_list for l in somes_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 10)
+            npu_input2 = torch.empty(0).npu().to(cpu_input1.dtype)
+            npu_input3 = torch.empty(0).npu().to(cpu_input1.dtype)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output1, cpu_output2 = self.cpu_op_exec(cpu_input1, item[1])
+            npu_output1, npu_output2 = self.npu_op_exec(npu_input1, item[1])
+            npu_output1_out, npu_output2_out = self.npu_op_exec_out(npu_input1, item[1], npu_input2, npu_input3)
+            cpu_output1 = cpu_output1.astype(npu_output1.dtype)
+            cpu_output2 = cpu_output2.astype(npu_output2.dtype)
+
+            self.assertRtolEqual(cpu_output1, npu_output1)
+            self.assertRtolEqual(cpu_output2, npu_output2)
+            self.assertRtolEqual(npu_output1_out, npu_output1)
+            self.assertRtolEqual(npu_output2_out, npu_output2)
+
+    def test_qr_common_shape_format(self, device):
+        shape_format = [
+            [np.float32, -1, (5, 3)],
+            [np.float32, -1, (1, 64, 147, 147)],
+            [np.float32, -1, (65536, 14, 7, 1)],
+            [np.int32, -1, (1000000, 3, 3, 1)],
+            [np.int32, -1, (1024, 107, 31, 2)],
+            [np.int32, -1, (1, 128, 1, 1)]
+        ]
+        somes = [True, False]
+        for item in shape_format:
+            for some in somes:
+                cpu_input1, npu_input1 = create_common_tensor(item, -0.001, 0.001)
+                if cpu_input1.dtype == torch.int32:
+                    cpu_input1 = cpu_input1.to(torch.float32)
+                if npu_input1.dtype == torch.int32:
+                    npu_input1 = npu_input1.to(torch.float32)       
+                cpu_output_q, cpu_output_r, cpu_out = self.cpu_op_exec_tuple(cpu_input1, some)
+                npu_output_q, npu_output_r, npu_out = self.npu_op_exec_tuple(npu_input1, some)
+                npu_output = np.matmul(npu_output_q, npu_output_r)
+
+                self.assertRtolEqual(cpu_output_q, npu_output_q)
+                self.assertRtolEqual(cpu_output_r, npu_output_r)
+                self.assertRtolEqual(cpu_input1.numpy(), npu_output)
+
+instantiate_device_type_tests(TestQr, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_reciprocal.py b/test/test_npu/test_network_ops/test_reciprocal.py
old mode 100644
new mode 100755
index fac894fb43ee9e933053c14ff3687a47e551b3d9..058188af46c9ffcf78068fef81fa939008b2be29
--- a/test/test_npu/test_network_ops/test_reciprocal.py
+++ b/test/test_npu/test_network_ops/test_reciprocal.py
@@ -1,156 +1,156 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestReciprocal(TestCase):
-    def cpu_op_exec(self, input1):
-        output = torch.reciprocal(input1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1):
-        output = torch.reciprocal(input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_out(self, input1, input2):
-        output = input2.to("npu")
-        torch.reciprocal(input1, out=output)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_inp_op_exec(self, input1):
-        output = torch.reciprocal_(input1)
-        output = output.numpy()
-        return output
-
-    def npu_inp_op_exec(self, input1):
-        output = torch.reciprocal_(input1)
-        output = input1.to("cpu")
-        output = output.numpy()
-        return output
-
-    def reciprocal_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            cpu_output_inp = self.cpu_inp_op_exec(cpu_input1)
-            npu_output_inp = self.npu_inp_op_exec(npu_input1)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            cpu_output_inp = cpu_output_inp.astype(npu_output_inp.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_output_inp, npu_output_inp)
-
-    def reciprocal_result_out(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2)
-            cpu_output = cpu_output.astype(npu_output_out.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output_out)
-
-    def test_reciprocal_shape_format_fp16_out(self, device):
-        shape_format = [[[np.float16, 0, [18]], [np.float16, 0, [18, 20]]],
-                        [[np.float16, 0, [18, 20, 30]], [np.float16, 0, [18, 20, 30]]],
-                        [[np.float16, 0, [18, 10, 10, 20]], [np.float16, 0, [18, 10, 20]]],
-                        [[np.float16, 3, [18]], [np.float16, 3, [18, 20]]],
-                        [[np.float16, 3, [18, 20, 30]], [np.float16, 3, [18, 20, 30]]],
-                        [[np.float16, 3, [18, 10, 10, 20]], [np.float16, 3, [18, 10, 20]]],
-                        [[np.float16, 4, [18]], [np.float16, 4, [18, 20]]],
-                        [[np.float16, 4, [18, 20, 30]], [np.float16, 4, [18, 20, 30]]],
-                        [[np.float16, 4, [18, 10, 10, 20]], [np.float16, 4, [18, 10, 20]]],
-                        ]
-        self.reciprocal_result_out(shape_format)
-
-    def test_reciprocal_shape_format_fp32_out(self, device):
-        shape_format = [[[np.float32, 0, [18]], [np.float32, 0, [18, 20]]],
-                        [[np.float32, 0, [18, 20, 30]], [np.float32, 0, [18, 20, 30]]],
-                        [[np.float32, 0, [18, 10, 10, 20]], [np.float32, 0, [18, 10, 20]]],
-                        [[np.float32, 3, [18]], [np.float32, 3, [18, 20]]],
-                        [[np.float32, 3, [18, 20, 30]], [np.float32, 3, [18, 20, 30]]],
-                        [[np.float32, 3, [18, 10, 10, 20]], [np.float32, 3, [18, 10, 20]]],
-                        [[np.float32, 4, [18]], [np.float32, 4, [18, 20]]],
-                        [[np.float32, 4, [18, 20, 30]], [np.float32, 4, [18, 20, 30]]],
-                        [[np.float32, 4, [18, 10, 10, 20]], [np.float32, 4, [18, 10, 20]]],
-                        ]
-        self.reciprocal_result_out(shape_format)
-
-    def test_reciprocal_shape_format_fp16_1d(self, device):
-        format_list = [0, 3, 4]
-        shape_format = [[np.float16, i, [18]] for i in format_list
-                        ]
-        self.reciprocal_result(shape_format)
-
-    def test_reciprocal_shape_format_fp32_1d(self, device):
-        format_list = [0, 3, 4]
-        shape_format = [[np.float32, i, [256]] for i in format_list
-                        ]
-        self.reciprocal_result(shape_format)
-
-    def test_reciprocal_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 4, 29]
-        shape_format = [[np.float16, i, [64, 516]] for i in format_list
-                        ]
-        self.reciprocal_result(shape_format)
-
-    def test_reciprocal_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 4, 29]
-        shape_format = [[np.float32, i, [64, 516]] for i in format_list
-                        ]
-        self.reciprocal_result(shape_format)
-
-    def test_reciprocal_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 4, 29]
-        shape_format = [[np.float16, i, [64, 124, 516]] for i in format_list
-                        ]
-        self.reciprocal_result(shape_format)
-
-    def test_reciprocal_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 4, 29]
-        shape_format = [[np.float32, i, [64, 124, 516]] for i in format_list
-                        ]
-        self.reciprocal_result(shape_format)
-
-    def test_reciprocal_shape_format_fp16_4d(self, device):
-        format_list = [0, 3, 4, 29]
-        shape_format = [[np.float16, i, [64, 128, 516, 32]] for i in format_list
-                        ]
-        self.reciprocal_result(shape_format)
-
-    def test_reciprocal_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 4, 29]
-        shape_format = [[np.float32, i, [64, 128, 516, 32]] for i in format_list
-                        ]
-        self.reciprocal_result(shape_format)
-
-
-instantiate_device_type_tests(TestReciprocal, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestReciprocal(TestCase):
+    def cpu_op_exec(self, input1):
+        output = torch.reciprocal(input1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        output = torch.reciprocal(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2):
+        output = input2.to("npu")
+        torch.reciprocal(input1, out=output)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_inp_op_exec(self, input1):
+        output = torch.reciprocal_(input1)
+        output = output.numpy()
+        return output
+
+    def npu_inp_op_exec(self, input1):
+        output = torch.reciprocal_(input1)
+        output = input1.to("cpu")
+        output = output.numpy()
+        return output
+
+    def reciprocal_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            cpu_output_inp = self.cpu_inp_op_exec(cpu_input1)
+            npu_output_inp = self.npu_inp_op_exec(npu_input1)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            cpu_output_inp = cpu_output_inp.astype(npu_output_inp.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output_inp, npu_output_inp)
+
+    def reciprocal_result_out(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(npu_output_out.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output_out)
+
+    def test_reciprocal_shape_format_fp16_out(self, device):
+        shape_format = [[[np.float16, 0, [18]], [np.float16, 0, [18, 20]]],
+                        [[np.float16, 0, [18, 20, 30]], [np.float16, 0, [18, 20, 30]]],
+                        [[np.float16, 0, [18, 10, 10, 20]], [np.float16, 0, [18, 10, 20]]],
+                        [[np.float16, 3, [18]], [np.float16, 3, [18, 20]]],
+                        [[np.float16, 3, [18, 20, 30]], [np.float16, 3, [18, 20, 30]]],
+                        [[np.float16, 3, [18, 10, 10, 20]], [np.float16, 3, [18, 10, 20]]],
+                        [[np.float16, 4, [18]], [np.float16, 4, [18, 20]]],
+                        [[np.float16, 4, [18, 20, 30]], [np.float16, 4, [18, 20, 30]]],
+                        [[np.float16, 4, [18, 10, 10, 20]], [np.float16, 4, [18, 10, 20]]],
+                        ]
+        self.reciprocal_result_out(shape_format)
+
+    def test_reciprocal_shape_format_fp32_out(self, device):
+        shape_format = [[[np.float32, 0, [18]], [np.float32, 0, [18, 20]]],
+                        [[np.float32, 0, [18, 20, 30]], [np.float32, 0, [18, 20, 30]]],
+                        [[np.float32, 0, [18, 10, 10, 20]], [np.float32, 0, [18, 10, 20]]],
+                        [[np.float32, 3, [18]], [np.float32, 3, [18, 20]]],
+                        [[np.float32, 3, [18, 20, 30]], [np.float32, 3, [18, 20, 30]]],
+                        [[np.float32, 3, [18, 10, 10, 20]], [np.float32, 3, [18, 10, 20]]],
+                        [[np.float32, 4, [18]], [np.float32, 4, [18, 20]]],
+                        [[np.float32, 4, [18, 20, 30]], [np.float32, 4, [18, 20, 30]]],
+                        [[np.float32, 4, [18, 10, 10, 20]], [np.float32, 4, [18, 10, 20]]],
+                        ]
+        self.reciprocal_result_out(shape_format)
+
+    def test_reciprocal_shape_format_fp16_1d(self, device):
+        format_list = [0, 3, 4]
+        shape_format = [[np.float16, i, [18]] for i in format_list
+                        ]
+        self.reciprocal_result(shape_format)
+
+    def test_reciprocal_shape_format_fp32_1d(self, device):
+        format_list = [0, 3, 4]
+        shape_format = [[np.float32, i, [256]] for i in format_list
+                        ]
+        self.reciprocal_result(shape_format)
+
+    def test_reciprocal_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 4, 29]
+        shape_format = [[np.float16, i, [64, 516]] for i in format_list
+                        ]
+        self.reciprocal_result(shape_format)
+
+    def test_reciprocal_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 4, 29]
+        shape_format = [[np.float32, i, [64, 516]] for i in format_list
+                        ]
+        self.reciprocal_result(shape_format)
+
+    def test_reciprocal_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 4, 29]
+        shape_format = [[np.float16, i, [64, 124, 516]] for i in format_list
+                        ]
+        self.reciprocal_result(shape_format)
+
+    def test_reciprocal_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 4, 29]
+        shape_format = [[np.float32, i, [64, 124, 516]] for i in format_list
+                        ]
+        self.reciprocal_result(shape_format)
+
+    def test_reciprocal_shape_format_fp16_4d(self, device):
+        format_list = [0, 3, 4, 29]
+        shape_format = [[np.float16, i, [64, 128, 516, 32]] for i in format_list
+                        ]
+        self.reciprocal_result(shape_format)
+
+    def test_reciprocal_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 4, 29]
+        shape_format = [[np.float32, i, [64, 128, 516, 32]] for i in format_list
+                        ]
+        self.reciprocal_result(shape_format)
+
+
+instantiate_device_type_tests(TestReciprocal, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_relu.py b/test/test_npu/test_network_ops/test_relu.py
old mode 100644
new mode 100755
index c05e82ec5a045be515930a512f2fa1c21f03272d..7d2f578d46ef792c6909ccafc8f9995ae86d3609
--- a/test/test_npu/test_network_ops/test_relu.py
+++ b/test/test_npu/test_network_ops/test_relu.py
@@ -1,107 +1,107 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestRelu(TestCase):
-    def cpu_op_back_exec(self, input1):
-        w = torch.ones_like(input1)
-        input1.requires_grad_(True)
-        output = torch.relu(input1)
-        output.backward(w)
-        res = input1.grad
-        res = res.numpy()
-        return output.detach().numpy(), res
-
-    def npu_op_back_exec(self, input1):
-        w = torch.ones_like(input1)
-        input1.requires_grad_(True)
-        output = torch.relu(input1)
-        output.backward(w)
-        output = output.to("cpu")
-        res = input1.grad.to("cpu")
-        res = res.numpy()
-        return output.detach().numpy(), res
-
-    def cpu_inp_op_exec(self, input1):
-        output = torch.relu_(input1)
-        output = output.numpy()
-        return output
-
-    def npu_inp_op_exec(self, input1):
-        output = torch.relu_(input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_relu_shape_format_fp32(self, device):
-        format_list = [0, 3, 4, 29]
-        shape_list = [(1000, 1280), (32, 3, 3), (1024, 464, 7, 7)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output, cpu_res = self.cpu_op_back_exec(cpu_input1)
-            npu_output, npu_res = self.npu_op_back_exec(npu_input1)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            cpu_res = cpu_res.astype(npu_res.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_res, npu_res)
-
-    def test_relu_shape_format_fp16(self, device):
-        format_list = [0, 3, 4, 29]
-        shape_list = [(1000, 1280), (32, 3, 3), (1024, 464, 7, 7)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output, cpu_res = self.cpu_op_back_exec(cpu_input1)
-            npu_output, npu_res = self.npu_op_back_exec(npu_input1)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            cpu_res = cpu_res.astype(npu_res.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_res, npu_res)
-
-    def test_relu_shape_format_fp16_inp(self, device):
-        format_list = [0, 3, 4, 29]
-        shape_list = [(1000, 1280), (32, 3, 3), (1024, 464, 7, 7)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_inp_op_exec(cpu_input1)
-            npu_output = self.npu_inp_op_exec(npu_input1)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestRelu, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestRelu(TestCase):
+    def cpu_op_back_exec(self, input1):
+        w = torch.ones_like(input1)
+        input1.requires_grad_(True)
+        output = torch.relu(input1)
+        output.backward(w)
+        res = input1.grad
+        res = res.numpy()
+        return output.detach().numpy(), res
+
+    def npu_op_back_exec(self, input1):
+        w = torch.ones_like(input1)
+        input1.requires_grad_(True)
+        output = torch.relu(input1)
+        output.backward(w)
+        output = output.to("cpu")
+        res = input1.grad.to("cpu")
+        res = res.numpy()
+        return output.detach().numpy(), res
+
+    def cpu_inp_op_exec(self, input1):
+        output = torch.relu_(input1)
+        output = output.numpy()
+        return output
+
+    def npu_inp_op_exec(self, input1):
+        output = torch.relu_(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_relu_shape_format_fp32(self, device):
+        format_list = [0, 3, 4, 29]
+        shape_list = [(1000, 1280), (32, 3, 3), (1024, 464, 7, 7)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output, cpu_res = self.cpu_op_back_exec(cpu_input1)
+            npu_output, npu_res = self.npu_op_back_exec(npu_input1)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            cpu_res = cpu_res.astype(npu_res.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_res, npu_res)
+
+    def test_relu_shape_format_fp16(self, device):
+        format_list = [0, 3, 4, 29]
+        shape_list = [(1000, 1280), (32, 3, 3), (1024, 464, 7, 7)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output, cpu_res = self.cpu_op_back_exec(cpu_input1)
+            npu_output, npu_res = self.npu_op_back_exec(npu_input1)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            cpu_res = cpu_res.astype(npu_res.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_res, npu_res)
+
+    def test_relu_shape_format_fp16_inp(self, device):
+        format_list = [0, 3, 4, 29]
+        shape_list = [(1000, 1280), (32, 3, 3), (1024, 464, 7, 7)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_inp_op_exec(cpu_input1)
+            npu_output = self.npu_inp_op_exec(npu_input1)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestRelu, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_remainder.py b/test/test_npu/test_network_ops/test_remainder.py
old mode 100644
new mode 100755
index 4d017a53c5189eb0cf6cc43e31676d63a38a9bf5..64f6311b8a6e2856f334bcd29a4f77b4004ac9d2
--- a/test/test_npu/test_network_ops/test_remainder.py
+++ b/test/test_npu/test_network_ops/test_remainder.py
@@ -1,228 +1,228 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestRemainder(TestCase):
-    def cpu_op_exec(self, input1, input2):
-        output = torch.remainder(input1, input2)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        output = torch.remainder(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_out(self, input1, input2, out):
-        # output = out.to("npu")
-        output = torch.remainder(input1, input2, out=out)
-        output = out.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_inplace_exec(self, input1, input2):
-        output = input1.remainder_(input2)
-        output = output.numpy()
-        return output
-
-    def npu_op_inplace_exec(self, input1, input2):
-        output = input1.remainder_(input2)
-        output = input1.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_scalar(self, input1, input2):
-        # input1 = input1.to("npu")
-        output = torch.remainder(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def remainder_out_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
-            npu_input3 = torch.randn(6).to("npu")
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_input2 = cpu_input2.to(torch.float32)
-
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
-            self.assertRtolEqual(cpu_output, npu_output_out)
-
-    def remainder_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
-            npu_input3 = copy.deepcopy(cpu_input1).to("npu")
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
-            cpu_output_inplace = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
-            npu_output_inplace = self.npu_op_inplace_exec(npu_input1, npu_input2)
-
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            cpu_output_inplace = cpu_output_inplace.astype(npu_output_inplace.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_output_inplace, npu_output_inplace)
-            self.assertRtolEqual(cpu_output, npu_output_out)
-
-    def remainder_scalar_result(self, shape_format):
-        for item in shape_format:
-            scalar = np.random.uniform(0, 100)
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 2)
-            npu_input3 = copy.deepcopy(cpu_input1).to("npu")
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1, scalar)
-            npu_output_scalar = self.npu_op_exec_scalar(npu_input1, scalar)
-            npu_output_out = self.npu_op_exec_out(npu_input1, scalar, npu_input3)
-
-            cpu_output = cpu_output.astype(npu_output_scalar.dtype)
-            self.assertRtolEqual(cpu_output, npu_output_scalar)
-            self.assertRtolEqual(cpu_output, npu_output_out)
-
-    def test_remainder_shape_format_fp16_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [[np.float16, i, [4]] for i in format_list
-                        ]
-        self.remainder_result(shape_format)
-
-    def test_remainder_shape_format_fp32_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [[np.float32, i, [4]] for i in format_list
-                        ]
-        self.remainder_result(shape_format)
-
-    def test_remainder_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float16, i, [4, 18]] for i in format_list
-                        ]
-        self.remainder_result(shape_format)
-
-    def test_remainder_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float32, i, [4, 18]] for i in format_list
-                        ]
-        self.remainder_result(shape_format)
-
-    def test_remainder_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float16, i, [4, 18, 32]] for i in format_list
-                        ]
-        self.remainder_result(shape_format)
-
-    def test_remainder_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float32, i, [4, 18, 32]] for i in format_list
-                        ]
-        self.remainder_result(shape_format)
-
-    def test_remainder_shape_format_fp16_4d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float16, i, [4, 18, 32, 128]] for i in format_list
-                        ]
-        self.remainder_result(shape_format)
-
-    def test_remainder_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float32, i, [4, 18, 32, 128]] for i in format_list
-                        ]
-        self.remainder_result(shape_format)
-
-    # scalar----------------------------------------------------------
-    def test_remainder_scalar_shape_format_fp16_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [[np.float16, i, [4]] for i in format_list
-                        ]
-        self.remainder_scalar_result(shape_format)
-
-    def test_remainder_scalar_shape_format_fp32_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [[np.float32, i, [4]] for i in format_list
-                        ]
-        self.remainder_scalar_result(shape_format)
-
-    def test_remainder_scalar_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float16, i, [4, 18]] for i in format_list
-                        ]
-        self.remainder_scalar_result(shape_format)
-
-    def test_remainder_scalar_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float32, i, [4, 18]] for i in format_list
-                        ]
-        self.remainder_scalar_result(shape_format)
-
-    def test_remainder_scalar_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float16, i, [4, 18, 32]] for i in format_list
-                        ]
-        self.remainder_scalar_result(shape_format)
-
-    def test_remainder_scalar_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float32, i, [4, 18, 32]] for i in format_list
-                        ]
-        self.remainder_scalar_result(shape_format)
-
-    def test_remainder_scalar_shape_format_fp16_4d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float16, i, [4, 18, 32, 128]] for i in format_list
-                        ]
-        self.remainder_scalar_result(shape_format)
-
-    def test_remainder_scalar_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float32, i, [4, 18, 32, 128]] for i in format_list
-                        ]
-        self.remainder_scalar_result(shape_format)
-
-    def test_remainder_mix_dtype_1(self, device):
-        npu_input1, npu_input2 = create_common_tensor([np.int32, 0, (2, 3)], 1, 100)
-        npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
-        npu_output = self.npu_op_exec(npu_input1, npu_input3)
-        self.assertRtolEqual(cpu_output, npu_output)
-        
-    def test_remainder_mix_dtype_2(self, device):
-        npu_input1, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
-        npu_input3 = torch.tensor(3).int()
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
-        npu_output = self.npu_op_exec(npu_input1, npu_input3)
-        self.assertRtolEqual(cpu_output, npu_output)
-        
-    def test_remainder_scalar_shape_format_fp32_out_4d(self, device):
-        format_list = [0]
-        shape_format = [[np.float32, i, [4, 18, 32, 128]] for i in format_list
-                        ]
-        self.remainder_out_result(shape_format)
-
-
-instantiate_device_type_tests(TestRemainder, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestRemainder(TestCase):
+    def cpu_op_exec(self, input1, input2):
+        output = torch.remainder(input1, input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.remainder(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2, out):
+        # output = out.to("npu")
+        output = torch.remainder(input1, input2, out=out)
+        output = out.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_exec(self, input1, input2):
+        output = input1.remainder_(input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_exec(self, input1, input2):
+        output = input1.remainder_(input2)
+        output = input1.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_scalar(self, input1, input2):
+        # input1 = input1.to("npu")
+        output = torch.remainder(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def remainder_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
+            npu_input3 = torch.randn(6).to("npu")
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
+            self.assertRtolEqual(cpu_output, npu_output_out)
+
+    def remainder_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 0, 100)
+            npu_input3 = copy.deepcopy(cpu_input1).to("npu")
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, npu_input3)
+            cpu_output_inplace = self.cpu_op_inplace_exec(cpu_input1, cpu_input2)
+            npu_output_inplace = self.npu_op_inplace_exec(npu_input1, npu_input2)
+
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            cpu_output_inplace = cpu_output_inplace.astype(npu_output_inplace.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output_inplace, npu_output_inplace)
+            self.assertRtolEqual(cpu_output, npu_output_out)
+
+    def remainder_scalar_result(self, shape_format):
+        for item in shape_format:
+            scalar = np.random.uniform(0, 100)
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 2)
+            npu_input3 = copy.deepcopy(cpu_input1).to("npu")
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, scalar)
+            npu_output_scalar = self.npu_op_exec_scalar(npu_input1, scalar)
+            npu_output_out = self.npu_op_exec_out(npu_input1, scalar, npu_input3)
+
+            cpu_output = cpu_output.astype(npu_output_scalar.dtype)
+            self.assertRtolEqual(cpu_output, npu_output_scalar)
+            self.assertRtolEqual(cpu_output, npu_output_out)
+
+    def test_remainder_shape_format_fp16_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [[np.float16, i, [4]] for i in format_list
+                        ]
+        self.remainder_result(shape_format)
+
+    def test_remainder_shape_format_fp32_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [[np.float32, i, [4]] for i in format_list
+                        ]
+        self.remainder_result(shape_format)
+
+    def test_remainder_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float16, i, [4, 18]] for i in format_list
+                        ]
+        self.remainder_result(shape_format)
+
+    def test_remainder_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float32, i, [4, 18]] for i in format_list
+                        ]
+        self.remainder_result(shape_format)
+
+    def test_remainder_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float16, i, [4, 18, 32]] for i in format_list
+                        ]
+        self.remainder_result(shape_format)
+
+    def test_remainder_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float32, i, [4, 18, 32]] for i in format_list
+                        ]
+        self.remainder_result(shape_format)
+
+    def test_remainder_shape_format_fp16_4d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float16, i, [4, 18, 32, 128]] for i in format_list
+                        ]
+        self.remainder_result(shape_format)
+
+    def test_remainder_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float32, i, [4, 18, 32, 128]] for i in format_list
+                        ]
+        self.remainder_result(shape_format)
+
+    # scalar----------------------------------------------------------
+    def test_remainder_scalar_shape_format_fp16_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [[np.float16, i, [4]] for i in format_list
+                        ]
+        self.remainder_scalar_result(shape_format)
+
+    def test_remainder_scalar_shape_format_fp32_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [[np.float32, i, [4]] for i in format_list
+                        ]
+        self.remainder_scalar_result(shape_format)
+
+    def test_remainder_scalar_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float16, i, [4, 18]] for i in format_list
+                        ]
+        self.remainder_scalar_result(shape_format)
+
+    def test_remainder_scalar_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float32, i, [4, 18]] for i in format_list
+                        ]
+        self.remainder_scalar_result(shape_format)
+
+    def test_remainder_scalar_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float16, i, [4, 18, 32]] for i in format_list
+                        ]
+        self.remainder_scalar_result(shape_format)
+
+    def test_remainder_scalar_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float32, i, [4, 18, 32]] for i in format_list
+                        ]
+        self.remainder_scalar_result(shape_format)
+
+    def test_remainder_scalar_shape_format_fp16_4d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float16, i, [4, 18, 32, 128]] for i in format_list
+                        ]
+        self.remainder_scalar_result(shape_format)
+
+    def test_remainder_scalar_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float32, i, [4, 18, 32, 128]] for i in format_list
+                        ]
+        self.remainder_scalar_result(shape_format)
+
+    def test_remainder_mix_dtype_1(self, device):
+        npu_input1, npu_input2 = create_common_tensor([np.int32, 0, (2, 3)], 1, 100)
+        npu_input3, npu_input4 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
+        npu_output = self.npu_op_exec(npu_input1, npu_input3)
+        self.assertRtolEqual(cpu_output, npu_output)
+        
+    def test_remainder_mix_dtype_2(self, device):
+        npu_input1, npu_input2 = create_common_tensor([np.float32, 0, (2, 3)], 1, 100)
+        npu_input3 = torch.tensor(3).int()
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
+        npu_output = self.npu_op_exec(npu_input1, npu_input3)
+        self.assertRtolEqual(cpu_output, npu_output)
+        
+    def test_remainder_scalar_shape_format_fp32_out_4d(self, device):
+        format_list = [0]
+        shape_format = [[np.float32, i, [4, 18, 32, 128]] for i in format_list
+                        ]
+        self.remainder_out_result(shape_format)
+
+
+instantiate_device_type_tests(TestRemainder, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_resize_.py b/test/test_npu/test_network_ops/test_resize_.py
index bafaff5300e2777698a8fbc57f1e899c1f305bd9..230ecd156a53dbf0a8354bab2dbdbcdc81aaaf43 100644
--- a/test/test_npu/test_network_ops/test_resize_.py
+++ b/test/test_npu/test_network_ops/test_resize_.py
@@ -1,69 +1,69 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION.
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import itertools
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-
-
-class TestResize(TestCase):
-    def cpu_op_exec(self, cpu_in, cpu_out, shape, op):
-        cpu_out.resize_(shape)
-        op(cpu_in, cpu_in, out=cpu_out)
-        return cpu_out
-
-    def npu_op_exec(self, npu_in, npu_out, shape, op):
-        npu_out.resize_(shape)
-        op(npu_in, npu_in, out=npu_out)
-        return npu_out
-
-    def op_result_cmp_(self, shape_a, shape_b, op, is_contiguous=False):
-        a = torch.rand(shape_a)
-        b = torch.full(shape_b, 100.)
-        if is_contiguous:
-            b = b.t()
-        cpu = self.cpu_op_exec(a, b, shape_a, op)
-
-        nb = torch.full(shape_b, 100.)
-        a_npu = a.npu()
-        b_npu = nb.npu()
-        if is_contiguous:
-            b_npu = b_npu.t()
-        npu = self.npu_op_exec(a_npu, b_npu, shape_a, op)
-
-        cpu.add_(10)
-        npu.add_(10)
-        self.assertRtolEqual(cpu.numpy(), npu.cpu().numpy())
-
-    def test_op_resize_(self, device):
-        operators = [torch.add, torch.mul, torch.matmul]
-        shape_a = (5, 5)
-        contiguous = [True, False]
-
-        smalls = [(0, ), (1, ), (3, 1), (2, 3)]
-        for shape_b, op, is_contiguous in itertools.product(smalls, operators, contiguous):
-            self.op_result_cmp_(shape_a, shape_b, op, is_contiguous)
-
-        bigs = [(10, 9), (11, 11), (8, 11)]
-        for shape_b, op, is_contiguous in itertools.product(bigs, operators, contiguous):
-            self.op_result_cmp_(shape_a, shape_b, op, is_contiguous)
-
-
-instantiate_device_type_tests(TestResize, globals(), except_for='cpu')
-if __name__ == "__main__":
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION.
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import itertools
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+
+
+class TestResize(TestCase):
+    def cpu_op_exec(self, cpu_in, cpu_out, shape, op):
+        cpu_out.resize_(shape)
+        op(cpu_in, cpu_in, out=cpu_out)
+        return cpu_out
+
+    def npu_op_exec(self, npu_in, npu_out, shape, op):
+        npu_out.resize_(shape)
+        op(npu_in, npu_in, out=npu_out)
+        return npu_out
+
+    def op_result_cmp_(self, shape_a, shape_b, op, is_contiguous=False):
+        a = torch.rand(shape_a)
+        b = torch.full(shape_b, 100.)
+        if is_contiguous:
+            b = b.t()
+        cpu = self.cpu_op_exec(a, b, shape_a, op)
+
+        nb = torch.full(shape_b, 100.)
+        a_npu = a.npu()
+        b_npu = nb.npu()
+        if is_contiguous:
+            b_npu = b_npu.t()
+        npu = self.npu_op_exec(a_npu, b_npu, shape_a, op)
+
+        cpu.add_(10)
+        npu.add_(10)
+        self.assertRtolEqual(cpu.numpy(), npu.cpu().numpy())
+
+    def test_op_resize_(self, device):
+        operators = [torch.add, torch.mul, torch.matmul]
+        shape_a = (5, 5)
+        contiguous = [True, False]
+
+        smalls = [(0, ), (1, ), (3, 1), (2, 3)]
+        for shape_b, op, is_contiguous in itertools.product(smalls, operators, contiguous):
+            self.op_result_cmp_(shape_a, shape_b, op, is_contiguous)
+
+        bigs = [(10, 9), (11, 11), (8, 11)]
+        for shape_b, op, is_contiguous in itertools.product(bigs, operators, contiguous):
+            self.op_result_cmp_(shape_a, shape_b, op, is_contiguous)
+
+
+instantiate_device_type_tests(TestResize, globals(), except_for='cpu')
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_resize_as.py b/test/test_npu/test_network_ops/test_resize_as.py
index 15363c6a6f6af5bcd30e16652778470c8572f52d..c1fc5835be0a311a4d42565eb4829cc0efefc260 100644
--- a/test/test_npu/test_network_ops/test_resize_as.py
+++ b/test/test_npu/test_network_ops/test_resize_as.py
@@ -1,61 +1,61 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestResizeAs(TestCase):
-    def cpu_op_exec(self, input1, input2):
-        output = torch.resize_as_(input1, input2)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        output = torch.resize_as_(input1, input2)
-        output = output.cpu()
-        output = output.numpy()
-        return output
-
-    def test_resize_as_type_format(self, device):
-        shape_format = [
-                [torch.float32, (1, 2), (3, 4)],
-                [torch.float32, (1, 2, 5), (3, 4, 7)],
-                [torch.float16, (2, 3, 4), (5, 6, 7)]
-        ]
-
-        for item in shape_format:
-            cpu_input1 = torch.randn(item[1])
-            cpu_input2 = torch.randn(item[2])
-
-            if item[0] == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float16)
-                cpu_input2 = cpu_input2.to(torch.float16)
-
-            npu_input1 = cpu_input1.npu()
-            npu_input2 = cpu_input2.npu()
-
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-
-            self.assertEqual(cpu_output.shape, npu_output.shape)
-
-
-instantiate_device_type_tests(TestResizeAs, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
-
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestResizeAs(TestCase):
+    def cpu_op_exec(self, input1, input2):
+        output = torch.resize_as_(input1, input2)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.resize_as_(input1, input2)
+        output = output.cpu()
+        output = output.numpy()
+        return output
+
+    def test_resize_as_type_format(self, device):
+        shape_format = [
+                [torch.float32, (1, 2), (3, 4)],
+                [torch.float32, (1, 2, 5), (3, 4, 7)],
+                [torch.float16, (2, 3, 4), (5, 6, 7)]
+        ]
+
+        for item in shape_format:
+            cpu_input1 = torch.randn(item[1])
+            cpu_input2 = torch.randn(item[2])
+
+            if item[0] == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float16)
+                cpu_input2 = cpu_input2.to(torch.float16)
+
+            npu_input1 = cpu_input1.npu()
+            npu_input2 = cpu_input2.npu()
+
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+
+            self.assertEqual(cpu_output.shape, npu_output.shape)
+
+
+instantiate_device_type_tests(TestResizeAs, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
+
diff --git a/test/test_npu/test_network_ops/test_round.py b/test/test_npu/test_network_ops/test_round.py
index 70dd7ca2d24ec6182f5d9f1bd1d525b6c2eadcbb..a9b137471034bd83eb0172d5b7215c977a1c2311 100644
--- a/test/test_npu/test_network_ops/test_round.py
+++ b/test/test_npu/test_network_ops/test_round.py
@@ -1,111 +1,111 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestRound(TestCase):
-    
-    def cpu_op_exec(self,input1):
-        output = torch.round(input1)
-        output = output.numpy()
-        return output
-        
-    def npu_op_exec(self,input1):
-        output = torch.round(input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-        
-    def cpu_op_exec_(self,input1):
-        output = torch.round_(input1)
-        output = input1.numpy()
-        return output
-            
-    def npu_op_exec_(self,input1):
-        output = torch.round_(input1)
-        output = input1.to("cpu")
-        output = output.numpy()
-        return output
-    
-    def cpu_op_exec_out(self,input1,cpu_out):
-        output = torch.round(input1, out = cpu_out)
-        output = cpu_out.numpy()
-        return output
-        
-    def npu_op_exec_out(self,input1,npu_out):
-        output = torch.round(input1, out = npu_out)
-        output = npu_out.to("cpu")
-        output = output.numpy()
-        return output
-        
-    def test_round_float32_common_shape_format(self, device):
-        shape_format = [
-                [[np.float32, -1, (3)]], 
-                [[np.float32, -1, (4, 23)]],
-                [[np.float32, -1, (2, 3)]],
-                [[np.float32, -1, (12, 23)]]
-        ]
-        for item in shape_format:            
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-            
-    def test_round_inp_float32_common_shape_format(self, device):
-        shape_format = [
-                [[np.float32, -1, (14)]], 
-                [[np.float32, -1, (4, 3)]],
-                [[np.float32, -1, (12, 32)]],
-                [[np.float32, -1, (22, 38)]]
-        ]
-        for item in shape_format:       
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = self.cpu_op_exec_(cpu_input1)
-            npu_output = self.npu_op_exec_(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-	
-    def test_round_out_common_shape_format(self, device):
-        shape_format = [
-                [[np.float16, -1, (10, 5)], [np.float16, -1, (5, 2)]],
-                [[np.float16, -1, (4, 1, 5)], [np.float16, -1, (8, 1, 10)]],
-                [[np.float32, -1, (10)], [np.float32, -1, (5)]],
-                [[np.float32, -1, (4, 1, 5)], [np.float32, -1, (8, 1, 3)]],
-                [[np.float32, -1, (2, 3, 8)], [np.float32, -1, (2, 3, 16)]],
-                [[np.float32, -1, (2, 13, 56)], [np.float32, -1, (1, 26, 56)]],
-                [[np.float32, -1, (2, 13, 56)], [np.float32, -1, (1, 26)]],
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_out1, npu_out1 = create_common_tensor(item[0], 1, 100)
-            cpu_out2, npu_out2 = create_common_tensor(item[1], 1, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            if cpu_out1.dtype == torch.float16:
-                cpu_out1 = cpu_out1.to(torch.float32)
-            cpu_output = self.cpu_op_exec_out(cpu_input1,cpu_out1)
-            npu_output1 = self.npu_op_exec_out(npu_input1,npu_out1)
-            npu_output2 = self.npu_op_exec_out(npu_input1,npu_out2)
-            cpu_output = cpu_output.astype(npu_output1.dtype)
-            self.assertRtolEqual(cpu_output, npu_output1)
-            self.assertRtolEqual(cpu_output, npu_output2)
-
-instantiate_device_type_tests(TestRound, globals(), except_for="cpu")
-
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestRound(TestCase):
+    
+    def cpu_op_exec(self,input1):
+        output = torch.round(input1)
+        output = output.numpy()
+        return output
+        
+    def npu_op_exec(self,input1):
+        output = torch.round(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+        
+    def cpu_op_exec_(self,input1):
+        output = torch.round_(input1)
+        output = input1.numpy()
+        return output
+            
+    def npu_op_exec_(self,input1):
+        output = torch.round_(input1)
+        output = input1.to("cpu")
+        output = output.numpy()
+        return output
+    
+    def cpu_op_exec_out(self,input1,cpu_out):
+        output = torch.round(input1, out = cpu_out)
+        output = cpu_out.numpy()
+        return output
+        
+    def npu_op_exec_out(self,input1,npu_out):
+        output = torch.round(input1, out = npu_out)
+        output = npu_out.to("cpu")
+        output = output.numpy()
+        return output
+        
+    def test_round_float32_common_shape_format(self, device):
+        shape_format = [
+                [[np.float32, -1, (3)]], 
+                [[np.float32, -1, (4, 23)]],
+                [[np.float32, -1, (2, 3)]],
+                [[np.float32, -1, (12, 23)]]
+        ]
+        for item in shape_format:            
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+            
+    def test_round_inp_float32_common_shape_format(self, device):
+        shape_format = [
+                [[np.float32, -1, (14)]], 
+                [[np.float32, -1, (4, 3)]],
+                [[np.float32, -1, (12, 32)]],
+                [[np.float32, -1, (22, 38)]]
+        ]
+        for item in shape_format:       
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_output = self.cpu_op_exec_(cpu_input1)
+            npu_output = self.npu_op_exec_(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+	
+    def test_round_out_common_shape_format(self, device):
+        shape_format = [
+                [[np.float16, -1, (10, 5)], [np.float16, -1, (5, 2)]],
+                [[np.float16, -1, (4, 1, 5)], [np.float16, -1, (8, 1, 10)]],
+                [[np.float32, -1, (10)], [np.float32, -1, (5)]],
+                [[np.float32, -1, (4, 1, 5)], [np.float32, -1, (8, 1, 3)]],
+                [[np.float32, -1, (2, 3, 8)], [np.float32, -1, (2, 3, 16)]],
+                [[np.float32, -1, (2, 13, 56)], [np.float32, -1, (1, 26, 56)]],
+                [[np.float32, -1, (2, 13, 56)], [np.float32, -1, (1, 26)]],
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_out1, npu_out1 = create_common_tensor(item[0], 1, 100)
+            cpu_out2, npu_out2 = create_common_tensor(item[1], 1, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            if cpu_out1.dtype == torch.float16:
+                cpu_out1 = cpu_out1.to(torch.float32)
+            cpu_output = self.cpu_op_exec_out(cpu_input1,cpu_out1)
+            npu_output1 = self.npu_op_exec_out(npu_input1,npu_out1)
+            npu_output2 = self.npu_op_exec_out(npu_input1,npu_out2)
+            cpu_output = cpu_output.astype(npu_output1.dtype)
+            self.assertRtolEqual(cpu_output, npu_output1)
+            self.assertRtolEqual(cpu_output, npu_output2)
+
+instantiate_device_type_tests(TestRound, globals(), except_for="cpu")
+
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_rsqrt.py b/test/test_npu/test_network_ops/test_rsqrt.py
old mode 100644
new mode 100755
index 5c3e453e088a391c8c60bd5df3fcc4269c045f26..348b1c5a293a8f5743549345fff8b4bd8a143045
--- a/test/test_npu/test_network_ops/test_rsqrt.py
+++ b/test/test_npu/test_network_ops/test_rsqrt.py
@@ -1,139 +1,139 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestRsqrt(TestCase):
-    def cpu_op_exec(self, input1):
-        output = torch.rsqrt(input1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1):
-        output = torch.rsqrt(input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_out(self, input1, input2):
-        output = input2
-        torch.rsqrt(input1, out=output)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_inp_op_exec(self, input1):
-        output = torch.rsqrt_(input1)
-        output = output.numpy()
-        return output
-
-    def npu_inp_op_exec(self, input1):
-        output = torch.rsqrt_(input1)
-        output = input1.to("cpu")
-        output = output.numpy()
-        return output
-
-    def rsqrt_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-
-            cpu_output_inp = self.cpu_inp_op_exec(cpu_input1)
-            npu_output_inp = self.npu_inp_op_exec(npu_input1)
-
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            cpu_output_inp = cpu_output_inp.astype(npu_output_inp.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_output_inp, npu_output_inp)
-
-    def rsqrt_out_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], 0, 100)
-            cpu_input3, npu_input3 = create_common_tensor(item[1], 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output_out1 = self.npu_op_exec_out(npu_input1, npu_input2)
-            npu_output_out2 = self.npu_op_exec_out(npu_input1, npu_input3)
-            cpu_output = cpu_output.astype(npu_output_out1.dtype)
-            self.assertRtolEqual(cpu_output, npu_output_out1)
-            self.assertRtolEqual(cpu_output, npu_output_out2)
-
-    def test_rsqrt_out_result(self, device):
-        shape_format = [
-            [[np.float16, -1, [128, 116, 14, 14]], [np.float16, -1, [256, 116, 1, 1]]],
-            [[np.float16, 0, [128, 58, 28, 28]],  [np.float16, 0, [58, 58, 1, 1]]],
-            [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3, 3]]],
-            [[np.float16, -1, [128, 116, 14, 14]], [np.float16, -1, [128, 116, 14, 14]]],
-            [[np.float32, 0, [256, 128, 7, 7]],   [np.float32, 0, [128, 128, 3, 3]]],
-            [[np.float32, 0, [256, 3, 224, 224]], [np.float32, 0, [3, 3, 7, 7]]],
-            [[np.float32, -1, [2, 3, 3, 3]],       [np.float32, -1, [3, 1, 3, 3]]],
-            [[np.float32, -1, [128, 232, 7, 7]],   [np.float32, -1, [128, 232, 7, 7]]],
-        ]
-        self.rsqrt_out_result(shape_format)
-
-    def test_rsqrt_shape_format_fp16_1d(self, device):
-        format_list = [-1, 0, 3]
-        shape_format = [[np.float16, i, [16]] for i in format_list]
-        self.rsqrt_result(shape_format)
-
-    def test_rsqrt_shape_format_fp32_1d(self, device):
-        format_list = [-1, 0, 3]
-        shape_format = [[np.float32, i, [16]] for i in format_list]
-        self.rsqrt_result(shape_format)
-
-    def test_rsqrt_shape_format_fp16_2d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[np.float16, i, [16, 32]] for i in format_list]
-        self.rsqrt_result(shape_format)
-
-    def test_rsqrt_shape_format_fp32_2d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[np.float32, i, [16, 32]] for i in format_list]
-        self.rsqrt_result(shape_format)
-
-    def test_rsqrt_shape_format_fp16_3d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[np.float16, i, [16, 32, 64]] for i in format_list]
-        self.rsqrt_result(shape_format)
-
-    def test_rsqrt_shape_format_fp32_3d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[np.float32, i, [16, 32, 64]] for i in format_list]
-        self.rsqrt_result(shape_format)
-
-    def test_rsqrt_shape_format_fp16_4d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[np.float16, i, [16, 32, 64, 128]] for i in format_list]
-        self.rsqrt_result(shape_format)
-
-    def test_rsqrt_shape_format_fp32_4d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[np.float32, i, [16, 32, 64, 128]] for i in format_list]
-        self.rsqrt_result(shape_format)
-
-
-instantiate_device_type_tests(TestRsqrt, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestRsqrt(TestCase):
+    def cpu_op_exec(self, input1):
+        output = torch.rsqrt(input1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        output = torch.rsqrt(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2):
+        output = input2
+        torch.rsqrt(input1, out=output)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_inp_op_exec(self, input1):
+        output = torch.rsqrt_(input1)
+        output = output.numpy()
+        return output
+
+    def npu_inp_op_exec(self, input1):
+        output = torch.rsqrt_(input1)
+        output = input1.to("cpu")
+        output = output.numpy()
+        return output
+
+    def rsqrt_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+
+            cpu_output_inp = self.cpu_inp_op_exec(cpu_input1)
+            npu_output_inp = self.npu_inp_op_exec(npu_input1)
+
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            cpu_output_inp = cpu_output_inp.astype(npu_output_inp.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output_inp, npu_output_inp)
+
+    def rsqrt_out_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], 0, 100)
+            cpu_input3, npu_input3 = create_common_tensor(item[1], 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output_out1 = self.npu_op_exec_out(npu_input1, npu_input2)
+            npu_output_out2 = self.npu_op_exec_out(npu_input1, npu_input3)
+            cpu_output = cpu_output.astype(npu_output_out1.dtype)
+            self.assertRtolEqual(cpu_output, npu_output_out1)
+            self.assertRtolEqual(cpu_output, npu_output_out2)
+
+    def test_rsqrt_out_result(self, device):
+        shape_format = [
+            [[np.float16, -1, [128, 116, 14, 14]], [np.float16, -1, [256, 116, 1, 1]]],
+            [[np.float16, 0, [128, 58, 28, 28]],  [np.float16, 0, [58, 58, 1, 1]]],
+            [[np.float16, 0, [128, 3, 224, 224]], [np.float16, 0, [3, 3, 3, 3]]],
+            [[np.float16, -1, [128, 116, 14, 14]], [np.float16, -1, [128, 116, 14, 14]]],
+            [[np.float32, 0, [256, 128, 7, 7]],   [np.float32, 0, [128, 128, 3, 3]]],
+            [[np.float32, 0, [256, 3, 224, 224]], [np.float32, 0, [3, 3, 7, 7]]],
+            [[np.float32, -1, [2, 3, 3, 3]],       [np.float32, -1, [3, 1, 3, 3]]],
+            [[np.float32, -1, [128, 232, 7, 7]],   [np.float32, -1, [128, 232, 7, 7]]],
+        ]
+        self.rsqrt_out_result(shape_format)
+
+    def test_rsqrt_shape_format_fp16_1d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[np.float16, i, [16]] for i in format_list]
+        self.rsqrt_result(shape_format)
+
+    def test_rsqrt_shape_format_fp32_1d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[np.float32, i, [16]] for i in format_list]
+        self.rsqrt_result(shape_format)
+
+    def test_rsqrt_shape_format_fp16_2d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[np.float16, i, [16, 32]] for i in format_list]
+        self.rsqrt_result(shape_format)
+
+    def test_rsqrt_shape_format_fp32_2d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[np.float32, i, [16, 32]] for i in format_list]
+        self.rsqrt_result(shape_format)
+
+    def test_rsqrt_shape_format_fp16_3d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[np.float16, i, [16, 32, 64]] for i in format_list]
+        self.rsqrt_result(shape_format)
+
+    def test_rsqrt_shape_format_fp32_3d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[np.float32, i, [16, 32, 64]] for i in format_list]
+        self.rsqrt_result(shape_format)
+
+    def test_rsqrt_shape_format_fp16_4d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[np.float16, i, [16, 32, 64, 128]] for i in format_list]
+        self.rsqrt_result(shape_format)
+
+    def test_rsqrt_shape_format_fp32_4d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[np.float32, i, [16, 32, 64, 128]] for i in format_list]
+        self.rsqrt_result(shape_format)
+
+
+instantiate_device_type_tests(TestRsqrt, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_rsub.py b/test/test_npu/test_network_ops/test_rsub.py
old mode 100644
new mode 100755
index afd70d99758e41048218132f8a41179330d212f6..50a173c3ecb17d5f1aa42b44ac300dd470e70e16
--- a/test/test_npu/test_network_ops/test_rsub.py
+++ b/test/test_npu/test_network_ops/test_rsub.py
@@ -1,170 +1,170 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestRsub(TestCase):
-    def cpu_op_exec(self, input1, input2):
-        output = input2 - input1
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        output = input2 - input1
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_scalar(self, input1, input2):
-        output = input1 - input2
-        output = output.to("cpu")
-        output = output.numpy()
-        output = -output
-        return output
-
-    def rsub_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_input2 = cpu_input2.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def rsub_scalar_result(self, shape_format):
-        for item in shape_format:
-            scalar = np.random.uniform(0, 100)
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input1, scalar)
-            npu_output_scalar = self.npu_op_exec_scalar(npu_input1, scalar)
-
-            cpu_output = cpu_output.astype(npu_output_scalar.dtype)
-            self.assertRtolEqual(cpu_output, npu_output_scalar)
-
-    def test_sub_shape_format_fp16_1d(self, device):
-        format_list = [-1, 0, 3]
-        shape_format = [[[np.float16, i, [32]], [np.float16, i, [32]]] for i in format_list]
-        self.rsub_result(shape_format)
-
-    def test_sub_shape_format_fp32_1d(self, device):
-        format_list = [-1, 0, 3]
-        shape_format = [[[np.float16, i, [32]], [np.float16, i, [32]]] for i in format_list]
-        self.rsub_result(shape_format)
-
-    def test_sub_shape_format_fp16_2d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float16, i, [5, 3]], [np.float16, i, [5, 3]]] for i in format_list]
-        self.rsub_result(shape_format)
-
-    def test_sub_shape_format_fp32_2d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float16, i, [5, 3]], [np.float16, i, [5, 3]]] for i in format_list]
-        self.rsub_result(shape_format)
-
-    def test_sub_shape_format_fp16_3d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float16, i, [256, 480, 14]], [np.float16, i, [256, 480, 14]]] for i in format_list]
-        self.rsub_result(shape_format)
-
-    def test_sub_shape_format_fp32_3d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float16, i, [256, 480, 14]], [np.float16, i, [256, 480, 14]]] for i in format_list]
-        self.rsub_result(shape_format)
-
-    def test_sub_shape_format_fp16_4d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float16, i, [32, 3, 3, 3]], [np.float16, i, [32, 3, 3, 3]]] for i in format_list]
-        self.rsub_result(shape_format)
-
-    def test_sub_shape_format_fp32_4d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float16, i, [32, 3, 3, 3]], [np.float16, i, [32, 3, 3, 3]]] for i in format_list]
-        self.rsub_result(shape_format)
-
-    # int-------------------------------------------------------------------------------
-    def test_sub_shape_format_int32_1d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.int32, i, [32]], [np.int32, i, [32]]] for i in format_list]
-        self.rsub_result(shape_format)
-
-    def test_sub_shape_format_int32_2d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.int32, i, [5, 3]], [np.int32, i, [5, 3]]] for i in format_list]
-        self.rsub_result(shape_format)
-
-    def test_sub_shape_format_int32_3d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.int32, i, [256, 480, 14]], [np.int32, i, [256, 480, 14]]] for i in format_list]
-        self.rsub_result(shape_format)
-
-    def test_sub_shape_format_int32_4d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.int32, i, [32, 3, 3, 3]], [np.int32, i, [32, 3, 3, 3]]] for i in format_list]
-        self.rsub_result(shape_format)
-
-    # scalar----------------------------------------------------------------------------
-    def test_sub_scalar_shape_format_fp16_1d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.float16, i, [32]]] for i in format_list]
-        self.rsub_scalar_result(shape_format)
-
-    def test_sub_scalar_shape_format_fp32_1d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.float16, i, [32]]] for i in format_list]
-        self.rsub_scalar_result(shape_format)
-
-    def test_sub_scalar_shape_format_fp16_2d(self, device):
-        format_list = []
-        shape_format = [[[np.float16, i, [32, 64]]] for i in format_list]
-        self.rsub_scalar_result(shape_format)
-
-    def test_sub_scalar_shape_format_fp32_2d(self, device):
-        format_list = []
-        shape_format = [[[np.float16, i, [32, 64]]] for i in format_list]
-        self.rsub_scalar_result(shape_format)
-
-    def test_sub_scalar_shape_format_fp16_3d(self, device):
-        format_list = []
-        shape_format = [[[np.float16, i, [32, 64, 128]]] for i in format_list]
-        self.rsub_scalar_result(shape_format)
-
-    def test_sub_scalar_shape_format_fp32_3d(self, device):
-        format_list = []
-        shape_format = [[[np.float16, i, [32, 64, 128]]] for i in format_list]
-        self.rsub_scalar_result(shape_format)
-
-    def test_sub_scalar_shape_format_fp16_4d(self, device):
-        format_list = []
-        shape_format = [[[np.float16, i, [32, 64, 128, 28]]] for i in format_list]
-        self.rsub_scalar_result(shape_format)
-
-    def test_sub_scalar_shape_format_fp32_4d(self, device):
-        format_list = []
-        shape_format = [[[np.float16, i, [32, 64, 128, 28]]] for i in format_list]
-        self.rsub_scalar_result(shape_format)
-
-
-instantiate_device_type_tests(TestRsub, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestRsub(TestCase):
+    def cpu_op_exec(self, input1, input2):
+        output = input2 - input1
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = input2 - input1
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_scalar(self, input1, input2):
+        output = input1 - input2
+        output = output.to("cpu")
+        output = output.numpy()
+        output = -output
+        return output
+
+    def rsub_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def rsub_scalar_result(self, shape_format):
+        for item in shape_format:
+            scalar = np.random.uniform(0, 100)
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input1, scalar)
+            npu_output_scalar = self.npu_op_exec_scalar(npu_input1, scalar)
+
+            cpu_output = cpu_output.astype(npu_output_scalar.dtype)
+            self.assertRtolEqual(cpu_output, npu_output_scalar)
+
+    def test_sub_shape_format_fp16_1d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[[np.float16, i, [32]], [np.float16, i, [32]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_fp32_1d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[[np.float16, i, [32]], [np.float16, i, [32]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_fp16_2d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [5, 3]], [np.float16, i, [5, 3]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_fp32_2d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [5, 3]], [np.float16, i, [5, 3]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_fp16_3d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [256, 480, 14]], [np.float16, i, [256, 480, 14]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_fp32_3d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [256, 480, 14]], [np.float16, i, [256, 480, 14]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_fp16_4d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [32, 3, 3, 3]], [np.float16, i, [32, 3, 3, 3]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_fp32_4d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [32, 3, 3, 3]], [np.float16, i, [32, 3, 3, 3]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    # int-------------------------------------------------------------------------------
+    def test_sub_shape_format_int32_1d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [32]], [np.int32, i, [32]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_int32_2d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [5, 3]], [np.int32, i, [5, 3]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_int32_3d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [256, 480, 14]], [np.int32, i, [256, 480, 14]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    def test_sub_shape_format_int32_4d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [32, 3, 3, 3]], [np.int32, i, [32, 3, 3, 3]]] for i in format_list]
+        self.rsub_result(shape_format)
+
+    # scalar----------------------------------------------------------------------------
+    def test_sub_scalar_shape_format_fp16_1d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.float16, i, [32]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp32_1d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.float16, i, [32]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp16_2d(self, device):
+        format_list = []
+        shape_format = [[[np.float16, i, [32, 64]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp32_2d(self, device):
+        format_list = []
+        shape_format = [[[np.float16, i, [32, 64]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp16_3d(self, device):
+        format_list = []
+        shape_format = [[[np.float16, i, [32, 64, 128]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp32_3d(self, device):
+        format_list = []
+        shape_format = [[[np.float16, i, [32, 64, 128]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp16_4d(self, device):
+        format_list = []
+        shape_format = [[[np.float16, i, [32, 64, 128, 28]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp32_4d(self, device):
+        format_list = []
+        shape_format = [[[np.float16, i, [32, 64, 128, 28]]] for i in format_list]
+        self.rsub_scalar_result(shape_format)
+
+
+instantiate_device_type_tests(TestRsub, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_scalar_tensor.py b/test/test_npu/test_network_ops/test_scalar_tensor.py
index d7f5acc0ae513fa7e434ddd99e8c149407fc5007..abf64be8016531c5acddddd0842da7bd0224b00b 100644
--- a/test/test_npu/test_network_ops/test_scalar_tensor.py
+++ b/test/test_npu/test_network_ops/test_scalar_tensor.py
@@ -1,51 +1,51 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestScalarTensor(TestCase):
-    def cpu_op_exec(self, scalar, dtype):
-        output = torch.scalar_tensor(scalar, dtype=dtype, device="cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, scalar, dtype):
-        output = torch.scalar_tensor(scalar, dtype=dtype, device="npu")
-        output = output.cpu()
-        output = output.numpy()
-        return output
-
-    def test_scalar_tensor_shape_format(self, device):
-        scalars = [-50, 0, 50]
-        dtypes = [torch.float16, torch.float32, torch.int32]
-        
-        shape_format = [
-                [i, j] for i in scalars for j in dtypes 
-        ]
-
-        for item in shape_format:
-            cpu_output = self.cpu_op_exec(item[0], item[1])
-            npu_output = self.npu_op_exec(item[0], item[1])
-
-            self.assertEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestScalarTensor, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestScalarTensor(TestCase):
+    def cpu_op_exec(self, scalar, dtype):
+        output = torch.scalar_tensor(scalar, dtype=dtype, device="cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, scalar, dtype):
+        output = torch.scalar_tensor(scalar, dtype=dtype, device="npu")
+        output = output.cpu()
+        output = output.numpy()
+        return output
+
+    def test_scalar_tensor_shape_format(self, device):
+        scalars = [-50, 0, 50]
+        dtypes = [torch.float16, torch.float32, torch.int32]
+        
+        shape_format = [
+                [i, j] for i in scalars for j in dtypes 
+        ]
+
+        for item in shape_format:
+            cpu_output = self.cpu_op_exec(item[0], item[1])
+            npu_output = self.npu_op_exec(item[0], item[1])
+
+            self.assertEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestScalarTensor, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_scatter_add.py b/test/test_npu/test_network_ops/test_scatter_add.py
index e7edf77d536630de799f1eea1d0ad83fd063f5b9..a2f2ee59b800be5303ecdda19394d8f2ab77b33a 100644
--- a/test/test_npu/test_network_ops/test_scatter_add.py
+++ b/test/test_npu/test_network_ops/test_scatter_add.py
@@ -1,108 +1,108 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestScatterAdd(TestCase):
-    def cpu_op_exec_inp(self, input1, dim, index, src):
-        input1.scatter_add_(dim, index, src)
-        output = input1.numpy()
-        return output
-
-    def npu_op_exec_inp(self, input, dim, index, src):
-        input.scatter_add_(dim, index, src)
-        input = input.to("cpu")
-        output = input.numpy()
-        return output        
-
-    def cpu_op_exec(self, input1, dim, index, src):
-        output = torch.scatter_add(input1, dim, index, src)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1,  dim, index, src):
-        output = torch.scatter_add(input1,dim, index, src)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_scatter_add_common_shape_format(self, device):
-        shape_format = [
-                [0,     [np.int64, 0, [10, 20]],       [np.float32, 0, [10, 20]],         [np.float32, 0, [10, 20]]],
-                [1,     [np.int64, 0, [10, 20]],       [np.float32, 0, [10, 20]],         [np.float32, 0, [10, 20]]],
-                [0,     [np.int64, 0, [2, 6]],         [np.float32, 0, [2, 6]],           [np.float32, 0, [2, 6]]],
-                [1,     [np.int64, 0, [2, 6]],         [np.float32, 0, [2, 6]],           [np.float32, 0, [2, 6]]],
-                [0,     [np.int64, 0, [10, 20, 30]],   [np.float32, 0, [10, 20, 30]],     [np.float32, 0, [10, 20, 30]]],
-                [1,     [np.int64, 0, [10, 20, 30]],   [np.float32, 0, [10, 20, 30]],     [np.float32, 0, [10, 20, 30]]],
-                [2,     [np.int64, 0, [10, 20, 30]],   [np.float32, 0, [10, 20, 30]],     [np.float32, 0, [10, 20, 30]]],
-        ]
-
-        for item in shape_format:
-            cpu_input2, npu_input2 = create_common_tensor(item[2], 1, 100)
-            cpu_input1, npu_input1 = create_common_tensor(item[1], 1, (item[1][2][item[0]] - 1))
-            cpu_input3, npu_input3 = create_common_tensor(item[3], 1, 100)
-
-            cpu_output = self.cpu_op_exec(cpu_input3, item[0], cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input3, item[0], npu_input1, npu_input2)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-            cpu_inp_output = self.cpu_op_exec_inp(cpu_input3, item[0], cpu_input1, cpu_input2)
-            npu_inp_output = self.npu_op_exec_inp(npu_input3, item[0], npu_input1, npu_input2)
-            self.assertRtolEqual(cpu_inp_output, npu_inp_output)
-    
-    def test_scatter_add_float16_shape_format(self, device):
-        def cpu_op_exec_inp_fp16(input, dim, index, src):
-            input = input.to(torch.float32)
-            src = src.to(torch.float32)
-            input.scatter_add_(dim, index, src)
-            output = input.numpy()
-            output = output.astype(np.float16)
-            return output
-        
-        def cpu_op_exec_fp16(input1, dim, index, src):
-            output = torch.scatter_add(input1,dim, index, src)
-            output = output.numpy()
-            output = output.astype(np.float16)
-            return output
-        
-        shape_format = [
-                [0,     [np.int64, 0, [10, 20]],      [np.float16, 0, [10, 20]],         [np.float16, 0, [10, 20]]],
-                [1,     [np.int64, 0, [10, 20]],      [np.float16, 0, [10, 20]],         [np.float16, 0, [10, 20]]],
-                [0,     [np.int64, 0, [2, 6]],         [np.float16, 0, [2, 6]],           [np.float16, 0, [2, 6]]],
-                [1,     [np.int64, 0, [2, 6]],         [np.float16, 0, [2, 6]],           [np.float16, 0, [2, 6]]],
-                [0,     [np.int64, 0, [10, 20, 30]],   [np.float16, 0, [10, 20, 30]],     [np.float16, 0, [10, 20, 30]]],
-                [1,     [np.int64, 0, [10, 20, 30]],   [np.float16, 0, [10, 20, 30]],     [np.float16, 0, [10, 20, 30]]],
-                [2,     [np.int64, 0, [10, 20, 30]],   [np.float16, 0, [10, 20, 30]],     [np.float16, 0, [10, 20, 30]]],
-        ]
-
-        for item in shape_format:
-            cpu_input2, npu_input2 = create_common_tensor(item[2], 1, 100)
-            cpu_input1, npu_input1 = create_common_tensor(item[1], 1, (item[1][2][item[0]] - 1))
-            cpu_input3, npu_input3 = create_common_tensor(item[3], 1, 100)
-
-            cpu_output = cpu_op_exec_fp16(cpu_input3, item[0], cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input3, item[0], npu_input1, npu_input2)
-            self.assertRtolEqual(cpu_output, npu_output)
-            cpu_inp_output = cpu_op_exec_inp_fp16(cpu_input3, item[0], cpu_input1, cpu_input2)
-            npu_inp_output = self.npu_op_exec_inp(npu_input3, item[0], npu_input1, npu_input2)
-            self.assertRtolEqual(cpu_inp_output, npu_inp_output)
-
-instantiate_device_type_tests(TestScatterAdd, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestScatterAdd(TestCase):
+    def cpu_op_exec_inp(self, input1, dim, index, src):
+        input1.scatter_add_(dim, index, src)
+        output = input1.numpy()
+        return output
+
+    def npu_op_exec_inp(self, input, dim, index, src):
+        input.scatter_add_(dim, index, src)
+        input = input.to("cpu")
+        output = input.numpy()
+        return output        
+
+    def cpu_op_exec(self, input1, dim, index, src):
+        output = torch.scatter_add(input1, dim, index, src)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1,  dim, index, src):
+        output = torch.scatter_add(input1,dim, index, src)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_scatter_add_common_shape_format(self, device):
+        shape_format = [
+                [0,     [np.int64, 0, [10, 20]],       [np.float32, 0, [10, 20]],         [np.float32, 0, [10, 20]]],
+                [1,     [np.int64, 0, [10, 20]],       [np.float32, 0, [10, 20]],         [np.float32, 0, [10, 20]]],
+                [0,     [np.int64, 0, [2, 6]],         [np.float32, 0, [2, 6]],           [np.float32, 0, [2, 6]]],
+                [1,     [np.int64, 0, [2, 6]],         [np.float32, 0, [2, 6]],           [np.float32, 0, [2, 6]]],
+                [0,     [np.int64, 0, [10, 20, 30]],   [np.float32, 0, [10, 20, 30]],     [np.float32, 0, [10, 20, 30]]],
+                [1,     [np.int64, 0, [10, 20, 30]],   [np.float32, 0, [10, 20, 30]],     [np.float32, 0, [10, 20, 30]]],
+                [2,     [np.int64, 0, [10, 20, 30]],   [np.float32, 0, [10, 20, 30]],     [np.float32, 0, [10, 20, 30]]],
+        ]
+
+        for item in shape_format:
+            cpu_input2, npu_input2 = create_common_tensor(item[2], 1, 100)
+            cpu_input1, npu_input1 = create_common_tensor(item[1], 1, (item[1][2][item[0]] - 1))
+            cpu_input3, npu_input3 = create_common_tensor(item[3], 1, 100)
+
+            cpu_output = self.cpu_op_exec(cpu_input3, item[0], cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input3, item[0], npu_input1, npu_input2)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+            cpu_inp_output = self.cpu_op_exec_inp(cpu_input3, item[0], cpu_input1, cpu_input2)
+            npu_inp_output = self.npu_op_exec_inp(npu_input3, item[0], npu_input1, npu_input2)
+            self.assertRtolEqual(cpu_inp_output, npu_inp_output)
+    
+    def test_scatter_add_float16_shape_format(self, device):
+        def cpu_op_exec_inp_fp16(input, dim, index, src):
+            input = input.to(torch.float32)
+            src = src.to(torch.float32)
+            input.scatter_add_(dim, index, src)
+            output = input.numpy()
+            output = output.astype(np.float16)
+            return output
+        
+        def cpu_op_exec_fp16(input1, dim, index, src):
+            output = torch.scatter_add(input1,dim, index, src)
+            output = output.numpy()
+            output = output.astype(np.float16)
+            return output
+        
+        shape_format = [
+                [0,     [np.int64, 0, [10, 20]],      [np.float16, 0, [10, 20]],         [np.float16, 0, [10, 20]]],
+                [1,     [np.int64, 0, [10, 20]],      [np.float16, 0, [10, 20]],         [np.float16, 0, [10, 20]]],
+                [0,     [np.int64, 0, [2, 6]],         [np.float16, 0, [2, 6]],           [np.float16, 0, [2, 6]]],
+                [1,     [np.int64, 0, [2, 6]],         [np.float16, 0, [2, 6]],           [np.float16, 0, [2, 6]]],
+                [0,     [np.int64, 0, [10, 20, 30]],   [np.float16, 0, [10, 20, 30]],     [np.float16, 0, [10, 20, 30]]],
+                [1,     [np.int64, 0, [10, 20, 30]],   [np.float16, 0, [10, 20, 30]],     [np.float16, 0, [10, 20, 30]]],
+                [2,     [np.int64, 0, [10, 20, 30]],   [np.float16, 0, [10, 20, 30]],     [np.float16, 0, [10, 20, 30]]],
+        ]
+
+        for item in shape_format:
+            cpu_input2, npu_input2 = create_common_tensor(item[2], 1, 100)
+            cpu_input1, npu_input1 = create_common_tensor(item[1], 1, (item[1][2][item[0]] - 1))
+            cpu_input3, npu_input3 = create_common_tensor(item[3], 1, 100)
+
+            cpu_output = cpu_op_exec_fp16(cpu_input3, item[0], cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input3, item[0], npu_input1, npu_input2)
+            self.assertRtolEqual(cpu_output, npu_output)
+            cpu_inp_output = cpu_op_exec_inp_fp16(cpu_input3, item[0], cpu_input1, cpu_input2)
+            npu_inp_output = self.npu_op_exec_inp(npu_input3, item[0], npu_input1, npu_input2)
+            self.assertRtolEqual(cpu_inp_output, npu_inp_output)
+
+instantiate_device_type_tests(TestScatterAdd, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_scatterv1.py b/test/test_npu/test_network_ops/test_scatterv1.py
index 139aa04221c87d69d2ed9b94f4c87041afd96d5b..0f8ec6a5f846d99c44dafd0c9d91bf9d8f69a5c7 100644
--- a/test/test_npu/test_network_ops/test_scatterv1.py
+++ b/test/test_npu/test_network_ops/test_scatterv1.py
@@ -1,39 +1,39 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-from common_utils import TestCase, run_tests
-
-
-class TestScatterV1(TestCase):
-   def npu_op_exec(self, input1, indices, updates, dim):
-        output = torch.npu_scatter(input1, indices, updates, dim)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-   def test_scatterv1(self, device):
-        input    = torch.tensor([[1.6279, 0.1226], [0.9041, 1.0980]]).npu()
-        indices  = torch.tensor([0, 1]).npu().to(torch.int32)
-        updates  = torch.tensor([-1.1993, -1.5247]).npu()
-        dim      = 0
-        exoutput = torch.tensor([[-1.1993, 0.1226], [0.9041, -1.5247]])
-        output   = self.npu_op_exec(input, indices, updates, dim)
-        self.assertRtolEqual(exoutput.numpy(), output) 
-
-instantiate_device_type_tests(TestScatterV1, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+from common_utils import TestCase, run_tests
+
+
+class TestScatterV1(TestCase):
+   def npu_op_exec(self, input1, indices, updates, dim):
+        output = torch.npu_scatter(input1, indices, updates, dim)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+   def test_scatterv1(self, device):
+        input    = torch.tensor([[1.6279, 0.1226], [0.9041, 1.0980]]).npu()
+        indices  = torch.tensor([0, 1]).npu().to(torch.int32)
+        updates  = torch.tensor([-1.1993, -1.5247]).npu()
+        dim      = 0
+        exoutput = torch.tensor([[-1.1993, 0.1226], [0.9041, -1.5247]])
+        output   = self.npu_op_exec(input, indices, updates, dim)
+        self.assertRtolEqual(exoutput.numpy(), output) 
+
+instantiate_device_type_tests(TestScatterV1, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_select.py b/test/test_npu/test_network_ops/test_select.py
index 2cf6b00de326812a821d85ddc573b2f4c140ab4f..10a262839e41f47f2d43f8653837c8bf26e1f518 100644
--- a/test/test_npu/test_network_ops/test_select.py
+++ b/test/test_npu/test_network_ops/test_select.py
@@ -1,48 +1,48 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
- 
-class TestSelect(TestCase):
-    def test_select_common_shape_format(self, device):
-        def cpu_op_exec(input, dim, index):
-            output = input.select(dim, index)
-            output = output.numpy()
-            return output
-    
-        def npu_op_exec(input, dim, index):
-            output = input.select(dim,index)
-            output = output.to("cpu")
-            output = output.numpy() 
-            return output  
-        
-        shape_format = [
-                [[np.float16, 0, (64, 10)]],
-                [[np.float32, 4, (32, 1, 3, 3)]],
-                [[np.float32, 29, (10, 128)]]
-        ]
-        for shape in shape_format:
-            cpu_input, npu_input = create_common_tensor(shape[0], -1, 1)
-            cpu_output = cpu_op_exec(cpu_input, 0, 2)
-            npu_output = npu_op_exec(npu_input, 0, 2)
-            self.assertRtolEqual(cpu_output, npu_output)
-    
-
-instantiate_device_type_tests(TestSelect, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+ 
+class TestSelect(TestCase):
+    def test_select_common_shape_format(self, device):
+        def cpu_op_exec(input, dim, index):
+            output = input.select(dim, index)
+            output = output.numpy()
+            return output
+    
+        def npu_op_exec(input, dim, index):
+            output = input.select(dim,index)
+            output = output.to("cpu")
+            output = output.numpy() 
+            return output  
+        
+        shape_format = [
+                [[np.float16, 0, (64, 10)]],
+                [[np.float32, 4, (32, 1, 3, 3)]],
+                [[np.float32, 29, (10, 128)]]
+        ]
+        for shape in shape_format:
+            cpu_input, npu_input = create_common_tensor(shape[0], -1, 1)
+            cpu_output = cpu_op_exec(cpu_input, 0, 2)
+            npu_output = npu_op_exec(npu_input, 0, 2)
+            self.assertRtolEqual(cpu_output, npu_output)
+    
+
+instantiate_device_type_tests(TestSelect, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_select_to_contiguous.py b/test/test_npu/test_network_ops/test_select_to_contiguous.py
index 1249e255a308ea1a282fb0ec6a9865dcb24f06ac..d5af660a232ebcc051f429ea68b37f1aa6251717 100644
--- a/test/test_npu/test_network_ops/test_select_to_contiguous.py
+++ b/test/test_npu/test_network_ops/test_select_to_contiguous.py
@@ -1,39 +1,39 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestSelectToContiguous(TestCase):
-    def test_SelectToContiguous(self, device):
-        dtype_list = [ np.float16 ,np.float32 ]
-        format_list = [0,3,4]
-        shape_list = [[200, 100, 300],[200,200,100,100]]
-        shape_format = [
-            [i, j, k] for i in dtype_list for j in format_list for k in shape_list
-        ]
-        for item in shape_format:    
-            a1_cpu, a1_npu = create_common_tensor(item, 0, 100)
-            for dim in range(1,len(item[2])):
-                npu_out = a1_npu.select(dim,1).contiguous()
-                cpu_out = a1_cpu.select(dim,1).contiguous()
-                self.assertRtolEqual(npu_out.to("cpu").numpy(), cpu_out.numpy())                
-                
-                
-instantiate_device_type_tests(TestSelectToContiguous, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestSelectToContiguous(TestCase):
+    def test_SelectToContiguous(self, device):
+        dtype_list = [ np.float16 ,np.float32 ]
+        format_list = [0,3,4]
+        shape_list = [[200, 100, 300],[200,200,100,100]]
+        shape_format = [
+            [i, j, k] for i in dtype_list for j in format_list for k in shape_list
+        ]
+        for item in shape_format:    
+            a1_cpu, a1_npu = create_common_tensor(item, 0, 100)
+            for dim in range(1,len(item[2])):
+                npu_out = a1_npu.select(dim,1).contiguous()
+                cpu_out = a1_cpu.select(dim,1).contiguous()
+                self.assertRtolEqual(npu_out.to("cpu").numpy(), cpu_out.numpy())                
+                
+                
+instantiate_device_type_tests(TestSelectToContiguous, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_sign.py b/test/test_npu/test_network_ops/test_sign.py
old mode 100644
new mode 100755
index 5f85e478873a365a4aa33fbca8d0d4ab3900f978..2b9525f83bebb211d070b3dc7c9b6ea4e4326b42
--- a/test/test_npu/test_network_ops/test_sign.py
+++ b/test/test_npu/test_network_ops/test_sign.py
@@ -1,134 +1,134 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestSign(TestCase):
-    def cpu_op_exec(self, input1):
-        cpu_output = torch.sign(input1)
-        cpu_output = cpu_output.numpy()
-        return cpu_output
-
-    def npu_op_exec(self, input1):
-        output = torch.sign(input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_out(self, input1, input2):
-        torch.sign(input1, out=input2)
-        output = input2.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_inp_op_exec(self, input1):
-        input1.sign_()
-        output = input1.numpy()
-        return output
-
-    def npu_inp_op_exec(self, input1):
-        input1.sign_()
-        output = input1.to("cpu")
-        output = output.numpy()
-        return output
-
-    def sign_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, -100, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item, -100, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2)
-            cpu_output_inp = self.cpu_inp_op_exec(cpu_input1)
-            npu_output_inp = self.npu_inp_op_exec(npu_input1)
-
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            cpu_output_inp = cpu_output_inp.astype(npu_output_inp.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_output, npu_output_out)
-            self.assertRtolEqual(cpu_output_inp, npu_output_inp)
-
-    def test_sign_shape_format_fp16_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [[np.float16, i, [18]] for i in format_list]
-        self.sign_result(shape_format)
-
-    def test_sign_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float16, i, [5, 256]] for i in format_list]
-        self.sign_result(shape_format)
-
-    def test_sign_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float16, i, [32, 3, 3]] for i in format_list]
-        self.sign_result(shape_format)
-
-    def test_sign_shape_format_fp16_4d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float16, i, [64, 112, 7, 7]] for i in format_list]
-        self.sign_result(shape_format)
-
-    def test_sign_shape_format_fp32_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [[np.float32, i, [18]] for i in format_list]
-        self.sign_result(shape_format)
-
-    def test_sign_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float32, i, [5, 256]] for i in format_list]
-        self.sign_result(shape_format)
-
-    def test_sign_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float32, i, [32, 3, 3]] for i in format_list]
-        self.sign_result(shape_format)
-
-    def test_sign_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float32, i, [64, 112, 7, 7]] for i in format_list]
-        self.sign_result(shape_format)
-
-    def test_sign_shape_format_int32_1d(self, device):
-        format_list = [0]
-        shape_format = [[np.int32, i, [18]] for i in format_list]
-        self.sign_result(shape_format)
-
-    def test_sign_shape_format_int32_2d(self, device):
-        format_list = [0]
-        shape_format = [[np.int32, i, [5, 256]] for i in format_list]
-        self.sign_result(shape_format)
-
-    def test_sign_shape_format_int32_3d(self, device):
-        format_list = [0]
-        shape_format = [[np.int32, i, [32, 3, 3]] for i in format_list]
-        self.sign_result(shape_format)
-
-    def test_sign_shape_format_int32_4d(self, device):
-        format_list = [0]
-        shape_format = [[np.int32, i, [64, 112, 7, 7]] for i in format_list]
-        self.sign_result(shape_format)
-
-
-instantiate_device_type_tests(TestSign, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestSign(TestCase):
+    def cpu_op_exec(self, input1):
+        cpu_output = torch.sign(input1)
+        cpu_output = cpu_output.numpy()
+        return cpu_output
+
+    def npu_op_exec(self, input1):
+        output = torch.sign(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2):
+        torch.sign(input1, out=input2)
+        output = input2.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_inp_op_exec(self, input1):
+        input1.sign_()
+        output = input1.numpy()
+        return output
+
+    def npu_inp_op_exec(self, input1):
+        input1.sign_()
+        output = input1.to("cpu")
+        output = output.numpy()
+        return output
+
+    def sign_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, -100, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, -100, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2)
+            cpu_output_inp = self.cpu_inp_op_exec(cpu_input1)
+            npu_output_inp = self.npu_inp_op_exec(npu_input1)
+
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            cpu_output_inp = cpu_output_inp.astype(npu_output_inp.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output_out)
+            self.assertRtolEqual(cpu_output_inp, npu_output_inp)
+
+    def test_sign_shape_format_fp16_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [[np.float16, i, [18]] for i in format_list]
+        self.sign_result(shape_format)
+
+    def test_sign_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float16, i, [5, 256]] for i in format_list]
+        self.sign_result(shape_format)
+
+    def test_sign_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float16, i, [32, 3, 3]] for i in format_list]
+        self.sign_result(shape_format)
+
+    def test_sign_shape_format_fp16_4d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float16, i, [64, 112, 7, 7]] for i in format_list]
+        self.sign_result(shape_format)
+
+    def test_sign_shape_format_fp32_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [[np.float32, i, [18]] for i in format_list]
+        self.sign_result(shape_format)
+
+    def test_sign_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float32, i, [5, 256]] for i in format_list]
+        self.sign_result(shape_format)
+
+    def test_sign_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float32, i, [32, 3, 3]] for i in format_list]
+        self.sign_result(shape_format)
+
+    def test_sign_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float32, i, [64, 112, 7, 7]] for i in format_list]
+        self.sign_result(shape_format)
+
+    def test_sign_shape_format_int32_1d(self, device):
+        format_list = [0]
+        shape_format = [[np.int32, i, [18]] for i in format_list]
+        self.sign_result(shape_format)
+
+    def test_sign_shape_format_int32_2d(self, device):
+        format_list = [0]
+        shape_format = [[np.int32, i, [5, 256]] for i in format_list]
+        self.sign_result(shape_format)
+
+    def test_sign_shape_format_int32_3d(self, device):
+        format_list = [0]
+        shape_format = [[np.int32, i, [32, 3, 3]] for i in format_list]
+        self.sign_result(shape_format)
+
+    def test_sign_shape_format_int32_4d(self, device):
+        format_list = [0]
+        shape_format = [[np.int32, i, [64, 112, 7, 7]] for i in format_list]
+        self.sign_result(shape_format)
+
+
+instantiate_device_type_tests(TestSign, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_size.py b/test/test_npu/test_network_ops/test_size.py
index fef31a6c8687ca96b076ba47d8ffce59910357f4..ec354c63cea95a73132001191c3179ce1c4213d9 100644
--- a/test/test_npu/test_network_ops/test_size.py
+++ b/test/test_npu/test_network_ops/test_size.py
@@ -1,42 +1,42 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import instantiate_device_type_tests
-from util_test import create_common_tensor
- 
-class TestSize(TestCase):
-    def test_size_common_shape_format(self, device):
-        def op_exec(input):
-            output = input.size()
-            output = np.array(output, dtype=np.int32)
-            return output
-        
-        shape_format = [
-                [[np.float16, 0, (64)]],
-                [[np.float32, 4, (32, 1, 3, 3)]],
-                [[np.float32, 3, (10, 128)]]
-        ]
-        for shape in shape_format:
-            cpu_input, npu_input = create_common_tensor(shape[0], -100, 100)
-            cpu_output = op_exec(cpu_input)
-            npu_output = op_exec(npu_input)
-            self.assertRtolEqual(cpu_output, npu_output)
-    
-
-instantiate_device_type_tests(TestSize, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import instantiate_device_type_tests
+from util_test import create_common_tensor
+ 
+class TestSize(TestCase):
+    def test_size_common_shape_format(self, device):
+        def op_exec(input):
+            output = input.size()
+            output = np.array(output, dtype=np.int32)
+            return output
+        
+        shape_format = [
+                [[np.float16, 0, (64)]],
+                [[np.float32, 4, (32, 1, 3, 3)]],
+                [[np.float32, 3, (10, 128)]]
+        ]
+        for shape in shape_format:
+            cpu_input, npu_input = create_common_tensor(shape[0], -100, 100)
+            cpu_output = op_exec(cpu_input)
+            npu_output = op_exec(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output)
+    
+
+instantiate_device_type_tests(TestSize, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_slice.py b/test/test_npu/test_network_ops/test_slice.py
index 7a29d001e32d939fca6d06ca8eccb04166b50d00..13b28de868144584c836f6d8f9ef6ea36523fd5e 100644
--- a/test/test_npu/test_network_ops/test_slice.py
+++ b/test/test_npu/test_network_ops/test_slice.py
@@ -1,36 +1,36 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-from common_utils import TestCase, run_tests
-
-
-class TestSlice(TestCase):
-   def npu_op_exec(self, input1, offset, sizes):
-        output = torch.npu_slice(input1, offset, sizes)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-   def test_slice_int32(self, device):
-        input    = torch.tensor([[1,2,3,4,5], [6,7,8,9,10]]).npu()
-        exoutput = torch.tensor([[1,2],[6,7]])
-        output   = self.npu_op_exec(input, [0, 0], [2, 2])
-        self.assertRtolEqual(exoutput.numpy(), output) 
-
-instantiate_device_type_tests(TestSlice, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+from common_utils import TestCase, run_tests
+
+
+class TestSlice(TestCase):
+   def npu_op_exec(self, input1, offset, sizes):
+        output = torch.npu_slice(input1, offset, sizes)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+   def test_slice_int32(self, device):
+        input    = torch.tensor([[1,2,3,4,5], [6,7,8,9,10]]).npu()
+        exoutput = torch.tensor([[1,2],[6,7]])
+        output   = self.npu_op_exec(input, [0, 0], [2, 2])
+        self.assertRtolEqual(exoutput.numpy(), output) 
+
+instantiate_device_type_tests(TestSlice, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_slogdet.py b/test/test_npu/test_network_ops/test_slogdet.py
index dafcd7431976925d1e27a667bba2e145d3904fa7..8164efda4b32f9317633be70d6c381c7d7cc98a9 100644
--- a/test/test_npu/test_network_ops/test_slogdet.py
+++ b/test/test_npu/test_network_ops/test_slogdet.py
@@ -1,55 +1,55 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestSlogdet(TestCase):
-    def cpu_op_exec(self, input):
-        sign, logabsdet = torch.slogdet(input)
-        sign = sign.numpy()
-        logabsdet = logabsdet.numpy()
-        return sign, logabsdet
-
-    def npu_op_exec(self, input):
-        sign, logabsdet = torch.slogdet(input)
-        sign = sign.cpu()
-        logabsdet = logabsdet.cpu()
-        sign = sign.numpy()
-        logabsdet = logabsdet.numpy()
-        return sign, logabsdet
-
-    def test_slogdet_shape_format(self, device):
-        shape_format = [
-                [np.float32, -1, (3, 3)],
-                [np.float32, -1, (4, 3, 3)],
-                [np.float32, -1, (5, 5, 5, 5)],
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, -100, 100)
-            cpu_output, cpu_indices = self.cpu_op_exec(cpu_input)
-            npu_output, npu_indices = self.npu_op_exec(npu_input)
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_indices, npu_indices)
-
-
-
-instantiate_device_type_tests(TestSlogdet, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestSlogdet(TestCase):
+    def cpu_op_exec(self, input):
+        sign, logabsdet = torch.slogdet(input)
+        sign = sign.numpy()
+        logabsdet = logabsdet.numpy()
+        return sign, logabsdet
+
+    def npu_op_exec(self, input):
+        sign, logabsdet = torch.slogdet(input)
+        sign = sign.cpu()
+        logabsdet = logabsdet.cpu()
+        sign = sign.numpy()
+        logabsdet = logabsdet.numpy()
+        return sign, logabsdet
+
+    def test_slogdet_shape_format(self, device):
+        shape_format = [
+                [np.float32, -1, (3, 3)],
+                [np.float32, -1, (4, 3, 3)],
+                [np.float32, -1, (5, 5, 5, 5)],
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, -100, 100)
+            cpu_output, cpu_indices = self.cpu_op_exec(cpu_input)
+            npu_output, npu_indices = self.npu_op_exec(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_indices, npu_indices)
+
+
+
+instantiate_device_type_tests(TestSlogdet, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_smoothl1loss.py b/test/test_npu/test_network_ops/test_smoothl1loss.py
index 977192f46cac46c78bd2009f2f0837fb57743a5f..bf3d50d26875d7a9ea4efa66d3752904d10a4827 100644
--- a/test/test_npu/test_network_ops/test_smoothl1loss.py
+++ b/test/test_npu/test_network_ops/test_smoothl1loss.py
@@ -1,74 +1,74 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-#torch.nn.functional.smooth_l1_loss 接口没有.out入参。未做.out测试
-class TestSmoothL1loss(TestCase):
-    def cpu_op_exec_new(self, input1, target, reduction):
-        output = torch.nn.functional.smooth_l1_loss(input1, target, reduction = reduction)
-        return output.numpy()
-
-    def npu_op_exec_new(self, input1, target, reduction):
-        target = target.npu()
-        output = torch.nn.functional.smooth_l1_loss(input1, target,reduction = reduction)
-        return output.cpu().numpy()
-
-    def test_smoothl1loss_shape_format_fp32(self, device):
-        format_list = [0]
-        shape_list = [[256, 10], [256, 1000], [256, 10000],
-                      [64, 10, 10], [64, 100, 100], [64, 200, 200],
-                      [32, 3, 10, 10], [32, 3, 100, 100], [32, 3, 200, 200]]
-        reduction_list = ['none', 'mean', 'sum']
-        shape_format = [
-            [[np.float32, i, j], [np.float32, 0, j], k] for i in format_list
-             for j in shape_list for k in reduction_list
-        ]
-        for item in shape_format:
-            np_target = np.random.uniform(0, 10, (item[1][2])).astype(item[1][0])
-            target = torch.from_numpy(np_target)
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_output = self.cpu_op_exec_new(cpu_input1, target, item[2])
-            npu_output = self.npu_op_exec_new(npu_input1, target, item[2])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_nllloss_shape_format_fp16(self, device):
-        format_list = [0]
-        shape_list = [[256, 10], [256, 1000], [256, 10000],
-                      [64, 10, 10], [64, 100, 100], [64, 200, 200],
-                      [32, 3, 10, 10], [32, 3, 100, 100], [32, 3, 200, 200]]
-        reduction_list = ['none', 'mean']
-        shape_format = [
-            [[np.float16, i, j], [np.float16, 0, j], k] for i in format_list
-             for j in shape_list for k in reduction_list
-        ]
-        
-        for item in shape_format:
-            np_target = np.random.uniform(0, 10, (item[1][2])).astype(item[1][0])
-            target = torch.from_numpy(np_target)
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output = self.cpu_op_exec_new(cpu_input1, target, item[2])
-            npu_output = self.npu_op_exec_new(npu_input1, target, item[2])
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestSmoothL1loss, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+#torch.nn.functional.smooth_l1_loss 接口没有.out入参。未做.out测试
+class TestSmoothL1loss(TestCase):
+    def cpu_op_exec_new(self, input1, target, reduction):
+        output = torch.nn.functional.smooth_l1_loss(input1, target, reduction = reduction)
+        return output.numpy()
+
+    def npu_op_exec_new(self, input1, target, reduction):
+        target = target.npu()
+        output = torch.nn.functional.smooth_l1_loss(input1, target,reduction = reduction)
+        return output.cpu().numpy()
+
+    def test_smoothl1loss_shape_format_fp32(self, device):
+        format_list = [0]
+        shape_list = [[256, 10], [256, 1000], [256, 10000],
+                      [64, 10, 10], [64, 100, 100], [64, 200, 200],
+                      [32, 3, 10, 10], [32, 3, 100, 100], [32, 3, 200, 200]]
+        reduction_list = ['none', 'mean', 'sum']
+        shape_format = [
+            [[np.float32, i, j], [np.float32, 0, j], k] for i in format_list
+             for j in shape_list for k in reduction_list
+        ]
+        for item in shape_format:
+            np_target = np.random.uniform(0, 10, (item[1][2])).astype(item[1][0])
+            target = torch.from_numpy(np_target)
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_output = self.cpu_op_exec_new(cpu_input1, target, item[2])
+            npu_output = self.npu_op_exec_new(npu_input1, target, item[2])
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_nllloss_shape_format_fp16(self, device):
+        format_list = [0]
+        shape_list = [[256, 10], [256, 1000], [256, 10000],
+                      [64, 10, 10], [64, 100, 100], [64, 200, 200],
+                      [32, 3, 10, 10], [32, 3, 100, 100], [32, 3, 200, 200]]
+        reduction_list = ['none', 'mean']
+        shape_format = [
+            [[np.float16, i, j], [np.float16, 0, j], k] for i in format_list
+             for j in shape_list for k in reduction_list
+        ]
+        
+        for item in shape_format:
+            np_target = np.random.uniform(0, 10, (item[1][2])).astype(item[1][0])
+            target = torch.from_numpy(np_target)
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output = self.cpu_op_exec_new(cpu_input1, target, item[2])
+            npu_output = self.npu_op_exec_new(npu_input1, target, item[2])
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestSmoothL1loss, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_soft_margin_loss.py b/test/test_npu/test_network_ops/test_soft_margin_loss.py
index 9c8308738fee4d74c75dd8bcb07f848c68cc025a..e1ac2921a77283ba3dc854054d2dea3ade7672e1 100644
--- a/test/test_npu/test_network_ops/test_soft_margin_loss.py
+++ b/test/test_npu/test_network_ops/test_soft_margin_loss.py
@@ -1,127 +1,127 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestSoftMarginLoss(TestCase):
-    def generate_data(self,min_d, max_d, shape1, shape2, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape1).astype(dtype)
-        npu_input1 = torch.from_numpy(input1)
-        if dtype == np.float16:
-            stype = torch.float16
-        if dtype == np.float32:
-            stype = torch.float32
-        npu_input2 = torch.ones(size=shape2, dtype=stype)
-        return npu_input1, npu_input2
-
-    def cpu_op_exec_default(self,input1, input2):
-        stype=input1.dtype
-        if stype==torch.float16:
-            input1=input1.float()
-            input2=input2.float()
-        loss = torch.nn.SoftMarginLoss()
-        output=loss(input1, input2)
-        if stype==torch.float16:
-            output=output.half()
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_default(self,input1, input2):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        loss = torch.nn.SoftMarginLoss()
-        output = loss(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec(self,input1, input2, reduct):
-        stype=input1.dtype
-        if stype==torch.float16:
-            input1=input1.float()
-            input2=input2.float()
-        loss = torch.nn.SoftMarginLoss(reduction=reduct)
-        output = loss(input1, input2)
-        if stype==torch.float16:
-            output=output.half()
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self,input1, input2, reduct):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        loss = torch.nn.SoftMarginLoss(reduction=reduct)
-        output = loss(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
- 
-    def test_soft_margin_loss_float16(self, device):
-        npu_input1, npu_input2 =self.generate_data(-2, 2, (5, 13, 2, 7, 18, 83, 5, 22), (5, 13, 2, 7, 18, 83, 5, 22), np.float16)
-        cpu_output = self.cpu_op_exec_default(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec_default(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_soft_margin_loss_float16_mean(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (3, 19, 19, 3, 11, 11, 2), (3, 1, 19, 3, 11, 11, 1), np.float16)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "mean")
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, "mean")
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_soft_margin_loss_float16_none(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (5, 13, 2, 7, 18, 83, 5, 22), (5, 13, 2, 1, 18, 83, 1, 22), np.float16)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "none")
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, "none")
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_soft_margin_loss_float16_sum(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (1, 8, 2, 2, 5, 8, 2, 8), 
-                                                   (1, 8, 2, 2, 1, 1, 1, 1), np.float16)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "sum")
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, "sum")
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_soft_margin_loss_float32(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (416, 192, 272), (416, 1, 272), np.float32)
-        cpu_output = self.cpu_op_exec_default(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec_default(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_soft_margin_loss_float32_mean(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (416, 192, 272), (416, 192, 272), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "mean")
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, "mean")
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_soft_margin_loss_float32_none(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (25, 25, 25), (25, 1, 25), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "none")
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, "none")
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_soft_margin_loss_float32_sum(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (148, 110, 148), (148, 1, 148), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "sum")
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, "sum")
-        self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestSoftMarginLoss, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestSoftMarginLoss(TestCase):
+    def generate_data(self,min_d, max_d, shape1, shape2, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape1).astype(dtype)
+        npu_input1 = torch.from_numpy(input1)
+        if dtype == np.float16:
+            stype = torch.float16
+        if dtype == np.float32:
+            stype = torch.float32
+        npu_input2 = torch.ones(size=shape2, dtype=stype)
+        return npu_input1, npu_input2
+
+    def cpu_op_exec_default(self,input1, input2):
+        stype=input1.dtype
+        if stype==torch.float16:
+            input1=input1.float()
+            input2=input2.float()
+        loss = torch.nn.SoftMarginLoss()
+        output=loss(input1, input2)
+        if stype==torch.float16:
+            output=output.half()
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_default(self,input1, input2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        loss = torch.nn.SoftMarginLoss()
+        output = loss(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec(self,input1, input2, reduct):
+        stype=input1.dtype
+        if stype==torch.float16:
+            input1=input1.float()
+            input2=input2.float()
+        loss = torch.nn.SoftMarginLoss(reduction=reduct)
+        output = loss(input1, input2)
+        if stype==torch.float16:
+            output=output.half()
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self,input1, input2, reduct):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        loss = torch.nn.SoftMarginLoss(reduction=reduct)
+        output = loss(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+ 
+    def test_soft_margin_loss_float16(self, device):
+        npu_input1, npu_input2 =self.generate_data(-2, 2, (5, 13, 2, 7, 18, 83, 5, 22), (5, 13, 2, 7, 18, 83, 5, 22), np.float16)
+        cpu_output = self.cpu_op_exec_default(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec_default(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_soft_margin_loss_float16_mean(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (3, 19, 19, 3, 11, 11, 2), (3, 1, 19, 3, 11, 11, 1), np.float16)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "mean")
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, "mean")
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_soft_margin_loss_float16_none(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (5, 13, 2, 7, 18, 83, 5, 22), (5, 13, 2, 1, 18, 83, 1, 22), np.float16)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "none")
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, "none")
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_soft_margin_loss_float16_sum(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (1, 8, 2, 2, 5, 8, 2, 8), 
+                                                   (1, 8, 2, 2, 1, 1, 1, 1), np.float16)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "sum")
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, "sum")
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_soft_margin_loss_float32(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (416, 192, 272), (416, 1, 272), np.float32)
+        cpu_output = self.cpu_op_exec_default(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec_default(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_soft_margin_loss_float32_mean(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (416, 192, 272), (416, 192, 272), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "mean")
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, "mean")
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_soft_margin_loss_float32_none(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (25, 25, 25), (25, 1, 25), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "none")
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, "none")
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_soft_margin_loss_float32_sum(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (148, 110, 148), (148, 1, 148), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, "sum")
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, "sum")
+        self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestSoftMarginLoss, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_softmax.py b/test/test_npu/test_network_ops/test_softmax.py
old mode 100644
new mode 100755
index 692dba18a13647e3793b3605ccc62c8bf13fd72d..ce624ab6b11d34620bfbd297685033289b6087bb
--- a/test/test_npu/test_network_ops/test_softmax.py
+++ b/test/test_npu/test_network_ops/test_softmax.py
@@ -1,126 +1,126 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestSoftMax(TestCase):
-    def cpu_op_exec(self, input1, dim):
-        output = torch.nn.functional.softmax(input1, dim)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, dim):
-        output = torch.nn.functional.softmax(input1, dim)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_dtype(self, input1, dim, dtype):
-        output = torch.nn.functional.softmax(input1, dim, dtype=dtype)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_dtype(self, input1, dim, dtype):
-        output = torch.nn.functional.softmax(input1, dim, dtype=dtype)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_half_float(self, input1, dim):
-        output = torch._softmax(input1, dim, True)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def softmax_result(self, shape_format):
-        for item in shape_format:
-            dim = np.random.randint(0, len(item[2]))
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-
-            cpu_output = self.cpu_op_exec(cpu_input1, dim)
-            npu_output = self.npu_op_exec(npu_input1, dim)
-
-            if npu_input1.dtype == torch.float16:
-                npu_output_half = self.npu_op_exec_half_float(npu_input1, dim)
-                npu_output_half = npu_output_half.astype(np.float16)
-
-            cpu_output_inp = self.cpu_op_exec_dtype(cpu_input1, dim, torch.float32)
-            npu_output_inp = self.npu_op_exec_dtype(npu_input1, dim, torch.float32)
-
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            cpu_output_inp = cpu_output_inp.astype(npu_output_inp.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            if npu_input1.dtype == torch.float16:
-                self.assertRtolEqual(cpu_output, npu_output_half)
-            self.assertRtolEqual(cpu_output_inp, npu_output_inp)
-
-    def test_softmax_shape_format_fp16_1d(self, device):
-        format_list = [0]
-        shape_format = [[np.float16, i, [18]] for i in format_list]
-        self.softmax_result(shape_format)
-
-    def test_softmax_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float16, i, [5, 256]] for i in format_list]
-        self.softmax_result(shape_format)
-
-    def test_softmax_shape_format_fp16_3d(self, device):
-        format_list = [0, 29]
-        shape_format = [[np.float16, i, [32, 8, 8]] for i in format_list]
-        self.softmax_result(shape_format)
-
-    def test_softmax_shape_format_fp16_4d(self, device):
-        format_list = [0, 29]
-        shape_format = [[np.float16, i, [64, 112, 7, 7]] for i in format_list]
-        self.softmax_result(shape_format)
-
-    def test_softmax_shape_format_fp32_1d(self, device):
-        format_list = [0]
-        shape_format = [[np.float32, i, [18]] for i in format_list]
-        self.softmax_result(shape_format)
-
-    def test_softmax_shape_format_fp32_2d(self, device):
-        format_list = [3, 29]
-        shape_format = [[np.float32, i, [5, 256]] for i in format_list]
-        self.softmax_result(shape_format)
-
-    def test_softmax_shape_format_fp32_3d(self, device):
-        format_list = [0, 29]
-        shape_format = [[np.float32, i, [32, 3, 3]] for i in format_list]
-        self.softmax_result(shape_format)
-
-    def test_softmax_shape_format_fp32_4d(self, device):
-        format_list = [3, 29]
-        shape_format = [[np.float32, i, [64, 112, 7, 7]] for i in format_list]
-        self.softmax_result(shape_format)
-
-    def test_softmax_dimname_shape_format(self, device):
-        cpu_input1 = torch.randn(4, 3, names=('N', 'C'))
-        npu_input1 = cpu_input1.npu()
-        cpu_output = self.cpu_op_exec(cpu_input1, 'N')
-        npu_output = self.npu_op_exec(npu_input1, 'N')
-        self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestSoftMax, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestSoftMax(TestCase):
+    def cpu_op_exec(self, input1, dim):
+        output = torch.nn.functional.softmax(input1, dim)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, dim):
+        output = torch.nn.functional.softmax(input1, dim)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_dtype(self, input1, dim, dtype):
+        output = torch.nn.functional.softmax(input1, dim, dtype=dtype)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_dtype(self, input1, dim, dtype):
+        output = torch.nn.functional.softmax(input1, dim, dtype=dtype)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_half_float(self, input1, dim):
+        output = torch._softmax(input1, dim, True)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def softmax_result(self, shape_format):
+        for item in shape_format:
+            dim = np.random.randint(0, len(item[2]))
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+
+            cpu_output = self.cpu_op_exec(cpu_input1, dim)
+            npu_output = self.npu_op_exec(npu_input1, dim)
+
+            if npu_input1.dtype == torch.float16:
+                npu_output_half = self.npu_op_exec_half_float(npu_input1, dim)
+                npu_output_half = npu_output_half.astype(np.float16)
+
+            cpu_output_inp = self.cpu_op_exec_dtype(cpu_input1, dim, torch.float32)
+            npu_output_inp = self.npu_op_exec_dtype(npu_input1, dim, torch.float32)
+
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            cpu_output_inp = cpu_output_inp.astype(npu_output_inp.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            if npu_input1.dtype == torch.float16:
+                self.assertRtolEqual(cpu_output, npu_output_half)
+            self.assertRtolEqual(cpu_output_inp, npu_output_inp)
+
+    def test_softmax_shape_format_fp16_1d(self, device):
+        format_list = [0]
+        shape_format = [[np.float16, i, [18]] for i in format_list]
+        self.softmax_result(shape_format)
+
+    def test_softmax_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float16, i, [5, 256]] for i in format_list]
+        self.softmax_result(shape_format)
+
+    def test_softmax_shape_format_fp16_3d(self, device):
+        format_list = [0, 29]
+        shape_format = [[np.float16, i, [32, 8, 8]] for i in format_list]
+        self.softmax_result(shape_format)
+
+    def test_softmax_shape_format_fp16_4d(self, device):
+        format_list = [0, 29]
+        shape_format = [[np.float16, i, [64, 112, 7, 7]] for i in format_list]
+        self.softmax_result(shape_format)
+
+    def test_softmax_shape_format_fp32_1d(self, device):
+        format_list = [0]
+        shape_format = [[np.float32, i, [18]] for i in format_list]
+        self.softmax_result(shape_format)
+
+    def test_softmax_shape_format_fp32_2d(self, device):
+        format_list = [3, 29]
+        shape_format = [[np.float32, i, [5, 256]] for i in format_list]
+        self.softmax_result(shape_format)
+
+    def test_softmax_shape_format_fp32_3d(self, device):
+        format_list = [0, 29]
+        shape_format = [[np.float32, i, [32, 3, 3]] for i in format_list]
+        self.softmax_result(shape_format)
+
+    def test_softmax_shape_format_fp32_4d(self, device):
+        format_list = [3, 29]
+        shape_format = [[np.float32, i, [64, 112, 7, 7]] for i in format_list]
+        self.softmax_result(shape_format)
+
+    def test_softmax_dimname_shape_format(self, device):
+        cpu_input1 = torch.randn(4, 3, names=('N', 'C'))
+        npu_input1 = cpu_input1.npu()
+        cpu_output = self.cpu_op_exec(cpu_input1, 'N')
+        npu_output = self.npu_op_exec(npu_input1, 'N')
+        self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestSoftMax, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_softmaxcrossentropywithlogits.py b/test/test_npu/test_network_ops/test_softmaxcrossentropywithlogits.py
index 2ee6e942b5ad4da2483441c99af3c286316d5ba7..4a4ca5b78e59b22d30e16fb8f683d91050b24919 100644
--- a/test/test_npu/test_network_ops/test_softmaxcrossentropywithlogits.py
+++ b/test/test_npu/test_network_ops/test_softmaxcrossentropywithlogits.py
@@ -1,37 +1,37 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-from common_utils import TestCase, run_tests
-
-
-class TestSoftmaxCrossentropyWithLogits(TestCase):
-   def npu_op_exec(self, input1, label):
-        output = torch.npu_softmax_cross_entropy_with_logits(input1, label)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-   def test_SoftmaxCross(self, device):
-        input    = torch.tensor([[1.,2.,3.,4.]]).npu()
-        label    = torch.tensor([[1.,2.,3.,4.]]).npu()
-        exresult = torch.tensor([14.4019])
-        output   = self.npu_op_exec(input, label)
-        self.assertRtolEqual(exresult.numpy(), output)
-
-instantiate_device_type_tests(TestSoftmaxCrossentropyWithLogits, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+from common_utils import TestCase, run_tests
+
+
+class TestSoftmaxCrossentropyWithLogits(TestCase):
+   def npu_op_exec(self, input1, label):
+        output = torch.npu_softmax_cross_entropy_with_logits(input1, label)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+   def test_SoftmaxCross(self, device):
+        input    = torch.tensor([[1.,2.,3.,4.]]).npu()
+        label    = torch.tensor([[1.,2.,3.,4.]]).npu()
+        exresult = torch.tensor([14.4019])
+        output   = self.npu_op_exec(input, label)
+        self.assertRtolEqual(exresult.numpy(), output)
+
+instantiate_device_type_tests(TestSoftmaxCrossentropyWithLogits, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_split.py b/test/test_npu/test_network_ops/test_split.py
old mode 100644
new mode 100755
index 7def0d59dd77d811c5555134d44b24ebed5802a2..ebfe50aa7e1fe9e01394fa1f0c19435050e88035
--- a/test/test_npu/test_network_ops/test_split.py
+++ b/test/test_npu/test_network_ops/test_split.py
@@ -1,111 +1,111 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestSplit(TestCase):
-    def cpu_op_exec(self, input1, sections, dim):
-        output = torch.split(input1, sections, dim)
-        output = list(output)
-        for i in range(len(output)):
-            output[i] = output[i].numpy()
-        return output
-
-    def npu_op_exec(self, input1, sections, dim):
-        output = torch.split(input1, sections, dim)
-        output = list(output)
-        for i in range(len(output)):
-            output[i] = output[i].to("cpu").numpy()
-        return output
-
-    def split_result(self, shape_format):
-        for item in shape_format:
-            dim = np.random.randint(0, len(item[2]))
-            size1 = int(item[2][dim] / 2)
-            size2 = int(item[2][dim] - size1)
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-
-            cpu_output = self.cpu_op_exec(cpu_input1, [size1, size2], dim)
-            npu_output = self.npu_op_exec(npu_input1, [size1, size2], dim)
-
-            for i in range(len(cpu_output)):
-                self.assertRtolEqual(cpu_output[i], npu_output[i])
-
-    def test_split_shape_format_fp16_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [[np.float16, i, [18]] for i in format_list]
-        self.split_result(shape_format)
-
-    def test_split_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float16, i, [5, 256]] for i in format_list]
-        self.split_result(shape_format)
-
-    def test_split_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float16, i, [32, 3, 3]] for i in format_list]
-        self.split_result(shape_format)
-
-    def test_split_shape_format_fp16_4d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float16, i, [64, 112, 7, 7]] for i in format_list]
-        self.split_result(shape_format)
-
-    def test_split_shape_format_fp32_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [[np.float32, i, [18]] for i in format_list]
-        self.split_result(shape_format)
-
-    def test_split_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float32, i, [5, 256]] for i in format_list]
-        self.split_result(shape_format)
-
-    def test_split_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float32, i, [32, 3, 3]] for i in format_list]
-        self.split_result(shape_format)
-
-    def test_split_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float32, i, [64, 112, 7, 7]] for i in format_list]
-        self.split_result(shape_format)
-
-    def test_split_common_shape_format(self, device):
-        shape_format = [
-                [[np.float32, 0 , (1, 4, 2, 3)], 3, 1],
-                [[np.float32, 0, (8,4)], [1,2,1,2,2],0],
-                [[np.float16, 0 , (1, 4, 2, 3)], 3, 1],
-                [[np.float16, 0, (8,4)], [1,2,1,2,2],0],
-                [[np.int32, 0 , (1, 4, 2, 3)], 3, 1],
-                [[np.int32, 0, (8,4)], [1,2,1,2,2],0],
-                [[np.int64, 0 , (1, 4, 2, 3)], 3, 1],
-                [[np.int64, 0, (8,4)], [1,2,1,2,2],0],
-                ]
-        for item in shape_format:            
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 10)
-            cpu_output = self.cpu_op_exec(cpu_input1, item[1], item[2])
-            npu_output = self.npu_op_exec(npu_input1, item[1], item[2])
-            for i in range(len(cpu_output)):
-                self.assertRtolEqual(cpu_output[i], npu_output[i])       
-
-
-instantiate_device_type_tests(TestSplit, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestSplit(TestCase):
+    def cpu_op_exec(self, input1, sections, dim):
+        output = torch.split(input1, sections, dim)
+        output = list(output)
+        for i in range(len(output)):
+            output[i] = output[i].numpy()
+        return output
+
+    def npu_op_exec(self, input1, sections, dim):
+        output = torch.split(input1, sections, dim)
+        output = list(output)
+        for i in range(len(output)):
+            output[i] = output[i].to("cpu").numpy()
+        return output
+
+    def split_result(self, shape_format):
+        for item in shape_format:
+            dim = np.random.randint(0, len(item[2]))
+            size1 = int(item[2][dim] / 2)
+            size2 = int(item[2][dim] - size1)
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+
+            cpu_output = self.cpu_op_exec(cpu_input1, [size1, size2], dim)
+            npu_output = self.npu_op_exec(npu_input1, [size1, size2], dim)
+
+            for i in range(len(cpu_output)):
+                self.assertRtolEqual(cpu_output[i], npu_output[i])
+
+    def test_split_shape_format_fp16_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [[np.float16, i, [18]] for i in format_list]
+        self.split_result(shape_format)
+
+    def test_split_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float16, i, [5, 256]] for i in format_list]
+        self.split_result(shape_format)
+
+    def test_split_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float16, i, [32, 3, 3]] for i in format_list]
+        self.split_result(shape_format)
+
+    def test_split_shape_format_fp16_4d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float16, i, [64, 112, 7, 7]] for i in format_list]
+        self.split_result(shape_format)
+
+    def test_split_shape_format_fp32_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [[np.float32, i, [18]] for i in format_list]
+        self.split_result(shape_format)
+
+    def test_split_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float32, i, [5, 256]] for i in format_list]
+        self.split_result(shape_format)
+
+    def test_split_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float32, i, [32, 3, 3]] for i in format_list]
+        self.split_result(shape_format)
+
+    def test_split_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float32, i, [64, 112, 7, 7]] for i in format_list]
+        self.split_result(shape_format)
+
+    def test_split_common_shape_format(self, device):
+        shape_format = [
+                [[np.float32, 0 , (1, 4, 2, 3)], 3, 1],
+                [[np.float32, 0, (8,4)], [1,2,1,2,2],0],
+                [[np.float16, 0 , (1, 4, 2, 3)], 3, 1],
+                [[np.float16, 0, (8,4)], [1,2,1,2,2],0],
+                [[np.int32, 0 , (1, 4, 2, 3)], 3, 1],
+                [[np.int32, 0, (8,4)], [1,2,1,2,2],0],
+                [[np.int64, 0 , (1, 4, 2, 3)], 3, 1],
+                [[np.int64, 0, (8,4)], [1,2,1,2,2],0],
+                ]
+        for item in shape_format:            
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 10)
+            cpu_output = self.cpu_op_exec(cpu_input1, item[1], item[2])
+            npu_output = self.npu_op_exec(npu_input1, item[1], item[2])
+            for i in range(len(cpu_output)):
+                self.assertRtolEqual(cpu_output[i], npu_output[i])       
+
+
+instantiate_device_type_tests(TestSplit, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_sqrt.py b/test/test_npu/test_network_ops/test_sqrt.py
old mode 100644
new mode 100755
index f9378d82efbae220f58341fb9902c996da187738..1b9e859644d6d851d2a378ae89dd4cb7e03b2c28
--- a/test/test_npu/test_network_ops/test_sqrt.py
+++ b/test/test_npu/test_network_ops/test_sqrt.py
@@ -1,143 +1,143 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-import torch
-import copy
-from common_device_type import instantiate_device_type_tests
-from common_utils import TestCase, run_tests
-from util_test import create_common_tensor
-
-
-class TestSqrt(TestCase):
-    def cpu_op_exec(self, input1):
-        cpu_output = torch.sqrt(input1)
-        cpu_output = cpu_output.numpy()
-        return cpu_output
-
-    def npu_op_exec(self, input1):
-        output = torch.sqrt(input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_out(self, input1, input2):
-        torch.sqrt(input1, out=input2)
-        output = input2.to("cpu")
-        output = output.numpy()
-        return output
-    
-    def npu_op_exec_out_shape(self, input1):
-        input2 = torch.empty(0, dtype=input1.dtype).npu()
-        torch.sqrt(input1, out=input2)
-        output = input2.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_out_contiguous(self, input1):
-        input2 = copy.deepcopy(input1)
-        if input2.dim() > 1 :
-            input2 = input2.transpose(0, 1);
-        torch.sqrt(input1, out=input2)
-        output = input2.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_out_input_equal_output(self, input1):
-        input2 = copy.deepcopy(input1)
-        torch.sqrt(input2, out=input2)
-        output = input2.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_inp_op_exec(self, input1):
-        torch.sqrt_(input1)
-        output = input1.numpy()
-        return output
-
-    def npu_inp_op_exec(self, input1):
-        torch.sqrt_(input1)
-        output = input1.to("cpu")
-        output = output.numpy()
-        return output
-
-    def sqrt_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 2, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item, 2, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2)
-            npu_output_out1 = self.npu_op_exec_out_shape(npu_input1)
-            npu_output_out2 = self.npu_op_exec_out_contiguous(npu_input1)
-            npu_output_out3 = self.npu_op_exec_out_input_equal_output(npu_input1)
-            cpu_output_inp = self.cpu_inp_op_exec(cpu_input1)
-            npu_output_inp = self.npu_inp_op_exec(npu_input1)
-
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            cpu_output_inp = cpu_output_inp.astype(npu_output_inp.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_output, npu_output_out)
-            self.assertRtolEqual(cpu_output, npu_output_out1)
-            self.assertRtolEqual(cpu_output, npu_output_out2)
-            self.assertRtolEqual(cpu_output, npu_output_out3)
-            self.assertRtolEqual(cpu_output_inp, npu_output_inp)
-
-    def test_sqrt_shape_format_fp16_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [[np.float16, i, [18]] for i in format_list]
-        self.sqrt_result(shape_format)
-
-    def test_sqrt_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float16, i, [5, 256]] for i in format_list]
-        self.sqrt_result(shape_format)
-
-    def test_sqrt_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float16, i, [32, 3, 3]] for i in format_list]
-        self.sqrt_result(shape_format)
-
-    def test_sqrt_shape_format_fp16_4d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float16, i, [64, 112, 7, 7]] for i in format_list]
-        self.sqrt_result(shape_format)
-
-    def test_sqrt_shape_format_fp32_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [[np.float32, i, [18]] for i in format_list]
-        self.sqrt_result(shape_format)
-
-    def test_sqrt_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float32, i, [5, 256]] for i in format_list]
-        self.sqrt_result(shape_format)
-
-    def test_sqrt_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float32, i, [32, 3, 3]] for i in format_list]
-        self.sqrt_result(shape_format)
-
-    def test_sqrt_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float32, i, [64, 112, 7, 7]] for i in format_list]
-        self.sqrt_result(shape_format)
-
-
-instantiate_device_type_tests(TestSqrt, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import torch
+import copy
+from common_device_type import instantiate_device_type_tests
+from common_utils import TestCase, run_tests
+from util_test import create_common_tensor
+
+
+class TestSqrt(TestCase):
+    def cpu_op_exec(self, input1):
+        cpu_output = torch.sqrt(input1)
+        cpu_output = cpu_output.numpy()
+        return cpu_output
+
+    def npu_op_exec(self, input1):
+        output = torch.sqrt(input1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2):
+        torch.sqrt(input1, out=input2)
+        output = input2.to("cpu")
+        output = output.numpy()
+        return output
+    
+    def npu_op_exec_out_shape(self, input1):
+        input2 = torch.empty(0, dtype=input1.dtype).npu()
+        torch.sqrt(input1, out=input2)
+        output = input2.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out_contiguous(self, input1):
+        input2 = copy.deepcopy(input1)
+        if input2.dim() > 1 :
+            input2 = input2.transpose(0, 1);
+        torch.sqrt(input1, out=input2)
+        output = input2.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out_input_equal_output(self, input1):
+        input2 = copy.deepcopy(input1)
+        torch.sqrt(input2, out=input2)
+        output = input2.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_inp_op_exec(self, input1):
+        torch.sqrt_(input1)
+        output = input1.numpy()
+        return output
+
+    def npu_inp_op_exec(self, input1):
+        torch.sqrt_(input1)
+        output = input1.to("cpu")
+        output = output.numpy()
+        return output
+
+    def sqrt_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 2, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item, 2, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2)
+            npu_output_out1 = self.npu_op_exec_out_shape(npu_input1)
+            npu_output_out2 = self.npu_op_exec_out_contiguous(npu_input1)
+            npu_output_out3 = self.npu_op_exec_out_input_equal_output(npu_input1)
+            cpu_output_inp = self.cpu_inp_op_exec(cpu_input1)
+            npu_output_inp = self.npu_inp_op_exec(npu_input1)
+
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            cpu_output_inp = cpu_output_inp.astype(npu_output_inp.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output_out)
+            self.assertRtolEqual(cpu_output, npu_output_out1)
+            self.assertRtolEqual(cpu_output, npu_output_out2)
+            self.assertRtolEqual(cpu_output, npu_output_out3)
+            self.assertRtolEqual(cpu_output_inp, npu_output_inp)
+
+    def test_sqrt_shape_format_fp16_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [[np.float16, i, [18]] for i in format_list]
+        self.sqrt_result(shape_format)
+
+    def test_sqrt_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float16, i, [5, 256]] for i in format_list]
+        self.sqrt_result(shape_format)
+
+    def test_sqrt_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float16, i, [32, 3, 3]] for i in format_list]
+        self.sqrt_result(shape_format)
+
+    def test_sqrt_shape_format_fp16_4d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float16, i, [64, 112, 7, 7]] for i in format_list]
+        self.sqrt_result(shape_format)
+
+    def test_sqrt_shape_format_fp32_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [[np.float32, i, [18]] for i in format_list]
+        self.sqrt_result(shape_format)
+
+    def test_sqrt_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float32, i, [5, 256]] for i in format_list]
+        self.sqrt_result(shape_format)
+
+    def test_sqrt_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float32, i, [32, 3, 3]] for i in format_list]
+        self.sqrt_result(shape_format)
+
+    def test_sqrt_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float32, i, [64, 112, 7, 7]] for i in format_list]
+        self.sqrt_result(shape_format)
+
+
+instantiate_device_type_tests(TestSqrt, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_squeeze.py b/test/test_npu/test_network_ops/test_squeeze.py
index 073f508c26eab05e1fbc6ec9711c6d68ade209e5..a5ba8c01e1815a8daafa92b75c138ef025d46864 100644
--- a/test/test_npu/test_network_ops/test_squeeze.py
+++ b/test/test_npu/test_network_ops/test_squeeze.py
@@ -1,46 +1,46 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-class TestSqueeze(TestCase):
-    def test_squeeze_common_shape_format(self, device):
-        def cpu_op_exec(input):
-            output = torch.squeeze(input)
-            output = output.numpy()
-            return output
-    
-        def npu_op_exec(input):
-            output = torch.squeeze(input)
-            output = output.to("cpu")
-            output = output.numpy() 
-            return output  
-        
-        shape_format = [
-                [torch.float16, (2, 1, 2, 1, 2)],
-                [torch.float32, (2, 1, 2, 1, 2)]
-        ]
-        for shape in shape_format:
-            cpu_input = torch.zeros(shape[1],dtype=shape[0])
-            npu_input = torch.zeros(shape[1],dtype=shape[0]).npu()
-            cpu_output = cpu_op_exec(cpu_input)
-            npu_output = npu_op_exec(npu_input)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestSqueeze, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+class TestSqueeze(TestCase):
+    def test_squeeze_common_shape_format(self, device):
+        def cpu_op_exec(input):
+            output = torch.squeeze(input)
+            output = output.numpy()
+            return output
+    
+        def npu_op_exec(input):
+            output = torch.squeeze(input)
+            output = output.to("cpu")
+            output = output.numpy() 
+            return output  
+        
+        shape_format = [
+                [torch.float16, (2, 1, 2, 1, 2)],
+                [torch.float32, (2, 1, 2, 1, 2)]
+        ]
+        for shape in shape_format:
+            cpu_input = torch.zeros(shape[1],dtype=shape[0])
+            npu_input = torch.zeros(shape[1],dtype=shape[0]).npu()
+            cpu_output = cpu_op_exec(cpu_input)
+            npu_output = npu_op_exec(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestSqueeze, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_stack.py b/test/test_npu/test_network_ops/test_stack.py
old mode 100644
new mode 100755
index 118951970b4840771f162cfc206b1632b837ac78..4e64baff1d7086b84a1eb68797ca605c924582a7
--- a/test/test_npu/test_network_ops/test_stack.py
+++ b/test/test_npu/test_network_ops/test_stack.py
@@ -1,158 +1,158 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestStack(TestCase):
-    def cpu_op_exec(self, input1, input2, dim):
-        cpu_output = torch.stack((input1, input2), dim)
-        cpu_output = cpu_output.numpy()
-        return cpu_output
-
-    def npu_op_exec(self, input1, input2, dim):
-        output = torch.stack((input1, input2), dim)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_out(self, input1, input2, dim, input3):    
-        torch.stack((input1, input2), dim, out=input3)
-        output = input3.numpy()
-        return output
-
-    def npu_op_exec_out(self, input1, input2, dim, input3):    
-        torch.stack((input1, input2), dim, out=input3)
-        output = input3.to("cpu")
-        output = output.numpy()
-        return output
-    
-    def npu_output_size(self, inputs = [], dim = 0):
-        shape = []
-        for i in range(dim):
-            shape.append(inputs[0].size(i))
-        shape.append(len(inputs))
-        for i in range(dim, inputs[0].dim()):
-            shape.append(inputs[0].size(i))
-
-        return shape
-
-    def stack_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[0], 0, 100)
-            shape = self.npu_output_size([npu_input1,npu_input2], item[1])
-            npu_input3 = torch.ones(shape, dtype = cpu_input1.dtype).npu()
-            cpu_input3 = torch.ones(shape, dtype = cpu_input1.dtype)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_input2 = cpu_input2.to(torch.float32)
-                cpu_input3 = cpu_input3.to(torch.float32)
-
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, item[1])
-            npu_output = self.npu_op_exec(npu_input1, npu_input2, item[1])
-            cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, item[1], cpu_input3)
-            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, item[1], npu_input3)
-
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_output, npu_output_out)
-
-    def test_stack_shape_format_fp16_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1)] for i in format_list]
-        self.stack_result(shape_format)
-
-    def test_stack_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[[np.float16, i, [5, 256]], np.random.randint(0, 2)] for i in format_list]
-        self.stack_result(shape_format)
-
-    def test_stack_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[[np.float16, i, [32, 3, 3]], np.random.randint(0, 3)] for i in format_list]
-        self.stack_result(shape_format)
-    
-    def test_stack_shape_format_fp16_4d(self, device):
-        format_list = [0, 29]
-        shape_format = [[[np.float16, i, [32, 32, 3, 3]], np.random.randint(0, 4)] for i in format_list]
-        self.stack_result(shape_format)
-
-    def test_stack_shape_format_fp32_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1)] for i in format_list]
-        self.stack_result(shape_format)
-
-    def test_stack_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[[np.float32, i, [5, 256]], np.random.randint(0, 2)] for i in format_list]
-        self.stack_result(shape_format)
-
-    def test_stack_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[[np.float32, i, [32, 3, 3]], np.random.randint(0, 3)] for i in format_list]
-        self.stack_result(shape_format)
-    
-    def test_stack_shape_format_fp32_4d(self, device):
-        format_list = [0, 29]
-        shape_format = [[[np.float32, i, [32, 32, 3, 3]], np.random.randint(0, 4)] for i in format_list]
-        self.stack_result(shape_format)
-
-    def test_stack_shape_format_int32_1d(self, device):
-        format_list = [0]
-        shape_format = [[[np.int32, i, [18]], np.random.randint(0, 1)] for i in format_list]
-        self.stack_result(shape_format)
-
-    def test_stack_shape_format_int32_2d(self, device):
-        format_list = [0]
-        shape_format = [[[np.int32, i, [5, 256]], np.random.randint(0, 2)] for i in format_list]
-        self.stack_result(shape_format)
-
-    def test_stack_shape_format_int32_3d(self, device):
-        format_list = [0]
-        shape_format = [[[np.int32, i, [32, 3, 3]], np.random.randint(0, 3)] for i in format_list]
-        self.stack_result(shape_format)
-    
-    def test_stack_shape_format_int32_4d(self, device):
-        format_list = [-1]
-        shape_format = [[[np.int32, i, [32, 32, 3, 3]], np.random.randint(0, 4)] for i in format_list]
-        self.stack_result(shape_format)
-
-    def test_stack_size_dim(self, device):
-        def cpu_op_exec(input1):
-            output = torch.stack((input1, input1, input1, input1, input1, input1, input1, input1, input1))
-            return output.numpy()
-
-        def npu_op_exec(input1):        
-            output = torch.stack((input1, input1, input1, input1, input1, input1, input1, input1, input1))
-            output = output.to("cpu")
-            return output.numpy()
-        shape_format = [
-                [[np.int32, 0, ()]],
-                [[np.float32, 0, ()]],
-                [[np.float16, 0, ()]],
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = cpu_op_exec(cpu_input1)
-            npu_output = npu_op_exec(npu_input1)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestStack, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestStack(TestCase):
+    def cpu_op_exec(self, input1, input2, dim):
+        cpu_output = torch.stack((input1, input2), dim)
+        cpu_output = cpu_output.numpy()
+        return cpu_output
+
+    def npu_op_exec(self, input1, input2, dim):
+        output = torch.stack((input1, input2), dim)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_out(self, input1, input2, dim, input3):    
+        torch.stack((input1, input2), dim, out=input3)
+        output = input3.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2, dim, input3):    
+        torch.stack((input1, input2), dim, out=input3)
+        output = input3.to("cpu")
+        output = output.numpy()
+        return output
+    
+    def npu_output_size(self, inputs = [], dim = 0):
+        shape = []
+        for i in range(dim):
+            shape.append(inputs[0].size(i))
+        shape.append(len(inputs))
+        for i in range(dim, inputs[0].dim()):
+            shape.append(inputs[0].size(i))
+
+        return shape
+
+    def stack_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[0], 0, 100)
+            shape = self.npu_output_size([npu_input1,npu_input2], item[1])
+            npu_input3 = torch.ones(shape, dtype = cpu_input1.dtype).npu()
+            cpu_input3 = torch.ones(shape, dtype = cpu_input1.dtype)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_input2 = cpu_input2.to(torch.float32)
+                cpu_input3 = cpu_input3.to(torch.float32)
+
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2, item[1])
+            npu_output = self.npu_op_exec(npu_input1, npu_input2, item[1])
+            cpu_output_out = self.cpu_op_exec_out(cpu_input1, cpu_input2, item[1], cpu_input3)
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, item[1], npu_input3)
+
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output_out)
+
+    def test_stack_shape_format_fp16_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [[[np.float16, i, [18]], np.random.randint(0, 1)] for i in format_list]
+        self.stack_result(shape_format)
+
+    def test_stack_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[[np.float16, i, [5, 256]], np.random.randint(0, 2)] for i in format_list]
+        self.stack_result(shape_format)
+
+    def test_stack_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[[np.float16, i, [32, 3, 3]], np.random.randint(0, 3)] for i in format_list]
+        self.stack_result(shape_format)
+    
+    def test_stack_shape_format_fp16_4d(self, device):
+        format_list = [0, 29]
+        shape_format = [[[np.float16, i, [32, 32, 3, 3]], np.random.randint(0, 4)] for i in format_list]
+        self.stack_result(shape_format)
+
+    def test_stack_shape_format_fp32_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [[[np.float32, i, [18]], np.random.randint(0, 1)] for i in format_list]
+        self.stack_result(shape_format)
+
+    def test_stack_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[[np.float32, i, [5, 256]], np.random.randint(0, 2)] for i in format_list]
+        self.stack_result(shape_format)
+
+    def test_stack_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[[np.float32, i, [32, 3, 3]], np.random.randint(0, 3)] for i in format_list]
+        self.stack_result(shape_format)
+    
+    def test_stack_shape_format_fp32_4d(self, device):
+        format_list = [0, 29]
+        shape_format = [[[np.float32, i, [32, 32, 3, 3]], np.random.randint(0, 4)] for i in format_list]
+        self.stack_result(shape_format)
+
+    def test_stack_shape_format_int32_1d(self, device):
+        format_list = [0]
+        shape_format = [[[np.int32, i, [18]], np.random.randint(0, 1)] for i in format_list]
+        self.stack_result(shape_format)
+
+    def test_stack_shape_format_int32_2d(self, device):
+        format_list = [0]
+        shape_format = [[[np.int32, i, [5, 256]], np.random.randint(0, 2)] for i in format_list]
+        self.stack_result(shape_format)
+
+    def test_stack_shape_format_int32_3d(self, device):
+        format_list = [0]
+        shape_format = [[[np.int32, i, [32, 3, 3]], np.random.randint(0, 3)] for i in format_list]
+        self.stack_result(shape_format)
+    
+    def test_stack_shape_format_int32_4d(self, device):
+        format_list = [-1]
+        shape_format = [[[np.int32, i, [32, 32, 3, 3]], np.random.randint(0, 4)] for i in format_list]
+        self.stack_result(shape_format)
+
+    def test_stack_size_dim(self, device):
+        def cpu_op_exec(input1):
+            output = torch.stack((input1, input1, input1, input1, input1, input1, input1, input1, input1))
+            return output.numpy()
+
+        def npu_op_exec(input1):        
+            output = torch.stack((input1, input1, input1, input1, input1, input1, input1, input1, input1))
+            output = output.to("cpu")
+            return output.numpy()
+        shape_format = [
+                [[np.int32, 0, ()]],
+                [[np.float32, 0, ()]],
+                [[np.float16, 0, ()]],
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_output = cpu_op_exec(cpu_input1)
+            npu_output = npu_op_exec(npu_input1)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestStack, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_std.py b/test/test_npu/test_network_ops/test_std.py
index 179532e2b55e726ab3faca09404862d8603551a2..728f769b2a20f216e4161f991ff4cd9657b70e94 100644
--- a/test/test_npu/test_network_ops/test_std.py
+++ b/test/test_npu/test_network_ops/test_std.py
@@ -1,301 +1,301 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import random
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestStd(TestCase):
-    def cpu_op_exec(self, input, unbiased=True):
-        output = torch.std(input, unbiased=unbiased)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input, unbiased=True):
-        output = torch.std(input, unbiased=unbiased)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_dim_exec(self, input, dim, unbiased=True, keepdim=False):
-        output = torch.std(input, dim, unbiased=unbiased, keepdim=keepdim)
-        output = output.numpy()
-        return output
-
-    def npu_op_dim_exec(self, input, dim, unbiased=True, keepdim=False):
-        output = torch.std(input, dim, unbiased=unbiased, keepdim=keepdim)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_dim_out_exec(self, input, dim, output1, unbiased=True, keepdim=False):
-        torch.std(input, dim, unbiased=unbiased, keepdim=keepdim,out=output1)
-        output1 = output1.numpy()
-        return output1
-
-    def npu_op_dim_out_exec(self, input, dim, output1, unbiased=True, keepdim=False):
-        torch.std(input, dim, unbiased=unbiased, keepdim=keepdim,out=output1)
-        output1 = output1.to("cpu")
-        output1 = output1.numpy()
-        return output1
-    
-    def output_shape(self, inputshape, dim, unbiased=True, keepdim=False):
-        shape = list(inputshape)
-        if dim < len(inputshape) and keepdim == True:
-            shape[dim] = 1
-        elif dim < len(inputshape) and keepdim == False:
-            shape.pop(dim)
-        return shape
-    
-    def create_output_tensor(self, minvalue,maxvalue,shape,npuformat,dtype):
-        input1 = np.random.uniform(minvalue, maxvalue, shape).astype(dtype)
-        cpu_input = torch.from_numpy(input1)
-        npu_input = torch.from_numpy(input1).npu()
-        if npuformat != -1:
-            npu_input = npu_input.npu_format_cast(npuformat)
-        return cpu_input, npu_input
-        
-    def test_std_shape_format_fp16(self, device):
-        format_list = [0]
-        shape_list = [[16], [32, 1024], [32, 8, 1024], [128, 32, 8, 1023]]
-        unbiased_list = [True, False]
-        shape_format = [
-            [np.float16, i, j, k] for i in format_list for j in shape_list for k in unbiased_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output1 = self.cpu_op_exec(cpu_input1, item[3])
-            cpu_output1 = cpu_output1.astype(np.float16)
-            npu_output1 = self.npu_op_exec(npu_input1, item[3])
-            self.assertRtolEqual(cpu_output1, npu_output1, prec16=0.1)
-
-    def test_std_shape_format_fp32(self, device):
-        format_list = [0]
-        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]]
-        unbiased_list = [True, False]
-        shape_format = [
-            [np.float32, i, j, k] for i in format_list for j in shape_list for k in unbiased_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, item[3])
-            npu_output = self.npu_op_exec(npu_input1, item[3])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_std_dim_shape_format_fp16(self, device):
-        format_list = [0]
-        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1023]]
-        dim_list = [0]
-        unbiased_list = [True, False]
-        keepdim_list = [True, False]
-        shape_format = [
-            [np.float16, i, j, k, l, m] for i in format_list for j in shape_list 
-            for k in dim_list for l in unbiased_list for m in keepdim_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output1 = self.cpu_op_dim_exec(cpu_input1, item[3], item[4], item[5])
-            cpu_output1 = cpu_output1.astype(np.float16)
-            npu_output1 = self.npu_op_dim_exec(npu_input1, item[3], item[4], item[5])
-            self.assertRtolEqual(cpu_output1, npu_output1, prec16=0.003)
-
-    def test_std_dim_shape_format_fp32(self, device):
-        format_list = [0]
-        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1023]]
-        dim_list = [0]
-        unbiased_list = [True, False]
-        keepdim_list = [True, False]
-        shape_format = [
-            [np.float32, i, j, k, l, m] for i in format_list for j in shape_list
-            for k in dim_list for l in unbiased_list for m in keepdim_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_output1 = self.cpu_op_dim_exec(cpu_input1, item[3], item[4], item[5])
-            npu_output1 = self.npu_op_dim_exec(npu_input1, item[3], item[4], item[5])
-            self.assertRtolEqual(cpu_output1, npu_output1)
-    
-    def test_std_dim_out_shape_format_fp16(self, device):
-        format_list = [0]
-        shape_list = [[1024], [32, 24], [32, 8, 24], [12, 32, 8, 24]]
-        dim_list = [0]
-        unbiased_list = [True, False]
-        keepdim_list = [True, False]
-        shape_format = [
-            [np.float16, i, j, k, l, m] for i in format_list for j in shape_list
-            for k in dim_list for l in unbiased_list for m in keepdim_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            outputshape = self.output_shape(item[2],item[3],item[4],item[5])
-            cpu_output,npu_output = self.create_output_tensor(0,1,outputshape,item[1],item[0])
-            if item[0] == np.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_output = cpu_output.to(torch.float32)
-            cpu_output1 = self.cpu_op_dim_out_exec(cpu_input1, item[3], cpu_output, item[4], item[5])
-            npu_output1 = self.npu_op_dim_out_exec(npu_input1, item[3], npu_output, item[4], item[5])
-            if item[0] == np.float16:
-                cpu_output1 = cpu_output1.astype(np.float16)
-            self.assertRtolEqual(cpu_output1, npu_output1, prec16=0.002)
-
-            random_outputshape = [random.randint(1, 100)]
-            cpu_output, npu_output = self.create_output_tensor(0, 1, random_outputshape,item[1],item[0])
-            if item[0] == np.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_output = cpu_output.to(torch.float32)
-            cpu_output1 = self.cpu_op_dim_out_exec(cpu_input1, item[3], cpu_output, item[4], item[5])
-            npu_output1 = self.npu_op_dim_out_exec(npu_input1, item[3], npu_output, item[4], item[5])
-            if item[0] == np.float16:
-                cpu_output1 = cpu_output1.astype(np.float16)
-            self.assertRtolEqual(cpu_output1, npu_output1, prec16=0.002)
-
-    def test_std_dim_out_shape_format_fp32(self, device):
-        format_list = [0]
-        shape_list = [[1024], [32, 24], [32, 8, 24], [12, 32, 8, 24]]
-        dim_list = [0]
-        unbiased_list = [True, False]
-        keepdim_list = [True, False]
-        shape_format = [
-            [np.float32, i, j, k, l, m] for i in format_list for j in shape_list
-            for k in dim_list for l in unbiased_list for m in keepdim_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            outputshape = self.output_shape(item[2],item[3],item[4],item[5])
-            cpu_output,npu_output = self.create_output_tensor(0,1,outputshape,item[1],item[0])
-            cpu_output1 = self.cpu_op_dim_out_exec(cpu_input1, item[3], cpu_output, item[4], item[5])
-            npu_output1 = self.npu_op_dim_out_exec(npu_input1, item[3], npu_output, item[4], item[5])
-            self.assertRtolEqual(cpu_output1, npu_output1)
-
-            random_outputshape = [random.randint(1, 100)]
-            cpu_output, npu_output = self.create_output_tensor(0, 1, random_outputshape, item[1], item[0])
-            cpu_output2 = self.cpu_op_dim_out_exec(cpu_input1, item[3], cpu_output.clone(), item[4], item[5])
-            npu_output2 = self.npu_op_dim_out_exec(npu_input1, item[3], npu_output, item[4], item[5])
-            self.assertRtolEqual(cpu_output2, npu_output2)
-
-    def test_std_dim_name_fp16(self, device):
-        shape = (1024, 8, 32)
-        cpu_input = torch.rand(shape, dtype=torch.float32)
-        npu_input = cpu_input.npu().to(torch.float16)
-        cpu_input.names = ['N','C','H']
-        npu_input.names = ['N','C','H']
-        dim = np.random.choice(['N', 'C', 'H'])
-        cpu_output = torch.std(cpu_input, dim=dim)
-        npu_output = torch.std(npu_input, dim=dim)
-        self.assertRtolEqual(cpu_output.to(torch.float16).numpy(), npu_output.cpu().numpy())
-
-    def test_std_dim_name_fp32(self, device):
-        shape = (1024, 8, 32)
-        cpu_input = torch.rand(shape, dtype=torch.float32, names=('N', 'C', 'H'))
-        npu_input = cpu_input.npu()
-        dim = np.random.choice(['N', 'C', 'H'])
-        cpu_output = torch.std(cpu_input, dim=dim)
-        npu_output = torch.std(npu_input, dim=dim)
-        self.assertRtolEqual(cpu_output.numpy(), npu_output.cpu().numpy())
-
-    def test_std_dim_out_name_fp16(self, device):
-        shape = (1024, 8, 32)
-        dimlist = ['N', 'C', 'H']
-        cpu_input = torch.rand(shape, dtype=torch.float32)
-        npu_input = cpu_input.npu()
-        dim = np.random.choice(dimlist)
-        dims = dimlist.index(dim)
-        outputshape = self.output_shape(shape, dims)
-        cpu_output,npu_output = self.create_output_tensor(0, 1, outputshape, -1, np.float32)
-        npu_input = npu_input.to(torch.float16)
-        npu_output = npu_output.to(torch.float16)
-        cpu_input.names = ['N','C','H']
-        npu_input.names = ['N','C','H']
-
-        cpu_output = torch.std(cpu_input, dim=dim,out=cpu_output)
-        npu_output = torch.std(npu_input, dim=dim,out=npu_output)
-        cpu_output = cpu_output.to(torch.float16)
-        self.assertRtolEqual(cpu_output.numpy(), npu_output.cpu().numpy())
-
-    def test_std_dim_out_name_fp32(self, device):
-        shape = (1024, 8, 32)
-        dimlist = ['N', 'C', 'H']
-        cpu_input = torch.rand(shape, dtype=torch.float32, names=('N', 'C', 'H'))
-        npu_input = cpu_input.npu()
-        dim = np.random.choice(dimlist)
-        dims = dimlist.index(dim)
-        outputshape = self.output_shape(shape, dims)
-        cpu_output,npu_output = self.create_output_tensor(0, 1, outputshape, -1, np.float32)
-        cpu_output = torch.std(cpu_input, dim=dim,out=cpu_output)
-        npu_output = torch.std(npu_input, dim=dim,out=npu_output)
-        self.assertRtolEqual(cpu_output.numpy(), npu_output.cpu().numpy())
-    
-    def test_std_n_dim_shape_format_fp16(self, device):
-        format_list = [0]
-        shape_list = [[128, 32, 8, 1023]]
-        dim_list = [(3, 1)]
-        unbiased_list = [True, False]
-        keepdim_list = [True, False]
-        shape_format = [
-            [np.float16, i, j, k, l, m] for i in format_list for j in shape_list 
-            for k in dim_list for l in unbiased_list for m in keepdim_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output1 = self.cpu_op_dim_exec(cpu_input1, item[3], item[4], item[5])
-            cpu_output1 = cpu_output1.astype(np.float16)
-            npu_output1 = self.npu_op_dim_exec(npu_input1, item[3], item[4], item[5])
-            self.assertRtolEqual(cpu_output1, npu_output1, prec16=0.003)
-
-    def test_std_n_dim_shape_format_fp32(self, device):
-        format_list = [0]
-        shape_list = [[128, 32, 8, 1023]]
-        dim_list = [(3, 1)]
-        unbiased_list = [True, False]
-        keepdim_list = [True, False]
-        shape_format = [
-            [np.float32, i, j, k, l, m] for i in format_list for j in shape_list
-            for k in dim_list for l in unbiased_list for m in keepdim_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_output1 = self.cpu_op_dim_exec(cpu_input1, item[3], item[4], item[5])
-            npu_output1 = self.npu_op_dim_exec(npu_input1, item[3], item[4], item[5])
-            self.assertRtolEqual(cpu_output1, npu_output1)
-    
-    def test_std_dim_shape_format_5d_fp16(self, device):
-        format_list = [-1]
-        shape_list = [[2, 94, 4, 52, 192]]
-        dim_list = [0]
-        unbiased_list = [True, False]
-        keepdim_list = [True, False]
-        shape_format = [
-            [np.float16, i, j, k, l, m] for i in format_list for j in shape_list 
-            for k in dim_list for l in unbiased_list for m in keepdim_list
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_output1 = self.cpu_op_dim_exec(cpu_input1, item[3], item[4], item[5])
-            cpu_output1 = cpu_output1.astype(np.float16)
-            npu_output1 = self.npu_op_dim_exec(npu_input1, item[3], item[4], item[5])
-            self.assertRtolEqual(cpu_output1, npu_output1, prec16=0.006)
-    
-instantiate_device_type_tests(TestStd, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import random
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestStd(TestCase):
+    def cpu_op_exec(self, input, unbiased=True):
+        output = torch.std(input, unbiased=unbiased)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input, unbiased=True):
+        output = torch.std(input, unbiased=unbiased)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_dim_exec(self, input, dim, unbiased=True, keepdim=False):
+        output = torch.std(input, dim, unbiased=unbiased, keepdim=keepdim)
+        output = output.numpy()
+        return output
+
+    def npu_op_dim_exec(self, input, dim, unbiased=True, keepdim=False):
+        output = torch.std(input, dim, unbiased=unbiased, keepdim=keepdim)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_dim_out_exec(self, input, dim, output1, unbiased=True, keepdim=False):
+        torch.std(input, dim, unbiased=unbiased, keepdim=keepdim,out=output1)
+        output1 = output1.numpy()
+        return output1
+
+    def npu_op_dim_out_exec(self, input, dim, output1, unbiased=True, keepdim=False):
+        torch.std(input, dim, unbiased=unbiased, keepdim=keepdim,out=output1)
+        output1 = output1.to("cpu")
+        output1 = output1.numpy()
+        return output1
+    
+    def output_shape(self, inputshape, dim, unbiased=True, keepdim=False):
+        shape = list(inputshape)
+        if dim < len(inputshape) and keepdim == True:
+            shape[dim] = 1
+        elif dim < len(inputshape) and keepdim == False:
+            shape.pop(dim)
+        return shape
+    
+    def create_output_tensor(self, minvalue,maxvalue,shape,npuformat,dtype):
+        input1 = np.random.uniform(minvalue, maxvalue, shape).astype(dtype)
+        cpu_input = torch.from_numpy(input1)
+        npu_input = torch.from_numpy(input1).npu()
+        if npuformat != -1:
+            npu_input = npu_input.npu_format_cast(npuformat)
+        return cpu_input, npu_input
+        
+    def test_std_shape_format_fp16(self, device):
+        format_list = [0]
+        shape_list = [[16], [32, 1024], [32, 8, 1024], [128, 32, 8, 1023]]
+        unbiased_list = [True, False]
+        shape_format = [
+            [np.float16, i, j, k] for i in format_list for j in shape_list for k in unbiased_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output1 = self.cpu_op_exec(cpu_input1, item[3])
+            cpu_output1 = cpu_output1.astype(np.float16)
+            npu_output1 = self.npu_op_exec(npu_input1, item[3])
+            self.assertRtolEqual(cpu_output1, npu_output1, prec16=0.1)
+
+    def test_std_shape_format_fp32(self, device):
+        format_list = [0]
+        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1024]]
+        unbiased_list = [True, False]
+        shape_format = [
+            [np.float32, i, j, k] for i in format_list for j in shape_list for k in unbiased_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, item[3])
+            npu_output = self.npu_op_exec(npu_input1, item[3])
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_std_dim_shape_format_fp16(self, device):
+        format_list = [0]
+        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1023]]
+        dim_list = [0]
+        unbiased_list = [True, False]
+        keepdim_list = [True, False]
+        shape_format = [
+            [np.float16, i, j, k, l, m] for i in format_list for j in shape_list 
+            for k in dim_list for l in unbiased_list for m in keepdim_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output1 = self.cpu_op_dim_exec(cpu_input1, item[3], item[4], item[5])
+            cpu_output1 = cpu_output1.astype(np.float16)
+            npu_output1 = self.npu_op_dim_exec(npu_input1, item[3], item[4], item[5])
+            self.assertRtolEqual(cpu_output1, npu_output1, prec16=0.003)
+
+    def test_std_dim_shape_format_fp32(self, device):
+        format_list = [0]
+        shape_list = [[1024], [32, 1024], [32, 8, 1024], [128, 32, 8, 1023]]
+        dim_list = [0]
+        unbiased_list = [True, False]
+        keepdim_list = [True, False]
+        shape_format = [
+            [np.float32, i, j, k, l, m] for i in format_list for j in shape_list
+            for k in dim_list for l in unbiased_list for m in keepdim_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_output1 = self.cpu_op_dim_exec(cpu_input1, item[3], item[4], item[5])
+            npu_output1 = self.npu_op_dim_exec(npu_input1, item[3], item[4], item[5])
+            self.assertRtolEqual(cpu_output1, npu_output1)
+    
+    def test_std_dim_out_shape_format_fp16(self, device):
+        format_list = [0]
+        shape_list = [[1024], [32, 24], [32, 8, 24], [12, 32, 8, 24]]
+        dim_list = [0]
+        unbiased_list = [True, False]
+        keepdim_list = [True, False]
+        shape_format = [
+            [np.float16, i, j, k, l, m] for i in format_list for j in shape_list
+            for k in dim_list for l in unbiased_list for m in keepdim_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            outputshape = self.output_shape(item[2],item[3],item[4],item[5])
+            cpu_output,npu_output = self.create_output_tensor(0,1,outputshape,item[1],item[0])
+            if item[0] == np.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_output = cpu_output.to(torch.float32)
+            cpu_output1 = self.cpu_op_dim_out_exec(cpu_input1, item[3], cpu_output, item[4], item[5])
+            npu_output1 = self.npu_op_dim_out_exec(npu_input1, item[3], npu_output, item[4], item[5])
+            if item[0] == np.float16:
+                cpu_output1 = cpu_output1.astype(np.float16)
+            self.assertRtolEqual(cpu_output1, npu_output1, prec16=0.002)
+
+            random_outputshape = [random.randint(1, 100)]
+            cpu_output, npu_output = self.create_output_tensor(0, 1, random_outputshape,item[1],item[0])
+            if item[0] == np.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_output = cpu_output.to(torch.float32)
+            cpu_output1 = self.cpu_op_dim_out_exec(cpu_input1, item[3], cpu_output, item[4], item[5])
+            npu_output1 = self.npu_op_dim_out_exec(npu_input1, item[3], npu_output, item[4], item[5])
+            if item[0] == np.float16:
+                cpu_output1 = cpu_output1.astype(np.float16)
+            self.assertRtolEqual(cpu_output1, npu_output1, prec16=0.002)
+
+    def test_std_dim_out_shape_format_fp32(self, device):
+        format_list = [0]
+        shape_list = [[1024], [32, 24], [32, 8, 24], [12, 32, 8, 24]]
+        dim_list = [0]
+        unbiased_list = [True, False]
+        keepdim_list = [True, False]
+        shape_format = [
+            [np.float32, i, j, k, l, m] for i in format_list for j in shape_list
+            for k in dim_list for l in unbiased_list for m in keepdim_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            outputshape = self.output_shape(item[2],item[3],item[4],item[5])
+            cpu_output,npu_output = self.create_output_tensor(0,1,outputshape,item[1],item[0])
+            cpu_output1 = self.cpu_op_dim_out_exec(cpu_input1, item[3], cpu_output, item[4], item[5])
+            npu_output1 = self.npu_op_dim_out_exec(npu_input1, item[3], npu_output, item[4], item[5])
+            self.assertRtolEqual(cpu_output1, npu_output1)
+
+            random_outputshape = [random.randint(1, 100)]
+            cpu_output, npu_output = self.create_output_tensor(0, 1, random_outputshape, item[1], item[0])
+            cpu_output2 = self.cpu_op_dim_out_exec(cpu_input1, item[3], cpu_output.clone(), item[4], item[5])
+            npu_output2 = self.npu_op_dim_out_exec(npu_input1, item[3], npu_output, item[4], item[5])
+            self.assertRtolEqual(cpu_output2, npu_output2)
+
+    def test_std_dim_name_fp16(self, device):
+        shape = (1024, 8, 32)
+        cpu_input = torch.rand(shape, dtype=torch.float32)
+        npu_input = cpu_input.npu().to(torch.float16)
+        cpu_input.names = ['N','C','H']
+        npu_input.names = ['N','C','H']
+        dim = np.random.choice(['N', 'C', 'H'])
+        cpu_output = torch.std(cpu_input, dim=dim)
+        npu_output = torch.std(npu_input, dim=dim)
+        self.assertRtolEqual(cpu_output.to(torch.float16).numpy(), npu_output.cpu().numpy())
+
+    def test_std_dim_name_fp32(self, device):
+        shape = (1024, 8, 32)
+        cpu_input = torch.rand(shape, dtype=torch.float32, names=('N', 'C', 'H'))
+        npu_input = cpu_input.npu()
+        dim = np.random.choice(['N', 'C', 'H'])
+        cpu_output = torch.std(cpu_input, dim=dim)
+        npu_output = torch.std(npu_input, dim=dim)
+        self.assertRtolEqual(cpu_output.numpy(), npu_output.cpu().numpy())
+
+    def test_std_dim_out_name_fp16(self, device):
+        shape = (1024, 8, 32)
+        dimlist = ['N', 'C', 'H']
+        cpu_input = torch.rand(shape, dtype=torch.float32)
+        npu_input = cpu_input.npu()
+        dim = np.random.choice(dimlist)
+        dims = dimlist.index(dim)
+        outputshape = self.output_shape(shape, dims)
+        cpu_output,npu_output = self.create_output_tensor(0, 1, outputshape, -1, np.float32)
+        npu_input = npu_input.to(torch.float16)
+        npu_output = npu_output.to(torch.float16)
+        cpu_input.names = ['N','C','H']
+        npu_input.names = ['N','C','H']
+
+        cpu_output = torch.std(cpu_input, dim=dim,out=cpu_output)
+        npu_output = torch.std(npu_input, dim=dim,out=npu_output)
+        cpu_output = cpu_output.to(torch.float16)
+        self.assertRtolEqual(cpu_output.numpy(), npu_output.cpu().numpy())
+
+    def test_std_dim_out_name_fp32(self, device):
+        shape = (1024, 8, 32)
+        dimlist = ['N', 'C', 'H']
+        cpu_input = torch.rand(shape, dtype=torch.float32, names=('N', 'C', 'H'))
+        npu_input = cpu_input.npu()
+        dim = np.random.choice(dimlist)
+        dims = dimlist.index(dim)
+        outputshape = self.output_shape(shape, dims)
+        cpu_output,npu_output = self.create_output_tensor(0, 1, outputshape, -1, np.float32)
+        cpu_output = torch.std(cpu_input, dim=dim,out=cpu_output)
+        npu_output = torch.std(npu_input, dim=dim,out=npu_output)
+        self.assertRtolEqual(cpu_output.numpy(), npu_output.cpu().numpy())
+    
+    def test_std_n_dim_shape_format_fp16(self, device):
+        format_list = [0]
+        shape_list = [[128, 32, 8, 1023]]
+        dim_list = [(3, 1)]
+        unbiased_list = [True, False]
+        keepdim_list = [True, False]
+        shape_format = [
+            [np.float16, i, j, k, l, m] for i in format_list for j in shape_list 
+            for k in dim_list for l in unbiased_list for m in keepdim_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output1 = self.cpu_op_dim_exec(cpu_input1, item[3], item[4], item[5])
+            cpu_output1 = cpu_output1.astype(np.float16)
+            npu_output1 = self.npu_op_dim_exec(npu_input1, item[3], item[4], item[5])
+            self.assertRtolEqual(cpu_output1, npu_output1, prec16=0.003)
+
+    def test_std_n_dim_shape_format_fp32(self, device):
+        format_list = [0]
+        shape_list = [[128, 32, 8, 1023]]
+        dim_list = [(3, 1)]
+        unbiased_list = [True, False]
+        keepdim_list = [True, False]
+        shape_format = [
+            [np.float32, i, j, k, l, m] for i in format_list for j in shape_list
+            for k in dim_list for l in unbiased_list for m in keepdim_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_output1 = self.cpu_op_dim_exec(cpu_input1, item[3], item[4], item[5])
+            npu_output1 = self.npu_op_dim_exec(npu_input1, item[3], item[4], item[5])
+            self.assertRtolEqual(cpu_output1, npu_output1)
+    
+    def test_std_dim_shape_format_5d_fp16(self, device):
+        format_list = [-1]
+        shape_list = [[2, 94, 4, 52, 192]]
+        dim_list = [0]
+        unbiased_list = [True, False]
+        keepdim_list = [True, False]
+        shape_format = [
+            [np.float16, i, j, k, l, m] for i in format_list for j in shape_list 
+            for k in dim_list for l in unbiased_list for m in keepdim_list
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_output1 = self.cpu_op_dim_exec(cpu_input1, item[3], item[4], item[5])
+            cpu_output1 = cpu_output1.astype(np.float16)
+            npu_output1 = self.npu_op_dim_exec(npu_input1, item[3], item[4], item[5])
+            self.assertRtolEqual(cpu_output1, npu_output1, prec16=0.006)
+    
+instantiate_device_type_tests(TestStd, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_stride.py b/test/test_npu/test_network_ops/test_stride.py
index 72ed2f7d248610c3e0875f453cdf9ab75832a46e..a849d5d1ba2a6f401b683d6429315d998e6a0816 100644
--- a/test/test_npu/test_network_ops/test_stride.py
+++ b/test/test_npu/test_network_ops/test_stride.py
@@ -1,42 +1,42 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import instantiate_device_type_tests
-from util_test import create_common_tensor
- 
-class TestStride(TestCase):
-    def test_stride_common_shape_format(self, device):
-        def op_exec(input):
-            output = input.stride()
-            output = np.array(output, dtype=np.int32)
-            return output
-        
-        shape_format = [
-                [[np.float16, 0, (64)]],
-                [[np.float32, 4, (32, 1, 3, 3)]],
-                [[np.float32, 3, (10, 128)]]
-        ]
-        for shape in shape_format:
-            cpu_input, npu_input = create_common_tensor(shape[0], -100, 100)
-            cpu_output = op_exec(cpu_input)
-            npu_output = op_exec(npu_input)
-            self.assertRtolEqual(cpu_output, npu_output)
-    
-
-instantiate_device_type_tests(TestStride, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import instantiate_device_type_tests
+from util_test import create_common_tensor
+ 
+class TestStride(TestCase):
+    def test_stride_common_shape_format(self, device):
+        def op_exec(input):
+            output = input.stride()
+            output = np.array(output, dtype=np.int32)
+            return output
+        
+        shape_format = [
+                [[np.float16, 0, (64)]],
+                [[np.float32, 4, (32, 1, 3, 3)]],
+                [[np.float32, 3, (10, 128)]]
+        ]
+        for shape in shape_format:
+            cpu_input, npu_input = create_common_tensor(shape[0], -100, 100)
+            cpu_output = op_exec(cpu_input)
+            npu_output = op_exec(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output)
+    
+
+instantiate_device_type_tests(TestStride, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_strideadd.py b/test/test_npu/test_network_ops/test_strideadd.py
index 99959f52afa400c6e87a22a543e8db94208671b2..19b5719759551ed0909c193fb8e5fa3f87cb7c47 100644
--- a/test/test_npu/test_network_ops/test_strideadd.py
+++ b/test/test_npu/test_network_ops/test_strideadd.py
@@ -1,39 +1,39 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-from common_utils import TestCase, run_tests
-
-
-class TestStrideAdd(TestCase):
-   def npu_op_exec(self, input1, input2, offset1, offset2, c1_len):
-        output = torch.npu_stride_add(input1, input2, offset1, offset2, c1_len)
-        output = output.to("cpu")
-        output = output.numpy()
-
-        return output
-
-   def test_StrideAdd(self, device):
-        input1  = torch.tensor([[[[[1.]]]]]).npu()
-        input2  = input1
-        exoutput = torch.tensor([[[[[2.]]],[[[0.]]],[[[0.]]],[[[0.]]],[[[0.]]],[[[0.]]],[[[0.]]],[[[0.]]],
-		 [[[0.]]],[[[0.]]],[[[0.]]],[[[0.]]],[[[0.]]],[[[0.]]],[[[0.]]],[[[0.]]]]])
-        output  = self.npu_op_exec(input1, input2, 0, 0, 1) 
-        self.assertRtolEqual(exoutput.numpy(), output)
-
-instantiate_device_type_tests(TestStrideAdd, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+from common_utils import TestCase, run_tests
+
+
+class TestStrideAdd(TestCase):
+   def npu_op_exec(self, input1, input2, offset1, offset2, c1_len):
+        output = torch.npu_stride_add(input1, input2, offset1, offset2, c1_len)
+        output = output.to("cpu")
+        output = output.numpy()
+
+        return output
+
+   def test_StrideAdd(self, device):
+        input1  = torch.tensor([[[[[1.]]]]]).npu()
+        input2  = input1
+        exoutput = torch.tensor([[[[[2.]]],[[[0.]]],[[[0.]]],[[[0.]]],[[[0.]]],[[[0.]]],[[[0.]]],[[[0.]]],
+		 [[[0.]]],[[[0.]]],[[[0.]]],[[[0.]]],[[[0.]]],[[[0.]]],[[[0.]]],[[[0.]]]]])
+        output  = self.npu_op_exec(input1, input2, 0, 0, 1) 
+        self.assertRtolEqual(exoutput.numpy(), output)
+
+instantiate_device_type_tests(TestStrideAdd, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_sub.py b/test/test_npu/test_network_ops/test_sub.py
old mode 100644
new mode 100755
index 7fa1d7aced19bf0813a2fe1c8b6a4a971d3bff81..6731ffefa5ed0dadf7b0aefcc3e36f103c4643e5
--- a/test/test_npu/test_network_ops/test_sub.py
+++ b/test/test_npu/test_network_ops/test_sub.py
@@ -1,236 +1,236 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-import numpy as np
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestSub(TestCase):
-    def cpu_op_exec(self, input1, input2):
-        output = input1 - input2
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        output = input1 - input2
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_t(self, input1, input2):
-        output = torch.sub(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_t_out(self, input1, input2, input3):
-        torch.sub(input1, input2, out=input3)
-        output = input3.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_tensor(self, input1, input2):
-        output = input1.sub(input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_inp_tensor(self, input1, input2):
-        input1.sub_(input2)
-        output = input1.numpy()
-        return output
-
-    def npu_op_exec_inp_tensor(self, input1, input2):
-        input1.sub_(input2)
-        output = input1.to("cpu")
-        output = output.numpy()
-        return output
-
-    def sub_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            npu_input3 = copy.deepcopy(cpu_input1).to("npu")
-            npu_input4 = torch.randn(6).to("npu").to(npu_input3.dtype)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-
-            if type(item[1]) == list:
-                cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
-                if cpu_input2.dtype == torch.float16:
-                    cpu_input2 = cpu_input2.to(torch.float32)
-            else:
-                cpu_input2 = item[1]
-                npu_input2 = item[1]
-
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-
-            npu_output_t = self.npu_op_exec_t(npu_input1, npu_input2)
-            npu_output_t_out = self.npu_op_exec_t_out(npu_input1, npu_input2, npu_input3)
-            npu_output_tensor = self.npu_op_exec_tensor(npu_input1, npu_input2)
-            npu_output_t_out_chk = self.npu_op_exec_t_out(npu_input1, npu_input2, npu_input4)#out tensor shape not shape as self
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_output, npu_output_t)
-            self.assertRtolEqual(cpu_output, npu_output_t_out)
-            self.assertRtolEqual(cpu_output, npu_output_tensor)
-            self.assertRtolEqual(cpu_output, npu_output_t_out_chk)
-
-            # test for tensor
-            cpu_input1_tensor, npu_input1_tensor = create_common_tensor(item[0], 0, 100)
-            if cpu_input1_tensor.dtype == torch.float16:
-                cpu_input1_tensor = cpu_input1_tensor.to(torch.float32)
-
-            if type(item[1]) == list:
-                cpu_input2_tensor, npu_input2_tensor = create_common_tensor(item[1], 0, 100)
-                if cpu_input2_tensor.dtype == torch.float16:
-                    cpu_input2_tensor = cpu_input2_tensor.to(torch.float32)
-            else:
-                cpu_input2_tensor = item[1]
-                npu_input2_tensor = item[1]
-
-            cpu_output_inp_tensor = self.cpu_op_exec_inp_tensor(cpu_input1_tensor, cpu_input2_tensor)
-            npu_output_inp_tensor = self.npu_op_exec_inp_tensor(npu_input1_tensor, npu_input2_tensor)
-            cpu_output_inp_tensor = cpu_output_inp_tensor.astype(npu_output_inp_tensor.dtype)
-            self.assertRtolEqual(cpu_output_inp_tensor, npu_output_inp_tensor)
-
-    def test_sub_scalar_shape_format_fp32_1d(self, device):
-        format_list = [-1, 0, 3]
-        shape_format = [[[np.float32, i, [448]], np.random.uniform(0, 100)] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_scalar_shape_format_fp32_2d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float32, i, [1000, 1280]], np.random.uniform(0, 100)] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_scalar_shape_format_fp32_3d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float32, i, [32, 3, 3]], np.random.uniform(0, 100)] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_scalar_shape_format_fp32_4d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float32, i, [256, 480, 14, 14]], np.random.uniform(0, 100)] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_scalar_shape_format_int32_1d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.int32, i, [448]], np.random.randint(0, 100)] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_scalar_shape_format_int32_2d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.int32, i, [64, 7]], np.random.randint(0, 100)] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_scalar_shape_format_int32_3d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.int32, i, [64, 7, 58]], np.random.randint(0, 100)] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_scalar_shape_format_int32_4d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.int32, i, [256, 480, 14, 14]], np.random.randint(0, 100)] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_shape_format_fp16_1d(self, device):
-        format_list = [-1, 0, 3]
-        shape_format = [[[np.float16, i, [448]], [np.float16, i, [448]]] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_shape_format_fp16_2d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float16, i, [1000, 1280]], [np.float16, i, []]] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_shape_format_fp16_3d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float16, i, [32, 3, 3]], [np.float16, i, []]] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_shape_format_fp16_4d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float16, i, [256, 480, 14, 14]], [np.float16, i, []]] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_shape_format_fp32_1d(self, device):
-        format_list = [-1, 0, 3]
-        shape_format = [[[np.float32, i, [448]], [np.float32, i, []]] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_shape_format_fp32_2d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float32, i, [1000, 1280]], [np.float32, i, []]] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_shape_format_fp32_3d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float32, i, [32, 3, 3]], [np.float32, i, []]] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_shape_format_fp32_4d(self, device):
-        format_list = [-1, 0, 3, 29]
-        shape_format = [[[np.float32, i, [256, 480, 14, 14]], [np.float32, i, []]] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_shape_format_int32_1d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.int32, i, [448]], [np.int32, i, []]] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_shape_format_int32_2d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.int32, i, [64, 7]], [np.int32, i, []]] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_shape_format_int32_3d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.int32, i, [64, 7, 58]], [np.int32, i, []]] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_shape_format_int32_4d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[[np.int32, i, [256, 480, 14, 14]], [np.int32, i, []]] for i in format_list]
-        self.sub_result(shape_format)
-'''
-    # unsupport
-    def test_sub_scalar_shape_format_fp16_1d(self, device):
-        format_list = [-1, 0, 3, 4]
-        shape_format = [[[np.float16, i, [448]], np.random.uniform(0, 100)] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_scalar_shape_format_fp16_2d(self, device):
-        format_list = [-1, 0, 3, 4, 29]
-        shape_format = [[[np.float16, i, [1000, 1280]], np.random.uniform(0, 100)] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_scalar_shape_format_fp16_3d(self, device):
-        format_list = [-1, 0, 3, 4, 29]
-        shape_format = [[[np.float16, i, [32, 3, 3]], np.random.uniform(0, 100)] for i in format_list]
-        self.sub_result(shape_format)
-
-    def test_sub_scalar_shape_format_fp16_4d(self, device):
-        format_list = [-1, 0, 3, 4, 29]
-        shape_format = [[[np.float16, i, [256, 480, 14, 14]], np.random.uniform(0, 100)] for i in format_list]
-        self.sub_result(shape_format)
-'''
-instantiate_device_type_tests(TestSub, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import numpy as np
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestSub(TestCase):
+    def cpu_op_exec(self, input1, input2):
+        output = input1 - input2
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = input1 - input2
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_t(self, input1, input2):
+        output = torch.sub(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_t_out(self, input1, input2, input3):
+        torch.sub(input1, input2, out=input3)
+        output = input3.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_tensor(self, input1, input2):
+        output = input1.sub(input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_inp_tensor(self, input1, input2):
+        input1.sub_(input2)
+        output = input1.numpy()
+        return output
+
+    def npu_op_exec_inp_tensor(self, input1, input2):
+        input1.sub_(input2)
+        output = input1.to("cpu")
+        output = output.numpy()
+        return output
+
+    def sub_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            npu_input3 = copy.deepcopy(cpu_input1).to("npu")
+            npu_input4 = torch.randn(6).to("npu").to(npu_input3.dtype)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+
+            if type(item[1]) == list:
+                cpu_input2, npu_input2 = create_common_tensor(item[1], 0, 100)
+                if cpu_input2.dtype == torch.float16:
+                    cpu_input2 = cpu_input2.to(torch.float32)
+            else:
+                cpu_input2 = item[1]
+                npu_input2 = item[1]
+
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+
+            npu_output_t = self.npu_op_exec_t(npu_input1, npu_input2)
+            npu_output_t_out = self.npu_op_exec_t_out(npu_input1, npu_input2, npu_input3)
+            npu_output_tensor = self.npu_op_exec_tensor(npu_input1, npu_input2)
+            npu_output_t_out_chk = self.npu_op_exec_t_out(npu_input1, npu_input2, npu_input4)#out tensor shape not shape as self
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output_t)
+            self.assertRtolEqual(cpu_output, npu_output_t_out)
+            self.assertRtolEqual(cpu_output, npu_output_tensor)
+            self.assertRtolEqual(cpu_output, npu_output_t_out_chk)
+
+            # test for tensor
+            cpu_input1_tensor, npu_input1_tensor = create_common_tensor(item[0], 0, 100)
+            if cpu_input1_tensor.dtype == torch.float16:
+                cpu_input1_tensor = cpu_input1_tensor.to(torch.float32)
+
+            if type(item[1]) == list:
+                cpu_input2_tensor, npu_input2_tensor = create_common_tensor(item[1], 0, 100)
+                if cpu_input2_tensor.dtype == torch.float16:
+                    cpu_input2_tensor = cpu_input2_tensor.to(torch.float32)
+            else:
+                cpu_input2_tensor = item[1]
+                npu_input2_tensor = item[1]
+
+            cpu_output_inp_tensor = self.cpu_op_exec_inp_tensor(cpu_input1_tensor, cpu_input2_tensor)
+            npu_output_inp_tensor = self.npu_op_exec_inp_tensor(npu_input1_tensor, npu_input2_tensor)
+            cpu_output_inp_tensor = cpu_output_inp_tensor.astype(npu_output_inp_tensor.dtype)
+            self.assertRtolEqual(cpu_output_inp_tensor, npu_output_inp_tensor)
+
+    def test_sub_scalar_shape_format_fp32_1d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[[np.float32, i, [448]], np.random.uniform(0, 100)] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp32_2d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float32, i, [1000, 1280]], np.random.uniform(0, 100)] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp32_3d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float32, i, [32, 3, 3]], np.random.uniform(0, 100)] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp32_4d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float32, i, [256, 480, 14, 14]], np.random.uniform(0, 100)] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_scalar_shape_format_int32_1d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [448]], np.random.randint(0, 100)] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_scalar_shape_format_int32_2d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [64, 7]], np.random.randint(0, 100)] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_scalar_shape_format_int32_3d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [64, 7, 58]], np.random.randint(0, 100)] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_scalar_shape_format_int32_4d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [256, 480, 14, 14]], np.random.randint(0, 100)] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_shape_format_fp16_1d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[[np.float16, i, [448]], [np.float16, i, [448]]] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_shape_format_fp16_2d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [1000, 1280]], [np.float16, i, []]] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_shape_format_fp16_3d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [32, 3, 3]], [np.float16, i, []]] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_shape_format_fp16_4d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float16, i, [256, 480, 14, 14]], [np.float16, i, []]] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_shape_format_fp32_1d(self, device):
+        format_list = [-1, 0, 3]
+        shape_format = [[[np.float32, i, [448]], [np.float32, i, []]] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_shape_format_fp32_2d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float32, i, [1000, 1280]], [np.float32, i, []]] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_shape_format_fp32_3d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float32, i, [32, 3, 3]], [np.float32, i, []]] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_shape_format_fp32_4d(self, device):
+        format_list = [-1, 0, 3, 29]
+        shape_format = [[[np.float32, i, [256, 480, 14, 14]], [np.float32, i, []]] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_shape_format_int32_1d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [448]], [np.int32, i, []]] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_shape_format_int32_2d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [64, 7]], [np.int32, i, []]] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_shape_format_int32_3d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [64, 7, 58]], [np.int32, i, []]] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_shape_format_int32_4d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[[np.int32, i, [256, 480, 14, 14]], [np.int32, i, []]] for i in format_list]
+        self.sub_result(shape_format)
+'''
+    # unsupport
+    def test_sub_scalar_shape_format_fp16_1d(self, device):
+        format_list = [-1, 0, 3, 4]
+        shape_format = [[[np.float16, i, [448]], np.random.uniform(0, 100)] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp16_2d(self, device):
+        format_list = [-1, 0, 3, 4, 29]
+        shape_format = [[[np.float16, i, [1000, 1280]], np.random.uniform(0, 100)] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp16_3d(self, device):
+        format_list = [-1, 0, 3, 4, 29]
+        shape_format = [[[np.float16, i, [32, 3, 3]], np.random.uniform(0, 100)] for i in format_list]
+        self.sub_result(shape_format)
+
+    def test_sub_scalar_shape_format_fp16_4d(self, device):
+        format_list = [-1, 0, 3, 4, 29]
+        shape_format = [[[np.float16, i, [256, 480, 14, 14]], np.random.uniform(0, 100)] for i in format_list]
+        self.sub_result(shape_format)
+'''
+instantiate_device_type_tests(TestSub, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_sub_sample.py b/test/test_npu/test_network_ops/test_sub_sample.py
index fa71597d1a0d784c22fe57be6b1f96662ab3c51e..197de22ace0103534e426807bcba43fa6b03c4d5 100644
--- a/test/test_npu/test_network_ops/test_sub_sample.py
+++ b/test/test_npu/test_network_ops/test_sub_sample.py
@@ -1,75 +1,75 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import torch
-import numpy as np
-import torch.nn as nn
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestSubSample(TestCase):
-    def get_num(self, input1, output):
-        input_num1 = 0
-        input_num0 = 0
-        output_num1 = 0
-        output_num0 = 0
-        for i in range(input1.size()[0]):
-            if input1[i] == 1:
-                input_num1 = input_num1 + 1
-            if input1[i] == 0:
-                input_num0 = input_num0 + 1
-        for i in range(output.size()[0]):
-            if output[i] == 1:
-                output_num1 = output_num1 + 1
-            if output[i] == 0:
-                output_num0 = output_num0 + 1
-        return input_num1, input_num0, output_num1, output_num0
-    
-    def numless_equal(self, input_num1, input_num0, output_num1, output_num0, size, fraction):
-        error_name = "result error"
-        if input_num1 < size * fraction:
-            if output_num1 != input_num1:
-                self.fail(error_name)
-            if input_num0 < size - input_num1 and output_num0 != input_num0:
-                self.fail(error_name)
-            if input_num0 >= size - input_num1 and output_num0 != size - input_num1:
-                self.fail(error_name) 
-    
-    def nummore_equal(self, input_num1, input_num0, output_num1, output_num0, size, fraction):
-        error_name = "result error"
-        if input_num1 >=size * fraction :
-            if output_num1 != size * fraction:
-                self.fail(error_name)
-            if input_num0 < size - size * fraction and output_num0 != input_num0:
-                self.fail(error_name)
-            if input_num0 >= size - size * fraction and output_num0 != size - size * fraction:
-                self.fail(error_name)     
-
-    def test_subsample(self, device):
-        for _ in range(20):
-            input1 = np.random.randint(-1,2,size = (10))
-            npu_input = torch.from_numpy(input1).to("npu")
-            #input only suport int32
-            npu_input = npu_input.to(torch.int32)     
-            npu_output1 = torch.npu_sub_sample(npu_input,5,0.6)
-            input_num1, input_num0, output_num1, output_num0 = self.get_num(npu_input, npu_output1)
-            self.numless_equal(input_num1, input_num0, output_num1, output_num0,5,0.6)
-            self.nummore_equal(input_num1, input_num0, output_num1, output_num0,5,0.6)
-            
-
-instantiate_device_type_tests(TestSubSample, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+import numpy as np
+import torch.nn as nn
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestSubSample(TestCase):
+    def get_num(self, input1, output):
+        input_num1 = 0
+        input_num0 = 0
+        output_num1 = 0
+        output_num0 = 0
+        for i in range(input1.size()[0]):
+            if input1[i] == 1:
+                input_num1 = input_num1 + 1
+            if input1[i] == 0:
+                input_num0 = input_num0 + 1
+        for i in range(output.size()[0]):
+            if output[i] == 1:
+                output_num1 = output_num1 + 1
+            if output[i] == 0:
+                output_num0 = output_num0 + 1
+        return input_num1, input_num0, output_num1, output_num0
+    
+    def numless_equal(self, input_num1, input_num0, output_num1, output_num0, size, fraction):
+        error_name = "result error"
+        if input_num1 < size * fraction:
+            if output_num1 != input_num1:
+                self.fail(error_name)
+            if input_num0 < size - input_num1 and output_num0 != input_num0:
+                self.fail(error_name)
+            if input_num0 >= size - input_num1 and output_num0 != size - input_num1:
+                self.fail(error_name) 
+    
+    def nummore_equal(self, input_num1, input_num0, output_num1, output_num0, size, fraction):
+        error_name = "result error"
+        if input_num1 >=size * fraction :
+            if output_num1 != size * fraction:
+                self.fail(error_name)
+            if input_num0 < size - size * fraction and output_num0 != input_num0:
+                self.fail(error_name)
+            if input_num0 >= size - size * fraction and output_num0 != size - size * fraction:
+                self.fail(error_name)     
+
+    def test_subsample(self, device):
+        for _ in range(20):
+            input1 = np.random.randint(-1,2,size = (10))
+            npu_input = torch.from_numpy(input1).to("npu")
+            #input only suport int32
+            npu_input = npu_input.to(torch.int32)     
+            npu_output1 = torch.npu_sub_sample(npu_input,5,0.6)
+            input_num1, input_num0, output_num1, output_num0 = self.get_num(npu_input, npu_output1)
+            self.numless_equal(input_num1, input_num0, output_num1, output_num0,5,0.6)
+            self.nummore_equal(input_num1, input_num0, output_num1, output_num0,5,0.6)
+            
+
+instantiate_device_type_tests(TestSubSample, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_sum.py b/test/test_npu/test_network_ops/test_sum.py
old mode 100644
new mode 100755
index b513ce3a702014f9889c90bf2bd5086d9c735cdc..9b1310e89e2783b177567e45e0ccb05192f11fd5
--- a/test/test_npu/test_network_ops/test_sum.py
+++ b/test/test_npu/test_network_ops/test_sum.py
@@ -1,219 +1,219 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-sys.path.append('..')
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestSum(TestCase):
-    def cpu_op_exec(self, input1):
-        output = input1.sum()
-        output = output.numpy()
-        return output
-                  
-    def npu_op_exec(self, input1):
-        output = input1.sum()
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-        
-    def cpu_op_exec_dim(self, input1, dim, dtype):
-        output = torch.sum(input1, dim, keepdim=True, dtype=dtype)
-        output = output.numpy()
-        return output
-                  
-    def npu_op_exec_dim(self, input1, dim, dtype):
-        output = torch.sum(input1, dim, keepdim=True, dtype=dtype)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output 
-
-    def cpu_op_dim_exec_out(self, input1, dim, keepdim):
-        out = torch.tensor(0).to(input1.dtype)
-        torch.sum(input1, dim=dim, keepdim=keepdim, out=out)
-        out = out.numpy()
-        return out
-
-    def npu_op_dim_exec_out(self, input1, dim, keepdim):
-        out = torch.tensor(0).to(input1.dtype).npu()
-        torch.sum(input1, dim=dim, keepdim=keepdim, out=out)
-        out = out.to("cpu").numpy()
-        output = torch.sum(input1, dim=dim, keepdim=keepdim)
-        output = output.to("cpu").numpy()
-        return out, output
-   
-    def sum_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, -1, 1)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-            
-    def sum_dim_result(self, shape_format):
-        for item in shape_format:
-            dim = np.random.randint(0, len(item[2]))
-            cpu_input1, npu_input1 = create_common_tensor(item, -1, 1)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-
-            cpu_out_dim = self.cpu_op_dim_exec_out(cpu_input1, dim=[0], keepdim=True)
-            npu_out_dim, npu_output_dim = self.npu_op_dim_exec_out(npu_input1, dim=[0], keepdim=True)
-            cpu_out_dim = cpu_out_dim.astype(npu_out_dim.dtype)
-            if npu_out_dim.dtype != np.float16:
-                self.assertRtolEqual(npu_out_dim, cpu_out_dim)
-            else:
-                self.assertRtolEqual(npu_out_dim, npu_output_dim)
-                                
-            cpu_output_dim = self.cpu_op_exec_dim(cpu_input1, dim, cpu_input1.dtype)
-            npu_output_dim = self.npu_op_exec_dim(npu_input1, dim, npu_input1.dtype)
-            cpu_output_dim = cpu_output_dim.astype(npu_output_dim.dtype)
-            self.assertRtolEqual(cpu_output_dim, npu_output_dim)
-            
-    def test_sum_shape_format_fp16_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float16, i, [18]] for i in format_list 
-        ]
-        self.sum_result(shape_format)
-        
-    def test_sum_shape_format_fp32_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float32, i, [18]] for i in format_list 
-        ]
-        self.sum_result(shape_format)
-        
-    def test_sum_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [
-            [np.float16, i, [256, 1000]] for i in format_list 
-        ]
-        self.sum_result(shape_format)
-        
-    def test_sum_shape_format_fp32_2d(self, device):
-        format_list = [0, 3,  29]
-        shape_format = [
-            [np.float32, i, [256, 1000]] for i in format_list 
-        ]
-        self.sum_result(shape_format)
-        
-    def test_sum_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [
-            [np.float16, i, [32, 48, 64]] for i in format_list 
-        ]
-        self.sum_result(shape_format)
-        
-    def test_sum_shape_format_fp32_3d(self, device):
-        format_list = [0, 3,  29]
-        shape_format = [
-            [np.float32, i, [32, 48, 64]] for i in format_list 
-        ]
-        self.sum_result(shape_format)
-        
-    def test_sum_shape_format_fp16_4d(self, device):
-        format_list = [0,  4, 29]
-        shape_format = [
-            [np.float16, i, [32, 24, 18, 18]] for i in format_list 
-        ]
-        self.sum_result(shape_format)
-        
-    def test_sum_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 4, 29]
-        shape_format = [
-            [np.float32, i, [32, 24, 18, 18]] for i in format_list 
-        ]
-        self.sum_result(shape_format)
-        
-        # --------sum dim---------------------
-        
-    def test_sum_dim_shape_format_fp16_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float16, i, [18]] for i in format_list 
-        ]
-        self.sum_dim_result(shape_format)
-        
-    def test_sum_dim_shape_format_fp32_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float32, i, [18]] for i in format_list 
-        ]
-        self.sum_dim_result(shape_format)
-        
-    def test_sum_dim_shape_format_fp16_2d(self, device):
-        format_list = [0, 3,  29]
-        shape_format = [
-            [np.float16, i, [256, 1000]] for i in format_list 
-        ]
-        self.sum_dim_result(shape_format)
-        
-    def test_sum_dim_shape_format_fp32_2d(self, device):
-        format_list = [0, 3,  29]
-        shape_format = [
-            [np.float32, i, [256, 1000]] for i in format_list 
-        ]
-        self.sum_dim_result(shape_format)
-        
-    def test_sum_dim_shape_format_fp16_3d(self, device):
-        # TODO(ascend): Insufficient precision
-        #format=29精度不满足 format_list = [0, 3,  29]
-        format_list = [0, 3]
-        shape_format = [
-            [np.float16, i, [32, 48, 64]] for i in format_list 
-        ]
-        self.sum_dim_result(shape_format)
-        
-    def test_sum_dim_shape_format_fp32_3d(self, device):
-        format_list = [0, 3,  29]
-        shape_format = [
-            [np.float32, i, [32, 48, 64]] for i in format_list 
-        ]
-        self.sum_dim_result(shape_format)
-        
-    def test_sum_dim_shape_format_fp16_4d(self, device):
-        format_list = [0, 3]
-        shape_format = [
-            [np.float16, i, [16, 16, 9, 9]] for i in format_list 
-        ]
-        self.sum_dim_result(shape_format)
-        
-    def test_sum_dim_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 4]
-        shape_format = [
-            [np.float32, i, [32, 24, 18, 18]] for i in format_list 
-        ]
-        self.sum_dim_result(shape_format)
-
-    def test_sum_dim_with_zero_shape_format(self, device):
-        format_list = [0, 3, 4]
-        shape_format = [
-            [np.float32, i, [2, 0, 3]] for i in format_list 
-        ]
-        self.sum_dim_result(shape_format)
-        self.sum_result(shape_format)
-
-
-instantiate_device_type_tests(TestSum, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('..')
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestSum(TestCase):
+    def cpu_op_exec(self, input1):
+        output = input1.sum()
+        output = output.numpy()
+        return output
+                  
+    def npu_op_exec(self, input1):
+        output = input1.sum()
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+        
+    def cpu_op_exec_dim(self, input1, dim, dtype):
+        output = torch.sum(input1, dim, keepdim=True, dtype=dtype)
+        output = output.numpy()
+        return output
+                  
+    def npu_op_exec_dim(self, input1, dim, dtype):
+        output = torch.sum(input1, dim, keepdim=True, dtype=dtype)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output 
+
+    def cpu_op_dim_exec_out(self, input1, dim, keepdim):
+        out = torch.tensor(0).to(input1.dtype)
+        torch.sum(input1, dim=dim, keepdim=keepdim, out=out)
+        out = out.numpy()
+        return out
+
+    def npu_op_dim_exec_out(self, input1, dim, keepdim):
+        out = torch.tensor(0).to(input1.dtype).npu()
+        torch.sum(input1, dim=dim, keepdim=keepdim, out=out)
+        out = out.to("cpu").numpy()
+        output = torch.sum(input1, dim=dim, keepdim=keepdim)
+        output = output.to("cpu").numpy()
+        return out, output
+   
+    def sum_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, -1, 1)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+            
+    def sum_dim_result(self, shape_format):
+        for item in shape_format:
+            dim = np.random.randint(0, len(item[2]))
+            cpu_input1, npu_input1 = create_common_tensor(item, -1, 1)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+
+            cpu_out_dim = self.cpu_op_dim_exec_out(cpu_input1, dim=[0], keepdim=True)
+            npu_out_dim, npu_output_dim = self.npu_op_dim_exec_out(npu_input1, dim=[0], keepdim=True)
+            cpu_out_dim = cpu_out_dim.astype(npu_out_dim.dtype)
+            if npu_out_dim.dtype != np.float16:
+                self.assertRtolEqual(npu_out_dim, cpu_out_dim)
+            else:
+                self.assertRtolEqual(npu_out_dim, npu_output_dim)
+                                
+            cpu_output_dim = self.cpu_op_exec_dim(cpu_input1, dim, cpu_input1.dtype)
+            npu_output_dim = self.npu_op_exec_dim(npu_input1, dim, npu_input1.dtype)
+            cpu_output_dim = cpu_output_dim.astype(npu_output_dim.dtype)
+            self.assertRtolEqual(cpu_output_dim, npu_output_dim)
+            
+    def test_sum_shape_format_fp16_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [
+            [np.float16, i, [18]] for i in format_list 
+        ]
+        self.sum_result(shape_format)
+        
+    def test_sum_shape_format_fp32_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [
+            [np.float32, i, [18]] for i in format_list 
+        ]
+        self.sum_result(shape_format)
+        
+    def test_sum_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [
+            [np.float16, i, [256, 1000]] for i in format_list 
+        ]
+        self.sum_result(shape_format)
+        
+    def test_sum_shape_format_fp32_2d(self, device):
+        format_list = [0, 3,  29]
+        shape_format = [
+            [np.float32, i, [256, 1000]] for i in format_list 
+        ]
+        self.sum_result(shape_format)
+        
+    def test_sum_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [
+            [np.float16, i, [32, 48, 64]] for i in format_list 
+        ]
+        self.sum_result(shape_format)
+        
+    def test_sum_shape_format_fp32_3d(self, device):
+        format_list = [0, 3,  29]
+        shape_format = [
+            [np.float32, i, [32, 48, 64]] for i in format_list 
+        ]
+        self.sum_result(shape_format)
+        
+    def test_sum_shape_format_fp16_4d(self, device):
+        format_list = [0,  4, 29]
+        shape_format = [
+            [np.float16, i, [32, 24, 18, 18]] for i in format_list 
+        ]
+        self.sum_result(shape_format)
+        
+    def test_sum_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 4, 29]
+        shape_format = [
+            [np.float32, i, [32, 24, 18, 18]] for i in format_list 
+        ]
+        self.sum_result(shape_format)
+        
+        # --------sum dim---------------------
+        
+    def test_sum_dim_shape_format_fp16_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [
+            [np.float16, i, [18]] for i in format_list 
+        ]
+        self.sum_dim_result(shape_format)
+        
+    def test_sum_dim_shape_format_fp32_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [
+            [np.float32, i, [18]] for i in format_list 
+        ]
+        self.sum_dim_result(shape_format)
+        
+    def test_sum_dim_shape_format_fp16_2d(self, device):
+        format_list = [0, 3,  29]
+        shape_format = [
+            [np.float16, i, [256, 1000]] for i in format_list 
+        ]
+        self.sum_dim_result(shape_format)
+        
+    def test_sum_dim_shape_format_fp32_2d(self, device):
+        format_list = [0, 3,  29]
+        shape_format = [
+            [np.float32, i, [256, 1000]] for i in format_list 
+        ]
+        self.sum_dim_result(shape_format)
+        
+    def test_sum_dim_shape_format_fp16_3d(self, device):
+        # TODO(ascend): Insufficient precision
+        #format=29精度不满足 format_list = [0, 3,  29]
+        format_list = [0, 3]
+        shape_format = [
+            [np.float16, i, [32, 48, 64]] for i in format_list 
+        ]
+        self.sum_dim_result(shape_format)
+        
+    def test_sum_dim_shape_format_fp32_3d(self, device):
+        format_list = [0, 3,  29]
+        shape_format = [
+            [np.float32, i, [32, 48, 64]] for i in format_list 
+        ]
+        self.sum_dim_result(shape_format)
+        
+    def test_sum_dim_shape_format_fp16_4d(self, device):
+        format_list = [0, 3]
+        shape_format = [
+            [np.float16, i, [16, 16, 9, 9]] for i in format_list 
+        ]
+        self.sum_dim_result(shape_format)
+        
+    def test_sum_dim_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 4]
+        shape_format = [
+            [np.float32, i, [32, 24, 18, 18]] for i in format_list 
+        ]
+        self.sum_dim_result(shape_format)
+
+    def test_sum_dim_with_zero_shape_format(self, device):
+        format_list = [0, 3, 4]
+        shape_format = [
+            [np.float32, i, [2, 0, 3]] for i in format_list 
+        ]
+        self.sum_dim_result(shape_format)
+        self.sum_result(shape_format)
+
+
+instantiate_device_type_tests(TestSum, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_tensor_equal.py b/test/test_npu/test_network_ops/test_tensor_equal.py
index 830c6ed93ff5b6cb51ff46647a78c54d1c314979..d43db04421d4c98794c7f0744f93633070b60b24 100644
--- a/test/test_npu/test_network_ops/test_tensor_equal.py
+++ b/test/test_npu/test_network_ops/test_tensor_equal.py
@@ -1,76 +1,76 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestEqual(TestCase):
-    def cpu_op_exec(self, input1, input2):
-        output = torch.equal(input1, input2)
-        output = np.array(output, dtype=np.int32)
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        output = torch.equal(input1, input2)
-        output = np.array(output, dtype=np.int32)
-        return output
-
-    def test_equal_shape_format_fp16(self, device):
-        shape_format = [
-            [[np.float16, 0, [5]],          [np.float16, 0, [5]]],
-            [[np.float16, 0, [2, 4]],       [np.float16, 0, [2, 4, 4]]],
-            [[np.float16, 0, [2, 2, 4]],    [np.float16, 0, [2, 3, 4]]],
-            [[np.float16, 0, [2, 3, 3, 4]], [np.float16, 0, [2, 3, 3, 4]]],
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], -100, 100)
-
-            cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_input2 = cpu_input2.to(torch.float32)
-
-            cpu_output1 = self.cpu_op_exec(cpu_input1, cpu_input1)
-            npu_output1 = self.npu_op_exec(npu_input1, npu_input1)
-            self.assertRtolEqual(cpu_output1, npu_output1)
-
-            cpu_output0 = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output0 = self.npu_op_exec(npu_input1, npu_input2)
-            self.assertRtolEqual(cpu_output0, npu_output0)
-
-    def test_equal_shape_format_fp32(self, device):
-        shape_format = [
-            [[np.float32, 0, [5]],          [np.float32, 0, [5]]],
-            [[np.float32, 0, [2, 4]],       [np.float32, 0, [2, 4, 4]]],
-            [[np.float32, 0, [2, 2, 4]],    [np.float32, 0, [2, 3, 4]]],
-            [[np.float32, 0, [2, 3, 3, 4]], [np.float32, 0, [2, 3, 3, 4]]],
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], -100, 100)
-
-            cpu_output1 = self.cpu_op_exec(cpu_input1, cpu_input1)
-            npu_output1 = self.npu_op_exec(npu_input1, npu_input1)
-            self.assertRtolEqual(cpu_output1, npu_output1)
-
-            cpu_output0 = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output0 = self.npu_op_exec(npu_input1, npu_input2)
-            self.assertRtolEqual(cpu_output0, npu_output0)
-
-
-instantiate_device_type_tests(TestEqual, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestEqual(TestCase):
+    def cpu_op_exec(self, input1, input2):
+        output = torch.equal(input1, input2)
+        output = np.array(output, dtype=np.int32)
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        output = torch.equal(input1, input2)
+        output = np.array(output, dtype=np.int32)
+        return output
+
+    def test_equal_shape_format_fp16(self, device):
+        shape_format = [
+            [[np.float16, 0, [5]],          [np.float16, 0, [5]]],
+            [[np.float16, 0, [2, 4]],       [np.float16, 0, [2, 4, 4]]],
+            [[np.float16, 0, [2, 2, 4]],    [np.float16, 0, [2, 3, 4]]],
+            [[np.float16, 0, [2, 3, 3, 4]], [np.float16, 0, [2, 3, 3, 4]]],
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], -100, 100)
+
+            cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_input2 = cpu_input2.to(torch.float32)
+
+            cpu_output1 = self.cpu_op_exec(cpu_input1, cpu_input1)
+            npu_output1 = self.npu_op_exec(npu_input1, npu_input1)
+            self.assertRtolEqual(cpu_output1, npu_output1)
+
+            cpu_output0 = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output0 = self.npu_op_exec(npu_input1, npu_input2)
+            self.assertRtolEqual(cpu_output0, npu_output0)
+
+    def test_equal_shape_format_fp32(self, device):
+        shape_format = [
+            [[np.float32, 0, [5]],          [np.float32, 0, [5]]],
+            [[np.float32, 0, [2, 4]],       [np.float32, 0, [2, 4, 4]]],
+            [[np.float32, 0, [2, 2, 4]],    [np.float32, 0, [2, 3, 4]]],
+            [[np.float32, 0, [2, 3, 3, 4]], [np.float32, 0, [2, 3, 3, 4]]],
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], -100, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], -100, 100)
+
+            cpu_output1 = self.cpu_op_exec(cpu_input1, cpu_input1)
+            npu_output1 = self.npu_op_exec(npu_input1, npu_input1)
+            self.assertRtolEqual(cpu_output1, npu_output1)
+
+            cpu_output0 = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output0 = self.npu_op_exec(npu_input1, npu_input2)
+            self.assertRtolEqual(cpu_output0, npu_output0)
+
+
+instantiate_device_type_tests(TestEqual, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_tensor_npu.py b/test/test_npu/test_network_ops/test_tensor_npu.py
index e0fdf11d358bf61bc29cd61188829017c8155b7f..c679316e5083b281b8f86fec51d15a795c9b7988 100644
--- a/test/test_npu/test_network_ops/test_tensor_npu.py
+++ b/test/test_npu/test_network_ops/test_tensor_npu.py
@@ -1,75 +1,75 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestTensorNpu(TestCase):
-    
-    def cpu_op_exec(self, input):
-        output = input.to("cpu")
-        return output
-
-    def npu_op_exec(self, input):
-        output = input.npu()
-        output = output.to("cpu")
-        return output
-
-    def cpu_type_exec(self, input):
-        output = input.to("cpu")
-        output = output.is_npu
-        return output
-
-    def npu_type_exec(self, input):
-        output = input.npu()
-        output = output.is_npu
-        return output
-
-    def test_tensor_npu_shape_format(self, device):
-        shape_format = [
-                [np.float32, 0, 1],
-                [np.float32, 0, (64, 10)],
-                [np.float32, 3, (256, 2048, 7, 7)],
-                [np.float32, 4, (32, 1, 3, 3)],
-                [np.float32, 29, (10, 128)]
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 1, 100)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec(npu_input)
-            self.assertRtolEqual(cpu_output, npu_output.cpu())
-
-    def test_is_npu_shape_format(self, device):
-        shape_format = [
-                [np.float32, 0, 1],
-                [np.float32, 0, (64, 10)],
-                [np.float32, 3, (256, 2048, 7, 7)],
-                [np.float32, 4, (32, 1, 3, 3)],
-                [np.float32, 29, (10, 128)]
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 1, 100)
-            cpu_output = self.cpu_type_exec(cpu_input)
-            npu_output = self.npu_type_exec(npu_input)
-            self.assertEqual(cpu_output, False)
-            self.assertEqual(npu_output, True)
-
-instantiate_device_type_tests(TestTensorNpu, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestTensorNpu(TestCase):
+    
+    def cpu_op_exec(self, input):
+        output = input.to("cpu")
+        return output
+
+    def npu_op_exec(self, input):
+        output = input.npu()
+        output = output.to("cpu")
+        return output
+
+    def cpu_type_exec(self, input):
+        output = input.to("cpu")
+        output = output.is_npu
+        return output
+
+    def npu_type_exec(self, input):
+        output = input.npu()
+        output = output.is_npu
+        return output
+
+    def test_tensor_npu_shape_format(self, device):
+        shape_format = [
+                [np.float32, 0, 1],
+                [np.float32, 0, (64, 10)],
+                [np.float32, 3, (256, 2048, 7, 7)],
+                [np.float32, 4, (32, 1, 3, 3)],
+                [np.float32, 29, (10, 128)]
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 1, 100)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output.cpu())
+
+    def test_is_npu_shape_format(self, device):
+        shape_format = [
+                [np.float32, 0, 1],
+                [np.float32, 0, (64, 10)],
+                [np.float32, 3, (256, 2048, 7, 7)],
+                [np.float32, 4, (32, 1, 3, 3)],
+                [np.float32, 29, (10, 128)]
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 1, 100)
+            cpu_output = self.cpu_type_exec(cpu_input)
+            npu_output = self.npu_type_exec(npu_input)
+            self.assertEqual(cpu_output, False)
+            self.assertEqual(npu_output, True)
+
+instantiate_device_type_tests(TestTensorNpu, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_threshold.py b/test/test_npu/test_network_ops/test_threshold.py
index e773cbaa4db06942d81e45126ce654c43bb7e379..5289993f086acbb7cf2f3339c55c944b98bc4dd5 100644
--- a/test/test_npu/test_network_ops/test_threshold.py
+++ b/test/test_npu/test_network_ops/test_threshold.py
@@ -1,75 +1,75 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestThreshold(TestCase):
-
-    def cpu_op_exec(self,input1, threshold, value):
-        output = torch.nn.functional.threshold(input1, threshold, value)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self,input1, threshold, value):
-        output = torch.nn.functional.threshold(input1, threshold, value)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_threshold_common_shape_format(self, device):
-        shape_format = [
-                [[np.float32, 0, (1,5)], [1.0], [20.0]],
-                [[np.int32, 0, (1,5)], [2], [20]],
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 3)
-            cpu_threshold = npu_threshold = item[1][0]
-            cpu_value = npu_value = item[2][0]
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_threshold, cpu_value)
-            npu_output = self.npu_op_exec(npu_input1, npu_threshold, npu_value)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_threshold_inplace_common_shape_format(self, device):
-        def cpu_op_inplace_exec(input1, threshold, value):
-            torch.nn.functional.threshold_(input1, threshold, value)
-            output = input1.numpy()
-            return output
-
-        def npu_op_inplace_exec(input1, threshold, value):
-            torch.nn.functional.threshold_(input1, threshold, value)
-            output = input1.to("cpu")
-            output = output.numpy()
-            return output
-
-        shape_format = [
-                [[np.float32, 0, (1,5)], [1.0], [20.0]],
-                [[np.int32, 0, (1,5)], [2], [20]],
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 3)
-            cpu_threshold = npu_threshold = item[1][0]
-            cpu_value = npu_value = item[2][0]
-            cpu_output = cpu_op_inplace_exec(cpu_input1, cpu_threshold, cpu_value)
-            npu_output = npu_op_inplace_exec(npu_input1, npu_threshold, npu_value)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestThreshold, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestThreshold(TestCase):
+
+    def cpu_op_exec(self,input1, threshold, value):
+        output = torch.nn.functional.threshold(input1, threshold, value)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self,input1, threshold, value):
+        output = torch.nn.functional.threshold(input1, threshold, value)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_threshold_common_shape_format(self, device):
+        shape_format = [
+                [[np.float32, 0, (1,5)], [1.0], [20.0]],
+                [[np.int32, 0, (1,5)], [2], [20]],
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 3)
+            cpu_threshold = npu_threshold = item[1][0]
+            cpu_value = npu_value = item[2][0]
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_threshold, cpu_value)
+            npu_output = self.npu_op_exec(npu_input1, npu_threshold, npu_value)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_threshold_inplace_common_shape_format(self, device):
+        def cpu_op_inplace_exec(input1, threshold, value):
+            torch.nn.functional.threshold_(input1, threshold, value)
+            output = input1.numpy()
+            return output
+
+        def npu_op_inplace_exec(input1, threshold, value):
+            torch.nn.functional.threshold_(input1, threshold, value)
+            output = input1.to("cpu")
+            output = output.numpy()
+            return output
+
+        shape_format = [
+                [[np.float32, 0, (1,5)], [1.0], [20.0]],
+                [[np.int32, 0, (1,5)], [2], [20]],
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 3)
+            cpu_threshold = npu_threshold = item[1][0]
+            cpu_value = npu_value = item[2][0]
+            cpu_output = cpu_op_inplace_exec(cpu_input1, cpu_threshold, cpu_value)
+            npu_output = npu_op_inplace_exec(npu_input1, npu_threshold, npu_value)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestThreshold, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_threshold_backward.py b/test/test_npu/test_network_ops/test_threshold_backward.py
index d191f8e9ce9d970954c409e80286623bae22b219..3f66ad31a1524da22f1c06fa86d8073c64856673 100644
--- a/test/test_npu/test_network_ops/test_threshold_backward.py
+++ b/test/test_npu/test_network_ops/test_threshold_backward.py
@@ -1,68 +1,68 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestThresholdBackward(TestCase):
-
-    def cpu_op_exec(self, input1, threshold, value):
-        input1.requires_grad_()
-        output = torch.nn.functional.threshold(input1, threshold, value)
-        w = torch.ones_like(output)
-        output.backward(w)
-        out = input1.grad
-        output = output.detach()
-        return output.numpy(), out.numpy()
-
-    def npu_op_exec(self, input1, threshold, value):
-        input1.requires_grad_()
-        output = torch.nn.functional.threshold(input1, threshold, value)
-        w = torch.ones_like(output)
-        output.backward(w)
-        out = input1.grad.to("cpu")
-        output = output.detach().to("cpu")
-        return output.numpy(), out.numpy()
-
-    def test_threshold_backward_common_shape_format(self, device):
-        shape_format = [
-                [[np.float32, 0, (1,5)], [1.0], [20.0]],
-                [[np.float32, 0, (2,3,5)], [2.0], [20.0]],
-                [[np.float32, 0, (2,3,4,5)], [0], [0]],
-                [[np.float32, 3, (1,5)], [1.0], [20.0]],
-                [[np.float32, 3, (2,3,5)], [2.0], [20.0]],
-                [[np.float32, 3, (2,3,4,5)], [0], [0]],
-                [[np.float16, 0, (1,5)], [1.0], [20.0]],
-                [[np.float16, 0, (2,3,5)], [2.0], [20.0]],
-                [[np.float16, 3, (2,3,4,5)], [0], [0]],
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 3)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-            cpu_threshold = npu_threshold = item[1][0]
-            cpu_value = npu_value = item[2][0]
-            cpu_output1, cpu_output2 = self.cpu_op_exec(cpu_input1, cpu_threshold, cpu_value)
-            npu_output1, npu_output2 = self.npu_op_exec(npu_input1, npu_threshold, npu_value)
-            self.assertRtolEqual(npu_output1.astype(np.float32), cpu_output1)
-            self.assertRtolEqual(npu_output2.astype(np.float32), cpu_output2)
-
-instantiate_device_type_tests(TestThresholdBackward, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestThresholdBackward(TestCase):
+
+    def cpu_op_exec(self, input1, threshold, value):
+        input1.requires_grad_()
+        output = torch.nn.functional.threshold(input1, threshold, value)
+        w = torch.ones_like(output)
+        output.backward(w)
+        out = input1.grad
+        output = output.detach()
+        return output.numpy(), out.numpy()
+
+    def npu_op_exec(self, input1, threshold, value):
+        input1.requires_grad_()
+        output = torch.nn.functional.threshold(input1, threshold, value)
+        w = torch.ones_like(output)
+        output.backward(w)
+        out = input1.grad.to("cpu")
+        output = output.detach().to("cpu")
+        return output.numpy(), out.numpy()
+
+    def test_threshold_backward_common_shape_format(self, device):
+        shape_format = [
+                [[np.float32, 0, (1,5)], [1.0], [20.0]],
+                [[np.float32, 0, (2,3,5)], [2.0], [20.0]],
+                [[np.float32, 0, (2,3,4,5)], [0], [0]],
+                [[np.float32, 3, (1,5)], [1.0], [20.0]],
+                [[np.float32, 3, (2,3,5)], [2.0], [20.0]],
+                [[np.float32, 3, (2,3,4,5)], [0], [0]],
+                [[np.float16, 0, (1,5)], [1.0], [20.0]],
+                [[np.float16, 0, (2,3,5)], [2.0], [20.0]],
+                [[np.float16, 3, (2,3,4,5)], [0], [0]],
+        ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 3)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+            cpu_threshold = npu_threshold = item[1][0]
+            cpu_value = npu_value = item[2][0]
+            cpu_output1, cpu_output2 = self.cpu_op_exec(cpu_input1, cpu_threshold, cpu_value)
+            npu_output1, npu_output2 = self.npu_op_exec(npu_input1, npu_threshold, npu_value)
+            self.assertRtolEqual(npu_output1.astype(np.float32), cpu_output1)
+            self.assertRtolEqual(npu_output2.astype(np.float32), cpu_output2)
+
+instantiate_device_type_tests(TestThresholdBackward, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_to.py b/test/test_npu/test_network_ops/test_to.py
index 1aca41a22203d327a2117dc57658e393817c60e5..b5d16df049d63db06d4bf37a1e6ef2d9350caecc 100644
--- a/test/test_npu/test_network_ops/test_to.py
+++ b/test/test_npu/test_network_ops/test_to.py
@@ -1,55 +1,55 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestTo(TestCase):
-    def cpu_op_exec(self, input1, target):
-        output = input1.to(target)
-        output = output.cpu().numpy()
-        return output
-
-    def npu_op_exec(self,input1, target):
-        output = input1.to(target)
-        output = output.cpu().numpy()
-        return output
-
-    def test_to(self, device):
-        shape_format = [
-                [np.float32, 0, [3, 3]],
-                [np.float16, 0, [4, 3]],
-                [np.int32, 0, [3, 5]],
-        ]
-       
-        targets = [torch.float16, torch.float32, torch.int32, 'cpu', 'npu']
-        for item in shape_format:
-            for target in targets:
-                cpu_input1, npu_input1 = create_common_tensor(item, -100, 100)
-                cpu_output = self.cpu_op_exec(cpu_input1, target)
-                npu_output = self.npu_op_exec(npu_input1, target)
-                self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestTo, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestTo(TestCase):
+    def cpu_op_exec(self, input1, target):
+        output = input1.to(target)
+        output = output.cpu().numpy()
+        return output
+
+    def npu_op_exec(self,input1, target):
+        output = input1.to(target)
+        output = output.cpu().numpy()
+        return output
+
+    def test_to(self, device):
+        shape_format = [
+                [np.float32, 0, [3, 3]],
+                [np.float16, 0, [4, 3]],
+                [np.int32, 0, [3, 5]],
+        ]
+       
+        targets = [torch.float16, torch.float32, torch.int32, 'cpu', 'npu']
+        for item in shape_format:
+            for target in targets:
+                cpu_input1, npu_input1 = create_common_tensor(item, -100, 100)
+                cpu_output = self.cpu_op_exec(cpu_input1, target)
+                npu_output = self.npu_op_exec(npu_input1, target)
+                self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestTo, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_tril.py b/test/test_npu/test_network_ops/test_tril.py
index 79134acaa833a965bf2edf94b06847d2f288049a..620623edb86505648e7d833b2e7d9770d990fc3c 100644
--- a/test/test_npu/test_network_ops/test_tril.py
+++ b/test/test_npu/test_network_ops/test_tril.py
@@ -1,77 +1,77 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestTril(TestCase):
-    def test_tril(self, device):
-        dtype_list = [np.float32, np.float16]
-        format_list = [0, 3, 4]
-        shape_list = [[5, 5],[4, 5, 6]]
-        diagonal_list = [-1, 0, 1]
-        shape_format = [
-            [i, j, k, l] for i in dtype_list for j in format_list for k in shape_list for l in diagonal_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[:-1], 0, 100)
-            cpu_output = self.cpu_op_exec(cpu_input, item[-1])
-            npu_output = self.npu_op_exec(npu_input, item[-1])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_tril_inplace(self, device):
-        dtype_list = [np.float32, np.float16]
-        format_list = [0, 3, 4]
-        shape_list = [[5, 5], [4, 5, 6]]
-        diagonal_list = [-1, 0, 1]
-        shape_format = [
-            [i, j, k, l] for i in dtype_list for j in format_list for k in shape_list for l in diagonal_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[:-1], 0, 100)
-            cpu_output = self.cpu_op_inplace_exec(cpu_input, item[-1])
-            npu_output = self.npu_op_inplace_exec(npu_input, item[-1])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def cpu_op_exec(self, input, diagonal=0):
-        output = torch.tril(input, diagonal)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input, diagonal=0):
-        output = torch.tril(input, diagonal)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_inplace_exec(self, input, diagonal=0):
-        output = input.tril_(diagonal)
-        output = output.numpy()
-        return output
-
-    def npu_op_inplace_exec(self, input, diagonal=0):
-        output = input.tril_(diagonal)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-instantiate_device_type_tests(TestTril, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestTril(TestCase):
+    def test_tril(self, device):
+        dtype_list = [np.float32, np.float16]
+        format_list = [0, 3, 4]
+        shape_list = [[5, 5],[4, 5, 6]]
+        diagonal_list = [-1, 0, 1]
+        shape_format = [
+            [i, j, k, l] for i in dtype_list for j in format_list for k in shape_list for l in diagonal_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[:-1], 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input, item[-1])
+            npu_output = self.npu_op_exec(npu_input, item[-1])
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_tril_inplace(self, device):
+        dtype_list = [np.float32, np.float16]
+        format_list = [0, 3, 4]
+        shape_list = [[5, 5], [4, 5, 6]]
+        diagonal_list = [-1, 0, 1]
+        shape_format = [
+            [i, j, k, l] for i in dtype_list for j in format_list for k in shape_list for l in diagonal_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[:-1], 0, 100)
+            cpu_output = self.cpu_op_inplace_exec(cpu_input, item[-1])
+            npu_output = self.npu_op_inplace_exec(npu_input, item[-1])
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def cpu_op_exec(self, input, diagonal=0):
+        output = torch.tril(input, diagonal)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input, diagonal=0):
+        output = torch.tril(input, diagonal)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_exec(self, input, diagonal=0):
+        output = input.tril_(diagonal)
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_exec(self, input, diagonal=0):
+        output = input.tril_(diagonal)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+instantiate_device_type_tests(TestTril, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_triu.py b/test/test_npu/test_network_ops/test_triu.py
index 50cadc56ef55e6145dda9e9055e6ef2daa1cc5f1..c95dc291f53623439b7ad12d02afb9d94d3b9647 100644
--- a/test/test_npu/test_network_ops/test_triu.py
+++ b/test/test_npu/test_network_ops/test_triu.py
@@ -1,79 +1,79 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestTriu(TestCase):
-    def test_triu(self, device):
-        dtype_list = [np.float32, np.float16]
-        format_list = [0, 3]
-        shape_list = [[5, 5]]
-        shape_format = [
-            [i, j, k] for i in dtype_list for j in format_list for k in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            print(cpu_input)
-            print(npu_input)
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec(npu_input)
-            print(cpu_output)
-            print(npu_output)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_triu_inplace(self, device):
-        dtype_list = [np.float32, np.float16]
-        format_list = [0, 3]
-        shape_list = [[5, 5]]
-        shape_format = [
-            [i, j, k] for i in dtype_list for j in format_list for k in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            cpu_output = self.cpu_op_inplace_exec(cpu_input)
-            npu_output = self.npu_op_inplace_exec(npu_input)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def cpu_op_exec(self, input):
-        output = torch.triu(input, 1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input):
-        output = torch.triu(input, 1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_inplace_exec(self, input):
-        output = input.triu_(1)
-        output = output.numpy()
-        return output
-
-    def npu_op_inplace_exec(self, input):
-        output = input.triu_(1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-instantiate_device_type_tests(TestTriu, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestTriu(TestCase):
+    def test_triu(self, device):
+        dtype_list = [np.float32, np.float16]
+        format_list = [0, 3]
+        shape_list = [[5, 5]]
+        shape_format = [
+            [i, j, k] for i in dtype_list for j in format_list for k in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            print(cpu_input)
+            print(npu_input)
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+            print(cpu_output)
+            print(npu_output)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_triu_inplace(self, device):
+        dtype_list = [np.float32, np.float16]
+        format_list = [0, 3]
+        shape_list = [[5, 5]]
+        shape_format = [
+            [i, j, k] for i in dtype_list for j in format_list for k in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_op_inplace_exec(cpu_input)
+            npu_output = self.npu_op_inplace_exec(npu_input)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def cpu_op_exec(self, input):
+        output = torch.triu(input, 1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input):
+        output = torch.triu(input, 1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_inplace_exec(self, input):
+        output = input.triu_(1)
+        output = output.numpy()
+        return output
+
+    def npu_op_inplace_exec(self, input):
+        output = input.triu_(1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+instantiate_device_type_tests(TestTriu, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_unbind.py b/test/test_npu/test_network_ops/test_unbind.py
index dbd889508fbf8688e1c855117a80bfd65f394eae..ff22e7acae8be4c5459b25bd3f6d5236cf8c0166 100644
--- a/test/test_npu/test_network_ops/test_unbind.py
+++ b/test/test_npu/test_network_ops/test_unbind.py
@@ -1,42 +1,42 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import instantiate_device_type_tests
-from util_test import create_common_tensor
- 
-class TestUnbind(TestCase):
-    def test_unbind_common_shape_format(self, device):
-        def op_exec(input):
-            output = torch.unbind(input,0)
-            return output
-        
-        shape_format = [
-                [[np.float16, 0, (64, 10)]],
-                [[np.float32, 4, (32, 1, 3, 3)]],
-                [[np.float32, 29, (10, 128)]]
-        ]
-        for shape in shape_format:
-            cpu_input, npu_input = create_common_tensor(shape[0], -1, 1)
-            cpu_output = op_exec(cpu_input)
-            npu_output = op_exec(npu_input)
-            self.assertRtolEqual(cpu_output[0], npu_output[0].cpu())
-            self.assertRtolEqual(cpu_output[-1], npu_output[-1].cpu())
-    
-
-instantiate_device_type_tests(TestUnbind, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import instantiate_device_type_tests
+from util_test import create_common_tensor
+ 
+class TestUnbind(TestCase):
+    def test_unbind_common_shape_format(self, device):
+        def op_exec(input):
+            output = torch.unbind(input,0)
+            return output
+        
+        shape_format = [
+                [[np.float16, 0, (64, 10)]],
+                [[np.float32, 4, (32, 1, 3, 3)]],
+                [[np.float32, 29, (10, 128)]]
+        ]
+        for shape in shape_format:
+            cpu_input, npu_input = create_common_tensor(shape[0], -1, 1)
+            cpu_output = op_exec(cpu_input)
+            npu_output = op_exec(npu_input)
+            self.assertRtolEqual(cpu_output[0], npu_output[0].cpu())
+            self.assertRtolEqual(cpu_output[-1], npu_output[-1].cpu())
+    
+
+instantiate_device_type_tests(TestUnbind, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_uniform_.py b/test/test_npu/test_network_ops/test_uniform_.py
index 723e02791cc13839681087c11582644680734254..38ec32e098bafeda54e706f6d9b2fd265b21999d 100644
--- a/test/test_npu/test_network_ops/test_uniform_.py
+++ b/test/test_npu/test_network_ops/test_uniform_.py
@@ -1,46 +1,46 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-from common_utils import TestCase, run_tests
-from common_device_type import instantiate_device_type_tests
-
-class TestUniform(TestCase):
-    def test_uniform(self, device):
-        shape_format = [
-           [(20,300), -100, 100, torch.float32],
-           [(20,300), -100, 100, torch.float16]
-        ]
-
-        for item in shape_format:
-            input1 = torch.zeros(item[0], dtype=item[3]).npu()
-            input1.uniform_(item[1], item[2])
-            self.assertTrue(item[1] <= input1.min())
-            self.assertTrue(item[2] >= input1.max())
-    
-    def test_uniform_trans(self, device):
-        shape_format = [
-           [(20,300), -100, 100, torch.float32],
-        ]
-
-        for item in shape_format:
-            input1 = torch.zeros(item[0], dtype=item[3]).npu()
-            input1.npu_format_cast(3)
-            input1.uniform_(item[1], item[2])
-            self.assertTrue(item[1] <= input1.min())
-            self.assertTrue(item[2] >= input1.max())
-
-
-instantiate_device_type_tests(TestUniform, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from common_utils import TestCase, run_tests
+from common_device_type import instantiate_device_type_tests
+
+class TestUniform(TestCase):
+    def test_uniform(self, device):
+        shape_format = [
+           [(20,300), -100, 100, torch.float32],
+           [(20,300), -100, 100, torch.float16]
+        ]
+
+        for item in shape_format:
+            input1 = torch.zeros(item[0], dtype=item[3]).npu()
+            input1.uniform_(item[1], item[2])
+            self.assertTrue(item[1] <= input1.min())
+            self.assertTrue(item[2] >= input1.max())
+    
+    def test_uniform_trans(self, device):
+        shape_format = [
+           [(20,300), -100, 100, torch.float32],
+        ]
+
+        for item in shape_format:
+            input1 = torch.zeros(item[0], dtype=item[3]).npu()
+            input1.npu_format_cast(3)
+            input1.uniform_(item[1], item[2])
+            self.assertTrue(item[1] <= input1.min())
+            self.assertTrue(item[2] >= input1.max())
+
+
+instantiate_device_type_tests(TestUniform, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_unique2.py b/test/test_npu/test_network_ops/test_unique2.py
index ac6b18611cf3e66e03d9b8ae424ae44dba300ad9..993779a1a302c00ad1afd617200bbaf7211b4aa6 100644
--- a/test/test_npu/test_network_ops/test_unique2.py
+++ b/test/test_npu/test_network_ops/test_unique2.py
@@ -1,58 +1,58 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestUnique2(TestCase):
-    def test_unique2(self, device):
-        shape_format = [
-                        [[np.uint8, (2, 3)], True, True, True],
-                        [[np.int8, (2, 3)], True, True, True],
-                        [[np.int16, (2, 3)], True, True, True],
-                        [[np.int32, (2, 3)], True, True, True],
-                        [[np.long, (2, 3)], True, True, False],
-                        [[np.long, (5, 3)], True, False, True],
-                        [[np.long, (2, 3, 4)], True, False, False],
-                        [[np.long, (3, 3)], False, True, True],
-                        [[np.long, (2, 3)], False, False, False],
-                        [[np.float32, (2, 3)], True, False, False],
-                        [[np.bool, (2, 3)], True, True, True],
-                        [[np.float16, (2, 3)], True, True, True]
-        ]
-
-        for item in shape_format:
-            input1 = np.random.uniform(-10, 10, item[0][1]).astype(item[0][0])
-            cpu_input1 = torch.from_numpy(input1)
-            if item[0][0] == np.float16:
-                cpu_input1 = torch.from_numpy(input1.astype(np.float32))
-            npu_input1 = torch.from_numpy(input1).npu()
-
-            cpu_output_y, cpu_yInverse, cpu_yCounts = torch._unique2(cpu_input1, item[1], item[2], item[3])
-            npu_output_y, npu_yInverse, npu_yCounts = torch._unique2(npu_input1, item[1], item[2], item[3])
-          
-            self.assertRtolEqual(cpu_output_y.numpy().astype(np.float32), npu_output_y.cpu().numpy().astype(np.float32))
-            self.assertRtolEqual(cpu_yInverse.numpy().astype(np.float32), npu_yInverse.cpu().numpy().astype(np.float32))
-            self.assertRtolEqual(cpu_yCounts.numpy().astype(np.float32), npu_yCounts.cpu().numpy().astype(np.float32))
-
-instantiate_device_type_tests(TestUnique2, globals(), except_for='cpu')
-if __name__ == "__main__":
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestUnique2(TestCase):
+    def test_unique2(self, device):
+        shape_format = [
+                        [[np.uint8, (2, 3)], True, True, True],
+                        [[np.int8, (2, 3)], True, True, True],
+                        [[np.int16, (2, 3)], True, True, True],
+                        [[np.int32, (2, 3)], True, True, True],
+                        [[np.long, (2, 3)], True, True, False],
+                        [[np.long, (5, 3)], True, False, True],
+                        [[np.long, (2, 3, 4)], True, False, False],
+                        [[np.long, (3, 3)], False, True, True],
+                        [[np.long, (2, 3)], False, False, False],
+                        [[np.float32, (2, 3)], True, False, False],
+                        [[np.bool, (2, 3)], True, True, True],
+                        [[np.float16, (2, 3)], True, True, True]
+        ]
+
+        for item in shape_format:
+            input1 = np.random.uniform(-10, 10, item[0][1]).astype(item[0][0])
+            cpu_input1 = torch.from_numpy(input1)
+            if item[0][0] == np.float16:
+                cpu_input1 = torch.from_numpy(input1.astype(np.float32))
+            npu_input1 = torch.from_numpy(input1).npu()
+
+            cpu_output_y, cpu_yInverse, cpu_yCounts = torch._unique2(cpu_input1, item[1], item[2], item[3])
+            npu_output_y, npu_yInverse, npu_yCounts = torch._unique2(npu_input1, item[1], item[2], item[3])
+          
+            self.assertRtolEqual(cpu_output_y.numpy().astype(np.float32), npu_output_y.cpu().numpy().astype(np.float32))
+            self.assertRtolEqual(cpu_yInverse.numpy().astype(np.float32), npu_yInverse.cpu().numpy().astype(np.float32))
+            self.assertRtolEqual(cpu_yCounts.numpy().astype(np.float32), npu_yCounts.cpu().numpy().astype(np.float32))
+
+instantiate_device_type_tests(TestUnique2, globals(), except_for='cpu')
+if __name__ == "__main__":
     run_tests() 
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_upsample_bicubic2d.py b/test/test_npu/test_network_ops/test_upsample_bicubic2d.py
index 5d7eeec4ecd51d0074a664b4578d726d314f7184..95a0acbb1060296fab750020c8bfa99b632e41a9 100644
--- a/test/test_npu/test_network_ops/test_upsample_bicubic2d.py
+++ b/test/test_npu/test_network_ops/test_upsample_bicubic2d.py
@@ -1,126 +1,126 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestUpsampleBicubic2d(TestCase):
-
-    def cpu_op_exec(self, input1, output_size, align_corners, scale_h, scale_w): 
-        output = torch._C._nn.upsample_bicubic2d(input1, output_size, align_corners, scale_h, scale_w)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, output_size, align_corners, scale_h, scale_w): 
-        output = torch._C._nn.upsample_bicubic2d(input1, output_size, align_corners, scale_h, scale_w)
-        output = output.to("cpu") 
-        output = output.numpy() 
-        return output 
-
-    # float32 [0.0002, 0.0001]
-    #pylint: disable=unused-argument
-    def test_upsample_bicubic2d_common_shape_format(self, device):
-        shape_format = [
-                        # same size
-                        [[np.float32, -1, (1, 1, 1, 1)], (1, 1), True, 0, 0, 0, 255],  # case 1
-                        [[np.float32, -1, (2, 65535, 2, 2)], (2, 2), True, 0, 0, 0, 255],  # case 2
-                        [[np.float32, -1, (65535, 2, 4, 8)], (4, 8), True, 0, 0, 0, 255],  # case 3
-                        [[np.float32, -1, (2, 4, 65535, 2)], (65535, 2), True, 0, 0, 0, 255],  # case 4
-                        [[np.float32, -1, (1, 31, 149, 2)], (149, 2), True, 0, 0, 0, 255],  # case 5
-                        [[np.float32, -1, (10, 10, 786432, 8)], (786432, 8), True, 0, 0, 0, 255],  # case 6
-                        [[np.float32, -1, (32, 32, 32, 32)], (32, 32), True, 0, 0, 0, 3402823500.0],  # case 7
-                        [[np.float32, -1, (10, 10, 786432, 8)], (786432, 8), False, 0, 0, 0, 255],  # case 8
-                        [[np.float32, -1, (32, 32, 32, 32)], (32, 32), False, 0, 0, 0, 3402823500.0],  # case 9
-                        
-                        # align_corners = True
-                        [[np.float32, -1, (1, 1, 1, 1)], (2, 2), True, 0, 0, 0, 255],  # case 10
-                        [[np.float32, -1, (1, 1, 2, 2)], (4, 4), True, 0, 0, 0, 255],  # case 11
-                        [[np.float32, -1, (2, 2, 1, 1)], (2, 2), True, 0, 0, 0, 255],  # case 12
-                        [[np.float32, -1, (2, 2, 2, 2)], (10, 10), True, 0, 0, 0, 255],  # case 13
-                        [[np.float32, -1, (1, 31, 149, 2)], (2, 149), True, 0, 0, 0, 255],  # case 14
-                        #[[np.float32, -1, (32, 32, 32, 32)], (64, 64), True, 0, 0, 0, 255],  # case 15
-                        #[[np.float32, -1, (32, 32, 32, 32)], (64, 64), True, 0, 0, 0, 3402823500.0],  # case 16
-                        
-                        # align_corners = False
-                        [[np.float32, -1, (1, 1, 1, 1)], (2, 2), False, 0.5, 0.5, 0, 255],  # case 17
-                        [[np.float32, -1, (1, 1, 2, 2)], (4, 4), False, 0.5, 0.5, 0, 255],  # case 18
-                        [[np.float32, -1, (2, 2, 1, 1)], (2, 2), False, 0.5, 0.5, 0, 255],  # case 19
-                        [[np.float32, -1, (2, 2, 2, 2)], (10, 10), False, 0.5, 0.5, 0, 255],  # case 20
-                        [[np.float32, -1, (1, 31, 149, 2)], (2, 149), False, 0.5, 0.5, 0, 255],  # case 21
-                        [[np.float32, -1, (32, 32, 32, 32)], (64, 64), False, 0.5, 0.5, 0, 255],  # case 22
-                        [[np.float32, -1, (32, 32, 32, 32)], (64, 64), False, 0.5, 0.5, 0, 3402823500.0]  # case 23
-                       ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], item[5], item[6])
-            cpu_output = self.cpu_op_exec(cpu_input1, item[1], item[2], item[3], item[4])
-            npu_output = self.npu_op_exec(npu_input1, item[1], item[2], item[3], item[4])
-            self.assertRtolEqual(cpu_output, npu_output)
-  
-    # float16 [0.002, 0.001]
-    #pylint: disable=unused-argument
-    def test_upsample_bicubic2d_float16_shape_format(self, device):
-        def cpu_op_exec_fp16(input1, output_size, align_corners, scale_h, scale_w):
-            input1 = input1.to(torch.float32)
-            output = torch._C._nn.upsample_bicubic2d(input1, output_size, align_corners, scale_h, scale_w)
-            output = output.numpy()
-            output = output.astype(np.float16)
-            return output
-    
-        shape_format = [
-                        # same size
-                        [[np.float16, -1, (1, 1, 1, 1)], (1, 1), True, 0, 0, 0, 255],  # case 24
-                        [[np.float16, -1, (2, 65535, 2, 2)], (2, 2), True, 0, 0, 0, 255],  # case 25
-                        [[np.float16, -1, (65535, 2, 4, 8)], (4, 8), True, 0, 0, 0, 255],  # case 26
-                        [[np.float16, -1, (2, 4, 65535, 2)], (65535, 2), True, 0, 0, 0, 255],  # case 27
-                        [[np.float16, -1, (1, 31, 149, 2)], (149, 2), True, 0, 0, 0, 255],  # case 28
-                        [[np.float16, -1, (10, 10, 786432, 8)], (786432, 8), True, 0, 0, 0, 255],  # case 29
-                        [[np.float16, -1, (32, 32, 32, 32)], (32, 32), True, 0, 0, 0, 6550.0],  # case 30
-                        [[np.float16, -1, (10, 10, 786432, 8)], (786432, 8), False, 0, 0, 0, 255],  # case 31
-                        [[np.float16, -1, (32, 32, 32, 32)], (32, 32), False, 0, 0, 0, 6550.0],  # case 32
-    
-                        # align_corners = True
-                        [[np.float16, -1, (1, 1, 1, 1)], (2, 2), True, 0, 0, 0, 255],  # case 33
-                        [[np.float16, -1, (1, 1, 2, 2)], (4, 4), True, 0, 0, 0, 255],  # case 34
-                        [[np.float16, -1, (2, 2, 1, 1)], (2, 2), True, 0, 0, 0, 255],  # case 35
-                        [[np.float16, -1, (2, 2, 2, 2)], (10, 10), True, 0, 0, 0, 255],  # case 36
-                        [[np.float16, -1, (1, 31, 149, 2)], (2, 149), True, 0, 0, 0, 255],  # case 37
-                        [[np.float16, -1, (32, 32, 32, 32)], (64, 64), True, 0, 0, 0, 255],  # case 38
-                        [[np.float16, -1, (32, 32, 32, 32)], (64, 64), True, 0, 0, 0, 6550.0],  # case 39
-                        
-                        # align_corners = False
-                        [[np.float16, -1, (1, 1, 1, 1)], (2, 2), False, 0.5, 0.5, 0, 255],  # case 40
-                        [[np.float16, -1, (1, 1, 2, 2)], (4, 4), False, 0.5, 0.5, 0, 255],  # case 41
-                        [[np.float16, -1, (2, 2, 1, 1)], (2, 2), False, 0.5, 0.5, 0, 255],  # case 42
-                        [[np.float16, -1, (2, 2, 2, 2)], (10, 10), False, 0.5, 0.5, 0, 255],  # case 43
-                        [[np.float16, -1, (1, 31, 149, 2)], (2, 149), False, 0.5, 0.5, 0, 255],  # case 44
-                        [[np.float16, -1, (32, 32, 32, 32)], (64, 64), False, 0.5, 0.5, 0, 255],  # case 45
-                        [[np.float16, -1, (32, 32, 32, 32)], (64, 64), False, 0.5, 0.5, 0, 6550.0]  # case 46
-                       ]
-        
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], item[5], item[6])
-            cpu_output = cpu_op_exec_fp16(cpu_input1, item[1], item[2], item[3], item[4])
-            npu_output = self.npu_op_exec(npu_input1, item[1], item[2], item[3], item[4])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestUpsampleBicubic2d, globals(), except_for='cpu')
-if __name__ == "__main__":
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestUpsampleBicubic2d(TestCase):
+
+    def cpu_op_exec(self, input1, output_size, align_corners, scale_h, scale_w): 
+        output = torch._C._nn.upsample_bicubic2d(input1, output_size, align_corners, scale_h, scale_w)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, output_size, align_corners, scale_h, scale_w): 
+        output = torch._C._nn.upsample_bicubic2d(input1, output_size, align_corners, scale_h, scale_w)
+        output = output.to("cpu") 
+        output = output.numpy() 
+        return output 
+
+    # float32 [0.0002, 0.0001]
+    #pylint: disable=unused-argument
+    def test_upsample_bicubic2d_common_shape_format(self, device):
+        shape_format = [
+                        # same size
+                        [[np.float32, -1, (1, 1, 1, 1)], (1, 1), True, 0, 0, 0, 255],  # case 1
+                        [[np.float32, -1, (2, 65535, 2, 2)], (2, 2), True, 0, 0, 0, 255],  # case 2
+                        [[np.float32, -1, (65535, 2, 4, 8)], (4, 8), True, 0, 0, 0, 255],  # case 3
+                        [[np.float32, -1, (2, 4, 65535, 2)], (65535, 2), True, 0, 0, 0, 255],  # case 4
+                        [[np.float32, -1, (1, 31, 149, 2)], (149, 2), True, 0, 0, 0, 255],  # case 5
+                        [[np.float32, -1, (10, 10, 786432, 8)], (786432, 8), True, 0, 0, 0, 255],  # case 6
+                        [[np.float32, -1, (32, 32, 32, 32)], (32, 32), True, 0, 0, 0, 3402823500.0],  # case 7
+                        [[np.float32, -1, (10, 10, 786432, 8)], (786432, 8), False, 0, 0, 0, 255],  # case 8
+                        [[np.float32, -1, (32, 32, 32, 32)], (32, 32), False, 0, 0, 0, 3402823500.0],  # case 9
+                        
+                        # align_corners = True
+                        [[np.float32, -1, (1, 1, 1, 1)], (2, 2), True, 0, 0, 0, 255],  # case 10
+                        [[np.float32, -1, (1, 1, 2, 2)], (4, 4), True, 0, 0, 0, 255],  # case 11
+                        [[np.float32, -1, (2, 2, 1, 1)], (2, 2), True, 0, 0, 0, 255],  # case 12
+                        [[np.float32, -1, (2, 2, 2, 2)], (10, 10), True, 0, 0, 0, 255],  # case 13
+                        [[np.float32, -1, (1, 31, 149, 2)], (2, 149), True, 0, 0, 0, 255],  # case 14
+                        #[[np.float32, -1, (32, 32, 32, 32)], (64, 64), True, 0, 0, 0, 255],  # case 15
+                        #[[np.float32, -1, (32, 32, 32, 32)], (64, 64), True, 0, 0, 0, 3402823500.0],  # case 16
+                        
+                        # align_corners = False
+                        [[np.float32, -1, (1, 1, 1, 1)], (2, 2), False, 0.5, 0.5, 0, 255],  # case 17
+                        [[np.float32, -1, (1, 1, 2, 2)], (4, 4), False, 0.5, 0.5, 0, 255],  # case 18
+                        [[np.float32, -1, (2, 2, 1, 1)], (2, 2), False, 0.5, 0.5, 0, 255],  # case 19
+                        [[np.float32, -1, (2, 2, 2, 2)], (10, 10), False, 0.5, 0.5, 0, 255],  # case 20
+                        [[np.float32, -1, (1, 31, 149, 2)], (2, 149), False, 0.5, 0.5, 0, 255],  # case 21
+                        [[np.float32, -1, (32, 32, 32, 32)], (64, 64), False, 0.5, 0.5, 0, 255],  # case 22
+                        [[np.float32, -1, (32, 32, 32, 32)], (64, 64), False, 0.5, 0.5, 0, 3402823500.0]  # case 23
+                       ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], item[5], item[6])
+            cpu_output = self.cpu_op_exec(cpu_input1, item[1], item[2], item[3], item[4])
+            npu_output = self.npu_op_exec(npu_input1, item[1], item[2], item[3], item[4])
+            self.assertRtolEqual(cpu_output, npu_output)
+  
+    # float16 [0.002, 0.001]
+    #pylint: disable=unused-argument
+    def test_upsample_bicubic2d_float16_shape_format(self, device):
+        def cpu_op_exec_fp16(input1, output_size, align_corners, scale_h, scale_w):
+            input1 = input1.to(torch.float32)
+            output = torch._C._nn.upsample_bicubic2d(input1, output_size, align_corners, scale_h, scale_w)
+            output = output.numpy()
+            output = output.astype(np.float16)
+            return output
+    
+        shape_format = [
+                        # same size
+                        [[np.float16, -1, (1, 1, 1, 1)], (1, 1), True, 0, 0, 0, 255],  # case 24
+                        [[np.float16, -1, (2, 65535, 2, 2)], (2, 2), True, 0, 0, 0, 255],  # case 25
+                        [[np.float16, -1, (65535, 2, 4, 8)], (4, 8), True, 0, 0, 0, 255],  # case 26
+                        [[np.float16, -1, (2, 4, 65535, 2)], (65535, 2), True, 0, 0, 0, 255],  # case 27
+                        [[np.float16, -1, (1, 31, 149, 2)], (149, 2), True, 0, 0, 0, 255],  # case 28
+                        [[np.float16, -1, (10, 10, 786432, 8)], (786432, 8), True, 0, 0, 0, 255],  # case 29
+                        [[np.float16, -1, (32, 32, 32, 32)], (32, 32), True, 0, 0, 0, 6550.0],  # case 30
+                        [[np.float16, -1, (10, 10, 786432, 8)], (786432, 8), False, 0, 0, 0, 255],  # case 31
+                        [[np.float16, -1, (32, 32, 32, 32)], (32, 32), False, 0, 0, 0, 6550.0],  # case 32
+    
+                        # align_corners = True
+                        [[np.float16, -1, (1, 1, 1, 1)], (2, 2), True, 0, 0, 0, 255],  # case 33
+                        [[np.float16, -1, (1, 1, 2, 2)], (4, 4), True, 0, 0, 0, 255],  # case 34
+                        [[np.float16, -1, (2, 2, 1, 1)], (2, 2), True, 0, 0, 0, 255],  # case 35
+                        [[np.float16, -1, (2, 2, 2, 2)], (10, 10), True, 0, 0, 0, 255],  # case 36
+                        [[np.float16, -1, (1, 31, 149, 2)], (2, 149), True, 0, 0, 0, 255],  # case 37
+                        [[np.float16, -1, (32, 32, 32, 32)], (64, 64), True, 0, 0, 0, 255],  # case 38
+                        [[np.float16, -1, (32, 32, 32, 32)], (64, 64), True, 0, 0, 0, 6550.0],  # case 39
+                        
+                        # align_corners = False
+                        [[np.float16, -1, (1, 1, 1, 1)], (2, 2), False, 0.5, 0.5, 0, 255],  # case 40
+                        [[np.float16, -1, (1, 1, 2, 2)], (4, 4), False, 0.5, 0.5, 0, 255],  # case 41
+                        [[np.float16, -1, (2, 2, 1, 1)], (2, 2), False, 0.5, 0.5, 0, 255],  # case 42
+                        [[np.float16, -1, (2, 2, 2, 2)], (10, 10), False, 0.5, 0.5, 0, 255],  # case 43
+                        [[np.float16, -1, (1, 31, 149, 2)], (2, 149), False, 0.5, 0.5, 0, 255],  # case 44
+                        [[np.float16, -1, (32, 32, 32, 32)], (64, 64), False, 0.5, 0.5, 0, 255],  # case 45
+                        [[np.float16, -1, (32, 32, 32, 32)], (64, 64), False, 0.5, 0.5, 0, 6550.0]  # case 46
+                       ]
+        
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], item[5], item[6])
+            cpu_output = cpu_op_exec_fp16(cpu_input1, item[1], item[2], item[3], item[4])
+            npu_output = self.npu_op_exec(npu_input1, item[1], item[2], item[3], item[4])
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestUpsampleBicubic2d, globals(), except_for='cpu')
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_network_ops/test_upsample_bicubic2d_backward.py b/test/test_npu/test_network_ops/test_upsample_bicubic2d_backward.py
index f75f627040a738f9c4ee208c98458fb6f2966ba2..0d3f04813775d77fba8238666fd02c1c61423385 100644
--- a/test/test_npu/test_network_ops/test_upsample_bicubic2d_backward.py
+++ b/test/test_npu/test_network_ops/test_upsample_bicubic2d_backward.py
@@ -1,91 +1,91 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestUpsampleBicubic2dBackward(TestCase):
-
-    def cpu_op_exec(self, input1, output_size, align_corners, scale_h, scale_w):
-        input1.requires_grad = True
-        output = torch._C._nn.upsample_bicubic2d(input1, output_size, align_corners, scale_h, scale_w)
-        output.backward(torch.ones_like(output))
-        output_grad = input1.grad
-        output_grad = output_grad.detach().numpy()
-        return output_grad
-
-    def npu_op_exec(self, input1, output_size, align_corners, scale_h, scale_w):
-        input1.requires_grad = True
-        output = torch._C._nn.upsample_bicubic2d(input1, output_size, align_corners, scale_h, scale_w)
-        output.backward(torch.ones_like(output))
-        output_grad = input1.grad
-        output_grad = output_grad.to("cpu").detach().numpy()
-        return output_grad
-
-
-    def test_upsample_bicubic2d_common_shape_format(self, device):
-        shape_format = [
-                        [[np.float32, -1, (1, 1, 1, 1)], (1, 1), True, 0, 0, 0, 255],
-                        [[np.float32, -1, (2, 65535, 2, 2)], (2, 2), True, 0, 0, 0, 255],
-                        [[np.float32, -1, (10, 10, 786432, 8)], (786432, 8), False, 0, 0, 0, 255],
-                        [[np.float32, -1, (1, 1, 1, 1)], (2, 2), True, 0, 0, 0, 255],
-                        [[np.float32, -1, (1, 1, 2, 2)], (4, 4), True, 0, 0, 0, 255],
-                        [[np.float32, -1, (1, 1, 1, 1)], (2, 2), False, 0.5, 0.5, 0, 255],
-                        [[np.float32, -1, (1, 1, 2, 2)], (4, 4), False, 0.5, 0.5, 0, 255],
-                        [[np.float32, -1, (32, 32, 32, 32)], (64, 64), False, 0.5, 0.5, 0, 3402823500.0]
-                       ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], item[5], item[6])
-            cpu_output = self.cpu_op_exec(cpu_input1, item[1], item[2], item[3], item[4])
-            npu_output = self.npu_op_exec(npu_input1, item[1], item[2], item[3], item[4])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-    def test_upsample_bicubic2d_float16_shape_format(self, device):
-        def cpu_op_exec_fp16(input1, output_size, align_corners, scale_h, scale_w):
-            input1 = input1.to(torch.float32)
-            input1.requires_grad = True
-            output = torch._C._nn.upsample_bicubic2d(input1, output_size, align_corners, scale_h, scale_w)
-            output.backward(torch.ones_like(output))
-            output_grad = input1.grad
-            output_grad = output_grad.detach().numpy()
-            output_grad = output_grad.astype(np.float16)
-            return output_grad
-    
-        shape_format = [
-                        [[np.float16, -1, (1, 1, 1, 1)], (1, 1), True, 0, 0, 0, 255],
-                        [[np.float16, -1, (2, 65535, 2, 2)], (2, 2), True, 0, 0, 0, 255],
-                        [[np.float16, -1, (32, 32, 32, 32)], (32, 32), False, 0, 0, 0, 6550.0],
-                        [[np.float16, -1, (1, 1, 1, 1)], (2, 2), True, 0, 0, 0, 255],
-                        [[np.float16, -1, (1, 1, 1, 1)], (2, 2), False, 0.5, 0.5, 0, 255],
-                        [[np.float16, -1, (1, 1, 2, 2)], (4, 4), False, 0.5, 0.5, 0, 255],
-                        [[np.float16, -1, (32, 32, 32, 32)], (64, 64), False, 0.5, 0.5, 0, 6550.0]
-                       ]
-        
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], item[5], item[6])
-            cpu_output = cpu_op_exec_fp16(cpu_input1, item[1], item[2], item[3], item[4])
-            npu_output = self.npu_op_exec(npu_input1, item[1], item[2], item[3], item[4])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestUpsampleBicubic2dBackward, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestUpsampleBicubic2dBackward(TestCase):
+
+    def cpu_op_exec(self, input1, output_size, align_corners, scale_h, scale_w):
+        input1.requires_grad = True
+        output = torch._C._nn.upsample_bicubic2d(input1, output_size, align_corners, scale_h, scale_w)
+        output.backward(torch.ones_like(output))
+        output_grad = input1.grad
+        output_grad = output_grad.detach().numpy()
+        return output_grad
+
+    def npu_op_exec(self, input1, output_size, align_corners, scale_h, scale_w):
+        input1.requires_grad = True
+        output = torch._C._nn.upsample_bicubic2d(input1, output_size, align_corners, scale_h, scale_w)
+        output.backward(torch.ones_like(output))
+        output_grad = input1.grad
+        output_grad = output_grad.to("cpu").detach().numpy()
+        return output_grad
+
+
+    def test_upsample_bicubic2d_common_shape_format(self, device):
+        shape_format = [
+                        [[np.float32, -1, (1, 1, 1, 1)], (1, 1), True, 0, 0, 0, 255],
+                        [[np.float32, -1, (2, 65535, 2, 2)], (2, 2), True, 0, 0, 0, 255],
+                        [[np.float32, -1, (10, 10, 786432, 8)], (786432, 8), False, 0, 0, 0, 255],
+                        [[np.float32, -1, (1, 1, 1, 1)], (2, 2), True, 0, 0, 0, 255],
+                        [[np.float32, -1, (1, 1, 2, 2)], (4, 4), True, 0, 0, 0, 255],
+                        [[np.float32, -1, (1, 1, 1, 1)], (2, 2), False, 0.5, 0.5, 0, 255],
+                        [[np.float32, -1, (1, 1, 2, 2)], (4, 4), False, 0.5, 0.5, 0, 255],
+                        [[np.float32, -1, (32, 32, 32, 32)], (64, 64), False, 0.5, 0.5, 0, 3402823500.0]
+                       ]
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], item[5], item[6])
+            cpu_output = self.cpu_op_exec(cpu_input1, item[1], item[2], item[3], item[4])
+            npu_output = self.npu_op_exec(npu_input1, item[1], item[2], item[3], item[4])
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+    def test_upsample_bicubic2d_float16_shape_format(self, device):
+        def cpu_op_exec_fp16(input1, output_size, align_corners, scale_h, scale_w):
+            input1 = input1.to(torch.float32)
+            input1.requires_grad = True
+            output = torch._C._nn.upsample_bicubic2d(input1, output_size, align_corners, scale_h, scale_w)
+            output.backward(torch.ones_like(output))
+            output_grad = input1.grad
+            output_grad = output_grad.detach().numpy()
+            output_grad = output_grad.astype(np.float16)
+            return output_grad
+    
+        shape_format = [
+                        [[np.float16, -1, (1, 1, 1, 1)], (1, 1), True, 0, 0, 0, 255],
+                        [[np.float16, -1, (2, 65535, 2, 2)], (2, 2), True, 0, 0, 0, 255],
+                        [[np.float16, -1, (32, 32, 32, 32)], (32, 32), False, 0, 0, 0, 6550.0],
+                        [[np.float16, -1, (1, 1, 1, 1)], (2, 2), True, 0, 0, 0, 255],
+                        [[np.float16, -1, (1, 1, 1, 1)], (2, 2), False, 0.5, 0.5, 0, 255],
+                        [[np.float16, -1, (1, 1, 2, 2)], (4, 4), False, 0.5, 0.5, 0, 255],
+                        [[np.float16, -1, (32, 32, 32, 32)], (64, 64), False, 0.5, 0.5, 0, 6550.0]
+                       ]
+        
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], item[5], item[6])
+            cpu_output = cpu_op_exec_fp16(cpu_input1, item[1], item[2], item[3], item[4])
+            npu_output = self.npu_op_exec(npu_input1, item[1], item[2], item[3], item[4])
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestUpsampleBicubic2dBackward, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_upsample_bilinear.py b/test/test_npu/test_network_ops/test_upsample_bilinear.py
index 9bac334778b37b6ab773a24fe23d67689479e601..8ac02b7cab5f8a5971dfc65ed63da18516bd54af 100644
--- a/test/test_npu/test_network_ops/test_upsample_bilinear.py
+++ b/test/test_npu/test_network_ops/test_upsample_bilinear.py
@@ -1,76 +1,76 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import sys
-import copy
-import torch.nn as nn
-import torch.nn.functional as F
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestUpsampleBilinear(TestCase):
-    def cpu_op_exec(self, input1, size):
-        input1.requires_grad_(True)
-        output = F.interpolate(input1, size, mode="bilinear")
-        w = torch.ones_like(output)
-        output.backward(w)
-        res = input1.grad
-        res = res.numpy()
-        output = output.detach().numpy()
-        return output, res
-
-    def npu_op_exec(self, input1, size):
-        input1.requires_grad_(True)
-        output = F.interpolate(input1, size, mode="bilinear")
-        w = torch.ones_like(output)
-        w = w.to("npu")
-        output.backward(w)
-        output = output.to("cpu").detach().numpy()
-        res = input1.grad
-        res = res.to("cpu").numpy()
-        return output, res
-
-    def upsample_bilinear_backward_result(self, shape_format):
-        for item in shape_format:
-            input_cpu, input_npu = create_common_tensor(item[0], 0, 100)
-            if input_cpu.dtype == torch.float16:
-                input_cpu = input_cpu.to(torch.float32)
-
-            cpu_output, cpu_grad = self.cpu_op_exec(input_cpu, item[1])
-            npu_output, npu_grad = self.npu_op_exec(input_npu, item[1])
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            cpu_grad = cpu_grad.astype(npu_grad.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_grad, npu_grad)
-
-    def test_upsample_bilinear_backward_shape_format_aicpu(self, device):
-        format_list = [0, 3]
-        size_list = [[10001, 2]]
-        shape_format = [[[np.float32, i, [2, 10020, 100, 1]], s] for i in format_list for s in size_list]
-
-        self.upsample_bilinear_backward_result(shape_format)
-        
-    def test_upsample_bilinear_backward_shape_format_aicore(self, device):
-        format_list = [0, 3]
-        size_list = [[100, 2]]
-        shape_format = [[[np.float32, i, [2, 10020, 100, 1]], s] for i in format_list for s in size_list]
-
-        self.upsample_bilinear_backward_result(shape_format)
-
-instantiate_device_type_tests(TestUpsampleBilinear, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import sys
+import copy
+import torch.nn as nn
+import torch.nn.functional as F
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestUpsampleBilinear(TestCase):
+    def cpu_op_exec(self, input1, size):
+        input1.requires_grad_(True)
+        output = F.interpolate(input1, size, mode="bilinear")
+        w = torch.ones_like(output)
+        output.backward(w)
+        res = input1.grad
+        res = res.numpy()
+        output = output.detach().numpy()
+        return output, res
+
+    def npu_op_exec(self, input1, size):
+        input1.requires_grad_(True)
+        output = F.interpolate(input1, size, mode="bilinear")
+        w = torch.ones_like(output)
+        w = w.to("npu")
+        output.backward(w)
+        output = output.to("cpu").detach().numpy()
+        res = input1.grad
+        res = res.to("cpu").numpy()
+        return output, res
+
+    def upsample_bilinear_backward_result(self, shape_format):
+        for item in shape_format:
+            input_cpu, input_npu = create_common_tensor(item[0], 0, 100)
+            if input_cpu.dtype == torch.float16:
+                input_cpu = input_cpu.to(torch.float32)
+
+            cpu_output, cpu_grad = self.cpu_op_exec(input_cpu, item[1])
+            npu_output, npu_grad = self.npu_op_exec(input_npu, item[1])
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            cpu_grad = cpu_grad.astype(npu_grad.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_grad, npu_grad)
+
+    def test_upsample_bilinear_backward_shape_format_aicpu(self, device):
+        format_list = [0, 3]
+        size_list = [[10001, 2]]
+        shape_format = [[[np.float32, i, [2, 10020, 100, 1]], s] for i in format_list for s in size_list]
+
+        self.upsample_bilinear_backward_result(shape_format)
+        
+    def test_upsample_bilinear_backward_shape_format_aicore(self, device):
+        format_list = [0, 3]
+        size_list = [[100, 2]]
+        shape_format = [[[np.float32, i, [2, 10020, 100, 1]], s] for i in format_list for s in size_list]
+
+        self.upsample_bilinear_backward_result(shape_format)
+
+instantiate_device_type_tests(TestUpsampleBilinear, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_upsample_bilinear_backward.py b/test/test_npu/test_network_ops/test_upsample_bilinear_backward.py
old mode 100644
new mode 100755
index 2d1ae7e4e3679bf22d27acfed7e7ac6c621a5b51..ce01e8f55d48fb1fd33fc9f063a48c5af3ab2d22
--- a/test/test_npu/test_network_ops/test_upsample_bilinear_backward.py
+++ b/test/test_npu/test_network_ops/test_upsample_bilinear_backward.py
@@ -1,79 +1,79 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import sys
-import copy
-import torch.nn as nn
-import torch.nn.functional as F
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestUpsampleBilinear(TestCase):
-    def cpu_op_exec(self, input1, size):
-        input1.requires_grad_(True)
-        output = F.interpolate(input1, size, mode="bilinear")
-        w = torch.ones_like(output)
-        output.backward(w)
-        res = input1.grad
-        res = res.numpy()
-        output = output.detach().numpy()
-        return output, res
-
-    def npu_op_exec(self, input1, size):
-        input1.requires_grad_(True)
-        output = F.interpolate(input1, size, mode="bilinear")
-        w = torch.ones_like(output)
-        w = w.to("npu")
-        output.backward(w)
-        output = output.to("cpu").detach().numpy()
-        res = input1.grad
-        res = res.to("cpu").numpy()
-        return output, res
-
-    def upsample_bilinear_backward_result(self, shape_format):
-        for item in shape_format:
-            input_cpu, input_npu = create_common_tensor(item[0], 0, 100)
-            if input_cpu.dtype == torch.float16:
-                input_cpu = input_cpu.to(torch.float32)
-
-            cpu_output, cpu_grad = self.cpu_op_exec(input_cpu, item[1])
-            npu_output, npu_grad = self.npu_op_exec(input_npu, item[1])
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            cpu_grad = cpu_grad.astype(npu_grad.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_grad, npu_grad)
-
-    def test_upsample_bilinear_backward_shape_format_fp16(self, device):
-        format_list = [0, 3, 29]
-        size_list = [[2, 2], [3, 3], [6, 6]]
-        shape_format = [[[np.float16, i, [24, 56, 18, 18]], s] for i in format_list for s in size_list]
-
-        self.upsample_bilinear_backward_result(shape_format)
-
-    def test_upsample_bilinear_backward_shape_format_fp32(self, device):
-        type_list = [np.float16, np.float32]
-        format_list = [0, 3, 29]
-        size_list = [[2, 2], [3, 3], [6, 6]]
-        shape_format = [[[np.float32, i, [24, 56, 18, 18]], s] for i in format_list for s in size_list]
-
-        self.upsample_bilinear_backward_result(shape_format)
-
-
-instantiate_device_type_tests(TestUpsampleBilinear, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import sys
+import copy
+import torch.nn as nn
+import torch.nn.functional as F
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestUpsampleBilinear(TestCase):
+    def cpu_op_exec(self, input1, size):
+        input1.requires_grad_(True)
+        output = F.interpolate(input1, size, mode="bilinear")
+        w = torch.ones_like(output)
+        output.backward(w)
+        res = input1.grad
+        res = res.numpy()
+        output = output.detach().numpy()
+        return output, res
+
+    def npu_op_exec(self, input1, size):
+        input1.requires_grad_(True)
+        output = F.interpolate(input1, size, mode="bilinear")
+        w = torch.ones_like(output)
+        w = w.to("npu")
+        output.backward(w)
+        output = output.to("cpu").detach().numpy()
+        res = input1.grad
+        res = res.to("cpu").numpy()
+        return output, res
+
+    def upsample_bilinear_backward_result(self, shape_format):
+        for item in shape_format:
+            input_cpu, input_npu = create_common_tensor(item[0], 0, 100)
+            if input_cpu.dtype == torch.float16:
+                input_cpu = input_cpu.to(torch.float32)
+
+            cpu_output, cpu_grad = self.cpu_op_exec(input_cpu, item[1])
+            npu_output, npu_grad = self.npu_op_exec(input_npu, item[1])
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            cpu_grad = cpu_grad.astype(npu_grad.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_grad, npu_grad)
+
+    def test_upsample_bilinear_backward_shape_format_fp16(self, device):
+        format_list = [0, 3, 29]
+        size_list = [[2, 2], [3, 3], [6, 6]]
+        shape_format = [[[np.float16, i, [24, 56, 18, 18]], s] for i in format_list for s in size_list]
+
+        self.upsample_bilinear_backward_result(shape_format)
+
+    def test_upsample_bilinear_backward_shape_format_fp32(self, device):
+        type_list = [np.float16, np.float32]
+        format_list = [0, 3, 29]
+        size_list = [[2, 2], [3, 3], [6, 6]]
+        shape_format = [[[np.float32, i, [24, 56, 18, 18]], s] for i in format_list for s in size_list]
+
+        self.upsample_bilinear_backward_result(shape_format)
+
+
+instantiate_device_type_tests(TestUpsampleBilinear, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_upsample_linear1d_backward.py b/test/test_npu/test_network_ops/test_upsample_linear1d_backward.py
index 9b492e355d9d55c09e30b94b91ce18842090cb85..f18187e0887d5cb8556dfbba46a6e69839c1a330 100644
--- a/test/test_npu/test_network_ops/test_upsample_linear1d_backward.py
+++ b/test/test_npu/test_network_ops/test_upsample_linear1d_backward.py
@@ -1,129 +1,129 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestUpsampleLinear1DBackward(TestCase):
-    def cpu_op_exec(self, input, grads, size, align_corners):
-        input.requires_grad_(True)
-        output = torch._C._nn.upsample_linear1d(input, size, align_corners=align_corners)
-        output.backward(grads)
-        gradcpu = input.grad
-        return output.detach().numpy(), gradcpu.detach().numpy()
-
-    def npu_op_exec(self, input, grads, size, align_corners):
-        input.requires_grad_(True)
-        output = torch._C._nn.upsample_linear1d(input, size, align_corners=align_corners)
-        output = output.to("npu")
-        output.backward(grads)  
-        gradnpu = input.grad
-        gradnpu = gradnpu.to("cpu")
-        output = output.to("cpu")
-        return output.detach().numpy(), gradnpu.detach().numpy()
-
-    def test_upsample_linear1d_backward_shape_format(self, device):
-        test_cases = [
-            [[np.float16, 0, (1, 1, 1, 2)], [4, ], True],
-            [[np.float16, 0, (2, 1, 1, 4)], [8, ], True],
-            [[np.float16, 0, (2, 2, 1, 3)], [1, ], True],
-            [[np.float16, 0, (2, 1, 1, 1)], [4, ], False],
-            [[np.float16, 0, (4, 1, 1, 2)], [4, ], False],
-            [[np.float16, 0, (1, 1, 1, 1)], [1, ], False],
-
-            [[np.float32, 0, (1, 1, 1, 2)], [4, ], True],
-            [[np.float32, 0, (2, 1, 1, 2)], [4, ], True],
-            [[np.float32, 0, (2, 2, 1, 3)], [1, ], True],
-            [[np.float32, 0, (3, 1, 1, 1)], [2, ], False],
-            [[np.float32, 0, (4, 1, 1, 1)], [2, ], False],
-            [[np.float32, 0, (1, 1, 1, 1)], [1, ], False],
-
-            [[np.float32, 0, (9, 7, 1, 2)], [15, ], True],
-            [[np.float16, 0, (8, 7, 1, 1)], [2, ], True],
-            [[np.float16, 0, (17, 2, 1, 3)], [1, ], True],
-            [[np.float16, 0, (6, 4, 1, 1)], [3, ], False],
-            [[np.float16, 0, (8, 7, 1, 2)], [4, ], False],
-            [[np.float16, 0, (2, 7, 1, 7)], [1, ], False],
-
-            [[np.float32, 0, (9, 7, 1, 2)], [7, ], True],
-            [[np.float32, 0, (8, 3, 1, 1)], [2, ], True],
-            [[np.float32, 0, (8, 3, 1, 1)], [2, ], True],
-            [[np.float32, 0, (17, 2, 1, 3)], [1, ], True],
-            [[np.float32, 0, (9, 7, 1, 2)], [7, ], False],
-            [[np.float32, 0, (8, 3, 1, 3)], [2, ], False],
-            [[np.float32, 0, (2, 7, 1, 7)], [1, ], False],
-
-            [[np.float16, 0, (9, 7, 1, 2)], [17, ], True],
-            [[np.float16, 0, (17, 13, 1, 15)], [16, ], True],
-            [[np.float16, 0, (61, 41, 1, 1)], [7, ], False],
-            [[np.float16, 0, (38, 7, 1, 7)], [16, ], False],
-            [[np.float32, 0, (997, 3, 1, 1)], [32, ], True],
-            [[np.float32, 0, (627, 2, 1, 3)], [17, ], False],
-            [[np.float32, 0, (78, 73, 1, 1)], [48, ], False],
-            [[np.float32, 0, (6553, 2, 1, 2)], [4, ], False],
-            [[np.float16, 0, (6553, 2, 1, 2)], [4, ], False],
-            [[np.float32, 0, (1008, 3, 1, 2)], [4, ], False],
-            [[np.float16, 0, (1008, 3, 1, 2)], [4, ], False]
-        ]
-        for item in test_cases:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-
-            size = list(item[0][2])
-            size[3] = item[1][0]
-
-            grad_item = []
-            grad_item.append(item[0][0])
-            grad_item.append(item[0][1])
-            grad_item.append(size)
-            cpu_grads, npu_grads = create_common_tensor(grad_item, 0, 100)
-
-            if cpu_input.dtype == torch.float16:
-                cpu_input = cpu_input.to(torch.float32)
-
-            if cpu_grads.dtype == torch.float16:
-                cpu_grads = cpu_grads.to(torch.float32)
-
-            if cpu_input.dim() == 4:
-                cpu_input = cpu_input.squeeze(2)
-
-            if npu_input.dim() == 4:
-                npu_input = npu_input.squeeze(2)
-
-            if cpu_grads.dim() == 4:
-                cpu_grads = cpu_grads.squeeze(2)
-
-            if npu_grads.dim() == 4:
-                npu_grads = npu_grads.squeeze(2)
-
-            size = item[1]
-            align_corners = item[2]
-
-            cpu_output, cpu_grad = self.cpu_op_exec(cpu_input, cpu_grads, size, align_corners)
-            npu_output, npu_grad = self.npu_op_exec(npu_input, npu_grads, size, align_corners)
-
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            cpu_grad = cpu_grad.astype(npu_grad.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_grad, npu_grad)
-
-
-
-instantiate_device_type_tests(TestUpsampleLinear1DBackward, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestUpsampleLinear1DBackward(TestCase):
+    def cpu_op_exec(self, input, grads, size, align_corners):
+        input.requires_grad_(True)
+        output = torch._C._nn.upsample_linear1d(input, size, align_corners=align_corners)
+        output.backward(grads)
+        gradcpu = input.grad
+        return output.detach().numpy(), gradcpu.detach().numpy()
+
+    def npu_op_exec(self, input, grads, size, align_corners):
+        input.requires_grad_(True)
+        output = torch._C._nn.upsample_linear1d(input, size, align_corners=align_corners)
+        output = output.to("npu")
+        output.backward(grads)  
+        gradnpu = input.grad
+        gradnpu = gradnpu.to("cpu")
+        output = output.to("cpu")
+        return output.detach().numpy(), gradnpu.detach().numpy()
+
+    def test_upsample_linear1d_backward_shape_format(self, device):
+        test_cases = [
+            [[np.float16, 0, (1, 1, 1, 2)], [4, ], True],
+            [[np.float16, 0, (2, 1, 1, 4)], [8, ], True],
+            [[np.float16, 0, (2, 2, 1, 3)], [1, ], True],
+            [[np.float16, 0, (2, 1, 1, 1)], [4, ], False],
+            [[np.float16, 0, (4, 1, 1, 2)], [4, ], False],
+            [[np.float16, 0, (1, 1, 1, 1)], [1, ], False],
+
+            [[np.float32, 0, (1, 1, 1, 2)], [4, ], True],
+            [[np.float32, 0, (2, 1, 1, 2)], [4, ], True],
+            [[np.float32, 0, (2, 2, 1, 3)], [1, ], True],
+            [[np.float32, 0, (3, 1, 1, 1)], [2, ], False],
+            [[np.float32, 0, (4, 1, 1, 1)], [2, ], False],
+            [[np.float32, 0, (1, 1, 1, 1)], [1, ], False],
+
+            [[np.float32, 0, (9, 7, 1, 2)], [15, ], True],
+            [[np.float16, 0, (8, 7, 1, 1)], [2, ], True],
+            [[np.float16, 0, (17, 2, 1, 3)], [1, ], True],
+            [[np.float16, 0, (6, 4, 1, 1)], [3, ], False],
+            [[np.float16, 0, (8, 7, 1, 2)], [4, ], False],
+            [[np.float16, 0, (2, 7, 1, 7)], [1, ], False],
+
+            [[np.float32, 0, (9, 7, 1, 2)], [7, ], True],
+            [[np.float32, 0, (8, 3, 1, 1)], [2, ], True],
+            [[np.float32, 0, (8, 3, 1, 1)], [2, ], True],
+            [[np.float32, 0, (17, 2, 1, 3)], [1, ], True],
+            [[np.float32, 0, (9, 7, 1, 2)], [7, ], False],
+            [[np.float32, 0, (8, 3, 1, 3)], [2, ], False],
+            [[np.float32, 0, (2, 7, 1, 7)], [1, ], False],
+
+            [[np.float16, 0, (9, 7, 1, 2)], [17, ], True],
+            [[np.float16, 0, (17, 13, 1, 15)], [16, ], True],
+            [[np.float16, 0, (61, 41, 1, 1)], [7, ], False],
+            [[np.float16, 0, (38, 7, 1, 7)], [16, ], False],
+            [[np.float32, 0, (997, 3, 1, 1)], [32, ], True],
+            [[np.float32, 0, (627, 2, 1, 3)], [17, ], False],
+            [[np.float32, 0, (78, 73, 1, 1)], [48, ], False],
+            [[np.float32, 0, (6553, 2, 1, 2)], [4, ], False],
+            [[np.float16, 0, (6553, 2, 1, 2)], [4, ], False],
+            [[np.float32, 0, (1008, 3, 1, 2)], [4, ], False],
+            [[np.float16, 0, (1008, 3, 1, 2)], [4, ], False]
+        ]
+        for item in test_cases:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+
+            size = list(item[0][2])
+            size[3] = item[1][0]
+
+            grad_item = []
+            grad_item.append(item[0][0])
+            grad_item.append(item[0][1])
+            grad_item.append(size)
+            cpu_grads, npu_grads = create_common_tensor(grad_item, 0, 100)
+
+            if cpu_input.dtype == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+
+            if cpu_grads.dtype == torch.float16:
+                cpu_grads = cpu_grads.to(torch.float32)
+
+            if cpu_input.dim() == 4:
+                cpu_input = cpu_input.squeeze(2)
+
+            if npu_input.dim() == 4:
+                npu_input = npu_input.squeeze(2)
+
+            if cpu_grads.dim() == 4:
+                cpu_grads = cpu_grads.squeeze(2)
+
+            if npu_grads.dim() == 4:
+                npu_grads = npu_grads.squeeze(2)
+
+            size = item[1]
+            align_corners = item[2]
+
+            cpu_output, cpu_grad = self.cpu_op_exec(cpu_input, cpu_grads, size, align_corners)
+            npu_output, npu_grad = self.npu_op_exec(npu_input, npu_grads, size, align_corners)
+
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            cpu_grad = cpu_grad.astype(npu_grad.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_grad, npu_grad)
+
+
+
+instantiate_device_type_tests(TestUpsampleLinear1DBackward, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_upsample_nearest1d.py b/test/test_npu/test_network_ops/test_upsample_nearest1d.py
index bb4c76edb7660e668dfee98d224ff13920683e1a..724ddc688c68ac14fe7d53586ecff34bd06d701d 100644
--- a/test/test_npu/test_network_ops/test_upsample_nearest1d.py
+++ b/test/test_npu/test_network_ops/test_upsample_nearest1d.py
@@ -1,90 +1,90 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import torch.nn.functional as F
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestUpsampleNearest1DBackward(TestCase):
-    def cpu_op_exec(self, input, size):
-        output = F.interpolate(input, size, mode="nearest")
-        return output.detach().numpy()
-
-    def cpu_op_scale_exec(self, input, scale):
-        output = F.interpolate(input, scale_factor = scale, mode="nearest")
-        return output.detach().numpy()
-
-    def npu_op_exec(self, input, size):
-        output = F.interpolate(input, size, mode="nearest")
-        output = output.cpu()
-        return output.detach().numpy()
-
-    def npu_op_scale_exec(self, input, scale):
-        output = F.interpolate(input, scale_factor = scale, mode="nearest")
-        output = output.cpu()
-        return output.detach().numpy()
-
-    def test_upsample_nearest1d_backward_shape_format(self, device):
-        test_cases = [
-            [[np.float16, 0, (1, 1, 2)], [4, ]],
-            [[np.float16, 0, (2, 1, 4)], [8, ]],
-            [[np.float32, 3, (2, 2, 3)], [1, ]],
-            [[np.float32, 0, (2, 1, 1)], [4, ]],
-            [[np.float32, 0, (4, 1, 2)], [4, ]],
-            [[np.float32, 0, (1, 1, 1)], [1, ]]
-        ]
-        for item in test_cases:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            size = list(item[0][2])
-            size[2] = item[1][0]
-
-            if cpu_input.dtype == torch.float16:
-                cpu_input = cpu_input.to(torch.float32)
-
-            cpu_output = self.cpu_op_exec(cpu_input, item[1])
-            npu_output = self.npu_op_exec(npu_input, item[1])
-            cpu_output = cpu_output.astype(npu_output.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_upsample_nearest1d_backward_shape_format_scale(self, device):
-        test_cases = [
-            [[np.float16, 0, (1, 1, 2)], 2],
-            [[np.float16, 0, (2, 1, 4)], 2.2],
-            [[np.float32, 3, (2, 2, 3)], 0.4],
-            [[np.float32, 0, (2, 1, 1)], 4],
-            [[np.float32, 0, (4, 1, 2)], 2],
-            [[np.float32, 0, (1, 1, 1)], 1]
-        ]
-        for item in test_cases:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-
-            if cpu_input.dtype == torch.float16:
-                cpu_input = cpu_input.to(torch.float32)
-
-            cpu_output = self.cpu_op_scale_exec(cpu_input, item[1])
-            npu_output = self.npu_op_scale_exec(npu_input, item[1])
-
-            cpu_output = cpu_output.astype(npu_output.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestUpsampleNearest1DBackward, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
-
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import torch.nn.functional as F
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestUpsampleNearest1DBackward(TestCase):
+    def cpu_op_exec(self, input, size):
+        output = F.interpolate(input, size, mode="nearest")
+        return output.detach().numpy()
+
+    def cpu_op_scale_exec(self, input, scale):
+        output = F.interpolate(input, scale_factor = scale, mode="nearest")
+        return output.detach().numpy()
+
+    def npu_op_exec(self, input, size):
+        output = F.interpolate(input, size, mode="nearest")
+        output = output.cpu()
+        return output.detach().numpy()
+
+    def npu_op_scale_exec(self, input, scale):
+        output = F.interpolate(input, scale_factor = scale, mode="nearest")
+        output = output.cpu()
+        return output.detach().numpy()
+
+    def test_upsample_nearest1d_backward_shape_format(self, device):
+        test_cases = [
+            [[np.float16, 0, (1, 1, 2)], [4, ]],
+            [[np.float16, 0, (2, 1, 4)], [8, ]],
+            [[np.float32, 3, (2, 2, 3)], [1, ]],
+            [[np.float32, 0, (2, 1, 1)], [4, ]],
+            [[np.float32, 0, (4, 1, 2)], [4, ]],
+            [[np.float32, 0, (1, 1, 1)], [1, ]]
+        ]
+        for item in test_cases:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            size = list(item[0][2])
+            size[2] = item[1][0]
+
+            if cpu_input.dtype == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+
+            cpu_output = self.cpu_op_exec(cpu_input, item[1])
+            npu_output = self.npu_op_exec(npu_input, item[1])
+            cpu_output = cpu_output.astype(npu_output.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_upsample_nearest1d_backward_shape_format_scale(self, device):
+        test_cases = [
+            [[np.float16, 0, (1, 1, 2)], 2],
+            [[np.float16, 0, (2, 1, 4)], 2.2],
+            [[np.float32, 3, (2, 2, 3)], 0.4],
+            [[np.float32, 0, (2, 1, 1)], 4],
+            [[np.float32, 0, (4, 1, 2)], 2],
+            [[np.float32, 0, (1, 1, 1)], 1]
+        ]
+        for item in test_cases:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+
+            if cpu_input.dtype == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+
+            cpu_output = self.cpu_op_scale_exec(cpu_input, item[1])
+            npu_output = self.npu_op_scale_exec(npu_input, item[1])
+
+            cpu_output = cpu_output.astype(npu_output.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestUpsampleNearest1DBackward, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
+
diff --git a/test/test_npu/test_network_ops/test_upsample_nearest1d_backward.py b/test/test_npu/test_network_ops/test_upsample_nearest1d_backward.py
index 05f55517dadef4afa2a7ae0431d4e780a2e16387..0741f7dfcc6cc19915315e296b63bd1975dca38c 100644
--- a/test/test_npu/test_network_ops/test_upsample_nearest1d_backward.py
+++ b/test/test_npu/test_network_ops/test_upsample_nearest1d_backward.py
@@ -1,131 +1,131 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import math
-import numpy as np
-import torch.nn.functional as F
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestUpsampleNearest1DBackward(TestCase):
-    def cpu_op_exec(self, input, grads, size):
-        input.requires_grad_(True)
-        output = F.interpolate(input, size, mode="nearest")
-        output.backward(grads)
-        gradcpu = input.grad
-        return output.detach().numpy(), gradcpu.detach().numpy()
-
-    def cpu_op_scale_exec(self, input, grads, scale):
-        input.requires_grad_(True)
-        output = F.interpolate(input, scale_factor = scale, mode="nearest")
-        output.backward(grads)
-        gradcpu = input.grad
-        return output.detach().numpy(), gradcpu.detach().numpy()
-
-    def npu_op_exec(self, input, grads, size):
-        input.requires_grad_(True)
-        output = F.interpolate(input, size, mode="nearest")
-        output.backward(grads)  
-        gradnpu = input.grad
-        gradnpu = gradnpu.to("cpu")
-        output = output.to("cpu")
-        return output.detach().numpy(), gradnpu.detach().numpy()
-
-    def npu_op_scale_exec(self, input, grads, scale):
-        input.requires_grad_(True)
-        output = F.interpolate(input, scale_factor = scale, mode="nearest")
-        output.backward(grads)  
-        gradnpu = input.grad
-        gradnpu = gradnpu.to("cpu")
-        output = output.to("cpu")
-        return output.detach().numpy(), gradnpu.detach().numpy()
-
-    def test_upsample_nearest1d_backward_shape_format(self, device):
-        test_cases = [
-            [[np.float16, 0, (1, 1, 2)], [4, ]],
-            [[np.float16, 0, (2, 1, 4)], [8, ]],
-            [[np.float32, 3, (2, 2, 3)], [1, ]],
-            [[np.float32, 0, (2, 1, 1)], [4, ]],
-            [[np.float32, 0, (4, 1, 2)], [4, ]],
-            [[np.float32, 0, (1, 1, 1)], [1, ]]
-        ]
-        for item in test_cases:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-            size = list(item[0][2])
-            size[2] = item[1][0]
-
-            grad_item = []
-            grad_item.append(item[0][0])
-            grad_item.append(item[0][1])
-            grad_item.append(size)
-            cpu_grads, npu_grads = create_common_tensor(grad_item, 0, 100)
-
-            if cpu_input.dtype == torch.float16:
-                cpu_input = cpu_input.to(torch.float32)
-
-            if cpu_grads.dtype == torch.float16:
-                cpu_grads = cpu_grads.to(torch.float32)
-
-            cpu_output, cpu_grad = self.cpu_op_exec(cpu_input, cpu_grads, item[1])
-            npu_output, npu_grad = self.npu_op_exec(npu_input, npu_grads, item[1])
-
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            cpu_grad = cpu_grad.astype(npu_grad.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_grad, npu_grad)
-
-    def test_upsample_nearest1d_backward_shape_format_scale(self, device):
-        test_cases = [
-            [[np.float16, 0, (1, 1, 2)], 2],
-            [[np.float16, 0, (2, 1, 4)], 2.2],
-            [[np.float32, 3, (2, 2, 3)], 0.4],
-            [[np.float32, 0, (2, 1, 1)], 4],
-            [[np.float32, 0, (4, 1, 2)], 2],
-            [[np.float32, 0, (1, 1, 1)], 1]
-        ]
-        for item in test_cases:
-            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
-
-            size = list(item[0][2])
-            size[2] = item[1] * item[0][2][2]
-            size[2] = math.floor(size[2])
-
-            grad_item = []
-            grad_item.append(item[0][0])
-            grad_item.append(item[0][1])
-            grad_item.append(size)
-            cpu_grads, npu_grads = create_common_tensor(grad_item, 0, 100)
-
-            if cpu_input.dtype == torch.float16:
-                cpu_input = cpu_input.to(torch.float32)
-
-            if cpu_grads.dtype == torch.float16:
-                cpu_grads = cpu_grads.to(torch.float32)
-
-            cpu_output, cpu_grad = self.cpu_op_scale_exec(cpu_input, cpu_grads, item[1])
-            npu_output, npu_grad = self.npu_op_scale_exec(npu_input, npu_grads, item[1])
-
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            cpu_grad = cpu_grad.astype(npu_grad.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_grad, npu_grad)
-
-instantiate_device_type_tests(TestUpsampleNearest1DBackward, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import math
+import numpy as np
+import torch.nn.functional as F
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestUpsampleNearest1DBackward(TestCase):
+    def cpu_op_exec(self, input, grads, size):
+        input.requires_grad_(True)
+        output = F.interpolate(input, size, mode="nearest")
+        output.backward(grads)
+        gradcpu = input.grad
+        return output.detach().numpy(), gradcpu.detach().numpy()
+
+    def cpu_op_scale_exec(self, input, grads, scale):
+        input.requires_grad_(True)
+        output = F.interpolate(input, scale_factor = scale, mode="nearest")
+        output.backward(grads)
+        gradcpu = input.grad
+        return output.detach().numpy(), gradcpu.detach().numpy()
+
+    def npu_op_exec(self, input, grads, size):
+        input.requires_grad_(True)
+        output = F.interpolate(input, size, mode="nearest")
+        output.backward(grads)  
+        gradnpu = input.grad
+        gradnpu = gradnpu.to("cpu")
+        output = output.to("cpu")
+        return output.detach().numpy(), gradnpu.detach().numpy()
+
+    def npu_op_scale_exec(self, input, grads, scale):
+        input.requires_grad_(True)
+        output = F.interpolate(input, scale_factor = scale, mode="nearest")
+        output.backward(grads)  
+        gradnpu = input.grad
+        gradnpu = gradnpu.to("cpu")
+        output = output.to("cpu")
+        return output.detach().numpy(), gradnpu.detach().numpy()
+
+    def test_upsample_nearest1d_backward_shape_format(self, device):
+        test_cases = [
+            [[np.float16, 0, (1, 1, 2)], [4, ]],
+            [[np.float16, 0, (2, 1, 4)], [8, ]],
+            [[np.float32, 3, (2, 2, 3)], [1, ]],
+            [[np.float32, 0, (2, 1, 1)], [4, ]],
+            [[np.float32, 0, (4, 1, 2)], [4, ]],
+            [[np.float32, 0, (1, 1, 1)], [1, ]]
+        ]
+        for item in test_cases:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+            size = list(item[0][2])
+            size[2] = item[1][0]
+
+            grad_item = []
+            grad_item.append(item[0][0])
+            grad_item.append(item[0][1])
+            grad_item.append(size)
+            cpu_grads, npu_grads = create_common_tensor(grad_item, 0, 100)
+
+            if cpu_input.dtype == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+
+            if cpu_grads.dtype == torch.float16:
+                cpu_grads = cpu_grads.to(torch.float32)
+
+            cpu_output, cpu_grad = self.cpu_op_exec(cpu_input, cpu_grads, item[1])
+            npu_output, npu_grad = self.npu_op_exec(npu_input, npu_grads, item[1])
+
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            cpu_grad = cpu_grad.astype(npu_grad.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_grad, npu_grad)
+
+    def test_upsample_nearest1d_backward_shape_format_scale(self, device):
+        test_cases = [
+            [[np.float16, 0, (1, 1, 2)], 2],
+            [[np.float16, 0, (2, 1, 4)], 2.2],
+            [[np.float32, 3, (2, 2, 3)], 0.4],
+            [[np.float32, 0, (2, 1, 1)], 4],
+            [[np.float32, 0, (4, 1, 2)], 2],
+            [[np.float32, 0, (1, 1, 1)], 1]
+        ]
+        for item in test_cases:
+            cpu_input, npu_input = create_common_tensor(item[0], 0, 100)
+
+            size = list(item[0][2])
+            size[2] = item[1] * item[0][2][2]
+            size[2] = math.floor(size[2])
+
+            grad_item = []
+            grad_item.append(item[0][0])
+            grad_item.append(item[0][1])
+            grad_item.append(size)
+            cpu_grads, npu_grads = create_common_tensor(grad_item, 0, 100)
+
+            if cpu_input.dtype == torch.float16:
+                cpu_input = cpu_input.to(torch.float32)
+
+            if cpu_grads.dtype == torch.float16:
+                cpu_grads = cpu_grads.to(torch.float32)
+
+            cpu_output, cpu_grad = self.cpu_op_scale_exec(cpu_input, cpu_grads, item[1])
+            npu_output, npu_grad = self.npu_op_scale_exec(npu_input, npu_grads, item[1])
+
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            cpu_grad = cpu_grad.astype(npu_grad.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_grad, npu_grad)
+
+instantiate_device_type_tests(TestUpsampleNearest1DBackward, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_where.py b/test/test_npu/test_network_ops/test_where.py
old mode 100644
new mode 100755
index 971735f457b6ef16a2747cdd58bbc80227da5708..e9c21dadc9c8f1fd1bf25001f8d4f152d8288231
--- a/test/test_npu/test_network_ops/test_where.py
+++ b/test/test_npu/test_network_ops/test_where.py
@@ -1,128 +1,128 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestWhere(TestCase):
-    def cpu_op_exec(self, input1):
-        output = torch.where(input1)
-        output = list(output)
-        for i in range(len(output)):
-            output[i] = output[i].numpy().astype(np.int32)
-        return output
-
-    def npu_op_exec(self, input1):
-        output = torch.where(input1)
-        output = list(output)
-        for i in range(len(output)):
-            output[i] = output[i].to("cpu").numpy().astype(np.int32)
-        return output
-
-    def cpu_op_exec_condition(self, input1, ones):
-        output = torch.where(input1 > 0, input1, ones)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_condition(self, input1, ones):
-        output = torch.where(input1 > 0, input1, ones)
-        output = output.to("cpu").numpy()
-        return output
-
-    def cpu_op_exec_s(self, input1, ones):
-        output = torch._s_where(input1 > 0, input1, ones)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_s(self, input1, ones):
-        output = torch._s_where(input1 > 0, input1, ones)
-        output = output.to("cpu").numpy()
-        return output
-
-    def where_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, -100, 100)
-            cpu_ones = torch.ones_like(cpu_input1)
-            npu_ones = cpu_ones.to("npu")
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-                cpu_ones = cpu_ones.to(torch.float32)
-
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-
-            cpu_output_cond = self.cpu_op_exec_condition(cpu_input1, cpu_ones)
-            npu_output_cond = self.npu_op_exec_condition(npu_input1, npu_ones)
-            cpu_output_cond = cpu_output_cond.astype(npu_output_cond.dtype)
-
-            cpu_output_s = self.cpu_op_exec_s(cpu_input1, cpu_ones)
-            npu_output_s = self.npu_op_exec_s(npu_input1, npu_ones)
-            cpu_output_s = cpu_output_s.astype(npu_output_s.dtype)
-
-            for i in range(len(cpu_output)):
-                cpu_output[i] = cpu_output[i].astype(npu_output[i].dtype)
-                self.assertRtolEqual(cpu_output[i], npu_output[i])
-
-            self.assertRtolEqual(cpu_output_cond, npu_output_cond)
-            self.assertRtolEqual(cpu_output_s, npu_output_s)
-
-    def test_where_shape_format_fp32_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [[np.float32, i, [18]] for i in format_list]
-        self.where_result(shape_format)
-
-    def test_where_shape_format_fp32_2d(self, device):
-        format_list = [0]
-        shape_format = [[np.float32, i, [5, 256]] for i in format_list]
-        self.where_result(shape_format)
-
-    def test_where_shape_format_fp32_3d(self, device):
-        format_list = [0]
-        shape_format = [[np.float32, i, [32, 3, 3]] for i in format_list]
-        self.where_result(shape_format)
-
-    def test_where_shape_format_fp32_4d(self, device):
-        format_list = [0, 3]
-        shape_format = [[np.float32, i, [64, 112, 7, 7]] for i in format_list]
-        self.where_result(shape_format)
-
-    def test_where_shape_format_fp16_1d(self, device):
-        format_list = [0, 3]
-        shape_format = [[np.float16, i, [18]] for i in format_list]
-        self.where_result(shape_format)
-
-    def test_where_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 4, 29]
-        shape_format = [[np.float16, i, [5, 256]] for i in format_list]
-        self.where_result(shape_format)
-        
-    def test_where_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 4, 29]
-        shape_format = [[np.float16, i, [32, 3, 3]] for i in format_list]
-        self.where_result(shape_format)
-        
-    def test_where_shape_format_fp16_4d(self, device):
-        format_list = [0, 3, 4, 29]
-        shape_format = [[np.float16, i, [64, 112, 7, 7]] for i in format_list]
-        self.where_result(shape_format)
-
-
-
-instantiate_device_type_tests(TestWhere, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestWhere(TestCase):
+    def cpu_op_exec(self, input1):
+        output = torch.where(input1)
+        output = list(output)
+        for i in range(len(output)):
+            output[i] = output[i].numpy().astype(np.int32)
+        return output
+
+    def npu_op_exec(self, input1):
+        output = torch.where(input1)
+        output = list(output)
+        for i in range(len(output)):
+            output[i] = output[i].to("cpu").numpy().astype(np.int32)
+        return output
+
+    def cpu_op_exec_condition(self, input1, ones):
+        output = torch.where(input1 > 0, input1, ones)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_condition(self, input1, ones):
+        output = torch.where(input1 > 0, input1, ones)
+        output = output.to("cpu").numpy()
+        return output
+
+    def cpu_op_exec_s(self, input1, ones):
+        output = torch._s_where(input1 > 0, input1, ones)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_s(self, input1, ones):
+        output = torch._s_where(input1 > 0, input1, ones)
+        output = output.to("cpu").numpy()
+        return output
+
+    def where_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, -100, 100)
+            cpu_ones = torch.ones_like(cpu_input1)
+            npu_ones = cpu_ones.to("npu")
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+                cpu_ones = cpu_ones.to(torch.float32)
+
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+
+            cpu_output_cond = self.cpu_op_exec_condition(cpu_input1, cpu_ones)
+            npu_output_cond = self.npu_op_exec_condition(npu_input1, npu_ones)
+            cpu_output_cond = cpu_output_cond.astype(npu_output_cond.dtype)
+
+            cpu_output_s = self.cpu_op_exec_s(cpu_input1, cpu_ones)
+            npu_output_s = self.npu_op_exec_s(npu_input1, npu_ones)
+            cpu_output_s = cpu_output_s.astype(npu_output_s.dtype)
+
+            for i in range(len(cpu_output)):
+                cpu_output[i] = cpu_output[i].astype(npu_output[i].dtype)
+                self.assertRtolEqual(cpu_output[i], npu_output[i])
+
+            self.assertRtolEqual(cpu_output_cond, npu_output_cond)
+            self.assertRtolEqual(cpu_output_s, npu_output_s)
+
+    def test_where_shape_format_fp32_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [[np.float32, i, [18]] for i in format_list]
+        self.where_result(shape_format)
+
+    def test_where_shape_format_fp32_2d(self, device):
+        format_list = [0]
+        shape_format = [[np.float32, i, [5, 256]] for i in format_list]
+        self.where_result(shape_format)
+
+    def test_where_shape_format_fp32_3d(self, device):
+        format_list = [0]
+        shape_format = [[np.float32, i, [32, 3, 3]] for i in format_list]
+        self.where_result(shape_format)
+
+    def test_where_shape_format_fp32_4d(self, device):
+        format_list = [0, 3]
+        shape_format = [[np.float32, i, [64, 112, 7, 7]] for i in format_list]
+        self.where_result(shape_format)
+
+    def test_where_shape_format_fp16_1d(self, device):
+        format_list = [0, 3]
+        shape_format = [[np.float16, i, [18]] for i in format_list]
+        self.where_result(shape_format)
+
+    def test_where_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 4, 29]
+        shape_format = [[np.float16, i, [5, 256]] for i in format_list]
+        self.where_result(shape_format)
+        
+    def test_where_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 4, 29]
+        shape_format = [[np.float16, i, [32, 3, 3]] for i in format_list]
+        self.where_result(shape_format)
+        
+    def test_where_shape_format_fp16_4d(self, device):
+        format_list = [0, 3, 4, 29]
+        shape_format = [[np.float16, i, [64, 112, 7, 7]] for i in format_list]
+        self.where_result(shape_format)
+
+
+
+instantiate_device_type_tests(TestWhere, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_xor.py b/test/test_npu/test_network_ops/test_xor.py
index 438f0a486321d2efb26b0147a36ee34e74295e5e..aea632524b7d1b63b60069895f35507b4ab74517 100644
--- a/test/test_npu/test_network_ops/test_xor.py
+++ b/test/test_npu/test_network_ops/test_xor.py
@@ -1,174 +1,174 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestXor(TestCase):
-
-    def generate_bool_data(self, shape):
-        input1 = np.random.uniform(0, 1, shape)
-        input2 = np.random.uniform(0, 1, shape)
-        input1 = input1.reshape(-1)
-        input2 = input2.reshape(-1)
-        for i in range(len(input1)):
-            if input1[i] < 0.5:
-                input1[i] = 0
-        for i in range(len(input2)):
-            if input2[i] < 0.5:
-                input2[i] = 0
-        input1 = input1.astype(np.bool)
-        input2 = input2.astype(np.bool)
-        input1 = input1.reshape(shape)
-        input2 = input2.reshape(shape)
-        # modify from numpy.ndarray to torch.tensor
-        npu_input1 = torch.from_numpy(input1)
-        npu_input2 = torch.from_numpy(input2)
-
-        return npu_input1, npu_input2
-
-    def generate_single_bool_data(self, shape):
-        input1 = np.random.uniform(0, 1, shape)
-        input1 = input1.reshape(-1)
-        for i in range(len(input1)):
-            if input1[i] < 0.5:
-                input1[i] = 0
-        input1 = input1.astype(np.bool)
-        input1 = input1.reshape(shape)
-        # modify from numpy.ndarray to torch.tensor
-        npu_input1 = torch.from_numpy(input1)
-
-        return npu_input1
-
-    def generate_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-
-        # modify from numpy.ndarray to torch.tensor
-        npu_input1 = torch.from_numpy(input1)
-        npu_input2 = torch.from_numpy(input2)
-
-        return npu_input1, npu_input2
-
-    def generate_single_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        npu_input1 = torch.from_numpy(input1)
-
-        return npu_input1
-
-    def cpu_op_exec(self, input1, input2):
-        output = input1 ^ input2
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        output = input1.__xor__(input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_scalar(self, input1, input2):
-        input1 = input1.to("npu")
-        output = input1.__xor__(input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_xor_tensor_int32(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (10, 10), np.int32)
-        npu_input2 = self.generate_single_data(0, 100, (10, 10), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertEqual(cpu_output, npu_output)
-
-    def test_xor_tensor_int16(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (10, 10), np.int16)
-        npu_input2 = self.generate_single_data(0, 100, (10, 10), np.int16)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertEqual(cpu_output, npu_output)
-
-    def test_xor_tensor_int8(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (10, 10), np.int8)
-        npu_input2 = self.generate_single_data(0, 100, (10, 10), np.int8)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertEqual(cpu_output, npu_output)
-
-    def test_xor_scalar_int32(self, device):
-        npu_input = self.generate_single_data(0, 100, (1, 10), np.int32)
-        npu_input_scalr = np.random.randint(0, 100)
-        cpu_output = self.cpu_op_exec(npu_input, npu_input_scalr)
-        npu_output = self.npu_op_exec_scalar(npu_input, npu_input_scalr)
-        self.assertEqual(cpu_output, npu_output)
-
-    def test_xor_scalar_int16(self, device):
-        npu_input = self.generate_single_data(0, 100, (10, 20), np.int16)
-        npu_input_scalr = np.random.randint(0, 100)
-        cpu_output = self.cpu_op_exec(npu_input, npu_input_scalr)
-        npu_output = self.npu_op_exec_scalar(npu_input, npu_input_scalr)
-        self.assertEqual(cpu_output, npu_output)
-
-    def test_xor_scalar_int8(self, device):
-        npu_input = self.generate_single_data(0, 100, (20, 10), np.int8)
-        npu_input_scalr = np.random.randint(0, 100)
-        cpu_output = self.cpu_op_exec(npu_input, npu_input_scalr)
-        npu_output = self.npu_op_exec_scalar(npu_input, npu_input_scalr)
-        self.assertEqual(cpu_output, npu_output)
-
-    def test_xor_tensor_uint8(self, device):
-        npu_input1 = self.generate_single_data(0, 100, (10, 10), np.uint8)
-        npu_input2 = self.generate_single_data(0, 100, (10, 10), np.uint8)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertEqual(cpu_output, npu_output)
-
-    def test_xor_scalar_uint8(self, device):
-        npu_input = self.generate_single_data(0, 100, (5, 10), np.uint8)
-        npu_input_scalr = np.random.randint(0, 100)
-        cpu_output = self.cpu_op_exec(npu_input, npu_input_scalr)
-        npu_output = self.npu_op_exec_scalar(npu_input, npu_input_scalr)
-        self.assertEqual(cpu_output, npu_output)
-
-    def test_xor_scalar_bool1(self, device):
-        npu_input = self.generate_single_bool_data((10, 10))
-        npu_input_scalr = True
-        cpu_output = self.cpu_op_exec(npu_input, npu_input_scalr)
-        npu_output = self.npu_op_exec_scalar(npu_input, npu_input_scalr)
-        self.assertEqual(cpu_output, npu_output)
-
-    def test_xor_scalar_bool2(self, device):
-        npu_input = self.generate_single_bool_data((10, 10))
-        npu_input_scalr = False
-        cpu_output = self.cpu_op_exec(npu_input, npu_input_scalr)
-        npu_output = self.npu_op_exec_scalar(npu_input, npu_input_scalr)
-        self.assertEqual(cpu_output, npu_output)
-
-    def test_xor_tensor_bool(self, device):
-        npu_input1, npu_input2 = self.generate_bool_data((10, 10))
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestXor, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestXor(TestCase):
+
+    def generate_bool_data(self, shape):
+        input1 = np.random.uniform(0, 1, shape)
+        input2 = np.random.uniform(0, 1, shape)
+        input1 = input1.reshape(-1)
+        input2 = input2.reshape(-1)
+        for i in range(len(input1)):
+            if input1[i] < 0.5:
+                input1[i] = 0
+        for i in range(len(input2)):
+            if input2[i] < 0.5:
+                input2[i] = 0
+        input1 = input1.astype(np.bool)
+        input2 = input2.astype(np.bool)
+        input1 = input1.reshape(shape)
+        input2 = input2.reshape(shape)
+        # modify from numpy.ndarray to torch.tensor
+        npu_input1 = torch.from_numpy(input1)
+        npu_input2 = torch.from_numpy(input2)
+
+        return npu_input1, npu_input2
+
+    def generate_single_bool_data(self, shape):
+        input1 = np.random.uniform(0, 1, shape)
+        input1 = input1.reshape(-1)
+        for i in range(len(input1)):
+            if input1[i] < 0.5:
+                input1[i] = 0
+        input1 = input1.astype(np.bool)
+        input1 = input1.reshape(shape)
+        # modify from numpy.ndarray to torch.tensor
+        npu_input1 = torch.from_numpy(input1)
+
+        return npu_input1
+
+    def generate_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+
+        # modify from numpy.ndarray to torch.tensor
+        npu_input1 = torch.from_numpy(input1)
+        npu_input2 = torch.from_numpy(input2)
+
+        return npu_input1, npu_input2
+
+    def generate_single_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        npu_input1 = torch.from_numpy(input1)
+
+        return npu_input1
+
+    def cpu_op_exec(self, input1, input2):
+        output = input1 ^ input2
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        output = input1.__xor__(input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_scalar(self, input1, input2):
+        input1 = input1.to("npu")
+        output = input1.__xor__(input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_xor_tensor_int32(self, device):
+        npu_input1 = self.generate_single_data(0, 100, (10, 10), np.int32)
+        npu_input2 = self.generate_single_data(0, 100, (10, 10), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertEqual(cpu_output, npu_output)
+
+    def test_xor_tensor_int16(self, device):
+        npu_input1 = self.generate_single_data(0, 100, (10, 10), np.int16)
+        npu_input2 = self.generate_single_data(0, 100, (10, 10), np.int16)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertEqual(cpu_output, npu_output)
+
+    def test_xor_tensor_int8(self, device):
+        npu_input1 = self.generate_single_data(0, 100, (10, 10), np.int8)
+        npu_input2 = self.generate_single_data(0, 100, (10, 10), np.int8)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertEqual(cpu_output, npu_output)
+
+    def test_xor_scalar_int32(self, device):
+        npu_input = self.generate_single_data(0, 100, (1, 10), np.int32)
+        npu_input_scalr = np.random.randint(0, 100)
+        cpu_output = self.cpu_op_exec(npu_input, npu_input_scalr)
+        npu_output = self.npu_op_exec_scalar(npu_input, npu_input_scalr)
+        self.assertEqual(cpu_output, npu_output)
+
+    def test_xor_scalar_int16(self, device):
+        npu_input = self.generate_single_data(0, 100, (10, 20), np.int16)
+        npu_input_scalr = np.random.randint(0, 100)
+        cpu_output = self.cpu_op_exec(npu_input, npu_input_scalr)
+        npu_output = self.npu_op_exec_scalar(npu_input, npu_input_scalr)
+        self.assertEqual(cpu_output, npu_output)
+
+    def test_xor_scalar_int8(self, device):
+        npu_input = self.generate_single_data(0, 100, (20, 10), np.int8)
+        npu_input_scalr = np.random.randint(0, 100)
+        cpu_output = self.cpu_op_exec(npu_input, npu_input_scalr)
+        npu_output = self.npu_op_exec_scalar(npu_input, npu_input_scalr)
+        self.assertEqual(cpu_output, npu_output)
+
+    def test_xor_tensor_uint8(self, device):
+        npu_input1 = self.generate_single_data(0, 100, (10, 10), np.uint8)
+        npu_input2 = self.generate_single_data(0, 100, (10, 10), np.uint8)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertEqual(cpu_output, npu_output)
+
+    def test_xor_scalar_uint8(self, device):
+        npu_input = self.generate_single_data(0, 100, (5, 10), np.uint8)
+        npu_input_scalr = np.random.randint(0, 100)
+        cpu_output = self.cpu_op_exec(npu_input, npu_input_scalr)
+        npu_output = self.npu_op_exec_scalar(npu_input, npu_input_scalr)
+        self.assertEqual(cpu_output, npu_output)
+
+    def test_xor_scalar_bool1(self, device):
+        npu_input = self.generate_single_bool_data((10, 10))
+        npu_input_scalr = True
+        cpu_output = self.cpu_op_exec(npu_input, npu_input_scalr)
+        npu_output = self.npu_op_exec_scalar(npu_input, npu_input_scalr)
+        self.assertEqual(cpu_output, npu_output)
+
+    def test_xor_scalar_bool2(self, device):
+        npu_input = self.generate_single_bool_data((10, 10))
+        npu_input_scalr = False
+        cpu_output = self.cpu_op_exec(npu_input, npu_input_scalr)
+        npu_output = self.npu_op_exec_scalar(npu_input, npu_input_scalr)
+        self.assertEqual(cpu_output, npu_output)
+
+    def test_xor_tensor_bool(self, device):
+        npu_input1, npu_input2 = self.generate_bool_data((10, 10))
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestXor, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_yolo_boxes_encode.py b/test/test_npu/test_network_ops/test_yolo_boxes_encode.py
index 57a0a7b30e40b0704b79fffcee376465eed89aca..71dce425b6c8258c817056810bf6a521797308c7 100644
--- a/test/test_npu/test_network_ops/test_yolo_boxes_encode.py
+++ b/test/test_npu/test_network_ops/test_yolo_boxes_encode.py
@@ -1,39 +1,39 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import numpy as np
-import copy
-import sys
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-class TestYoloBoxesEncode(TestCase):
-    def npu_op_exec(self, anchor_boxes, gt_bboxes, stride, impl_mode=False):
-        out = torch.npu_yolo_boxes_encode(anchor_boxes, gt_bboxes, stride, impl_mode)
-        out = out.to("cpu")
-        return out.detach().numpy()
-        
-    def test_yolo_boxes_encode(self, device):
-        anchor_boxes = torch.rand((2, 4), dtype=torch.float32).to("npu")
-        gt_bboxes = torch.rand((2, 4), dtype=torch.float32).to("npu")
-        stride = torch.tensor([2, 2], dtype=torch.int32).to("npu")
-        expect_cpu = torch.tensor([[0.7921727, 0.5314963, -0.74224466, -13.815511],
-                                   [0.7360072, 0.58343244, 4.3334002, -0.51378196]], dtype=torch.float32)
-        npu_output = self.npu_op_exec(anchor_boxes, gt_bboxes, stride, False)
-        self.assertRtolEqual(expect_cpu.numpy(), npu_output)
-
-
-instantiate_device_type_tests(TestYoloBoxesEncode, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import copy
+import sys
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+class TestYoloBoxesEncode(TestCase):
+    def npu_op_exec(self, anchor_boxes, gt_bboxes, stride, impl_mode=False):
+        out = torch.npu_yolo_boxes_encode(anchor_boxes, gt_bboxes, stride, impl_mode)
+        out = out.to("cpu")
+        return out.detach().numpy()
+        
+    def test_yolo_boxes_encode(self, device):
+        anchor_boxes = torch.rand((2, 4), dtype=torch.float32).to("npu")
+        gt_bboxes = torch.rand((2, 4), dtype=torch.float32).to("npu")
+        stride = torch.tensor([2, 2], dtype=torch.int32).to("npu")
+        expect_cpu = torch.tensor([[0.7921727, 0.5314963, -0.74224466, -13.815511],
+                                   [0.7360072, 0.58343244, 4.3334002, -0.51378196]], dtype=torch.float32)
+        npu_output = self.npu_op_exec(anchor_boxes, gt_bboxes, stride, False)
+        self.assertRtolEqual(expect_cpu.numpy(), npu_output)
+
+
+instantiate_device_type_tests(TestYoloBoxesEncode, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_zero.py b/test/test_npu/test_network_ops/test_zero.py
old mode 100644
new mode 100755
index 3326b938aabb77dde1df548fc404822bc98c04ef..0781c411d300cd5ff1a5928cf5edc20b51d7608c
--- a/test/test_npu/test_network_ops/test_zero.py
+++ b/test/test_npu/test_network_ops/test_zero.py
@@ -1,109 +1,109 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestZero(TestCase):
-    def cpu_op_exec(self, input1):
-        torch.zero_(input1)
-        output = input1.numpy()
-        return output
-
-    def npu_op_exec(self, input1):
-        torch.zero_(input1)
-        output = input1.to("cpu")
-        output = output.numpy()
-        return output
-
-    def zero_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-
-            cpu_output = self.cpu_op_exec(cpu_input1)
-            npu_output = self.npu_op_exec(npu_input1)
-
-            cpu_output = cpu_output.astype(npu_output.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_zero_shape_format_fp16_1d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float16, i, [18]] for i in format_list]
-        self.zero_result(shape_format)
-
-    def test_zero_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float16, i, [5, 256]] for i in format_list]
-        self.zero_result(shape_format)
-
-    def test_zero_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float16, i, [32, 3, 3]] for i in format_list]
-        self.zero_result(shape_format)
-
-    def test_zero_shape_format_fp16_4d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float16, i, [64, 112, 7, 7]] for i in format_list]
-        self.zero_result(shape_format)
-
-    def test_zero_shape_format_fp32_1d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float32, i, [18]] for i in format_list]
-        self.zero_result(shape_format)
-
-    def test_zero_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float32, i, [5, 256]] for i in format_list]
-        self.zero_result(shape_format)
-
-    def test_zero_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float32, i, [32, 3, 3]] for i in format_list]
-        self.zero_result(shape_format)
-
-    def test_zero_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 29]
-        shape_format = [[np.float32, i, [64, 112, 7, 7]] for i in format_list]
-        self.zero_result(shape_format)
-
-    def test_zero_shape_format_int32_1d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[np.int32, i, [18]] for i in format_list]
-        self.zero_result(shape_format)
-
-    def test_zero_shape_format_int32_2d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[np.int32, i, [5, 256]] for i in format_list]
-        self.zero_result(shape_format)
-
-    def test_zero_shape_format_int32_3d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[np.int32, i, [32, 3, 3]] for i in format_list]
-        self.zero_result(shape_format)
-
-    def test_zero_shape_format_int32_4d(self, device):
-        format_list = [-1, 0]
-        shape_format = [[np.int32, i, [64, 112, 7, 7]] for i in format_list]
-        self.zero_result(shape_format)
-
-
-instantiate_device_type_tests(TestZero, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestZero(TestCase):
+    def cpu_op_exec(self, input1):
+        torch.zero_(input1)
+        output = input1.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        torch.zero_(input1)
+        output = input1.to("cpu")
+        output = output.numpy()
+        return output
+
+    def zero_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item, 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+
+            cpu_output = self.cpu_op_exec(cpu_input1)
+            npu_output = self.npu_op_exec(npu_input1)
+
+            cpu_output = cpu_output.astype(npu_output.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_zero_shape_format_fp16_1d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float16, i, [18]] for i in format_list]
+        self.zero_result(shape_format)
+
+    def test_zero_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float16, i, [5, 256]] for i in format_list]
+        self.zero_result(shape_format)
+
+    def test_zero_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float16, i, [32, 3, 3]] for i in format_list]
+        self.zero_result(shape_format)
+
+    def test_zero_shape_format_fp16_4d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float16, i, [64, 112, 7, 7]] for i in format_list]
+        self.zero_result(shape_format)
+
+    def test_zero_shape_format_fp32_1d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float32, i, [18]] for i in format_list]
+        self.zero_result(shape_format)
+
+    def test_zero_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float32, i, [5, 256]] for i in format_list]
+        self.zero_result(shape_format)
+
+    def test_zero_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float32, i, [32, 3, 3]] for i in format_list]
+        self.zero_result(shape_format)
+
+    def test_zero_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 29]
+        shape_format = [[np.float32, i, [64, 112, 7, 7]] for i in format_list]
+        self.zero_result(shape_format)
+
+    def test_zero_shape_format_int32_1d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[np.int32, i, [18]] for i in format_list]
+        self.zero_result(shape_format)
+
+    def test_zero_shape_format_int32_2d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[np.int32, i, [5, 256]] for i in format_list]
+        self.zero_result(shape_format)
+
+    def test_zero_shape_format_int32_3d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[np.int32, i, [32, 3, 3]] for i in format_list]
+        self.zero_result(shape_format)
+
+    def test_zero_shape_format_int32_4d(self, device):
+        format_list = [-1, 0]
+        shape_format = [[np.int32, i, [64, 112, 7, 7]] for i in format_list]
+        self.zero_result(shape_format)
+
+
+instantiate_device_type_tests(TestZero, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_zeros.py b/test/test_npu/test_network_ops/test_zeros.py
old mode 100644
new mode 100755
index 66cdbf95d0494fa365dec3bf513c8cd3a7a88e35..bab4366186c6abb37b992914ea5cac04c259d158
--- a/test/test_npu/test_network_ops/test_zeros.py
+++ b/test/test_npu/test_network_ops/test_zeros.py
@@ -1,151 +1,151 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-import numpy as np
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestZeros(TestCase):
-    def cpu_op_exec(self, input1, dtype):
-        output = torch.zeros(input1.size(), dtype=dtype, device="cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, dtype):
-        output = torch.zeros(input1.size(), dtype=dtype, device="npu")
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_out(self, input1, input2, dtype):
-        torch.zeros(input1.size(), dtype=dtype, device="npu", out=input2)
-        output = input2.to("cpu")
-        output = output.numpy()
-        return output
-
-    def zeros_result(self, shape_format):
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            npu_input2 = copy.deepcopy(cpu_input1)
-            npu_input2 = npu_input2.to(item[1]).to('npu')
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-
-            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
-            npu_output = self.npu_op_exec(npu_input1, item[1])
-            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, item[1])
-            cpu_output = cpu_output.astype(npu_output.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-            self.assertRtolEqual(cpu_output, npu_output_out)
-
-    def test_zeros_shape_format_names(self, device):
-        format_list = [0, 3, 29]
-        dtype_list = [torch.float16, torch.float32, torch.int32]
-        shape_format = [[[np.float32, i, [18, 24, 8, 8]], j] for i in format_list for j in dtype_list]
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
-            if cpu_input1.dtype == torch.float16:
-                cpu_input1 = cpu_input1.to(torch.float32)
-
-            cpu_output = torch.zeros(cpu_input1.size(), names=('N', 'C', 'H', 'W'), dtype=item[1], device="cpu")
-            cpu_output = cpu_output.numpy()
-            npu_output = torch.zeros(cpu_input1.size(), names=('N', 'C', 'H', 'W'), dtype=item[1], device="npu")
-            npu_output = npu_output.to("cpu")
-            npu_output = npu_output.numpy()
-            cpu_output = cpu_output.astype(npu_output.dtype)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_zeros_shape_format_fp16_1d(self, device):
-        format_list = [0, 3, 29]
-        dtype_list = [torch.float16, torch.float32, torch.int32]
-        shape_format = [[[np.float16, i, [18]], j] for i in format_list for j in dtype_list]
-        self.zeros_result(shape_format)
-
-    def test_zeros_shape_format_fp16_2d(self, device):
-        format_list = [0, 3, 29]
-        dtype_list = [torch.float16, torch.float32, torch.int32]
-        shape_format = [[[np.float16, i, [5, 256]], j] for i in format_list for j in dtype_list]
-        self.zeros_result(shape_format)
-
-    def test_zeros_shape_format_fp16_3d(self, device):
-        format_list = [0, 3, 29]
-        dtype_list = [torch.float16, torch.float32, torch.int32]
-        shape_format = [[[np.float16, i, [32, 3, 3]], j] for i in format_list for j in dtype_list]
-        self.zeros_result(shape_format)
-
-    def test_zeros_shape_format_fp16_4d(self, device):
-        format_list = [0, 3, 29]
-        dtype_list = [torch.float16, torch.float32, torch.int32]
-        shape_format = [[[np.float16, i, [64, 112, 7, 7]], j] for i in format_list for j in dtype_list]
-        self.zeros_result(shape_format)
-
-    def test_zeros_shape_format_fp32_1d(self, device):
-        format_list = [0, 3, 29]
-        dtype_list = [torch.float16, torch.float32, torch.int32]
-        shape_format = [[[np.float32, i, [18]], j] for i in format_list for j in dtype_list]
-        self.zeros_result(shape_format)
-
-    def test_zeros_shape_format_fp32_2d(self, device):
-        format_list = [0, 3, 29]
-        dtype_list = [torch.float16, torch.float32, torch.int32]
-        shape_format = [[[np.float32, i, [5, 256]], j] for i in format_list for j in dtype_list]
-        self.zeros_result(shape_format)
-
-    def test_zeros_shape_format_fp32_3d(self, device):
-        format_list = [0, 3, 29]
-        dtype_list = [torch.float16, torch.float32, torch.int32]
-        shape_format = [[[np.float32, i, [32, 3, 3]], j] for i in format_list for j in dtype_list]
-        self.zeros_result(shape_format)
-
-    def test_zeros_shape_format_fp32_4d(self, device):
-        format_list = [0, 3, 29]
-        dtype_list = [torch.float16, torch.float32, torch.int32]
-        shape_format = [[[np.float32, i, [64, 112, 7, 7]], j] for i in format_list for j in dtype_list]
-        self.zeros_result(shape_format)
-
-    def test_zeros_shape_format_int32_1d(self, device):
-        format_list = [-1, 0]
-        dtype_list = [torch.float16, torch.float32, torch.int32]
-        shape_format = [[[np.int32, i, [18]], j] for i in format_list for j in dtype_list]
-        self.zeros_result(shape_format)
-
-    def test_zeros_shape_format_int32_2d(self, device):
-        format_list = [-1, 0]
-        dtype_list = [torch.float16, torch.float32, torch.int32]
-        shape_format = [[[np.int32, i, [5, 256]], j] for i in format_list for j in dtype_list]
-        self.zeros_result(shape_format)
-
-    def test_zeros_shape_format_int32_3d(self, device):
-        format_list = [-1, 0]
-        dtype_list = [torch.float16, torch.float32, torch.int32]
-        shape_format = [[[np.int32, i, [32, 3, 3]], j] for i in format_list for j in dtype_list]
-        self.zeros_result(shape_format)
-
-    def test_zeros_shape_format_int32_4d(self, device):
-        format_list = [-1, 0]
-        dtype_list = [torch.float16, torch.float32, torch.int32]
-        shape_format = [[[np.int32, i, [64, 112, 7, 7]], j] for i in format_list for j in dtype_list]
-        self.zeros_result(shape_format)
-
-
-instantiate_device_type_tests(TestZeros, globals(), except_for="cpu")
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import numpy as np
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestZeros(TestCase):
+    def cpu_op_exec(self, input1, dtype):
+        output = torch.zeros(input1.size(), dtype=dtype, device="cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, dtype):
+        output = torch.zeros(input1.size(), dtype=dtype, device="npu")
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_out(self, input1, input2, dtype):
+        torch.zeros(input1.size(), dtype=dtype, device="npu", out=input2)
+        output = input2.to("cpu")
+        output = output.numpy()
+        return output
+
+    def zeros_result(self, shape_format):
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            npu_input2 = copy.deepcopy(cpu_input1)
+            npu_input2 = npu_input2.to(item[1]).to('npu')
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+
+            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
+            npu_output = self.npu_op_exec(npu_input1, item[1])
+            npu_output_out = self.npu_op_exec_out(npu_input1, npu_input2, item[1])
+            cpu_output = cpu_output.astype(npu_output.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+            self.assertRtolEqual(cpu_output, npu_output_out)
+
+    def test_zeros_shape_format_names(self, device):
+        format_list = [0, 3, 29]
+        dtype_list = [torch.float16, torch.float32, torch.int32]
+        shape_format = [[[np.float32, i, [18, 24, 8, 8]], j] for i in format_list for j in dtype_list]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 0, 100)
+            if cpu_input1.dtype == torch.float16:
+                cpu_input1 = cpu_input1.to(torch.float32)
+
+            cpu_output = torch.zeros(cpu_input1.size(), names=('N', 'C', 'H', 'W'), dtype=item[1], device="cpu")
+            cpu_output = cpu_output.numpy()
+            npu_output = torch.zeros(cpu_input1.size(), names=('N', 'C', 'H', 'W'), dtype=item[1], device="npu")
+            npu_output = npu_output.to("cpu")
+            npu_output = npu_output.numpy()
+            cpu_output = cpu_output.astype(npu_output.dtype)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_zeros_shape_format_fp16_1d(self, device):
+        format_list = [0, 3, 29]
+        dtype_list = [torch.float16, torch.float32, torch.int32]
+        shape_format = [[[np.float16, i, [18]], j] for i in format_list for j in dtype_list]
+        self.zeros_result(shape_format)
+
+    def test_zeros_shape_format_fp16_2d(self, device):
+        format_list = [0, 3, 29]
+        dtype_list = [torch.float16, torch.float32, torch.int32]
+        shape_format = [[[np.float16, i, [5, 256]], j] for i in format_list for j in dtype_list]
+        self.zeros_result(shape_format)
+
+    def test_zeros_shape_format_fp16_3d(self, device):
+        format_list = [0, 3, 29]
+        dtype_list = [torch.float16, torch.float32, torch.int32]
+        shape_format = [[[np.float16, i, [32, 3, 3]], j] for i in format_list for j in dtype_list]
+        self.zeros_result(shape_format)
+
+    def test_zeros_shape_format_fp16_4d(self, device):
+        format_list = [0, 3, 29]
+        dtype_list = [torch.float16, torch.float32, torch.int32]
+        shape_format = [[[np.float16, i, [64, 112, 7, 7]], j] for i in format_list for j in dtype_list]
+        self.zeros_result(shape_format)
+
+    def test_zeros_shape_format_fp32_1d(self, device):
+        format_list = [0, 3, 29]
+        dtype_list = [torch.float16, torch.float32, torch.int32]
+        shape_format = [[[np.float32, i, [18]], j] for i in format_list for j in dtype_list]
+        self.zeros_result(shape_format)
+
+    def test_zeros_shape_format_fp32_2d(self, device):
+        format_list = [0, 3, 29]
+        dtype_list = [torch.float16, torch.float32, torch.int32]
+        shape_format = [[[np.float32, i, [5, 256]], j] for i in format_list for j in dtype_list]
+        self.zeros_result(shape_format)
+
+    def test_zeros_shape_format_fp32_3d(self, device):
+        format_list = [0, 3, 29]
+        dtype_list = [torch.float16, torch.float32, torch.int32]
+        shape_format = [[[np.float32, i, [32, 3, 3]], j] for i in format_list for j in dtype_list]
+        self.zeros_result(shape_format)
+
+    def test_zeros_shape_format_fp32_4d(self, device):
+        format_list = [0, 3, 29]
+        dtype_list = [torch.float16, torch.float32, torch.int32]
+        shape_format = [[[np.float32, i, [64, 112, 7, 7]], j] for i in format_list for j in dtype_list]
+        self.zeros_result(shape_format)
+
+    def test_zeros_shape_format_int32_1d(self, device):
+        format_list = [-1, 0]
+        dtype_list = [torch.float16, torch.float32, torch.int32]
+        shape_format = [[[np.int32, i, [18]], j] for i in format_list for j in dtype_list]
+        self.zeros_result(shape_format)
+
+    def test_zeros_shape_format_int32_2d(self, device):
+        format_list = [-1, 0]
+        dtype_list = [torch.float16, torch.float32, torch.int32]
+        shape_format = [[[np.int32, i, [5, 256]], j] for i in format_list for j in dtype_list]
+        self.zeros_result(shape_format)
+
+    def test_zeros_shape_format_int32_3d(self, device):
+        format_list = [-1, 0]
+        dtype_list = [torch.float16, torch.float32, torch.int32]
+        shape_format = [[[np.int32, i, [32, 3, 3]], j] for i in format_list for j in dtype_list]
+        self.zeros_result(shape_format)
+
+    def test_zeros_shape_format_int32_4d(self, device):
+        format_list = [-1, 0]
+        dtype_list = [torch.float16, torch.float32, torch.int32]
+        shape_format = [[[np.int32, i, [64, 112, 7, 7]], j] for i in format_list for j in dtype_list]
+        self.zeros_result(shape_format)
+
+
+instantiate_device_type_tests(TestZeros, globals(), except_for="cpu")
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/test_zeroslike.py b/test/test_npu/test_network_ops/test_zeroslike.py
old mode 100644
new mode 100755
index 30a7b2a0777c65700b1316bef432e11833d0ab34..3dcd1c3c826d4a24cd696cd8e50f64133eb3b310
--- a/test/test_npu/test_network_ops/test_zeroslike.py
+++ b/test/test_npu/test_network_ops/test_zeroslike.py
@@ -1,65 +1,65 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestZerosLike(TestCase):
-    def cpu_op_exec(self, input1, dtype):
-        output = torch.zeros_like(input1, dtype=dtype)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, dtype):
-        output = torch.zeros_like(input1, dtype=dtype)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_zeroslike_fp32(self, device):
-        format_list = [0, 3, 29]
-        shape_list = [1, (1000, 1280), (32, 3, 3), (32, 144, 1, 1)]
-        shape_format = [
-            [np.float32, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            cpu_output = self.cpu_op_exec(cpu_input, torch.float32)
-            npu_output = self.npu_op_exec(npu_input, torch.float32)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_zeroslike_fp16(self, device):
-        format_list = [0, 3, 29]
-        shape_list = [1, (1000, 1280), (32, 3, 3), (32, 144, 1, 1)]
-        shape_format = [
-            [np.float16, i, j] for i in format_list for j in shape_list
-        ]
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 0, 100)
-            cpu_input = cpu_input.to(torch.float32)
-            cpu_output = self.cpu_op_exec(cpu_input, torch.float16)
-            npu_output = self.npu_op_exec(npu_input, torch.float16)
-            cpu_output = cpu_output.astype(np.float16)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestZerosLike, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestZerosLike(TestCase):
+    def cpu_op_exec(self, input1, dtype):
+        output = torch.zeros_like(input1, dtype=dtype)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, dtype):
+        output = torch.zeros_like(input1, dtype=dtype)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_zeroslike_fp32(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [1, (1000, 1280), (32, 3, 3), (32, 144, 1, 1)]
+        shape_format = [
+            [np.float32, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_output = self.cpu_op_exec(cpu_input, torch.float32)
+            npu_output = self.npu_op_exec(npu_input, torch.float32)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_zeroslike_fp16(self, device):
+        format_list = [0, 3, 29]
+        shape_list = [1, (1000, 1280), (32, 3, 3), (32, 144, 1, 1)]
+        shape_format = [
+            [np.float16, i, j] for i in format_list for j in shape_list
+        ]
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 0, 100)
+            cpu_input = cpu_input.to(torch.float32)
+            cpu_output = self.cpu_op_exec(cpu_input, torch.float16)
+            npu_output = self.npu_op_exec(npu_input, torch.float16)
+            cpu_output = cpu_output.astype(np.float16)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestZerosLike, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_network_ops/util_test.py b/test/test_npu/test_network_ops/util_test.py
old mode 100644
new mode 100755
index fd0682982f94a94558efc1d331217e5467978aa3..e60fe1997e3affac44a6540a97d62acc79d95c98
--- a/test/test_npu/test_network_ops/util_test.py
+++ b/test/test_npu/test_network_ops/util_test.py
@@ -1,22 +1,22 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION.
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-common_path = os.path.dirname("../common/")
-if common_path not in sys.path:
-    sys.path.append(common_path)
-from util_test_new import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
-
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION.
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+common_path = os.path.dirname("../common/")
+if common_path not in sys.path:
+    sys.path.append(common_path)
+from util_test_new import create_common_tensor, test_2args_broadcast, create_dtype_tensor, UT_FAST_MODE
+
diff --git a/test/test_npu/test_ones_like.py b/test/test_npu/test_ones_like.py
index 512b0ff03116ecbbfa2490a86b31c8bef5a16fbc..6b075e0fcee26aa11514964ae5ae4c3b39b101a8 100644
--- a/test/test_npu/test_ones_like.py
+++ b/test/test_npu/test_ones_like.py
@@ -1,81 +1,81 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from util_test import create_common_tensor
-
-class TestOnesLike(TestCase):
-
-    def cpu_op_exec(self, input1):
-        output = torch.ones_like(input1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1):
-        output = torch.ones_like(input1)
-        output = output.to('cpu')
-        output = output.numpy()
-        return output
-
-    def test_ones_like_shape_format(self, device):
-        shape_format = [
-            [np.float32, -1, (3, )],
-            [np.float32, -1, (2, 4)],
-            [np.float32, -1, (3, 6, 9)],
-            [np.int8, -1, (3,)],
-            [np.int8, -1, (2, 4)],
-            [np.int8, -1, (3, 6, 9)],
-            [np.int32, -1, (3,)],
-            [np.int32, -1, (2, 4)],
-            [np.int32, -1, (3, 6, 9)],
-            [np.uint8, -1, (3,)],
-            [np.uint8, -1, (2, 4)],
-            [np.uint8, -1, (3, 6, 9)]
-        ]
-
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 1, 100)
-
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec(npu_input)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-
-
-    def test_ones_like_float16_shape_format(self, device):
-        shape_format = [
-            [np.float16, -1, (3, )],
-            [np.float16, -1, (2, 4)],
-            [np.float16, -1, (3, 6, 9)],
-            [np.float16, -1, (3, 4, 5, 12)]
-        ]
-
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item, 1, 100)
-
-            cpu_input = cpu_input.to(torch.float32)
-
-            cpu_output = self.cpu_op_exec(cpu_input)
-            npu_output = self.npu_op_exec(npu_input)
-
-            cpu_output = cpu_output.astype(np.float16)
-
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestOnesLike, globals(), except_for='cpu')
-if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from util_test import create_common_tensor
+
+class TestOnesLike(TestCase):
+
+    def cpu_op_exec(self, input1):
+        output = torch.ones_like(input1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        output = torch.ones_like(input1)
+        output = output.to('cpu')
+        output = output.numpy()
+        return output
+
+    def test_ones_like_shape_format(self, device):
+        shape_format = [
+            [np.float32, -1, (3, )],
+            [np.float32, -1, (2, 4)],
+            [np.float32, -1, (3, 6, 9)],
+            [np.int8, -1, (3,)],
+            [np.int8, -1, (2, 4)],
+            [np.int8, -1, (3, 6, 9)],
+            [np.int32, -1, (3,)],
+            [np.int32, -1, (2, 4)],
+            [np.int32, -1, (3, 6, 9)],
+            [np.uint8, -1, (3,)],
+            [np.uint8, -1, (2, 4)],
+            [np.uint8, -1, (3, 6, 9)]
+        ]
+
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 1, 100)
+
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+
+
+    def test_ones_like_float16_shape_format(self, device):
+        shape_format = [
+            [np.float16, -1, (3, )],
+            [np.float16, -1, (2, 4)],
+            [np.float16, -1, (3, 6, 9)],
+            [np.float16, -1, (3, 4, 5, 12)]
+        ]
+
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item, 1, 100)
+
+            cpu_input = cpu_input.to(torch.float32)
+
+            cpu_output = self.cpu_op_exec(cpu_input)
+            npu_output = self.npu_op_exec(npu_input)
+
+            cpu_output = cpu_output.astype(np.float16)
+
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestOnesLike, globals(), except_for='cpu')
+if __name__ == "__main__":
+    torch.npu.set_device("npu:5")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_onnx/torch.onnx/eval/onnx/cp_onnx_eval.py b/test/test_npu/test_onnx/torch.onnx/eval/onnx/cp_onnx_eval.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_onnx/torch.onnx/eval/onnxrt/onnxrt_eval.py b/test/test_npu/test_onnx/torch.onnx/eval/onnxrt/onnxrt_eval.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_onnx/torch.onnx/export/cp_parser.py b/test/test_npu/test_onnx/torch.onnx/export/cp_parser.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_onnx/torch.onnx/export/export_onnx.py b/test/test_npu/test_onnx/torch.onnx/export/export_onnx.py
old mode 100644
new mode 100755
index f7d79c7d9ee2293a7ffb1d260d3e444f2d2708b4..4c3a0e64506aba2d06dfc0ff355d84ef417f7376
--- a/test/test_npu/test_onnx/torch.onnx/export/export_onnx.py
+++ b/test/test_npu/test_onnx/torch.onnx/export/export_onnx.py
@@ -1,90 +1,90 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torchvision
-from export.cp_parser import *
-
-def getDeviceStr(deviceStr, DeviceNo):
-    #print("cp_getDeviceId test device : ","(", deviceStr,"  ", DeviceNo, ")")
-    if DeviceNo == None:
-        return deviceStr
-    if deviceStr == 'cpu':
-        return deviceStr
-    elif deviceStr == 'npu' or deviceStr == 'cuda':
-        loc = '{}:{}'.format(deviceStr, DeviceNo)
-        return loc
-    else: 
-        return deviceStr
-
-
-def cp2onnx(model,cpfile,onnxfile, input_data, ispth=False,device="cpu",dno=None):
-    if os.path.isfile(cpfile):
-        #model = torchvision.models.resnet50(pretrained=False)
-        model = cp_load(model,cpfile,ispth=ispth,device=device,dno=dno)
-    else :
-        print("warning : \"",cpfile,"\"not exist!")
-        model.state_dict()
-    deviceStr = getDeviceStr(device,dno)
-    print("cp2onnx device: ",deviceStr,"(",device," ",dno,")")
-    #torch.npu.set_device("npu:0")
-    #dummy_input = torch.randn(10, 3, 224, 224, device='npu:0')
-    dummy_input = input_data.to(deviceStr)
-
-    # Providing input and output names sets the display names for values
-    # within the model's graph. Setting these does not change the semantics
-    # of the graph; it is only for readability.
-    #
-    # The inputs to the network consist of the flat list of inputs (i.e.
-    # the values you would pass to the forward() method) followed by the
-    # flat list of parameters. You can partially specify names, i.e. provide
-    # a list here shorter than the number of inputs to the model, and we will
-    # only set that subset of names, starting from the beginning.
-    input_names = [ "actual_input_1" ] #+ [ "learned_%d" % i for i in range(16) ]
-    output_names = [ "output1" ]
-    model = model.to(deviceStr)
-    torch.onnx.export(model, dummy_input, onnxfile, verbose=True, input_names=input_names, output_names=output_names,opset_version=11)
-
-
-def cp2onnx_dynamic_axes(model,cpfile,onnxfile,device="cuda",dno=None):
-    if os.path.isfile(cpfile):
-        #model = torchvision.models.resnet50(pretrained=False)
-        model = cp_load(model,cpfile)
-    else :
-        print("warning : \"",cpfile,"\"not exist!")
-        model.state_dict()
-    deviceStr = getDeviceStr(device,dno)
-    #torch.npu.set_device("npu:0")
-    #dummy_input = torch.randn(10, 3, 224, 224, device='npu:0')
-    dummy_input = torch.randn(10, 3, 224, 224)
-    dummy_input = dummy_input.to(deviceStr)
-
-    # Providing input and output names sets the display names for values
-    # within the model's graph. Setting these does not change the semantics
-    # of the graph; it is only for readability.
-    #
-    # The inputs to the network consist of the flat list of inputs (i.e.
-    # the values you would pass to the forward() method) followed by the
-    # flat list of parameters. You can partially specify names, i.e. provide
-    # a list here shorter than the number of inputs to the model, and we will
-    # only set that subset of names, starting from the beginning.
-    input_names = [ "actual_input_1" ] #+ [ "learned_%d" % i for i in range(16) ]
-    output_names = [ "output1" ]
-    model = model.to(deviceStr)
-    dynamic_axes = {'actual_input_1': {0: '-1'}, 'output1': {0: '-1'}}
-    torch.onnx.export(model, dummy_input, onnxfile, verbose=True, input_names=input_names, output_names=output_names,dynamic_axes=dynamic_axes,opset_version=11)
-
-
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torchvision
+from export.cp_parser import *
+
+def getDeviceStr(deviceStr, DeviceNo):
+    #print("cp_getDeviceId test device : ","(", deviceStr,"  ", DeviceNo, ")")
+    if DeviceNo == None:
+        return deviceStr
+    if deviceStr == 'cpu':
+        return deviceStr
+    elif deviceStr == 'npu' or deviceStr == 'cuda':
+        loc = '{}:{}'.format(deviceStr, DeviceNo)
+        return loc
+    else: 
+        return deviceStr
+
+
+def cp2onnx(model,cpfile,onnxfile, input_data, ispth=False,device="cpu",dno=None):
+    if os.path.isfile(cpfile):
+        #model = torchvision.models.resnet50(pretrained=False)
+        model = cp_load(model,cpfile,ispth=ispth,device=device,dno=dno)
+    else :
+        print("warning : \"",cpfile,"\"not exist!")
+        model.state_dict()
+    deviceStr = getDeviceStr(device,dno)
+    print("cp2onnx device: ",deviceStr,"(",device," ",dno,")")
+    #torch.npu.set_device("npu:0")
+    #dummy_input = torch.randn(10, 3, 224, 224, device='npu:0')
+    dummy_input = input_data.to(deviceStr)
+
+    # Providing input and output names sets the display names for values
+    # within the model's graph. Setting these does not change the semantics
+    # of the graph; it is only for readability.
+    #
+    # The inputs to the network consist of the flat list of inputs (i.e.
+    # the values you would pass to the forward() method) followed by the
+    # flat list of parameters. You can partially specify names, i.e. provide
+    # a list here shorter than the number of inputs to the model, and we will
+    # only set that subset of names, starting from the beginning.
+    input_names = [ "actual_input_1" ] #+ [ "learned_%d" % i for i in range(16) ]
+    output_names = [ "output1" ]
+    model = model.to(deviceStr)
+    torch.onnx.export(model, dummy_input, onnxfile, verbose=True, input_names=input_names, output_names=output_names,opset_version=11)
+
+
+def cp2onnx_dynamic_axes(model,cpfile,onnxfile,device="cuda",dno=None):
+    if os.path.isfile(cpfile):
+        #model = torchvision.models.resnet50(pretrained=False)
+        model = cp_load(model,cpfile)
+    else :
+        print("warning : \"",cpfile,"\"not exist!")
+        model.state_dict()
+    deviceStr = getDeviceStr(device,dno)
+    #torch.npu.set_device("npu:0")
+    #dummy_input = torch.randn(10, 3, 224, 224, device='npu:0')
+    dummy_input = torch.randn(10, 3, 224, 224)
+    dummy_input = dummy_input.to(deviceStr)
+
+    # Providing input and output names sets the display names for values
+    # within the model's graph. Setting these does not change the semantics
+    # of the graph; it is only for readability.
+    #
+    # The inputs to the network consist of the flat list of inputs (i.e.
+    # the values you would pass to the forward() method) followed by the
+    # flat list of parameters. You can partially specify names, i.e. provide
+    # a list here shorter than the number of inputs to the model, and we will
+    # only set that subset of names, starting from the beginning.
+    input_names = [ "actual_input_1" ] #+ [ "learned_%d" % i for i in range(16) ]
+    output_names = [ "output1" ]
+    model = model.to(deviceStr)
+    dynamic_axes = {'actual_input_1': {0: '-1'}, 'output1': {0: '-1'}}
+    torch.onnx.export(model, dummy_input, onnxfile, verbose=True, input_names=input_names, output_names=output_names,dynamic_axes=dynamic_axes,opset_version=11)
+
+
diff --git a/test/test_npu/test_onnx/torch.onnx/export/model_export-cpu.py b/test/test_npu/test_onnx/torch.onnx/export/model_export-cpu.py
old mode 100644
new mode 100755
index 85a422688d14aab2d9870c32c957ee3ddd26d509..c6049ec810867bdb0b18d36089a7ccd58f41793e
--- a/test/test_npu/test_onnx/torch.onnx/export/model_export-cpu.py
+++ b/test/test_npu/test_onnx/torch.onnx/export/model_export-cpu.py
@@ -1,48 +1,48 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torchvision
-# torch.cpu.set_device("cpu:0")
-#dummy_input = torch.randn(10, 3, 224, 224, device='npu:0')
-dummy_input = torch.randn(10, 3, 224, 224)
-dummy_input = dummy_input.to("cpu")
-model = torchvision.models.resnet50(pretrained=False)
-
-# Providing input and output names sets the display names for values
-# within the model's graph. Setting these does not change the semantics
-# of the graph; it is only for readability.
-#
-# The inputs to the network consist of the flat list of inputs (i.e.
-# the values you would pass to the forward() method) followed by the
-# flat list of parameters. You can partially specify names, i.e. provide
-# a list here shorter than the number of inputs to the model, and we will
-# only set that subset of names, starting from the beginning.
-input_names = [ "actual_input_1" ] + [ "learned_%d" % i for i in range(16) ]
-output_names = [ "output1" ]
-model = model.to("cpu")
-torch.onnx.export(model, dummy_input, "resnet50.onnx", verbose=True, input_names=input_names, output_names=output_names)
-
-
-
-# 有坑 会提示下载不下来 修改下resnet.py，手动下载下来，然后放到 D:/Pytorch/models 目录下。
-def _resnet(arch, block, layers, pretrained, progress, **kwargs):
-    model = ResNet(block, layers, **kwargs)
-    if pretrained:
-        state_dict = load_state_dict_from_url(model_urls[arch], model_dir="D:/Pytorch/models",
-                                              progress=progress)
-        model.load_state_dict(state_dict)
-    return model
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torchvision
+# torch.cpu.set_device("cpu:0")
+#dummy_input = torch.randn(10, 3, 224, 224, device='npu:0')
+dummy_input = torch.randn(10, 3, 224, 224)
+dummy_input = dummy_input.to("cpu")
+model = torchvision.models.resnet50(pretrained=False)
+
+# Providing input and output names sets the display names for values
+# within the model's graph. Setting these does not change the semantics
+# of the graph; it is only for readability.
+#
+# The inputs to the network consist of the flat list of inputs (i.e.
+# the values you would pass to the forward() method) followed by the
+# flat list of parameters. You can partially specify names, i.e. provide
+# a list here shorter than the number of inputs to the model, and we will
+# only set that subset of names, starting from the beginning.
+input_names = [ "actual_input_1" ] + [ "learned_%d" % i for i in range(16) ]
+output_names = [ "output1" ]
+model = model.to("cpu")
+torch.onnx.export(model, dummy_input, "resnet50.onnx", verbose=True, input_names=input_names, output_names=output_names)
+
+
+
+# 有坑 会提示下载不下来 修改下resnet.py，手动下载下来，然后放到 D:/Pytorch/models 目录下。
+def _resnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = ResNet(block, layers, **kwargs)
+    if pretrained:
+        state_dict = load_state_dict_from_url(model_urls[arch], model_dir="D:/Pytorch/models",
+                                              progress=progress)
+        model.load_state_dict(state_dict)
+    return model
diff --git a/test/test_npu/test_onnx/torch.onnx/export/model_export-gpu.py b/test/test_npu/test_onnx/torch.onnx/export/model_export-gpu.py
old mode 100644
new mode 100755
index 9e2e11bcb02493a2a3f12ea544e53cc1e40d1d51..31e2f9c7ddc2003a4f160ec54fd40eea35036176
--- a/test/test_npu/test_onnx/torch.onnx/export/model_export-gpu.py
+++ b/test/test_npu/test_onnx/torch.onnx/export/model_export-gpu.py
@@ -1,48 +1,48 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torchvision
-#torch.npu.set_device("npu:0")
-#dummy_input = torch.randn(10, 3, 224, 224, device='npu:0')
-dummy_input = torch.randn(10, 3, 224, 224)
-dummy_input = dummy_input.to("cuda")
-model = torchvision.models.resnet50(pretrained=False)
-
-# Providing input and output names sets the display names for values
-# within the model's graph. Setting these does not change the semantics
-# of the graph; it is only for readability.
-#
-# The inputs to the network consist of the flat list of inputs (i.e.
-# the values you would pass to the forward() method) followed by the
-# flat list of parameters. You can partially specify names, i.e. provide
-# a list here shorter than the number of inputs to the model, and we will
-# only set that subset of names, starting from the beginning.
-input_names = [ "actual_input_1" ] + [ "learned_%d" % i for i in range(16) ]
-output_names = [ "output1" ]
-model = model.to("cuda")
-torch.onnx.export(model, dummy_input, "resnet50.onnx", verbose=True, input_names=input_names, output_names=output_names)
-
-
-
-# 有坑 会提示下载不下来 修改下resnet.py，手动下载下来，然后放到 D:/Pytorch/models 目录下。
-def _resnet(arch, block, layers, pretrained, progress, **kwargs):
-    model = ResNet(block, layers, **kwargs)
-    if pretrained:
-        state_dict = load_state_dict_from_url(model_urls[arch], model_dir="D:/Pytorch/models",
-                                              progress=progress)
-        model.load_state_dict(state_dict)
-    return model
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torchvision
+#torch.npu.set_device("npu:0")
+#dummy_input = torch.randn(10, 3, 224, 224, device='npu:0')
+dummy_input = torch.randn(10, 3, 224, 224)
+dummy_input = dummy_input.to("cuda")
+model = torchvision.models.resnet50(pretrained=False)
+
+# Providing input and output names sets the display names for values
+# within the model's graph. Setting these does not change the semantics
+# of the graph; it is only for readability.
+#
+# The inputs to the network consist of the flat list of inputs (i.e.
+# the values you would pass to the forward() method) followed by the
+# flat list of parameters. You can partially specify names, i.e. provide
+# a list here shorter than the number of inputs to the model, and we will
+# only set that subset of names, starting from the beginning.
+input_names = [ "actual_input_1" ] + [ "learned_%d" % i for i in range(16) ]
+output_names = [ "output1" ]
+model = model.to("cuda")
+torch.onnx.export(model, dummy_input, "resnet50.onnx", verbose=True, input_names=input_names, output_names=output_names)
+
+
+
+# 有坑 会提示下载不下来 修改下resnet.py，手动下载下来，然后放到 D:/Pytorch/models 目录下。
+def _resnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = ResNet(block, layers, **kwargs)
+    if pretrained:
+        state_dict = load_state_dict_from_url(model_urls[arch], model_dir="D:/Pytorch/models",
+                                              progress=progress)
+        model.load_state_dict(state_dict)
+    return model
diff --git a/test/test_npu/test_onnx/torch.onnx/export/model_export-npu.py b/test/test_npu/test_onnx/torch.onnx/export/model_export-npu.py
old mode 100644
new mode 100755
index 030d7492613c47f64bdf61f624767bed123687d5..aeac7133b6496daf43023b190c452779c574a393
--- a/test/test_npu/test_onnx/torch.onnx/export/model_export-npu.py
+++ b/test/test_npu/test_onnx/torch.onnx/export/model_export-npu.py
@@ -1,48 +1,48 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torchvision
-torch.npu.set_device("npu:0")
-#dummy_input = torch.randn(10, 3, 224, 224, device='npu:0')
-dummy_input = torch.randn(10, 3, 224, 224)
-dummy_input = dummy_input.to("npu")
-model = torchvision.models.resnet50(pretrained=False)
-
-# Providing input and output names sets the display names for values
-# within the model's graph. Setting these does not change the semantics
-# of the graph; it is only for readability.
-#
-# The inputs to the network consist of the flat list of inputs (i.e.
-# the values you would pass to the forward() method) followed by the
-# flat list of parameters. You can partially specify names, i.e. provide
-# a list here shorter than the number of inputs to the model, and we will
-# only set that subset of names, starting from the beginning.
-input_names = [ "actual_input_1" ] + [ "learned_%d" % i for i in range(16) ]
-output_names = [ "output1" ]
-model = model.to("npu")
-torch.onnx.export(model, dummy_input, "resnet50.onnx", verbose=True, input_names=input_names, output_names=output_names)
-
-
-
-# 有坑 会提示下载不下来 修改下resnet.py，手动下载下来，然后放到 D:/Pytorch/models 目录下。
-def _resnet(arch, block, layers, pretrained, progress, **kwargs):
-    model = ResNet(block, layers, **kwargs)
-    if pretrained:
-        state_dict = load_state_dict_from_url(model_urls[arch], model_dir="D:/Pytorch/models",
-                                              progress=progress)
-        model.load_state_dict(state_dict)
-    return model
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torchvision
+torch.npu.set_device("npu:0")
+#dummy_input = torch.randn(10, 3, 224, 224, device='npu:0')
+dummy_input = torch.randn(10, 3, 224, 224)
+dummy_input = dummy_input.to("npu")
+model = torchvision.models.resnet50(pretrained=False)
+
+# Providing input and output names sets the display names for values
+# within the model's graph. Setting these does not change the semantics
+# of the graph; it is only for readability.
+#
+# The inputs to the network consist of the flat list of inputs (i.e.
+# the values you would pass to the forward() method) followed by the
+# flat list of parameters. You can partially specify names, i.e. provide
+# a list here shorter than the number of inputs to the model, and we will
+# only set that subset of names, starting from the beginning.
+input_names = [ "actual_input_1" ] + [ "learned_%d" % i for i in range(16) ]
+output_names = [ "output1" ]
+model = model.to("npu")
+torch.onnx.export(model, dummy_input, "resnet50.onnx", verbose=True, input_names=input_names, output_names=output_names)
+
+
+
+# 有坑 会提示下载不下来 修改下resnet.py，手动下载下来，然后放到 D:/Pytorch/models 目录下。
+def _resnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = ResNet(block, layers, **kwargs)
+    if pretrained:
+        state_dict = load_state_dict_from_url(model_urls[arch], model_dir="D:/Pytorch/models",
+                                              progress=progress)
+        model.load_state_dict(state_dict)
+    return model
diff --git a/test/test_npu/test_onnx/torch.onnx/export/model_export.py b/test/test_npu/test_onnx/torch.onnx/export/model_export.py
old mode 100644
new mode 100755
index a5b1b6b3d3503c3b6aed180e065d6388bee9960e..66101388d186896b7c0f13473eb7ceafdaeb0e7e
--- a/test/test_npu/test_onnx/torch.onnx/export/model_export.py
+++ b/test/test_npu/test_onnx/torch.onnx/export/model_export.py
@@ -1,48 +1,48 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torchvision
-
-dummy_input = torch.randn(10, 3, 224, 224, device='cpu')
-#dummy_input = torch.randn(10, 3, 224, 224)
-#dummy_input = dummy_input.to("npu")
-model = torchvision.models.resnet50(pretrained=False)
-
-# Providing input and output names sets the display names for values
-# within the model's graph. Setting these does not change the semantics
-# of the graph; it is only for readability.
-#
-# The inputs to the network consist of the flat list of inputs (i.e.
-# the values you would pass to the forward() method) followed by the
-# flat list of parameters. You can partially specify names, i.e. provide
-# a list here shorter than the number of inputs to the model, and we will
-# only set that subset of names, starting from the beginning.
-input_names = [ "actual_input_1" ] + [ "learned_%d" % i for i in range(16) ]
-output_names = [ "output1" ]
-
-torch.onnx.export(model, dummy_input, "resnet50.onnx", verbose=True, input_names=input_names, output_names=output_names)
-
-
-
-# 有坑 会提示下载不下来 修改下resnet.py，手动下载下来，然后放到 D:/Pytorch/models 目录下。
-def _resnet(arch, block, layers, pretrained, progress, **kwargs):
-    model = ResNet(block, layers, **kwargs)
-    if pretrained:
-        state_dict = load_state_dict_from_url(model_urls[arch], model_dir="D:/Pytorch/models",
-                                              progress=progress)
-        model.load_state_dict(state_dict)
-    return model
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torchvision
+
+dummy_input = torch.randn(10, 3, 224, 224, device='cpu')
+#dummy_input = torch.randn(10, 3, 224, 224)
+#dummy_input = dummy_input.to("npu")
+model = torchvision.models.resnet50(pretrained=False)
+
+# Providing input and output names sets the display names for values
+# within the model's graph. Setting these does not change the semantics
+# of the graph; it is only for readability.
+#
+# The inputs to the network consist of the flat list of inputs (i.e.
+# the values you would pass to the forward() method) followed by the
+# flat list of parameters. You can partially specify names, i.e. provide
+# a list here shorter than the number of inputs to the model, and we will
+# only set that subset of names, starting from the beginning.
+input_names = [ "actual_input_1" ] + [ "learned_%d" % i for i in range(16) ]
+output_names = [ "output1" ]
+
+torch.onnx.export(model, dummy_input, "resnet50.onnx", verbose=True, input_names=input_names, output_names=output_names)
+
+
+
+# 有坑 会提示下载不下来 修改下resnet.py，手动下载下来，然后放到 D:/Pytorch/models 目录下。
+def _resnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = ResNet(block, layers, **kwargs)
+    if pretrained:
+        state_dict = load_state_dict_from_url(model_urls[arch], model_dir="D:/Pytorch/models",
+                                              progress=progress)
+        model.load_state_dict(state_dict)
+    return model
diff --git a/test/test_npu/test_onnx/torch.onnx/export/onnx_parser.py b/test/test_npu/test_onnx/torch.onnx/export/onnx_parser.py
old mode 100644
new mode 100755
diff --git a/test/test_npu/test_onnx/torch.onnx/main.py b/test/test_npu/test_onnx/torch.onnx/main.py
old mode 100644
new mode 100755
index 28317d74ef5ff0f9b4bc602667261a045f95422e..b340984068d8050af9225149ef09ab96a7fa5bbf
--- a/test/test_npu/test_onnx/torch.onnx/main.py
+++ b/test/test_npu/test_onnx/torch.onnx/main.py
@@ -1,41 +1,41 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# coding: utf8
-
-import logging
-
-logger_level_relations = {
-        'debug':logging.DEBUG,
-        'info':logging.INFO,
-        'warning':logging.WARNING,
-        'error':logging.ERROR,
-        'crit':logging.CRITICAL
-    }#��־�����ϵӳ��
-
-loggerScreanHander = logging.StreamHandler()
-if 0 :
-    loggerScreanHander.setFormatter(logging.Formatter('%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s') )
-else:
-    loggerScreanHander.setFormatter(logging.Formatter('%(message)s') )
-
-logger = logging.getLogger('torch.onnx.export.test.main')
-logger.addHandler(loggerScreanHander)
-logger.setLevel(logger_level_relations.get('debug'))
-logger.debug('__file__={0:<35} | __name__={1:<20} | __package__={2:<20}'.format(__file__,__name__,str(__package__)))
-
-import export.cp2cp
-import export.model2onnx
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding: utf8
+
+import logging
+
+logger_level_relations = {
+        'debug':logging.DEBUG,
+        'info':logging.INFO,
+        'warning':logging.WARNING,
+        'error':logging.ERROR,
+        'crit':logging.CRITICAL
+    }#��־�����ϵӳ��
+
+loggerScreanHander = logging.StreamHandler()
+if 0 :
+    loggerScreanHander.setFormatter(logging.Formatter('%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s') )
+else:
+    loggerScreanHander.setFormatter(logging.Formatter('%(message)s') )
+
+logger = logging.getLogger('torch.onnx.export.test.main')
+logger.addHandler(loggerScreanHander)
+logger.setLevel(logger_level_relations.get('debug'))
+logger.debug('__file__={0:<35} | __name__={1:<20} | __package__={2:<20}'.format(__file__,__name__,str(__package__)))
+
+import export.cp2cp
+import export.model2onnx
diff --git a/test/test_npu/test_pairwise_distance.py b/test/test_npu/test_pairwise_distance.py
index bfdf729e6c711c756183591cc13825a93fe88c17..6acc37211646412f92a07a26456729fac283c4b1 100644
--- a/test/test_npu/test_pairwise_distance.py
+++ b/test/test_npu/test_pairwise_distance.py
@@ -1,160 +1,160 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestPairwiseDistance(TestCase):
-    def generate_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        npu_input1 = torch.from_numpy(input1)
-        npu_input2 = torch.from_numpy(input2)
-        return npu_input1, npu_input2
-
-    def cpu_op_exec_default(self, input1, input2):
-        stype = input1.dtype
-        if stype == torch.float16:
-            input1 = input1.float()
-            input2 = input2.float()       
-        pdist = torch.nn.PairwiseDistance()
-        output = pdist(input1, input2)
-        if stype == torch.float16:
-            output = output.half()       
-        output = output.numpy()
-        return output
-        
-    def npu_op_exec_default(self, input1, input2):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        pdist = torch.nn.PairwiseDistance()
-        output = pdist(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec(self, input1, input2, p, eps, keepdim):
-        stype = input1.dtype
-        if stype == torch.float16:
-            input1 = input1.float()  
-            input2 = input2.float()                  
-        pdist = torch.nn.PairwiseDistance(p=p, eps=eps, keepdim=keepdim)
-        output = pdist(input1, input2)
-        if stype == torch.float16:
-            output = output.half()       
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2, p, eps, keepdim):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        pdist = torch.nn.PairwiseDistance(p=p, eps=eps, keepdim=keepdim)
-        output = pdist(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
- 
-    def test_pairwise_distance_5_360_float16(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (5, 360), np.float16)
-        cpu_output = self.cpu_op_exec_default(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec_default(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_pairwise_distance_10_3600_30_float32(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (10, 3600, 30), np.float32)
-        cpu_output = self.cpu_op_exec_default(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec_default(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)         
-
-    def test_pairwise_distance_505_10_30_float32(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (505, 10, 30), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 0, 1e-6, True)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0, 1e-6, True)
-        self.assertRtolEqual(cpu_output, npu_output) 
-
-    def test_pairwise_distance_505_10_30_23_float16(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (505, 10, 30, 23), np.float16)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 0, 1e-5, True)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0, 1e-5, True)
-        self.assertRtolEqual(cpu_output, npu_output) 
-
-    def test_pairwise_distance_505_1_30_23_float32(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (505, 1, 30, 23), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 0, 0, False)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0, 0, False)
-        self.assertRtolEqual(cpu_output, npu_output) 
-
-
-    def test_pairwise_distance_55_10_30_float32(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (55, 10, 30), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 10, -1e-6, False)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, 10, -1e-6, False)
-        self.assertRtolEqual(cpu_output, npu_output) 
-
-    def test_pairwise_distance_30_23_float16(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (30, 23), np.float16)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 10, 5, False)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, 10, 5, False)
-        self.assertRtolEqual(cpu_output, npu_output) 
-
-    def test_pairwise_distance_505_1_23_19_float16(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (505, 1, 23, 19), np.float16)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 10, 10, True)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, 10, 10, True)
-        self.assertRtolEqual(cpu_output, npu_output) 
-
-    def test_pairwise_distance_30_23_19_float32(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (30, 23, 19), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 10, -10, True)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, 10, -10, True)
-        self.assertRtolEqual(cpu_output, npu_output) 
-
-    def test_pairwise_distance_505_1_float32(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (505, 1), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 0, -1e-4, False)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0, -1e-4, False)
-        self.assertRtolEqual(cpu_output, npu_output) 
-
-    def test_pairwise_distance_1_520_float32(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (1, 520), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 10, 1e-4, False)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, 10, 1e-4, False)
-        self.assertRtolEqual(cpu_output, npu_output) 
- 
-    def test_pairwise_distance_1_1_float32(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (1, 1), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 1,  2, True)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, 1,  2, True)
-        self.assertRtolEqual(cpu_output, npu_output) 
-
-    #can't pass test
-    def test_pairwise_distance_505_12_float16(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (505, 12), np.float16)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 0, -1e-4, False)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0, -1e-4, False)
-        self.assertRtolEqual(cpu_output, npu_output) 
-
-    def test_pairwise_distance_509_35_float16(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, (509, 35), np.float16)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 0.0, -1, False)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0.0, -1, False)
-        self.assertRtolEqual(cpu_output, npu_output) 
-instantiate_device_type_tests(TestPairwiseDistance, globals(), except_for='cpu')
-if __name__ == "__main__":
-    torch.npu.set_device("npu:2")
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestPairwiseDistance(TestCase):
+    def generate_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        npu_input1 = torch.from_numpy(input1)
+        npu_input2 = torch.from_numpy(input2)
+        return npu_input1, npu_input2
+
+    def cpu_op_exec_default(self, input1, input2):
+        stype = input1.dtype
+        if stype == torch.float16:
+            input1 = input1.float()
+            input2 = input2.float()       
+        pdist = torch.nn.PairwiseDistance()
+        output = pdist(input1, input2)
+        if stype == torch.float16:
+            output = output.half()       
+        output = output.numpy()
+        return output
+        
+    def npu_op_exec_default(self, input1, input2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        pdist = torch.nn.PairwiseDistance()
+        output = pdist(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec(self, input1, input2, p, eps, keepdim):
+        stype = input1.dtype
+        if stype == torch.float16:
+            input1 = input1.float()  
+            input2 = input2.float()                  
+        pdist = torch.nn.PairwiseDistance(p=p, eps=eps, keepdim=keepdim)
+        output = pdist(input1, input2)
+        if stype == torch.float16:
+            output = output.half()       
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2, p, eps, keepdim):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        pdist = torch.nn.PairwiseDistance(p=p, eps=eps, keepdim=keepdim)
+        output = pdist(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+ 
+    def test_pairwise_distance_5_360_float16(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (5, 360), np.float16)
+        cpu_output = self.cpu_op_exec_default(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec_default(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_pairwise_distance_10_3600_30_float32(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (10, 3600, 30), np.float32)
+        cpu_output = self.cpu_op_exec_default(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec_default(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)         
+
+    def test_pairwise_distance_505_10_30_float32(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (505, 10, 30), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 0, 1e-6, True)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0, 1e-6, True)
+        self.assertRtolEqual(cpu_output, npu_output) 
+
+    def test_pairwise_distance_505_10_30_23_float16(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (505, 10, 30, 23), np.float16)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 0, 1e-5, True)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0, 1e-5, True)
+        self.assertRtolEqual(cpu_output, npu_output) 
+
+    def test_pairwise_distance_505_1_30_23_float32(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (505, 1, 30, 23), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 0, 0, False)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0, 0, False)
+        self.assertRtolEqual(cpu_output, npu_output) 
+
+
+    def test_pairwise_distance_55_10_30_float32(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (55, 10, 30), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 10, -1e-6, False)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, 10, -1e-6, False)
+        self.assertRtolEqual(cpu_output, npu_output) 
+
+    def test_pairwise_distance_30_23_float16(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (30, 23), np.float16)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 10, 5, False)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, 10, 5, False)
+        self.assertRtolEqual(cpu_output, npu_output) 
+
+    def test_pairwise_distance_505_1_23_19_float16(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (505, 1, 23, 19), np.float16)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 10, 10, True)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, 10, 10, True)
+        self.assertRtolEqual(cpu_output, npu_output) 
+
+    def test_pairwise_distance_30_23_19_float32(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (30, 23, 19), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 10, -10, True)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, 10, -10, True)
+        self.assertRtolEqual(cpu_output, npu_output) 
+
+    def test_pairwise_distance_505_1_float32(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (505, 1), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 0, -1e-4, False)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0, -1e-4, False)
+        self.assertRtolEqual(cpu_output, npu_output) 
+
+    def test_pairwise_distance_1_520_float32(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (1, 520), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 10, 1e-4, False)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, 10, 1e-4, False)
+        self.assertRtolEqual(cpu_output, npu_output) 
+ 
+    def test_pairwise_distance_1_1_float32(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (1, 1), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 1,  2, True)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, 1,  2, True)
+        self.assertRtolEqual(cpu_output, npu_output) 
+
+    #can't pass test
+    def test_pairwise_distance_505_12_float16(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (505, 12), np.float16)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 0, -1e-4, False)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0, -1e-4, False)
+        self.assertRtolEqual(cpu_output, npu_output) 
+
+    def test_pairwise_distance_509_35_float16(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, (509, 35), np.float16)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, 0.0, -1, False)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2, 0.0, -1, False)
+        self.assertRtolEqual(cpu_output, npu_output) 
+instantiate_device_type_tests(TestPairwiseDistance, globals(), except_for='cpu')
+if __name__ == "__main__":
+    torch.npu.set_device("npu:2")
+    run_tests()
diff --git a/test/test_npu/test_reflection_pad2d.py b/test/test_npu/test_reflection_pad2d.py
deleted file mode 100644
index d150c4c955b8d3670c033e7056f6d71810d6baef..0000000000000000000000000000000000000000
--- a/test/test_npu/test_reflection_pad2d.py
+++ /dev/null
@@ -1,238 +0,0 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestReflectionPad2d(TestCase):
-    def cpu_op_out_exec(self, input1, pad, output):
-        m = torch._C._nn.reflection_pad2d(input1, pad, out=output)
-        m = m.numpy()
-        return m
-
-    def npu_op_out_exec(self, input1, pad, output):
-        m_n = torch._C._nn.reflection_pad2d(input1, pad, out=output)
-        m_n = m_n.to("cpu")
-        m_n = m_n.numpy()
-        return m_n
-
-    def cpu_op_exec(self, input1, pad):
-        m = torch.nn.ReflectionPad2d(pad)
-        output = m(input1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, pad):
-        m = torch.nn.ReflectionPad2d(pad).to("npu")
-        output = m(input1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_reflectionPad2d_out_shape_format(self, device):
-        shape_format = [
-            [[np.float32, 0, (1, 1, 3, 3)], [2, 2, 2, 2]],
-            [[np.float32, 3, (1, 1, 4, 3)], 2]
-        ]
-        for item in shape_format:
-            cpuout = torch.randn(1, 1, 3, 3)
-            npuout = cpuout.npu()
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = self.cpu_op_out_exec(cpu_input1, item[1], cpuout)
-            npu_output = self.npu_op_out_exec(npu_input1, item[1], npuout)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_reflectionPad2d_out_shape_format_fp16(self, device):
-        shape_format = [
-            [[np.float16, 0, (1, 1, 4, 3)], [2, 2, 2, 2]],
-            [[np.float16, 3, (1, 1, 4, 3)], 2]
-        ]
-
-        def cpu_op_out_exec_fp16(input1, pad, output):
-            input1 = input1.to(torch.float32)
-            m = torch._C._nn.reflection_pad2d(input1, pad, out=output)
-            m = m.numpy()
-            m = m.astype(np.float16)
-            return m
-
-        for item in shape_format:
-            cpuout = torch.randn(1, 1, 3, 3)
-            npuout = cpuout.npu()
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = cpu_op_out_exec_fp16(cpu_input1, item[1], cpuout)
-            npu_output = self.npu_op_out_exec(npu_input1, item[1], npuout)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_reflectionPad2d_out_shape_format_int8(self, device):
-        shape_format = [
-            [[np.int8, 0, (1, 1, 4, 3)], [2, 2, 2, 2]],
-            [[np.int8, 0, (1, 1, 5, 3)], 2]
-        ]
-
-        def cpu_op_out_exec_int8(input1, pad, output):
-            input1 = input1.to(torch.float32)
-            m = torch._C._nn.reflection_pad2d(input1, pad, out=output)
-            m = m.numpy()
-            m = m.astype(np.int8)
-            return m
-
-        for item in shape_format:
-            cpuout = torch.randn(1, 1, 3, 3)
-            npuout = cpuout.npu()
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = cpu_op_out_exec_int8(cpu_input1, item[1], cpuout)
-            npu_output = self.npu_op_out_exec(npu_input1, item[1], npuout)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_reflectionPad2d_out_shape_format_uint8(self, device):
-        shape_format = [
-            [[np.uint8, 0, (1, 1, 4, 3)], [2, 2, 2, 2]],
-            [[np.uint8, 0, (1, 1, 4, 9)], 3]
-        ]
-
-        def cpu_op_out_exec_uint8(input1, pad, output):
-            input1 = input1.to(torch.float32)
-            m = torch._C._nn.reflection_pad2d(input1, pad, out=output)
-            m = m.numpy()
-            m = m.astype(np.uint8)
-            return m
-
-        for item in shape_format:
-            cpuout = torch.randn(1, 1, 3, 3)
-            npuout = cpuout.npu()
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = cpu_op_out_exec_uint8(cpu_input1, item[1], cpuout)
-            npu_output = self.npu_op_out_exec(npu_input1, item[1], npuout)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_reflectionPad2d_out_shape_format_int32(self, device):
-        shape_format = [
-            [[np.int32, 0, (1, 1, 4, 3)], [2, 2, 2, 2]],
-            [[np.int32, 0, (1, 1, 4, 9)], 2]
-        ]
-
-        def cpu_op_out_exec_int32(input1, pad, output):
-            input1 = input1.to(torch.float32)
-            m = torch._C._nn.reflection_pad2d(input1, pad, out=output)
-            m = m.numpy()
-            m = m.astype(np.int32)
-            return m
-
-        for item in shape_format:
-            cpuout = torch.randn(1, 1, 3, 3)
-            npuout = cpuout.npu()
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = cpu_op_out_exec_int32(cpu_input1, item[1], cpuout)
-            npu_output = self.npu_op_out_exec(npu_input1, item[1], npuout)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_reflectionPad2d_shape_format(self, device):
-        shape_format = [
-            [[np.float32, 0, (1, 1, 3, 3)], [2, 2, 2, 2]],
-            [[np.float32, 3, (1, 1, 4, 3)], 2]
-        ]
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
-            npu_output = self.npu_op_exec(npu_input1, item[1])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_reflectionPad2d_shape_format_fp16(self, device):
-        shape_format = [
-            [[np.float16, 0, (1, 1, 4, 3)], [2, 2, 2, 2]],
-            [[np.float16, 3, (1, 1, 4, 3)], 2]
-        ]
-
-        def cpu_op_exec_fp16(input1, pad):
-            input1 = input1.to(torch.float32)
-            m = torch.nn.ReflectionPad2d(pad)
-            output = m(input1)
-            output = output.numpy()
-            output = output.astype(np.float16)
-            return output
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = cpu_op_exec_fp16(cpu_input1, item[1])
-            npu_output = self.npu_op_exec(npu_input1, item[1])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_reflectionPad2d_shape_format_int8(self, device):
-        shape_format = [
-            [[np.int8, 0, (1, 1, 4, 3)], [2, 2, 2, 2]],
-            [[np.int8, 0, (1, 1, 5, 3)], 2]
-        ]
-
-        def cpu_op_exec_int8(input1, pad):
-            input1 = input1.to(torch.float32)
-            m = torch.nn.ReflectionPad2d(pad)
-            output = m(input1)
-            output = output.numpy()
-            output = output.astype(np.int8)
-            return output
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = cpu_op_exec_int8(cpu_input1, item[1])
-            npu_output = self.npu_op_exec(npu_input1, item[1])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_reflectionPad2d_shape_format_uint8(self, device):
-        shape_format = [
-            [[np.uint8, 0, (1, 1, 4, 3)], [2, 2, 2, 2]],
-            [[np.uint8, 0, (1, 1, 4, 9)], 3]
-        ]
-
-        def cpu_op_exec_uint8(input1, pad):
-            input1 = input1.to(torch.float32)
-            m = torch.nn.ReflectionPad2d(pad)
-            output = m(input1)
-            output = output.numpy()
-            output = output.astype(np.uint8)
-            return output
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = cpu_op_exec_uint8(cpu_input1, item[1])
-            npu_output = self.npu_op_exec(npu_input1, item[1])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_reflectionPad2d_shape_format_int32(self, device):
-        shape_format = [
-            [[np.int32, 0, (1, 1, 4, 3)], [2, 2, 2, 2]],
-            [[np.int32, 0, (1, 1, 4, 9)], 2]
-        ]
-
-        def cpu_op_exec_int32(input1, pad):
-            input1 = input1.to(torch.float32)
-            m = torch.nn.ReflectionPad2d(pad)
-            output = m(input1)
-            output = output.numpy()
-            output = output.astype(np.int32)
-            return output
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = cpu_op_exec_int32(cpu_input1, item[1])
-            npu_output = self.npu_op_exec(npu_input1, item[1])
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestReflectionPad2d, globals(), except_for="cpu")
-if __name__ == "__main__":
-    torch.npu.set_device("npu:2")
-    run_tests()
diff --git a/test/test_npu/test_scatter_dim_update.py b/test/test_npu/test_scatter_dim_update.py
index 630dbe48e41a2b019994a19b5d014a29d6936fa3..0fab3b7d9b9919bf0479602e17b14c0fcb3d344e 100644
--- a/test/test_npu/test_scatter_dim_update.py
+++ b/test/test_npu/test_scatter_dim_update.py
@@ -1,96 +1,96 @@
-# Copyright 2020 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestScatterDimUpdate(TestCase):
-
-    def generate_data(self, min, max, shape_var, shape_indices, shape_updates, dtype_var,
-                      dtype_indices, dtype_updates, dim):
-        var = np.random.uniform(min, max, shape_var).astype(dtype_var)
-        updates = np.random.uniform(min, max, shape_updates).astype(dtype_updates)
-        indices = np.random.randint(0, shape_var[dim], shape_indices).astype(dtype_indices)
-
-        #modify from numpy.ndarray to torch.tensor
-        var = torch.from_numpy(var)
-        indices = torch.from_numpy(indices)
-        updates = torch.from_numpy(updates)
-
-        return var, indices, updates, dim
-
-    def cpu_op_exec(self, var, indices, updates, dim):
-        output = var.scatter(dim=dim, index=indices.long(), src=updates)
-        return output.numpy()
-
-    def npu_op_exec(self, var, indices, updates, dim):
-        var = var.to("npu")
-        indices = indices.to("npu")
-        updates = updates.to("npu")
-        output = torch.scatter(var, dim, indices, updates)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_scatter_dim_update_32_float32(self, device):
-        var, indices, updates, dim = self.generate_data(-2, 2, (32, ), (32, ), (32, ),
-                                                   "float32", "int32", "float32", 0)
-        cpu_output = self.cpu_op_exec(var, indices, updates, dim)
-        npu_output = self.npu_op_exec(var, indices, updates, dim)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_scatter_dim_update_32_32_float16(self, device):
-        var, indices, updates, dim = self.generate_data(-2, 2, (32, 32), (32, 32), (32, 32),
-                                                   "float16", "int32", "float16", 0)
-        cpu_output = self.cpu_op_exec(var, indices, updates, dim)
-        npu_output = self.npu_op_exec(var, indices, updates, dim)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_scatter_dim_update_32_32_float32(self, device):
-        var, indices, updates, dim = self.generate_data(-2, 2, (32, 32), (24, 24), (24, 24),
-                                                   "float32", "int32", "float32", 1)
-        cpu_output = self.cpu_op_exec(var, indices, updates, dim)
-        npu_output = self.npu_op_exec(var, indices, updates, dim)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_scatter_dim_update_32_32_32_int8(self, device):
-        var, indices, updates, dim = self.generate_data(-2, 2, (32, 32, 32), (24, 24, 24), (32, 32, 32),
-                                                   "int8", "int32", "int8", 1)
-        cpu_output = self.cpu_op_exec(var, indices, updates, dim)
-        npu_output = self.npu_op_exec(var, indices, updates, dim)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_scatter_dim_update_16_16_16_16_float16(self, device):
-        var, indices, updates, dim = self.generate_data(-2, 2, (16, 16, 16, 16), (8, 8, 8, 8), (12, 12, 12, 12),
-                                                   "float16", "int32", "float16", 2)
-        cpu_output = self.cpu_op_exec(var, indices, updates, dim)
-        npu_output = self.npu_op_exec(var, indices, updates, dim)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_scatter_dim_update_8_8_8_8_8_floa32(self, device):
-        var, indices, updates, dim = self.generate_data(-2, 2, (8, 8, 8, 8, 8), (3, 3, 3, 3, 3), (8, 8, 8, 8, 8),
-                                                   "float32", "int32", "float32", 3)
-        cpu_output = self.cpu_op_exec(var, indices, updates, dim)
-        npu_output = self.npu_op_exec(var, indices, updates, dim)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestScatterDimUpdate, globals(), except_for='cpu')
-if __name__ == '__main__':
-    torch.npu.set_device("npu:2")
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestScatterDimUpdate(TestCase):
+
+    def generate_data(self, min, max, shape_var, shape_indices, shape_updates, dtype_var,
+                      dtype_indices, dtype_updates, dim):
+        var = np.random.uniform(min, max, shape_var).astype(dtype_var)
+        updates = np.random.uniform(min, max, shape_updates).astype(dtype_updates)
+        indices = np.random.randint(0, shape_var[dim], shape_indices).astype(dtype_indices)
+
+        #modify from numpy.ndarray to torch.tensor
+        var = torch.from_numpy(var)
+        indices = torch.from_numpy(indices)
+        updates = torch.from_numpy(updates)
+
+        return var, indices, updates, dim
+
+    def cpu_op_exec(self, var, indices, updates, dim):
+        output = var.scatter(dim=dim, index=indices.long(), src=updates)
+        return output.numpy()
+
+    def npu_op_exec(self, var, indices, updates, dim):
+        var = var.to("npu")
+        indices = indices.to("npu")
+        updates = updates.to("npu")
+        output = torch.scatter(var, dim, indices, updates)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_scatter_dim_update_32_float32(self, device):
+        var, indices, updates, dim = self.generate_data(-2, 2, (32, ), (32, ), (32, ),
+                                                   "float32", "int32", "float32", 0)
+        cpu_output = self.cpu_op_exec(var, indices, updates, dim)
+        npu_output = self.npu_op_exec(var, indices, updates, dim)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_scatter_dim_update_32_32_float16(self, device):
+        var, indices, updates, dim = self.generate_data(-2, 2, (32, 32), (32, 32), (32, 32),
+                                                   "float16", "int32", "float16", 0)
+        cpu_output = self.cpu_op_exec(var, indices, updates, dim)
+        npu_output = self.npu_op_exec(var, indices, updates, dim)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_scatter_dim_update_32_32_float32(self, device):
+        var, indices, updates, dim = self.generate_data(-2, 2, (32, 32), (24, 24), (24, 24),
+                                                   "float32", "int32", "float32", 1)
+        cpu_output = self.cpu_op_exec(var, indices, updates, dim)
+        npu_output = self.npu_op_exec(var, indices, updates, dim)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_scatter_dim_update_32_32_32_int8(self, device):
+        var, indices, updates, dim = self.generate_data(-2, 2, (32, 32, 32), (24, 24, 24), (32, 32, 32),
+                                                   "int8", "int32", "int8", 1)
+        cpu_output = self.cpu_op_exec(var, indices, updates, dim)
+        npu_output = self.npu_op_exec(var, indices, updates, dim)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_scatter_dim_update_16_16_16_16_float16(self, device):
+        var, indices, updates, dim = self.generate_data(-2, 2, (16, 16, 16, 16), (8, 8, 8, 8), (12, 12, 12, 12),
+                                                   "float16", "int32", "float16", 2)
+        cpu_output = self.cpu_op_exec(var, indices, updates, dim)
+        npu_output = self.npu_op_exec(var, indices, updates, dim)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_scatter_dim_update_8_8_8_8_8_floa32(self, device):
+        var, indices, updates, dim = self.generate_data(-2, 2, (8, 8, 8, 8, 8), (3, 3, 3, 3, 3), (8, 8, 8, 8, 8),
+                                                   "float32", "int32", "float32", 3)
+        cpu_output = self.cpu_op_exec(var, indices, updates, dim)
+        npu_output = self.npu_op_exec(var, indices, updates, dim)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestScatterDimUpdate, globals(), except_for='cpu')
+if __name__ == '__main__':
+    torch.npu.set_device("npu:2")
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_sign.py b/test/test_npu/test_sign.py
index 2b4d7a42749bddcb36fe89f047682141498d93df..205a24e9b9d5f47b1cabfbfce964a878fa7cbe19 100644
--- a/test/test_npu/test_sign.py
+++ b/test/test_npu/test_sign.py
@@ -1,64 +1,64 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestSign(TestCase):
-
-    def generate_data(self, min_d, max_d, shape, dtype):
-        input= np.random.uniform(min_d, max_d, shape).astype(dtype)
-        npu_input = torch.from_numpy(input)
-        return npu_input
-
-    def cpu_op_exec(self, input_x):
-        output = torch.sign(input_x)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input_x):
-        input = input_x.to("npu")
-        output= torch.sign(input)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_sign_float16(self, device):
-        def cpu_op_exec_fp16(input):
-            input = input.to(torch.float32)
-            output = torch.sign(input)
-            output = output.numpy()
-            output = output.astype(np.float16)
-            return output
-
-        npu_input = self.generate_data(0, 100, (5,3), np.float16)
-        cpu_output = cpu_op_exec_fp16(npu_input)
-        npu_output = self.npu_op_exec(npu_input)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_sign_float32(self, device):
-        npu_input = self.generate_data(0, 100, (4,3), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input)
-        npu_output = self.npu_op_exec(npu_input)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestSign, globals(), except_for='cpu')
-if __name__ == "__main__":
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestSign(TestCase):
+
+    def generate_data(self, min_d, max_d, shape, dtype):
+        input= np.random.uniform(min_d, max_d, shape).astype(dtype)
+        npu_input = torch.from_numpy(input)
+        return npu_input
+
+    def cpu_op_exec(self, input_x):
+        output = torch.sign(input_x)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input_x):
+        input = input_x.to("npu")
+        output= torch.sign(input)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_sign_float16(self, device):
+        def cpu_op_exec_fp16(input):
+            input = input.to(torch.float32)
+            output = torch.sign(input)
+            output = output.numpy()
+            output = output.astype(np.float16)
+            return output
+
+        npu_input = self.generate_data(0, 100, (5,3), np.float16)
+        cpu_output = cpu_op_exec_fp16(npu_input)
+        npu_output = self.npu_op_exec(npu_input)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_sign_float32(self, device):
+        npu_input = self.generate_data(0, 100, (4,3), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input)
+        npu_output = self.npu_op_exec(npu_input)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestSign, globals(), except_for='cpu')
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_npu/test_solve.py b/test/test_npu/test_solve.py
index 53a986db75a89ce07c10fdebe1e45461cae93860..0f0e6ade83c6cec2648d204d7ad21b971deb9063 100644
--- a/test/test_npu/test_solve.py
+++ b/test/test_npu/test_solve.py
@@ -1,89 +1,89 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import random
-import copy
-from torch.autograd import Variable
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestSolve(TestCase):
-    def generate_data(self, min, max, shape, dtype):
-        input = np.random.uniform(min, max, shape).astype(dtype)
-        npu_input = torch.from_numpy(input)
-        return npu_input
-        
-    def cpu_op_exec(self, input1, input2):
-        X, LU = torch.solve(input2, input1) 
-        return X
-
-    def npu_op_exec(self, input1, input2):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        X, LU = torch.solve(input2, input1) 
-        X = X.to("cpu")
-        return X 
-
-    def test_solve_float16_2(self, device):
-        def cpu_op_exec_float16_2(input1, input2):
-            input1 = input1.to(torch.float32)
-            input2 = input2.to(torch.float32)
-            X, LU = torch.solve(input2, input1)
-            X = X.numpy()
-            X = X.astype(np.float16)
-            return X
-        npu_input1 = self.generate_data(0, 100, (2, 2), np.float16)
-        npu_input2 = self.generate_data(0, 100, (2, 1), np.float16)
-        cpu_output = cpu_op_exec_float16_2(npu_input1, npu_input2)
-        # npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        #self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_solve_float16_1(self, device):
-        def cpu_op_exec_float16_1(input1, input2):
-            input1 = input1.to(torch.float32)
-            input2 = input2.to(torch.float32)
-            X, LU = torch.solve(input2, input1)
-            X = X.numpy()
-            X = X.astype(np.float16)
-            return X
-        npu_input1 = self.generate_data(0, 100, (5, 5), np.float16)
-        npu_input2 = self.generate_data(0, 100, (5, 5), np.float16)
-        cpu_output = cpu_op_exec_float16_1(npu_input1, npu_input2)
-        # npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        #self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_solve_float32_1(self, device):
-        npu_input1 = self.generate_data(0, 100, (2, 3, 2, 2), np.float32)
-        npu_input2 = self.generate_data(0, 100, (2, 1, 2, 1), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        # npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        # self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_solve_float32_2(self, device):
-        npu_input1 = self.generate_data(0, 100, (3, 3, 3), np.float32)
-        npu_input2 = self.generate_data(0, 100, (3, 3, 2), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        # npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        # self.assertRtolEqual(cpu_output, npu_output)
-
-  
-instantiate_device_type_tests(TestSolve, globals(), except_for='cpu')
-if __name__ == '__main__':
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import random
+import copy
+from torch.autograd import Variable
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestSolve(TestCase):
+    def generate_data(self, min, max, shape, dtype):
+        input = np.random.uniform(min, max, shape).astype(dtype)
+        npu_input = torch.from_numpy(input)
+        return npu_input
+        
+    def cpu_op_exec(self, input1, input2):
+        X, LU = torch.solve(input2, input1) 
+        return X
+
+    def npu_op_exec(self, input1, input2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        X, LU = torch.solve(input2, input1) 
+        X = X.to("cpu")
+        return X 
+
+    def test_solve_float16_2(self, device):
+        def cpu_op_exec_float16_2(input1, input2):
+            input1 = input1.to(torch.float32)
+            input2 = input2.to(torch.float32)
+            X, LU = torch.solve(input2, input1)
+            X = X.numpy()
+            X = X.astype(np.float16)
+            return X
+        npu_input1 = self.generate_data(0, 100, (2, 2), np.float16)
+        npu_input2 = self.generate_data(0, 100, (2, 1), np.float16)
+        cpu_output = cpu_op_exec_float16_2(npu_input1, npu_input2)
+        # npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        #self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_solve_float16_1(self, device):
+        def cpu_op_exec_float16_1(input1, input2):
+            input1 = input1.to(torch.float32)
+            input2 = input2.to(torch.float32)
+            X, LU = torch.solve(input2, input1)
+            X = X.numpy()
+            X = X.astype(np.float16)
+            return X
+        npu_input1 = self.generate_data(0, 100, (5, 5), np.float16)
+        npu_input2 = self.generate_data(0, 100, (5, 5), np.float16)
+        cpu_output = cpu_op_exec_float16_1(npu_input1, npu_input2)
+        # npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        #self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_solve_float32_1(self, device):
+        npu_input1 = self.generate_data(0, 100, (2, 3, 2, 2), np.float32)
+        npu_input2 = self.generate_data(0, 100, (2, 1, 2, 1), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        # npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        # self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_solve_float32_2(self, device):
+        npu_input1 = self.generate_data(0, 100, (3, 3, 3), np.float32)
+        npu_input2 = self.generate_data(0, 100, (3, 3, 2), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        # npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        # self.assertRtolEqual(cpu_output, npu_output)
+
+  
+instantiate_device_type_tests(TestSolve, globals(), except_for='cpu')
+if __name__ == '__main__':
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_sub.py b/test/test_npu/test_sub.py
index 41580f85bdc2a2d822058e79cc6ab85cf724e4b5..b2a17b4eb7fa2c16cd9a4e70d34a286ad1e85578 100644
--- a/test/test_npu/test_sub.py
+++ b/test/test_npu/test_sub.py
@@ -1,193 +1,193 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestSub(TestCase):
-    def generate_data(self, min_d, max_d, shape, dtype):
-        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
-
-        # modify from numpy.ndarray to torch.tensor
-        npu_input1 = torch.from_numpy(input1)
-        npu_input2 = torch.from_numpy(input2)
-        return npu_input1, npu_input2
-
-    def cpu_op_exec(self, input1, input2):
-        output = torch.sub(input1, input2)
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_fp16(self, input1, input2):
-        input1 = input1.to(torch.float32)
-        output = torch.sub(input1, input2)
-        output = output.numpy()
-        output = output.astype(np.float16)
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        input1 = input1.to("npu")
-        input2 = input2.to("npu")
-        output = torch.sub(input1, input2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_sub_int32_1(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 3), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_sub_int32_2(self, device):
-        npu_input1, npu_input2 = self.generate_data(2147483647, 2147483648, (2, 3), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_sub_int32_3(self, device):
-        npu_input1, npu_input2 = self.generate_data(-100, 0, (2, 3), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_sub_int32_4(self, device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (500, 500), np.int32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_sub_float16_1(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, ((65535, 1, 1, 1)), np.float16)
-        cpu_output = self.cpu_op_exec_fp16(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_sub_float16_2(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, ((1, 1, 1, 8192)), np.float16)
-        cpu_output = self.cpu_op_exec_fp16(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_sub_float16_3(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, ((1, 1, 1, 65535)), np.float16)
-        cpu_output = self.cpu_op_exec_fp16(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_sub_float16_4(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, ((1, 1, 1, 524288)), np.float16)
-        cpu_output = self.cpu_op_exec_fp16(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_sub_float16_5(self, device):
-        npu_input1, npu_input2 = self.generate_data(-2, 2, ((1, 1, 1, 786432)), np.float16)
-        cpu_output = self.cpu_op_exec_fp16(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_sub_float16_6(self, device):
-        npu_input1, npu_input2 = self.generate_data(-5, 5, ((1, 1, 1, 786432)), np.float16)
-        cpu_output = self.cpu_op_exec_fp16(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_sub_float32_1(self, device):
-        npu_input1, npu_input2 = self.generate_data(-1.1754943508e-38, -1.1754943508e-38, ((1, 31, 149, 2)), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_sub_float32_2(self, device):
-        npu_input1, npu_input2 = self.generate_data(-0.000030517578125, 0.000030517578125, ((2, 32, 149, 31)), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_sub_float32_3(self, device):
-        npu_input1, npu_input2 = self.generate_data(-9.313225746154785e-10, 9.313225746154785e-10, ((184965, 1)), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_sub_float32_4(self, device):
-        npu_input1, npu_input2 = self.generate_data(-3, 3, ((1, 31, 149, 2)), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_sub_float32_5(self, device):
-        npu_input1, npu_input2 = self.generate_data(-9.313225746154785e-10, 9.313225746154785e-10, ((1, 31, 149, 2)), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_sub_float32_6(self, device):
-        npu_input1, npu_input2 = self.generate_data(-0.000000000000000000000000000000000000011754943508,
-                                          0.000000000000000000000000000000000000011754943508, ((2, 31, 149, 2)),
-                                          np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_sub_float32_7(self, device):
-        npu_input1, npu_input2 = self.generate_data(0.000000000000000000000000000000000000011754943508,
-                                          0.000000000000000000000000000000000000011754943508, ((4, 31, 149, 2)),
-                                          np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_sub_float32_8(self, device):
-        npu_input1, npu_input2 = self.generate_data(-0.000000000000000000000000000000000000011754943508,
-                                          -0.000000000000000000000000000000000000011754943508, ((2048, 31, 1, 2)),
-                                          np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_sub_float32_9(self, device):
-        npu_input1, npu_input2 = self.generate_data(-0.000000000000000000000000000000000000011754943508,
-                                          0.000000000000000000000000000000000000011754943508, ((8, 7, 149)), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
-        npu_output = self.npu_op_exec(npu_input1, npu_input2)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-    def test_sub_mix_dtype_1(self,device):
-        npu_input1, npu_input2= self.generate_data(0, 100, (2, 3), np.int32)
-        npu_input3, npu_input4 = self.generate_data(0, 100, (2, 3), np.float32)
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
-        npu_output = self.npu_op_exec(npu_input1, npu_input3)
-        self.assertRtolEqual(cpu_output, npu_output)
-        
-    def test_sub_mix_dtype_2(self,device):
-        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 3), np.float32)
-        npu_input3 = torch.tensor(3).int()
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
-        npu_output = self.npu_op_exec(npu_input1, npu_input3)
-        self.assertRtolEqual(cpu_output, npu_output)
-
-
-instantiate_device_type_tests(TestSub, globals(), except_for='cpu')
-if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestSub(TestCase):
+    def generate_data(self, min_d, max_d, shape, dtype):
+        input1 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+        input2 = np.random.uniform(min_d, max_d, shape).astype(dtype)
+
+        # modify from numpy.ndarray to torch.tensor
+        npu_input1 = torch.from_numpy(input1)
+        npu_input2 = torch.from_numpy(input2)
+        return npu_input1, npu_input2
+
+    def cpu_op_exec(self, input1, input2):
+        output = torch.sub(input1, input2)
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_fp16(self, input1, input2):
+        input1 = input1.to(torch.float32)
+        output = torch.sub(input1, input2)
+        output = output.numpy()
+        output = output.astype(np.float16)
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        input1 = input1.to("npu")
+        input2 = input2.to("npu")
+        output = torch.sub(input1, input2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_sub_int32_1(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 3), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_sub_int32_2(self, device):
+        npu_input1, npu_input2 = self.generate_data(2147483647, 2147483648, (2, 3), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_sub_int32_3(self, device):
+        npu_input1, npu_input2 = self.generate_data(-100, 0, (2, 3), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_sub_int32_4(self, device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (500, 500), np.int32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_sub_float16_1(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, ((65535, 1, 1, 1)), np.float16)
+        cpu_output = self.cpu_op_exec_fp16(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_sub_float16_2(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, ((1, 1, 1, 8192)), np.float16)
+        cpu_output = self.cpu_op_exec_fp16(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_sub_float16_3(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, ((1, 1, 1, 65535)), np.float16)
+        cpu_output = self.cpu_op_exec_fp16(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_sub_float16_4(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, ((1, 1, 1, 524288)), np.float16)
+        cpu_output = self.cpu_op_exec_fp16(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_sub_float16_5(self, device):
+        npu_input1, npu_input2 = self.generate_data(-2, 2, ((1, 1, 1, 786432)), np.float16)
+        cpu_output = self.cpu_op_exec_fp16(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_sub_float16_6(self, device):
+        npu_input1, npu_input2 = self.generate_data(-5, 5, ((1, 1, 1, 786432)), np.float16)
+        cpu_output = self.cpu_op_exec_fp16(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_sub_float32_1(self, device):
+        npu_input1, npu_input2 = self.generate_data(-1.1754943508e-38, -1.1754943508e-38, ((1, 31, 149, 2)), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_sub_float32_2(self, device):
+        npu_input1, npu_input2 = self.generate_data(-0.000030517578125, 0.000030517578125, ((2, 32, 149, 31)), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_sub_float32_3(self, device):
+        npu_input1, npu_input2 = self.generate_data(-9.313225746154785e-10, 9.313225746154785e-10, ((184965, 1)), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_sub_float32_4(self, device):
+        npu_input1, npu_input2 = self.generate_data(-3, 3, ((1, 31, 149, 2)), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_sub_float32_5(self, device):
+        npu_input1, npu_input2 = self.generate_data(-9.313225746154785e-10, 9.313225746154785e-10, ((1, 31, 149, 2)), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_sub_float32_6(self, device):
+        npu_input1, npu_input2 = self.generate_data(-0.000000000000000000000000000000000000011754943508,
+                                          0.000000000000000000000000000000000000011754943508, ((2, 31, 149, 2)),
+                                          np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_sub_float32_7(self, device):
+        npu_input1, npu_input2 = self.generate_data(0.000000000000000000000000000000000000011754943508,
+                                          0.000000000000000000000000000000000000011754943508, ((4, 31, 149, 2)),
+                                          np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_sub_float32_8(self, device):
+        npu_input1, npu_input2 = self.generate_data(-0.000000000000000000000000000000000000011754943508,
+                                          -0.000000000000000000000000000000000000011754943508, ((2048, 31, 1, 2)),
+                                          np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_sub_float32_9(self, device):
+        npu_input1, npu_input2 = self.generate_data(-0.000000000000000000000000000000000000011754943508,
+                                          0.000000000000000000000000000000000000011754943508, ((8, 7, 149)), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2)
+        npu_output = self.npu_op_exec(npu_input1, npu_input2)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+    def test_sub_mix_dtype_1(self,device):
+        npu_input1, npu_input2= self.generate_data(0, 100, (2, 3), np.int32)
+        npu_input3, npu_input4 = self.generate_data(0, 100, (2, 3), np.float32)
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
+        npu_output = self.npu_op_exec(npu_input1, npu_input3)
+        self.assertRtolEqual(cpu_output, npu_output)
+        
+    def test_sub_mix_dtype_2(self,device):
+        npu_input1, npu_input2 = self.generate_data(0, 100, (2, 3), np.float32)
+        npu_input3 = torch.tensor(3).int()
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input3)
+        npu_output = self.npu_op_exec(npu_input1, npu_input3)
+        self.assertRtolEqual(cpu_output, npu_output)
+
+
+instantiate_device_type_tests(TestSub, globals(), except_for='cpu')
+if __name__ == "__main__":
+    torch.npu.set_device("npu:5")
+    run_tests()
diff --git a/test/test_npu/test_sum.py b/test/test_npu/test_sum.py
index 3daa2d4dbda88a70773f43df9d4395c993ce3c6a..abbc85d2482f327d494630b29c585331e5bea983 100644
--- a/test/test_npu/test_sum.py
+++ b/test/test_npu/test_sum.py
@@ -1,62 +1,62 @@
-# Copyright (c) 2020 Huawei Technologies Co., Ltd
-# Copyright (c) 2019, Facebook CORPORATION. 
-# All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestSum(TestCase):
-    def cpu_op_exec(self, input1):
-        output = input1.sum()
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1):
-        output = input1.sum()
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def cpu_op_exec_dim(self, input1, dim, dtype):
-        output = torch.sum(input1, dim, keepdim=True, dtype=dtype)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec_dim(self, input1, dim, dtype):
-        output = torch.sum(input1, dim, keepdim=True, dtype=dtype)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_sum_shape_format(self, device):
-        shape_format = [
-                [[np.float32, 0, [256]], [0]],
-                [[np.float32, 0, [256, 1000]], [0]],
-                [[np.int32, 0, [5, 256]], [0]],
-        ]
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 2, 100)
-            cpu_output = self.cpu_op_exec_dim(cpu_input1, item[1], cpu_input1.dtype)
-            npu_output = self.npu_op_exec_dim(npu_input1, item[1], cpu_input1.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-
-instantiate_device_type_tests(TestSum, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020 Huawei Technologies Co., Ltd
+# Copyright (c) 2019, Facebook CORPORATION. 
+# All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestSum(TestCase):
+    def cpu_op_exec(self, input1):
+        output = input1.sum()
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1):
+        output = input1.sum()
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def cpu_op_exec_dim(self, input1, dim, dtype):
+        output = torch.sum(input1, dim, keepdim=True, dtype=dtype)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec_dim(self, input1, dim, dtype):
+        output = torch.sum(input1, dim, keepdim=True, dtype=dtype)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+
+    def test_sum_shape_format(self, device):
+        shape_format = [
+                [[np.float32, 0, [256]], [0]],
+                [[np.float32, 0, [256, 1000]], [0]],
+                [[np.int32, 0, [5, 256]], [0]],
+        ]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 2, 100)
+            cpu_output = self.cpu_op_exec_dim(cpu_input1, item[1], cpu_input1.dtype)
+            npu_output = self.npu_op_exec_dim(npu_input1, item[1], cpu_input1.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+
+instantiate_device_type_tests(TestSum, globals(), except_for="cpu")
+if __name__ == "__main__":
 	run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_topk.py b/test/test_npu/test_topk.py
index 6713a511ac053c183df14f138d6b13d38ee38339..8055ac38ef3f3ccccf06774af3174cd5dee8655b 100644
--- a/test/test_npu/test_topk.py
+++ b/test/test_npu/test_topk.py
@@ -1,97 +1,97 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-import time
-
-class TestTopk(TestCase):
-    def cpu_op_exec(self, input1, k, dim, largest, sorted1):
-        output, index = torch.topk(input1, k, dim, largest, sorted1)
-        output = output.numpy()
-        index = index.numpy()
-        return output, index
-    
-    def npu_op_exec(self, input1, k, dim, largest, sorted1):
-        output, index = torch.topk(input1, k, dim, largest, sorted1)
-        output = output.to("cpu")
-        index = index.to("cpu")
-        output = output.numpy()
-        index = index.numpy()
-        return output, index
-    
-    def test_topk_shape_format(self, device):
-        np.random.seed(0)
-        shape_format = [
-            # [k, dim, [input_type, input_format, input_shape, min, max], largest, sorted]
-            # dim
-            [3, 0, [np.float32, 0, [8, 10], 0, 100], True, True],
-            [3, 1, [np.float32, 0, [8, 10], 0, 100], True, True],
-            [5, 2, [np.float32, 0, [8, 9, 10], 0, 1000], True, True],
-            [5, 3, [np.float32, 0, [8, 9, 10, 11], 0, 1000], True, True],
-            # dtype
-            [3, 0, [np.int32, 0, [8, 10], 0, 100], True, True],
-            [5, 2, [np.int32, 0, [8, 9, 10], 0, 1000], True, True],
-            # random
-            [5, 0, [np.float32, 0, [31, 47], 0, 100], True, True],
-            [5, 1, [np.float32, 0, [42, 53, 7], 0, 100], True, True],
-            # largest
-            [3, 0, [np.float32, 0, [8, 10], 0, 100], False, True],
-        ]
-
-        cnt = 0
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[2], item[2][3], item[2][4])
-            cpu_output, cpu_index = self.cpu_op_exec(cpu_input, item[0], item[1], item[3], item[4])
-            npu_output, npu_index = self.npu_op_exec(npu_input, item[0], item[1], item[3], item[4])
-            # 目前只支持fp16,fp32降低阈值判断
-            self.assertRtolEqual(cpu_output, npu_output, prec=1.e-1)
-            #self.assertRtolEqual(cpu_index, npu_index)
-            cnt += 1
-    
-    def test_topk_float16_shape_format(self, device):
-        def cpu_op_exec_fp16(input1, k, dim, largest, sorted1):
-            input1 = input1.to(torch.float32)
-            output, index = torch.topk(input1, k, dim, largest, sorted1)
-            output = output.numpy().astype(np.float16)
-            index = index.numpy().astype(np.int32)
-            return output, index
-
-        np.random.seed(0)
-        shape_format = [
-            # [k, dim, [input_type, input_format, input_shape, min, max], largest, sorted]
-            # dim
-            [3, 0, [np.float16, 0, [8, 10], 0, 100], True, True],
-            [3, 1, [np.float16, 0, [8, 10], 0, 100], True, True],
-            [5, 2, [np.float16, 0, [8, 9, 10], 0, 1000], True, True],
-            [5, 3, [np.float16, 0, [8, 9, 10, 11], 0, 1000], True, True],
-        ]
-
-        cnt = 0
-        for item in shape_format:
-            cpu_input, npu_input = create_common_tensor(item[2], item[2][3], item[2][4])
-            cpu_output, cpu_index = cpu_op_exec_fp16(cpu_input, item[0], item[1], item[3], item[4])
-            npu_output, npu_index = self.npu_op_exec(npu_input, item[0], item[1], item[3], item[4])
-            cpu_index = cpu_index.astype(npu_index.dtype)
-            self.assertRtolEqual(cpu_output, npu_output)
-            #self.assertRtolEqual(cpu_index, npu_index)
-            cnt += 1
-
-instantiate_device_type_tests(TestTopk, globals(), except_for="cpu")
-if __name__ == "__main__":
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+import time
+
+class TestTopk(TestCase):
+    def cpu_op_exec(self, input1, k, dim, largest, sorted1):
+        output, index = torch.topk(input1, k, dim, largest, sorted1)
+        output = output.numpy()
+        index = index.numpy()
+        return output, index
+    
+    def npu_op_exec(self, input1, k, dim, largest, sorted1):
+        output, index = torch.topk(input1, k, dim, largest, sorted1)
+        output = output.to("cpu")
+        index = index.to("cpu")
+        output = output.numpy()
+        index = index.numpy()
+        return output, index
+    
+    def test_topk_shape_format(self, device):
+        np.random.seed(0)
+        shape_format = [
+            # [k, dim, [input_type, input_format, input_shape, min, max], largest, sorted]
+            # dim
+            [3, 0, [np.float32, 0, [8, 10], 0, 100], True, True],
+            [3, 1, [np.float32, 0, [8, 10], 0, 100], True, True],
+            [5, 2, [np.float32, 0, [8, 9, 10], 0, 1000], True, True],
+            [5, 3, [np.float32, 0, [8, 9, 10, 11], 0, 1000], True, True],
+            # dtype
+            [3, 0, [np.int32, 0, [8, 10], 0, 100], True, True],
+            [5, 2, [np.int32, 0, [8, 9, 10], 0, 1000], True, True],
+            # random
+            [5, 0, [np.float32, 0, [31, 47], 0, 100], True, True],
+            [5, 1, [np.float32, 0, [42, 53, 7], 0, 100], True, True],
+            # largest
+            [3, 0, [np.float32, 0, [8, 10], 0, 100], False, True],
+        ]
+
+        cnt = 0
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[2], item[2][3], item[2][4])
+            cpu_output, cpu_index = self.cpu_op_exec(cpu_input, item[0], item[1], item[3], item[4])
+            npu_output, npu_index = self.npu_op_exec(npu_input, item[0], item[1], item[3], item[4])
+            # 目前只支持fp16,fp32降低阈值判断
+            self.assertRtolEqual(cpu_output, npu_output, prec=1.e-1)
+            #self.assertRtolEqual(cpu_index, npu_index)
+            cnt += 1
+    
+    def test_topk_float16_shape_format(self, device):
+        def cpu_op_exec_fp16(input1, k, dim, largest, sorted1):
+            input1 = input1.to(torch.float32)
+            output, index = torch.topk(input1, k, dim, largest, sorted1)
+            output = output.numpy().astype(np.float16)
+            index = index.numpy().astype(np.int32)
+            return output, index
+
+        np.random.seed(0)
+        shape_format = [
+            # [k, dim, [input_type, input_format, input_shape, min, max], largest, sorted]
+            # dim
+            [3, 0, [np.float16, 0, [8, 10], 0, 100], True, True],
+            [3, 1, [np.float16, 0, [8, 10], 0, 100], True, True],
+            [5, 2, [np.float16, 0, [8, 9, 10], 0, 1000], True, True],
+            [5, 3, [np.float16, 0, [8, 9, 10, 11], 0, 1000], True, True],
+        ]
+
+        cnt = 0
+        for item in shape_format:
+            cpu_input, npu_input = create_common_tensor(item[2], item[2][3], item[2][4])
+            cpu_output, cpu_index = cpu_op_exec_fp16(cpu_input, item[0], item[1], item[3], item[4])
+            npu_output, npu_index = self.npu_op_exec(npu_input, item[0], item[1], item[3], item[4])
+            cpu_index = cpu_index.astype(npu_index.dtype)
+            self.assertRtolEqual(cpu_output, npu_output)
+            #self.assertRtolEqual(cpu_index, npu_index)
+            cnt += 1
+
+instantiate_device_type_tests(TestTopk, globals(), except_for="cpu")
+if __name__ == "__main__":
     run_tests()
\ No newline at end of file
diff --git a/test/test_npu/test_triangular_solve.py b/test/test_npu/test_triangular_solve.py
index 19c29815db5e30f915c77de92c8ca4fd10a94afa..1cb5e4b8f8219a53d189090ee902cae8d4f28c6d 100644
--- a/test/test_npu/test_triangular_solve.py
+++ b/test/test_npu/test_triangular_solve.py
@@ -1,84 +1,84 @@
-#  Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#  Licensed under the BSD 3-Clause License  (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#  https://opensource.org/licenses/BSD-3-Clause
-#
-#  Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestTriangularSolve(TestCase):
-    def generate_data(self, min, max, shape, dtype): 
-        input1 = np.random.uniform(min, max, shape).astype(dtype)
-        npu_input1 = torch.from_numpy(input1) 
-        return npu_input1
-
-    def cpu_op_exec(self, input1, input2, input3, input4, input5): 
-        output = input1.triangular_solve(input2,upper=input3,transpose=input4,unitriangular=input5)
-        return output 
-
-    def cpu_op_exec_float16(self, input1, input2, input3, input4, input5): 
-        input1 = input1.to(torch.float32)
-        input2 = input2.to(torch.float32)
-        output = input1.triangular_solve(input2,upper=input3,transpose=input4,unitriangular=input5)
-        return output 
-
-    def npu_op_exec(self, input1, input2, input3, input4, input5): 
-        input1 = input1.to("npu") 
-        input2 = input2.to("npu") 
-        output = input1.triangular_solve(input2,upper=input3,transpose=input4,unitriangular=input5)
-        output = output.to("cpu") 
-        return output        
-
-    def test_triangular_solve_float32(self, device): 
-        npu_input1 = self.generate_data(0, 100, (2,3) , np.float32) 
-        npu_input2 = self.generate_data(0, 100, (2,2) , np.float32) 
-        npu_true = True 
-        npu_false = False
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, npu_true, npu_false, npu_false) 
-        #npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_true, npu_false, npu_false) 
-        #self.assertRtolEqual(cpu_output, npu_output) 
-
-    def test_triangular_solve_float32_zhuanzhi(self, device): 
-        npu_input1 = self.generate_data(0, 100, (2,3) , np.float32) 
-        npu_input2 = self.generate_data(0, 100, (2,2) , np.float32) 
-        npu_true = True 
-        npu_false = False
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, npu_true, npu_true, npu_false) 
-        #npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_true, npu_true, npu_false) 
-        #self.assertRtolEqual(cpu_output, npu_output) 
-
-    def test_triangular_solve_float32_danwei(self, device): 
-        npu_input1 = self.generate_data(0, 100, (2,3) , np.float32) 
-        npu_input2 = self.generate_data(0, 100, (2,2) , np.float32) 
-        npu_true = True 
-        npu_false = False
-        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, npu_true, npu_false, npu_true) 
-        #npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_true, npu_false, npu_true) 
-        #self.assertRtolEqual(cpu_output, npu_output) 
-
-    def test_triangular_solve_float16(self, device): 
-        npu_input1 = self.generate_data(0, 100, (2,3) , np.float16) 
-        npu_input2 = self.generate_data(0, 100, (2,2) , np.float16) 
-        npu_true = True 
-        npu_false = False
-        cpu_output = self.cpu_op_exec_float16(npu_input1, npu_input2, npu_true, npu_false, npu_true) 
-        #npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_true, npu_false, npu_true) 
-        #self.assertRtolEqual(cpu_output, npu_output) 
-
-instantiate_device_type_tests(TestTriangularSolve, globals(), except_for='cpu')
-if __name__ == '__main__':
-    torch.npu.set_device("npu:2") 
-    run_tests()
+#  Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#  Licensed under the BSD 3-Clause License  (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#  https://opensource.org/licenses/BSD-3-Clause
+#
+#  Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestTriangularSolve(TestCase):
+    def generate_data(self, min, max, shape, dtype): 
+        input1 = np.random.uniform(min, max, shape).astype(dtype)
+        npu_input1 = torch.from_numpy(input1) 
+        return npu_input1
+
+    def cpu_op_exec(self, input1, input2, input3, input4, input5): 
+        output = input1.triangular_solve(input2,upper=input3,transpose=input4,unitriangular=input5)
+        return output 
+
+    def cpu_op_exec_float16(self, input1, input2, input3, input4, input5): 
+        input1 = input1.to(torch.float32)
+        input2 = input2.to(torch.float32)
+        output = input1.triangular_solve(input2,upper=input3,transpose=input4,unitriangular=input5)
+        return output 
+
+    def npu_op_exec(self, input1, input2, input3, input4, input5): 
+        input1 = input1.to("npu") 
+        input2 = input2.to("npu") 
+        output = input1.triangular_solve(input2,upper=input3,transpose=input4,unitriangular=input5)
+        output = output.to("cpu") 
+        return output        
+
+    def test_triangular_solve_float32(self, device): 
+        npu_input1 = self.generate_data(0, 100, (2,3) , np.float32) 
+        npu_input2 = self.generate_data(0, 100, (2,2) , np.float32) 
+        npu_true = True 
+        npu_false = False
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, npu_true, npu_false, npu_false) 
+        #npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_true, npu_false, npu_false) 
+        #self.assertRtolEqual(cpu_output, npu_output) 
+
+    def test_triangular_solve_float32_zhuanzhi(self, device): 
+        npu_input1 = self.generate_data(0, 100, (2,3) , np.float32) 
+        npu_input2 = self.generate_data(0, 100, (2,2) , np.float32) 
+        npu_true = True 
+        npu_false = False
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, npu_true, npu_true, npu_false) 
+        #npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_true, npu_true, npu_false) 
+        #self.assertRtolEqual(cpu_output, npu_output) 
+
+    def test_triangular_solve_float32_danwei(self, device): 
+        npu_input1 = self.generate_data(0, 100, (2,3) , np.float32) 
+        npu_input2 = self.generate_data(0, 100, (2,2) , np.float32) 
+        npu_true = True 
+        npu_false = False
+        cpu_output = self.cpu_op_exec(npu_input1, npu_input2, npu_true, npu_false, npu_true) 
+        #npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_true, npu_false, npu_true) 
+        #self.assertRtolEqual(cpu_output, npu_output) 
+
+    def test_triangular_solve_float16(self, device): 
+        npu_input1 = self.generate_data(0, 100, (2,3) , np.float16) 
+        npu_input2 = self.generate_data(0, 100, (2,2) , np.float16) 
+        npu_true = True 
+        npu_false = False
+        cpu_output = self.cpu_op_exec_float16(npu_input1, npu_input2, npu_true, npu_false, npu_true) 
+        #npu_output = self.npu_op_exec(npu_input1, npu_input2, npu_true, npu_false, npu_true) 
+        #self.assertRtolEqual(cpu_output, npu_output) 
+
+instantiate_device_type_tests(TestTriangularSolve, globals(), except_for='cpu')
+if __name__ == '__main__':
+    torch.npu.set_device("npu:2") 
+    run_tests()
diff --git a/test/test_npu/test_type_as.py b/test/test_npu/test_type_as.py
index 67f9ad01c81909650ca6a753d2270778e84b15db..65e990117d38a071f9ddd0ab6c24148a62556724 100644
--- a/test/test_npu/test_type_as.py
+++ b/test/test_npu/test_type_as.py
@@ -1,70 +1,70 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import numpy as np
-import sys
-import copy
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-
-class TestTypeAs(TestCase):        
-    def cpu_op_exec(self, input1, input2):
-        tensor1 = input1
-        tensor2 = input2
-        output = tensor1.type_as(tensor2)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, input2):
-        tensor1 = input1
-        tensor2 = input2
-        output = tensor1.type_as(tensor2)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output 
-
-    def test_type_as_int32_shape_format(self, device):
-        shape_format = [
-                [[np.float32, -1, (4, 3)],    [np.int32, -1, (4, 3)]],
-                [[np.float32, -1, (4, 3, 1)], [np.int32, -1, (4, 3, 1)]],
-        ]
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 1, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            self.assertRtolEqual(cpu_output, npu_output)   
-
-    def test_type_as_float32_shape_format(self, device):
-        shape_format = [
-                [[np.int32, -1, (8, 5)],    [np.float32, -1, (8, 5)]],
-                [[np.int32, -1, (9, 4, 2)], [np.float32, -1, (9, 4, 2)]],
-        ]
-
-        for item in shape_format:
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_input2, npu_input2 = create_common_tensor(item[1], 1, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
-            npu_output = self.npu_op_exec(npu_input1, npu_input2)
-            self.assertRtolEqual(cpu_output, npu_output)  
-
-   
-instantiate_device_type_tests(TestTypeAs, globals(), except_for='cpu')
-if __name__ == "__main__":
-    torch.npu.set_device("npu:5")
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import numpy as np
+import sys
+import copy
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+
+class TestTypeAs(TestCase):        
+    def cpu_op_exec(self, input1, input2):
+        tensor1 = input1
+        tensor2 = input2
+        output = tensor1.type_as(tensor2)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, input2):
+        tensor1 = input1
+        tensor2 = input2
+        output = tensor1.type_as(tensor2)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output 
+
+    def test_type_as_int32_shape_format(self, device):
+        shape_format = [
+                [[np.float32, -1, (4, 3)],    [np.int32, -1, (4, 3)]],
+                [[np.float32, -1, (4, 3, 1)], [np.int32, -1, (4, 3, 1)]],
+        ]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 1, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            self.assertRtolEqual(cpu_output, npu_output)   
+
+    def test_type_as_float32_shape_format(self, device):
+        shape_format = [
+                [[np.int32, -1, (8, 5)],    [np.float32, -1, (8, 5)]],
+                [[np.int32, -1, (9, 4, 2)], [np.float32, -1, (9, 4, 2)]],
+        ]
+
+        for item in shape_format:
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_input2, npu_input2 = create_common_tensor(item[1], 1, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, cpu_input2)
+            npu_output = self.npu_op_exec(npu_input1, npu_input2)
+            self.assertRtolEqual(cpu_output, npu_output)  
+
+   
+instantiate_device_type_tests(TestTypeAs, globals(), except_for='cpu')
+if __name__ == "__main__":
+    torch.npu.set_device("npu:5")
+    run_tests()
diff --git a/test/test_npu/test_unbind.py b/test/test_npu/test_unbind.py
index a843bec1dada8fc2e57b6d016e18cbd2051f6d94..19b5e7977da576c7c96cda166427736ba76ab7eb 100644
--- a/test/test_npu/test_unbind.py
+++ b/test/test_npu/test_unbind.py
@@ -1,84 +1,84 @@
-# Copyright (c) 2020, Huawei Technologies.All rights reserved.
-#
-# Licensed under the BSD 3-Clause License  (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# https://opensource.org/licenses/BSD-3-Clause
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# coding: utf-8
-
-import torch
-import numpy as np
-from common_utils import TestCase, run_tests
-from common_device_type import dtypes, instantiate_device_type_tests
-from util_test import create_common_tensor
-
-class TestUnbind(TestCase):
-
-    def cpu_op_exec(self, input1, dim):
-        output_tuple= torch.unbind(input1, dim=dim)
-        listtuple1 = []
-        for i in range(len(output_tuple)):
-            listtuple1 += list(output_tuple[i].contiguous().view(-1))
-        output = torch.tensor(listtuple1)
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, dim):
-        output_tuple = torch.unbind(input1, dim=dim)
-        listtuple1 = []
-        for i in range(len(output_tuple)):
-            listtuple1 += list(output_tuple[i].contiguous().view(-1))
-        output = torch.tensor(listtuple1)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-    
-    def test_unbind_common_shape_format(self, device):
-        shape_format = [
-                [[np.float32, 0 , (1, 4, 2, 3)], 1],
-                [[np.float32, 0, (1, 3, 2, 3)], 2],
-                [[np.float32, 0, (3, 2, 3)], 2],
-                [[np.float32, 0, ( 2, 3)], 0],
-                [[np.float16, 0 , (1, 4, 2, 3)], 1],
-                [[np.float16, 0, (1, 3, 2, 3)], 3],
-                [[np.float16, 0, (3, 2, 3)], 2],
-                [[np.float16, 0, ( 2, 3)], 0],
-                [[np.int32, 0 , (1, 4, 2, 3)], 1],
-                [[np.int32, 0, (1, 3, 2, 3)], 3],
-                [[np.int32, 0, (3, 2, 3)], 2],
-                [[np.int32, 0, ( 2, 3)], 0],
-                [[np.int16, 0 , (1, 4, 2, 3)], 1],
-                [[np.int16, 0, (1, 3, 2, 3)], 3],
-                [[np.int16, 0, (3, 2, 3)], 2],
-                [[np.int16, 0, ( 2, 3)], 0],
-                [[np.int8, 0 , (1, 4, 2, 3)], 1],
-                [[np.int8, 0, (1, 3, 2, 3)], 3],
-                [[np.int8, 0, (3, 2, 3)], 2],
-                [[np.int8, 0, ( 2, 3)], 0],
-                [[np.uint8, 0 , (1, 4, 2, 3)], 1],
-                [[np.uint8, 0, (1, 3, 2, 3)], 3],
-                [[np.uint8, 0, (3, 2, 3)], 2],
-                [[np.uint8, 0, ( 2, 3)], 0],
-                [[np.int64, 0 , (1, 4, 2, 3)], 1],
-                [[np.int64, 0, (1, 3, 2, 3)], 3],
-                [[np.int64, 0, (3, 2, 3)], 2],
-                [[np.int64, 0, ( 2, 3)], 0]
-                ]
-        for item in shape_format:            
-            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
-            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
-            npu_output = self.npu_op_exec(npu_input1, item[1])
-            self.assertRtolEqual(cpu_output, npu_output)       
-    
-instantiate_device_type_tests(TestUnbind, globals(), except_for="cpu")
-if __name__ == "__main__":
-    torch.npu.set_device("npu:6")
-    run_tests()
+# Copyright (c) 2020, Huawei Technologies.All rights reserved.
+#
+# Licensed under the BSD 3-Clause License  (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding: utf-8
+
+import torch
+import numpy as np
+from common_utils import TestCase, run_tests
+from common_device_type import dtypes, instantiate_device_type_tests
+from util_test import create_common_tensor
+
+class TestUnbind(TestCase):
+
+    def cpu_op_exec(self, input1, dim):
+        output_tuple= torch.unbind(input1, dim=dim)
+        listtuple1 = []
+        for i in range(len(output_tuple)):
+            listtuple1 += list(output_tuple[i].contiguous().view(-1))
+        output = torch.tensor(listtuple1)
+        output = output.numpy()
+        return output
+
+    def npu_op_exec(self, input1, dim):
+        output_tuple = torch.unbind(input1, dim=dim)
+        listtuple1 = []
+        for i in range(len(output_tuple)):
+            listtuple1 += list(output_tuple[i].contiguous().view(-1))
+        output = torch.tensor(listtuple1)
+        output = output.to("cpu")
+        output = output.numpy()
+        return output
+    
+    def test_unbind_common_shape_format(self, device):
+        shape_format = [
+                [[np.float32, 0 , (1, 4, 2, 3)], 1],
+                [[np.float32, 0, (1, 3, 2, 3)], 2],
+                [[np.float32, 0, (3, 2, 3)], 2],
+                [[np.float32, 0, ( 2, 3)], 0],
+                [[np.float16, 0 , (1, 4, 2, 3)], 1],
+                [[np.float16, 0, (1, 3, 2, 3)], 3],
+                [[np.float16, 0, (3, 2, 3)], 2],
+                [[np.float16, 0, ( 2, 3)], 0],
+                [[np.int32, 0 , (1, 4, 2, 3)], 1],
+                [[np.int32, 0, (1, 3, 2, 3)], 3],
+                [[np.int32, 0, (3, 2, 3)], 2],
+                [[np.int32, 0, ( 2, 3)], 0],
+                [[np.int16, 0 , (1, 4, 2, 3)], 1],
+                [[np.int16, 0, (1, 3, 2, 3)], 3],
+                [[np.int16, 0, (3, 2, 3)], 2],
+                [[np.int16, 0, ( 2, 3)], 0],
+                [[np.int8, 0 , (1, 4, 2, 3)], 1],
+                [[np.int8, 0, (1, 3, 2, 3)], 3],
+                [[np.int8, 0, (3, 2, 3)], 2],
+                [[np.int8, 0, ( 2, 3)], 0],
+                [[np.uint8, 0 , (1, 4, 2, 3)], 1],
+                [[np.uint8, 0, (1, 3, 2, 3)], 3],
+                [[np.uint8, 0, (3, 2, 3)], 2],
+                [[np.uint8, 0, ( 2, 3)], 0],
+                [[np.int64, 0 , (1, 4, 2, 3)], 1],
+                [[np.int64, 0, (1, 3, 2, 3)], 3],
+                [[np.int64, 0, (3, 2, 3)], 2],
+                [[np.int64, 0, ( 2, 3)], 0]
+                ]
+        for item in shape_format:            
+            cpu_input1, npu_input1 = create_common_tensor(item[0], 1, 100)
+            cpu_output = self.cpu_op_exec(cpu_input1, item[1])
+            npu_output = self.npu_op_exec(npu_input1, item[1])
+            self.assertRtolEqual(cpu_output, npu_output)       
+    
+instantiate_device_type_tests(TestUnbind, globals(), except_for="cpu")
+if __name__ == "__main__":
+    torch.npu.set_device("npu:6")
+    run_tests()