From 3b82a40bcb897238a887a60632922d4055e4f5d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=82=B1=E6=AD=A3=E9=98=B3?= <qiuzhengyang3@huawei.com>
Date: Tue, 18 Mar 2025 17:18:39 +0800
Subject: [PATCH 1/7] feat: unique

---
 include/csrc/functions.h        |   1 +
 kernels/op_host/unique.cpp      | 100 +++++
 kernels/op_host/unique_tiling.h |  18 +
 kernels/op_kernel/unique.cpp    |  26 ++
 kernels/op_kernel/unique.h      | 745 ++++++++++++++++++++++++++++++++
 mx_driving/__init__.py          |   2 +
 mx_driving/csrc/Unique.cpp      |  32 ++
 mx_driving/csrc/pybind.cpp      |   3 +
 mx_driving/fused.py             |   1 +
 mx_driving/ops/npu_unique.py    |  19 +
 10 files changed, 947 insertions(+)
 create mode 100644 kernels/op_host/unique.cpp
 create mode 100644 kernels/op_host/unique_tiling.h
 create mode 100644 kernels/op_kernel/unique.cpp
 create mode 100644 kernels/op_kernel/unique.h
 create mode 100644 mx_driving/csrc/Unique.cpp
 create mode 100644 mx_driving/ops/npu_unique.py

diff --git a/include/csrc/functions.h b/include/csrc/functions.h
index 1df7ed71..2508cf26 100644
--- a/include/csrc/functions.h
+++ b/include/csrc/functions.h
@@ -288,4 +288,5 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> calc_poly_start_end_sl(const at::
 at::Tensor npu_subm_sparse_conv3d_with_key(const at::Tensor& ouidx_offset, const at::Tensor& valid_indices,
                                            const at::Tensor& weight, const at::Tensor& feature, int indices_number,
                                            at::IntArrayRef kernel_size);
+at::Tensor npu_unique(const at::Tensor& input);
 #endif // CSRC_FUNCTIONS_H_
diff --git a/kernels/op_host/unique.cpp b/kernels/op_host/unique.cpp
new file mode 100644
index 00000000..bbc0dfbf
--- /dev/null
+++ b/kernels/op_host/unique.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
+ *
+ */
+#include "unique_tiling.h"
+#include "register/op_def_registry.h"
+#include "tiling/platform/platform_ascendc.h"
+constexpr size_t SYS_RSVD_WS_SIZE = 16 * 1024 * 1024;
+
+
+namespace optiling {
+static ge::graphStatus UniqueTilingFunc(gert::TilingContext* context) {
+  UniqueTilingData tiling;
+
+  constexpr uint16_t tileLength = 8192;
+  const uint8_t dimNum = context->GetInputShape(0)->GetStorageShape().GetDimNum();
+  const gert::StorageShape* inputShape = context->GetInputShape(0);
+  uint32_t totalLength = 1;
+  for (int i = 0; i < dimNum; i++) {
+    totalLength *= inputShape->GetStorageShape().GetDim(i);
+  }
+  const uint32_t tileNum = (totalLength + tileLength - 1) / tileLength;
+  const uint16_t tailLength = totalLength % tileLength;
+  const auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
+  const uint32_t aivNum = ascendcPlatform.GetCoreNumAiv();
+  const uint8_t blockNum = tileNum >= aivNum? aivNum: tileNum;
+  const uint32_t shortBlockTileNum = tileNum / blockNum;
+  const uint8_t longBlockNum = tileNum % blockNum;
+  const uint8_t shortBlockNum = blockNum - longBlockNum;
+
+  tiling.set_totalLength(totalLength);
+  tiling.set_tileNum(tileNum);
+  tiling.set_shortBlockTileNum(shortBlockTileNum);
+  tiling.set_tailLength(tailLength);
+  tiling.set_blockNum(blockNum);
+  tiling.set_shortBlockNum(shortBlockNum);
+
+  context->SetBlockDim(blockNum);
+  tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
+  context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
+  // Workspace for IBSet/IBWait up to 8 times, and 2 times full data.
+  uint32_t sysWorkspaceSize = ascendcPlatform.GetLibApiWorkSpaceSize();
+  auto&& currentWorkspace = context->GetWorkspaceSizes(1);
+  if (currentWorkspace == nullptr) {
+    return ge::GRAPH_FAILED;
+  }
+  size_t usrSize = (aivNum * 8 + 1) * 8 * sizeof(uint32_t) + (tileNum * tileLength) * 2 * sizeof(float) * 2;
+  currentWorkspace[0] = usrSize + sysWorkspaceSize;
+  return ge::GRAPH_SUCCESS;
+}
+}
+
+
+namespace ge {
+static ge::graphStatus UniqueInferShape(gert::InferShapeContext* context) {
+    const gert::Shape* x1_shape = context->GetInputShape(0);
+    gert::Shape* y_shape = context->GetOutputShape(0);
+    *y_shape = *x1_shape;
+    return GRAPH_SUCCESS;
+}
+
+static ge::graphStatus UniqueInferDtype(gert::InferDataTypeContext* context)
+{
+    auto inputDtype = context->GetInputDataType(0);
+    context->SetOutputDataType(0, inputDtype);
+    return ge::GRAPH_SUCCESS;
+}
+}
+
+
+namespace ops {
+class Unique : public OpDef {
+public:
+    explicit Unique(const char* name) : OpDef(name) {
+        this->Input("input")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_BF16, ge::DT_FLOAT16, ge::DT_INT16, ge::DT_FLOAT, ge::DT_INT32, ge::DT_INT64})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND})
+            .IgnoreContiguous();
+        this->Output("output")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_BF16, ge::DT_FLOAT16, ge::DT_INT16, ge::DT_FLOAT, ge::DT_INT32, ge::DT_INT64})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+        this->Output("uniqueCnt")
+            .ParamType(REQUIRED)
+            .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND});
+
+        this->SetInferShape(ge::UniqueInferShape);
+        this->SetInferDataType(ge::UniqueInferDtype);
+
+        this->AICore()
+            .SetTiling(optiling::UniqueTilingFunc);
+        this->AICore().AddConfig("ascend910b");
+        this->AICore().AddConfig("ascend910_93");
+    }
+};
+
+OP_ADD(Unique);
+}
diff --git a/kernels/op_host/unique_tiling.h b/kernels/op_host/unique_tiling.h
new file mode 100644
index 00000000..f42483c4
--- /dev/null
+++ b/kernels/op_host/unique_tiling.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
+ *
+ */
+#include "register/tilingdata_base.h"
+
+namespace optiling {
+BEGIN_TILING_DATA_DEF(UniqueTilingData)
+  TILING_DATA_FIELD_DEF(uint32_t, totalLength);
+  TILING_DATA_FIELD_DEF(uint32_t, tileNum);
+  TILING_DATA_FIELD_DEF(uint32_t, shortBlockTileNum);
+  TILING_DATA_FIELD_DEF(uint16_t, tailLength);
+  TILING_DATA_FIELD_DEF(uint8_t, blockNum);
+  TILING_DATA_FIELD_DEF(uint8_t, shortBlockNum);
+END_TILING_DATA_DEF;
+
+REGISTER_TILING_DATA_CLASS(Unique, UniqueTilingData)
+}
diff --git a/kernels/op_kernel/unique.cpp b/kernels/op_kernel/unique.cpp
new file mode 100644
index 00000000..e9cedcf5
--- /dev/null
+++ b/kernels/op_kernel/unique.cpp
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ */
+
+#include "kernel_operator.h"
+#include "unique.h"
+
+
+extern "C" __global__ __aicore__ void unique(
+    GM_ADDR input, GM_ADDR output, GM_ADDR uniqueCnt, GM_ADDR workspace, GM_ADDR tiling) {
+    GM_ADDR usrWorkspace = AscendC::GetUserWorkspace(workspace);
+    GET_TILING_DATA(tiling_data, tiling);
+    TPipe pipe;
+    KernelUnique<DTYPE_INPUT> op(pipe);
+    op.Init(input,
+            output,
+            uniqueCnt,
+            usrWorkspace,
+            tiling_data.totalLength,
+            tiling_data.tileNum,
+            tiling_data.shortBlockTileNum,
+            tiling_data.tailLength,
+            tiling_data.blockNum,
+            tiling_data.shortBlockNum);
+    op.Process();
+}
diff --git a/kernels/op_kernel/unique.h b/kernels/op_kernel/unique.h
new file mode 100644
index 00000000..0960db13
--- /dev/null
+++ b/kernels/op_kernel/unique.h
@@ -0,0 +1,745 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
+ */
+
+#include "kernel_operator.h"
+using namespace AscendC;
+
+
+namespace AscendC {
+template<typename Ta, typename Tb>
+__aicore__ inline static Ta min(const Ta a, const Tb b) {
+    if (a > b) {
+        return b;
+    }
+    return a;
+}
+
+template<typename Ta, typename Tb>
+__aicore__ inline static Ta max(const Ta a, const Tb b) {
+    if (a < b) {
+        return b;
+    }
+    return a;
+}
+
+template <typename T>
+class KernelUnique {
+public:
+    __aicore__ inline KernelUnique(TPipe &pipe) : pipe(pipe) {}
+    // Each block process diffent part of data. This function returns the element-wise first index of data by blockIdx.
+    __aicore__ inline size_t GetGlobalOffset(const uint32_t blockIdx);
+    __aicore__ inline void Init(GM_ADDR input,
+                                GM_ADDR output,
+                                GM_ADDR uniqueCnt,
+                                GM_ADDR workspace,
+                                const uint32_t totalLength,
+                                const uint32_t totalTileNum,
+                                const uint32_t shortBlockTileNum,
+                                const uint16_t tailLength,
+                                const uint8_t blockNum,
+                                const uint8_t shortBlockNum);
+    __aicore__ inline void Process();
+
+private:
+    __aicore__ inline void CopyIn(const int32_t progress);
+    __aicore__ inline void Elem32Sort(const int32_t progress);
+    __aicore__ inline void TileSort(const int32_t progress);
+    template <typename T1>
+    __aicore__ inline static void DataCopyGM2GM(const GlobalTensor<T1> &dst,
+                                                const GlobalTensor<T1> &src,
+                                                const LocalTensor<T1> &tmpLocal,
+                                                const int elemLength,
+                                                const int bufByteLength);
+    using GMSSrcList = GlobalTensor<float> (&)[4];
+    struct GMSParams {
+        int (&GMSLengths)[4];
+        uint8_t &queNum;
+        LocalTensor<float> (&&buffLocal)[5];
+    };
+    __aicore__ inline static void MrgSortGM(GlobalTensor<float> &&dstGlobal,
+                                            GMSSrcList &srcList,
+                                            GMSParams &params);
+    __aicore__ inline void BlockSortV2();
+    __aicore__ inline void GlobalSortV2();
+    __aicore__ inline static void ConsecutiveUnique(const LocalTensor<float> &dstVal,
+                                                    const LocalTensor<float> &srcLocal,
+                                                    const LocalTensor<float> &shiftedLocal,
+                                                    const LocalTensor<uint32_t> &bitMask16,
+                                                    const uint16_t elemLength,
+                                                    uint64_t &tileUniqueCnt);
+    __aicore__ inline void TileUnique(const int32_t progress);
+    __aicore__ inline void CopyOut();
+
+private:
+    static constexpr int32_t    TILE_LENGTH = 8192;
+    // INF to fill the tail blank, so that tail is automatically removed by Compare in Unique.
+    static constexpr float      FLOAT_INF = 3e+99;
+    // Indicates the factor converting float to data structure used by Sort32&MrgSort.
+    static constexpr int16_t    SORT_DATATYPE_SIZE = sizeof(float) + sizeof(uint32_t);  // 8
+    static constexpr int16_t    SORT_DATATYPE_SIZE_FACTOR = SORT_DATATYPE_SIZE / sizeof(float);  // 2
+    static constexpr int32_t    TILE_LEN_BYTE = TILE_LENGTH * SORT_DATATYPE_SIZE;  // 8192 * 8 = 65536
+    static constexpr int32_t    TILE_LEN_ELEM = TILE_LENGTH * SORT_DATATYPE_SIZE_FACTOR;  // 8192 * 2 = 16384
+    static constexpr uint16_t   VALID_QUE[5] = {0, 0, 0b11, 0b111, 0b1111};  // Converts queue number to validBit of MrgSort.
+
+    TPipe &pipe;
+    TQue<TPosition::VECIN, 1>   calcBuf[3];
+
+    GlobalTensor<T>         srcGlobal;
+    GlobalTensor<uint32_t>  srcGlobalAsUint;
+    GlobalTensor<T>         dstGlobal1;
+    GlobalTensor<int32_t>   dstGlobal1As32;
+    GlobalTensor<int32_t>   uniqueCntGlobal;
+
+    GlobalTensor<float>     sortedBlock1;
+    GlobalTensor<int32_t>   sortedBlock1AsInt;
+    GlobalTensor<float>     sortedBlock2;
+    GlobalTensor<int32_t>   sortedBlock2AsInt;
+    GlobalTensor<float>     sortedGlobal1;
+    GlobalTensor<float>     sortedGlobal2;
+
+    GlobalTensor<int32_t>   IBSyncGlobal;
+    GlobalTensor<uint32_t>  blockUniqueCntGlobal;
+
+    uint16_t    syncWorkspaceSize;
+    uint8_t     eventID {0};
+    uint64_t    accUniqueCnt {0};
+    float       lastTileUniqueVal;
+
+    uint32_t    totalLength;
+    uint32_t    alignedTotalLength;
+    uint32_t    tileNum;
+    uint32_t    shortBlockTileNum;
+    uint16_t    tailLength;
+    uint8_t     blockNum;
+    uint8_t     shortBlockNum;
+
+    size_t      globalOffset;  // Offset of data for current block.
+    size_t      blockLength;  // Length of current block.
+    bool        hasInfFlag {false};
+};
+
+// Each block process diffent part of data. This function returns the element-wise first index of data by blockIdx.
+template <typename T>
+__aicore__ inline size_t KernelUnique<T>::GetGlobalOffset(const uint32_t blockIdx) {
+    // (shortBlockTileNum + 1) indicates longBlockTileNum.
+    const size_t offset = (this->shortBlockTileNum * min(this->shortBlockNum, blockIdx) +
+                          (this->shortBlockTileNum + 1) *
+                              (this->shortBlockNum >= blockIdx? 0: blockIdx - this->shortBlockNum)) *
+                              TILE_LENGTH;
+    return offset;
+}
+
+template <typename T>
+__aicore__ inline void KernelUnique<T>::Init(GM_ADDR input,
+                                             GM_ADDR output,
+                                             GM_ADDR uniqueCnt,
+                                             GM_ADDR workspace,
+                                             const uint32_t totalLength,
+                                             const uint32_t totalTileNum,
+                                             const uint32_t shortBlockTileNum,
+                                             const uint16_t tailLength,
+                                             const uint8_t blockNum,
+                                             const uint8_t shortBlockNum) {
+    this->totalLength = totalLength;
+    this->alignedTotalLength = totalTileNum * TILE_LENGTH;
+    this->shortBlockTileNum = shortBlockTileNum;
+    this->tailLength = tailLength;
+    this->blockNum = blockNum;
+    this->shortBlockNum = shortBlockNum;
+
+    const bool isShortBlock = this->shortBlockNum > GetBlockIdx();
+    // (shortBlockTileNum + 1) indicates longBlockTileNum.
+    this->tileNum = isShortBlock? shortBlockTileNum: shortBlockTileNum + 1;
+    this->blockLength = this->tileNum * TILE_LENGTH;
+    this->globalOffset = GetGlobalOffset(GetBlockIdx());
+
+    srcGlobal.SetGlobalBuffer((__gm__ T *)input + globalOffset, this->blockLength);
+    srcGlobalAsUint.SetGlobalBuffer((__gm__ uint32_t *)input + globalOffset * sizeof(T) / sizeof(uint32_t),
+                                    this->blockLength * sizeof(T) / sizeof(uint32_t));
+    dstGlobal1.SetGlobalBuffer((__gm__ T *)output, this->alignedTotalLength);
+    dstGlobal1As32.SetGlobalBuffer((__gm__ int32_t *)output,
+                                   this->alignedTotalLength * sizeof(T) / sizeof(int32_t));
+    uniqueCntGlobal.SetGlobalBuffer((__gm__ int32_t *)uniqueCnt, 1);
+
+    // sortedBlock is offsetted, and could only see the data that this block should process.
+    sortedBlock1.SetGlobalBuffer((__gm__ float *)workspace + globalOffset * SORT_DATATYPE_SIZE_FACTOR,
+                                 this->blockLength * SORT_DATATYPE_SIZE_FACTOR);
+    sortedBlock1AsInt.SetGlobalBuffer((__gm__ int32_t *)workspace + globalOffset * SORT_DATATYPE_SIZE_FACTOR,
+                                      this->blockLength * SORT_DATATYPE_SIZE_FACTOR);
+    sortedBlock2.SetGlobalBuffer((__gm__ float *)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR +
+                                 globalOffset * SORT_DATATYPE_SIZE_FACTOR,
+                                 this->blockLength * SORT_DATATYPE_SIZE_FACTOR);
+    sortedBlock2AsInt.SetGlobalBuffer((__gm__ int32_t *)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR +
+                                      globalOffset * SORT_DATATYPE_SIZE_FACTOR,
+                                      this->blockLength * SORT_DATATYPE_SIZE_FACTOR);
+    // sortedGlobal could see all data in the workspace.
+    sortedGlobal1.SetGlobalBuffer((__gm__ float *)workspace, alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR);
+    sortedGlobal2.SetGlobalBuffer((__gm__ float *)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR,
+                                  alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR);
+
+    // Buff size for syncronizing according to document of IBWait&IBSet.
+    this->syncWorkspaceSize = (blockNum * 32 + 1) * 8;
+    IBSyncGlobal.SetGlobalBuffer((__gm__ int32_t *)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR * 2,
+                                 syncWorkspaceSize);
+    blockUniqueCntGlobal.SetGlobalBuffer((__gm__ uint32_t *)workspace + alignedTotalLength * 4 + syncWorkspaceSize,
+                                         (blockNum + 7) / 8 * 8);  // Length aligned up to 32B.
+
+    pipe.InitBuffer(calcBuf[0], 1, TILE_LEN_BYTE);
+    pipe.InitBuffer(calcBuf[1], 1, TILE_LEN_BYTE);
+    pipe.InitBuffer(calcBuf[2], 1, TILE_LEN_BYTE);
+}
+
+template <typename T>
+__aicore__ inline void KernelUnique<T>::Process() {
+    LocalTensor<int32_t> IBSyncLocal;
+    // Initialize sync buff.
+    if (GetBlockIdx() == 0) {
+        IBSyncLocal = calcBuf[0].AllocTensor<int32_t>();
+        Duplicate(IBSyncLocal, 0, syncWorkspaceSize);
+        PipeBarrier<PIPE_ALL>();
+        DataCopy(IBSyncGlobal, IBSyncLocal, syncWorkspaceSize);
+        PipeBarrier<PIPE_ALL>();
+        calcBuf[0].FreeTensor(IBSyncLocal);
+    }  // Initialize sync buff.
+
+    // Sort within each tile.
+    for (int32_t tileIdx = 0; tileIdx < this->tileNum; tileIdx++) {
+        CopyIn(tileIdx);
+        Elem32Sort(tileIdx);
+        TileSort(tileIdx);
+    }
+
+    if (GetBlockNum() > 1) {
+        if (this->tileNum > 1) {
+            BlockSortV2();  // Sort within each block.
+        }
+
+        GlobalSortV2();  // Sort globally.
+
+        PipeBarrier<PIPE_ALL>();
+        SyncAll<true>();
+        PipeBarrier<PIPE_ALL>();
+    }
+
+    // Check if an inf value exists. If do, inf will be append to the result in TileUnique().
+    if ((IsSameType<T, bfloat16_t>::value || IsSameType<T, half>::value || IsSameType<T, float>::value) &&
+        GetBlockIdx() == blockNum - 1) {
+        PipeBarrier<PIPE_ALL>();
+        const uint32_t totalLength32BAligned = (totalLength * sizeof(T) + 31) / 32 * 32 / sizeof(float);
+        if (sortedGlobal1.GetValue((totalLength32BAligned - 1) * 2) == -FLOAT_INF) {
+            hasInfFlag = true;
+        }
+    }
+
+    // Do unique in each block based on tiles.
+    for (int32_t tileIdx = 0; tileIdx < this->tileNum; tileIdx++) {
+        TileUnique(tileIdx);
+    }
+
+    if (this->blockNum > 1) {
+        // Each block waits for its former block to upload blockUniqueCnt.
+        IBSyncLocal = calcBuf[0].AllocTensor<int32_t>();
+        if (GetBlockIdx() != 0) {
+            PipeBarrier<PIPE_ALL>();
+            IBWait(IBSyncGlobal, IBSyncLocal, (int32_t)GetBlockIdx() - 1, eventID);
+            PipeBarrier<PIPE_ALL>();
+        }
+        PipeBarrier<PIPE_ALL>();
+        IBSet(IBSyncGlobal, IBSyncLocal, (int32_t)GetBlockIdx(), eventID);
+        PipeBarrier<PIPE_ALL>();
+        calcBuf[0].FreeTensor(IBSyncLocal);
+    }
+
+    // Gather result from every block.
+    CopyOut();
+}
+
+template <typename T>
+__aicore__ inline void KernelUnique<T>::CopyIn(const int32_t progress) {
+    LocalTensor<T> srcLocal = calcBuf[0].AllocTensor<T>();
+    LocalTensor<float> sortedLocal2 = calcBuf[2].AllocTensor<float>();
+
+    // To process tail, fill the whole tile with INF, then cover it with tail.
+    int32_t castLen;  // Valid length of the last block.
+    if ((progress != tileNum - 1) || (GetBlockIdx() != blockNum - 1) || tailLength == 0) {
+        // Must determine during compilation, otherwise we get a compilation error.
+        if constexpr (!IsSameType<T, float>::value) {
+            DataCopy(srcLocal, srcGlobal[progress * TILE_LENGTH], TILE_LENGTH);
+        } else {
+            DataCopy(sortedLocal2, srcGlobal[progress * TILE_LENGTH], TILE_LENGTH);
+        }
+        castLen = TILE_LENGTH;
+    } else {
+        // Process tail.
+        LocalTensor<uint32_t> srcAsUint = srcLocal.template ReinterpretCast<uint32_t>();
+        Duplicate(sortedLocal2, FLOAT_INF, TILE_LENGTH);
+        PipeBarrier<PIPE_V>();
+        if constexpr (IsSameType<T, float>::value) {
+            DataCopyPad(sortedLocal2,
+                        srcGlobal[progress * TILE_LENGTH],
+                        {1, static_cast<uint16_t>(sizeof(T) * tailLength), 0, 0},
+                        {false, 0, 0, 0});
+        } else if constexpr (sizeof(T) >= sizeof(float)) {
+            DataCopyPad(srcAsUint,
+                        srcGlobalAsUint[progress * TILE_LENGTH * sizeof(T) / sizeof(uint32_t)],
+                        {1, static_cast<uint16_t>(sizeof(T) * tailLength), 0, 0},
+                        {false, 0, 0, 0});
+        } else {
+            DataCopyPad(srcLocal,
+                        srcGlobal[progress * TILE_LENGTH],
+                        {1, static_cast<uint16_t>(sizeof(T) * tailLength), 0, 0},
+                        {false, 0, 0, 0});
+        }
+        castLen = tailLength;
+    }
+    PipeBarrier<PIPE_ALL>();
+    if constexpr (!IsSameType<T, float>::value) {
+        if constexpr (sizeof(T) >= sizeof(float)) {
+            Cast(sortedLocal2, srcLocal, RoundMode::CAST_ROUND, castLen);
+        } else {
+            Cast(sortedLocal2, srcLocal, RoundMode::CAST_NONE, castLen);
+        }
+        PipeBarrier<PIPE_V>();
+    }
+    Muls(sortedLocal2, sortedLocal2, (float)-1, TILE_LENGTH);
+    calcBuf[0].EnQue(srcLocal);
+    calcBuf[2].EnQue(sortedLocal2);
+}
+
+template <typename T>
+__aicore__ inline void KernelUnique<T>::Elem32Sort(const int32_t progress) {
+    LocalTensor<T> srcLocal = calcBuf[0].DeQue<T>();
+    LocalTensor<float> sortedLocal1 = calcBuf[1].AllocTensor<float>();
+    LocalTensor<float> sortedLocal2 = calcBuf[2].DeQue<float>();
+    LocalTensor<int32_t> arithLocal = srcLocal.template ReinterpretCast<int32_t>()[TILE_LENGTH];
+
+    int32_t baseOffset = progress * TILE_LENGTH + this->globalOffset;  // calc tileOffset
+    Duplicate(arithLocal, baseOffset, TILE_LENGTH);
+    PipeBarrier<PIPE_V>();
+
+    LocalTensor<uint32_t> uidArray = arithLocal.template ReinterpretCast<uint32_t>();
+    // Max repeatTime of Sort32 is 255, which is exceeded because TILE_LENGTH is 8192.
+    constexpr uint8_t sort32BatchSize = 32;
+    constexpr uint8_t sort32RepeatLimit = 255;
+    int instrRepeatTime = 0;
+    int restLen = TILE_LENGTH;
+    while (restLen) {
+        int repTime = min(restLen / sort32BatchSize, sort32RepeatLimit);
+        Sort32<float>(sortedLocal1[sort32BatchSize * sort32RepeatLimit * SORT_DATATYPE_SIZE_FACTOR * instrRepeatTime],
+                      sortedLocal2[sort32BatchSize * sort32RepeatLimit * instrRepeatTime],
+                      uidArray[sort32BatchSize * sort32RepeatLimit * instrRepeatTime],
+                      repTime);
+        restLen -= repTime * sort32BatchSize;
+        instrRepeatTime++;
+    }
+    PipeBarrier<PIPE_ALL>();
+    calcBuf[0].FreeTensor(srcLocal);
+    calcBuf[1].EnQue(sortedLocal1);
+    calcBuf[2].EnQue(sortedLocal2);
+}
+
+template <typename T>
+__aicore__ inline void KernelUnique<T>::TileSort(const int32_t progress) {
+    LocalTensor<float> sortedLocal1 = calcBuf[1].DeQue<float>();
+    LocalTensor<float> sortedLocal2 = calcBuf[2].DeQue<float>();
+    LocalTensor<float> sortedQue[2] = {sortedLocal1, sortedLocal2};
+    uint16_t currentQueLength = 32;  // Initial queue length is 32 because data is from Sort32.
+    uint16_t currentQueNum = TILE_LENGTH / currentQueLength;
+    bool switchFlag = false;
+    // Multiple MrgSort until we have one generally sorted tile.
+    while (currentQueLength < TILE_LENGTH) {
+        const uint16_t elementLengths[4] = {
+            currentQueLength, currentQueLength, currentQueLength, currentQueLength};
+        const uint16_t fullMrgSortTime = currentQueNum / 4;
+        if (fullMrgSortTime > 0) {
+            MrgSort4Info params = {elementLengths, false, 0b1111, fullMrgSortTime};
+            MrgSort<float>(sortedQue[!switchFlag],
+                           {sortedQue[switchFlag][0],
+                            sortedQue[switchFlag][currentQueLength * 1 * 2],
+                            sortedQue[switchFlag][currentQueLength * 2 * 2],
+                            sortedQue[switchFlag][currentQueLength * 3 * 2]},
+                           params);
+            PipeBarrier<PIPE_ALL>();
+            switchFlag = !switchFlag;
+        }
+        currentQueNum = fullMrgSortTime;
+        currentQueLength *= 4;
+    }
+    DataCopy(sortedBlock1[progress * TILE_LEN_ELEM], sortedQue[switchFlag], TILE_LEN_ELEM);
+    PipeBarrier<PIPE_ALL>();
+    calcBuf[1].FreeTensor(sortedLocal1);
+    calcBuf[2].FreeTensor(sortedLocal2);
+}
+
+template <typename T>
+template <typename T1>
+__aicore__ inline void KernelUnique<T>::DataCopyGM2GM(const GlobalTensor<T1> &dst,
+                                                      const GlobalTensor<T1> &src,
+                                                      const LocalTensor<T1> &tmpLocal,
+                                                      const int elemLength,
+                                                      const int bufByteLength) {
+    // Max byte size of DataCopyPad in one repeat is 65535.
+    int bufElemLength = min(bufByteLength, 65535) / sizeof(T1);
+    int restLen = elemLength;
+    while (restLen > 0) {
+        int copyLen = min(restLen, bufElemLength);
+        DataCopyPad(tmpLocal,
+                    src[elemLength - restLen],
+                    {1, static_cast<uint16_t>(sizeof(T1) * copyLen), 0, 0},
+                    {false, 0, 0, 0});
+        PipeBarrier<PIPE_ALL>();
+        DataCopyPad(dst[elemLength - restLen], tmpLocal, {1, static_cast<uint16_t>(sizeof(T1) * copyLen), 0, 0});
+        PipeBarrier<PIPE_ALL>();
+        restLen -= copyLen;
+    }
+}
+
+template <typename T>
+__aicore__ inline void KernelUnique<T>::MrgSortGM(GlobalTensor<float> &&dstGlobal,
+                                                  GMSSrcList &srcList,
+                                                  GMSParams &params) {
+    int restLen[4] {params.GMSLengths[0], params.GMSLengths[1], params.GMSLengths[2], params.GMSLengths[3]};
+    int currentHead[4] {};
+    int totalMrgLen {};
+    uint8_t queNum = params.queNum;
+    // limited by MrgSort api constraint and mrgLocal size, we set different buffer length due to diffent queNum.
+    // mrgLocal contains 8192 elems, and MrgSort limits max 4095 elems per queue.
+    constexpr int BUFFER_LEN[5] {0, 0, 4095, 2730, 2048};
+    uint16_t sortedLen[4];
+    uint16_t mrgLen[4] {};
+    while (queNum > 1) {
+        int currentBufferLen = BUFFER_LEN[queNum];
+        for (int i = 0; i < queNum; i++) {
+            mrgLen[i] = min(restLen[i], currentBufferLen);
+        }
+        // CopyIn
+        for (int i = 0; i < queNum; i++) {
+            DataCopyPad(params.buffLocal[i],
+                        srcList[i][currentHead[i] * SORT_DATATYPE_SIZE_FACTOR],
+                        {1, static_cast<uint16_t>(sizeof(float) * mrgLen[i] * SORT_DATATYPE_SIZE_FACTOR), 0, 0},
+                        {false, 0, 0, 0});
+        }
+        PipeBarrier<PIPE_ALL>();
+        // MrgSort
+        MrgSort4Info localParams {mrgLen, true, VALID_QUE[queNum], 1};
+        MrgSort<float>(params.buffLocal[4],
+                       {params.buffLocal[0], params.buffLocal[1], params.buffLocal[2], params.buffLocal[3]},
+                       localParams);
+        PipeBarrier<PIPE_ALL>();
+        GetMrgSortResult(sortedLen[0], sortedLen[1], sortedLen[2], sortedLen[3]);
+        const uint16_t localMrgLen = sortedLen[0] + sortedLen[1] + sortedLen[2] + sortedLen[3];
+        // CopyOut
+        DataCopyPad(dstGlobal[totalMrgLen * SORT_DATATYPE_SIZE_FACTOR],
+                    params.buffLocal[4],
+                    {1, static_cast<uint16_t>(sizeof(float) * localMrgLen * SORT_DATATYPE_SIZE_FACTOR), 0, 0});
+        PipeBarrier<PIPE_ALL>();
+        // renew currentHead, restLen
+        totalMrgLen += localMrgLen;
+        for (int i = 0; i < queNum; i++) {
+            restLen[i] -= sortedLen[i];
+            currentHead[i] += sortedLen[i];
+        }
+        // Switch empty to tail
+        for (int i = 0; i < queNum; i++) {
+            if (restLen[i] == 0) {
+                for (int j = i; j < 3; j++) {
+                    restLen[j] = restLen[j + 1];
+                    currentHead[j] = currentHead[j + 1];
+                    srcList[j] = srcList[j + 1];
+                }
+                restLen[3] = 0;
+                queNum--;
+                break;  // because ifExhaustedSuspension == true, there is 0 or 1 empty que.
+            }
+        }
+    }
+    // Process tail
+    for (int i = 0; i < params.queNum; i++) {
+        if (restLen[i] > 0) {
+            DataCopyGM2GM(dstGlobal[totalMrgLen * SORT_DATATYPE_SIZE_FACTOR],
+                          srcList[i][currentHead[i] * SORT_DATATYPE_SIZE_FACTOR],
+                          params.buffLocal[4],
+                          restLen[i] * SORT_DATATYPE_SIZE_FACTOR,
+                          TILE_LEN_BYTE);
+            break;
+        }
+    }
+};
+
+template <typename T>
+__aicore__ inline void KernelUnique<T>::BlockSortV2() {
+    LocalTensor<float> sortedLocal1 = calcBuf[0].AllocTensor<float>();
+    LocalTensor<float> sortedLocal2 = calcBuf[1].AllocTensor<float>();
+    LocalTensor<float> mrgLocal = calcBuf[2].AllocTensor<float>();
+    GlobalTensor<float> sortedBlock[2] = {sortedBlock1, sortedBlock2};
+
+    // Each time merge 4 queues into 1 queue.
+    constexpr uint8_t PREFIX_QUE_NUM = 4;
+    bool switchFlag = false;
+    GlobalTensor<float> srcGlobal[4];
+    LocalTensor<float> buffLocal[5];
+    int lengths[4];
+    for (int bindTile = 1; bindTile < tileNum; bindTile *= PREFIX_QUE_NUM) {
+        for (int tileIdx = 0; tileIdx < tileNum; tileIdx += bindTile * PREFIX_QUE_NUM) {
+            int mrgTileNum = min(tileNum - tileIdx, bindTile * PREFIX_QUE_NUM);
+            uint8_t queNum = (mrgTileNum + bindTile - 1) / bindTile;
+            uint8_t lastQueTileNum = mrgTileNum % bindTile;
+            if (lastQueTileNum == 0) {
+                lastQueTileNum = bindTile;
+            }
+            // Init GMSSrcList, GMSParams
+            for (int i = 0; i < queNum; i++) {
+                srcGlobal[i] = sortedBlock[switchFlag][TILE_LEN_ELEM * (tileIdx + bindTile * i)];
+            }
+            for (int i = 0; i < queNum - 1; i++) {
+                lengths[i] = TILE_LENGTH * bindTile;
+            }
+            lengths[queNum - 1] = TILE_LENGTH * lastQueTileNum;
+            GMSSrcList srcList {srcGlobal};
+            GMSParams params {lengths, queNum, {sortedLocal1, sortedLocal1[TILE_LENGTH],
+                                                sortedLocal2, sortedLocal2[TILE_LENGTH],
+                                                mrgLocal}};
+            MrgSortGM(sortedBlock[!switchFlag][TILE_LEN_ELEM * tileIdx], srcList, params);
+        }
+        switchFlag = !switchFlag;
+    }
+    if (switchFlag) {
+        DataCopyGM2GM(sortedBlock1,
+                      sortedBlock2,
+                      sortedLocal1,
+                      blockLength * SORT_DATATYPE_SIZE_FACTOR,
+                      TILE_LEN_BYTE);
+    }
+    calcBuf[0].FreeTensor(sortedLocal1);
+    calcBuf[1].FreeTensor(sortedLocal2);
+    calcBuf[2].FreeTensor(mrgLocal);
+}
+
+template <typename T>
+__aicore__ inline void KernelUnique<T>::GlobalSortV2() {
+    LocalTensor<float> sortedLocal1 = calcBuf[0].AllocTensor<float>();
+    LocalTensor<float> sortedLocal2 = calcBuf[1].AllocTensor<float>();
+    LocalTensor<float> mrgLocal = calcBuf[2].AllocTensor<float>();
+    LocalTensor<int32_t> IBSyncLocal = sortedLocal2.ReinterpretCast<int32_t>();
+    GlobalTensor<float> sortedGlobal[2] = {sortedGlobal1, sortedGlobal2};
+
+    // Each time merge up to 4 queues into 1 queue.
+    constexpr uint8_t PREFIX_QUE_NUM = 4;
+    bool switchFlag = false;
+    GlobalTensor<float> srcGlobal[4];
+    int lengths[4];
+    for (int bindBlock = 1; bindBlock < blockNum; bindBlock *= PREFIX_QUE_NUM, eventID++) {
+        for (int blockIdx = 0; blockIdx < blockNum; blockIdx += bindBlock * PREFIX_QUE_NUM) {
+            if ((GetBlockIdx() == blockIdx + bindBlock) ||
+                (GetBlockIdx() == blockIdx + bindBlock * 2) ||
+                (GetBlockIdx() == blockIdx + bindBlock * 3)) {
+                PipeBarrier<PIPE_ALL>();
+                IBSet(IBSyncGlobal, IBSyncLocal, (int32_t)GetBlockIdx(), eventID);
+                PipeBarrier<PIPE_ALL>();
+            } else if (GetBlockIdx() == blockIdx) {
+                int mrgBlockNum = min(blockNum - blockIdx, bindBlock * PREFIX_QUE_NUM);
+                uint8_t queNum = (mrgBlockNum + bindBlock - 1) / bindBlock;
+                for (int i = 1; i < queNum; i++) {
+                    PipeBarrier<PIPE_ALL>();
+                    IBWait(IBSyncGlobal, IBSyncLocal, (int32_t)blockIdx + (bindBlock * i), eventID);
+                    PipeBarrier<PIPE_ALL>();
+                }
+                // 判断最后一个队列包含了多少个block的数据.
+                uint8_t lastQueBlockNum = mrgBlockNum % bindBlock;
+                if (lastQueBlockNum == 0) {
+                    lastQueBlockNum = bindBlock;
+                }
+                // Init GMSSrcList, GMSParams
+                for (int i = 0; i < queNum; i++) {
+                    srcGlobal[i] = sortedGlobal[switchFlag][GetGlobalOffset(blockIdx + bindBlock * i) * 
+                                                            SORT_DATATYPE_SIZE_FACTOR];
+                }
+                for (int i = 0; i < queNum - 1; i++) {
+                    lengths[i] = GetGlobalOffset(blockIdx + (bindBlock * (i + 1))) -
+                                 GetGlobalOffset(blockIdx + (bindBlock * i));
+                }
+                lengths[queNum - 1] = GetGlobalOffset(blockIdx + (bindBlock * (queNum - 1)) + lastQueBlockNum) - 
+                                      GetGlobalOffset(blockIdx + (bindBlock * (queNum - 1)));
+                GMSSrcList srcList {srcGlobal};
+                GMSParams params {lengths, queNum, {sortedLocal1, sortedLocal1[TILE_LENGTH],
+                                                    sortedLocal2, sortedLocal2[TILE_LENGTH],
+                                                    mrgLocal}};
+                MrgSortGM(sortedGlobal[!switchFlag][GetGlobalOffset(blockIdx) * SORT_DATATYPE_SIZE_FACTOR],
+                          srcList,
+                          params);
+            }
+        }
+        switchFlag = !switchFlag;
+    }
+    // Switch valid workspace pointer.
+    if (switchFlag) {
+        GlobalTensor<float> tmpGlobal = sortedGlobal1;
+        sortedGlobal1 = sortedGlobal2;
+        sortedGlobal2 = tmpGlobal;
+
+        GlobalTensor<float> tmpGlobal1 = sortedBlock1;
+        sortedBlock1 = sortedBlock2;
+        sortedBlock2 = tmpGlobal1;
+
+        GlobalTensor<int32_t> tmpGlobal2 = sortedBlock1AsInt;
+        sortedBlock1AsInt = sortedBlock2AsInt;
+        sortedBlock2AsInt = tmpGlobal2;
+    }
+    calcBuf[0].FreeTensor(sortedLocal1);
+    calcBuf[1].FreeTensor(sortedLocal2);
+    calcBuf[2].FreeTensor(mrgLocal);
+}
+
+template <typename T>
+__aicore__ inline void KernelUnique<T>::ConsecutiveUnique(const LocalTensor<float> &dstVal,
+                                                          const LocalTensor<float> &srcLocal,
+                                                          const LocalTensor<float> &shiftedLocal,
+                                                          const LocalTensor<uint32_t> &bitMask32,
+                                                          const uint16_t elemLength,
+                                                          uint64_t &tileUniqueCnt) {
+    LocalTensor<uint16_t> bitMask16 = bitMask32.ReinterpretCast<uint16_t>();
+    uint64_t rsvdCnt = 0;
+    // Seperate Val and Idx.
+    GatherMask(dstVal, srcLocal, 1, false, 0, {1, static_cast<uint16_t>((elemLength * 2 + 63) / 64), 8, 0}, rsvdCnt);
+    PipeBarrier<PIPE_V>();
+
+    // Gen bitMask to calc shifted array.
+    Duplicate(bitMask16, (uint16_t)0b1111111111111111, elemLength / 16);
+    PipeBarrier<PIPE_V>();
+    bitMask16.SetValue(0, 0b1111111111111110);
+
+    // Calc shifted array.
+    GatherMask(shiftedLocal, dstVal, bitMask32, true, elemLength, {1, 1, 8, 8}, rsvdCnt);
+    PipeBarrier<PIPE_V>();
+    // Set the last val as INF in order to avoid dropping the last unique val.
+    shiftedLocal.SetValue(elemLength - 1, -FLOAT_INF);
+
+    // Generate bitMask which represents unique numbers.
+    Compare(bitMask16, dstVal, shiftedLocal, CMPMODE::NE, (elemLength + 63) / 64 * 64);
+    PipeBarrier<PIPE_V>();
+
+    // Gather unique numbers and their idx.
+    GatherMask(dstVal, dstVal, bitMask32, true, elemLength, {1, 1, 8, 8}, tileUniqueCnt);
+    PipeBarrier<PIPE_V>();
+}
+
+template <typename T>
+__aicore__ inline void KernelUnique<T>::TileUnique(const int32_t progress) {
+    LocalTensor<uint32_t> bitMask32 = calcBuf[0].AllocTensor<uint32_t>();
+    LocalTensor<float> shiftedLocal = bitMask32[TILE_LENGTH].ReinterpretCast<float>();
+    LocalTensor<float> sortedLocal1 = calcBuf[1].AllocTensor<float>();
+    LocalTensor<float> sortedLocal2 = calcBuf[2].AllocTensor<float>();
+    LocalTensor<uint32_t> uniqueCntLocal = shiftedLocal.ReinterpretCast<uint32_t>();
+    uint64_t tileUniqueCnt;
+    uint64_t tmpRsvdCnt;
+
+    DataCopy(sortedLocal1, sortedBlock1[progress * TILE_LEN_ELEM], TILE_LEN_ELEM);
+    PipeBarrier<PIPE_ALL>();
+
+    ConsecutiveUnique(sortedLocal2, sortedLocal1, shiftedLocal, bitMask32, TILE_LENGTH, tileUniqueCnt);
+    // If has inf, append.
+    if ((progress == tileNum - 1) && hasInfFlag) {
+        sortedLocal2.SetValue(tileUniqueCnt, -FLOAT_INF);
+        tileUniqueCnt++;
+    }
+
+    if (tileUniqueCnt != 0) {
+        accUniqueCnt += tileUniqueCnt;
+        if (progress != 0 && lastTileUniqueVal == sortedLocal2.GetValue(0)) {
+            accUniqueCnt--;
+        }
+        DataCopyPad(sortedBlock1[accUniqueCnt - tileUniqueCnt],
+                    sortedLocal2,
+                    {1, static_cast<uint16_t>(sizeof(float) * tileUniqueCnt), 0, 0});
+        PipeBarrier<PIPE_ALL>();
+        lastTileUniqueVal = sortedLocal2.GetValue(tileUniqueCnt - 1);
+    }
+
+    // upload uniqueCnt.
+    if (progress == tileNum - 1) {
+        uniqueCntLocal.SetValue(0, accUniqueCnt);
+        DataCopyPad(blockUniqueCntGlobal[GetBlockIdx()],
+                    uniqueCntLocal,
+                    {1, static_cast<uint16_t>(sizeof(uint32_t) * 1), 0, 0});
+        PipeBarrier<PIPE_ALL>();
+    }
+    calcBuf[0].FreeTensor(shiftedLocal);
+    calcBuf[1].FreeTensor(sortedLocal1);
+    calcBuf[2].FreeTensor(sortedLocal2);
+}
+
+template <typename T>
+__aicore__ inline void KernelUnique<T>::CopyOut() {
+    LocalTensor<T> copyLocal0 = calcBuf[0].AllocTensor<T>();
+    LocalTensor<float> copyLocal1 = calcBuf[1].AllocTensor<float>();
+    LocalTensor<int32_t> IBSyncLocal = copyLocal1.ReinterpretCast<int32_t>();
+    LocalTensor<int32_t> copyLocal2 = calcBuf[2].AllocTensor<int32_t>();
+
+    uint64_t lastAccUniqueCnt = 0;
+    // Get every blockUniqueCnt before current block. Calc accumulate uniqueCnt.
+    for (int i = 0; i < GetBlockIdx(); i++) {
+        uint64_t lastUniqueCnt = blockUniqueCntGlobal.GetValue(i);
+        lastAccUniqueCnt += lastUniqueCnt;
+        // If the first val of (i+1)th block equals to the last val of (i)th block, then they should be placed in
+        // the same position, accUniqueCnt--.
+        if (sortedGlobal1[GetGlobalOffset(i + 1) * SORT_DATATYPE_SIZE_FACTOR].GetValue(0) ==
+            sortedGlobal1[GetGlobalOffset(i) * SORT_DATATYPE_SIZE_FACTOR].GetValue(lastUniqueCnt - 1)) {
+            lastAccUniqueCnt--;
+        }
+    }
+    uint64_t thisUniqueCnt = blockUniqueCntGlobal.GetValue(GetBlockIdx());
+
+    uint64_t restLen = thisUniqueCnt;
+    // max(Ta a, Tb b) function does not support compilation period calc.
+    constexpr uint64_t bottleneckTypeSize = sizeof(T) > sizeof(float)? sizeof(T): sizeof(float);
+    LocalTensor<int32_t> copyVal32 = copyLocal0.template ReinterpretCast<int32_t>();
+    LocalTensor<int32_t> uniqueVal32 = copyLocal1.ReinterpretCast<int32_t>();
+    // Copy unique values (and counts) from Workspace to dst.
+    while (restLen > 0) {
+        // DataCopyPad could copy up to 65535B in one cycle. And one tile may contain up to 65536B. So we should
+        // process multiple cycles.
+        uint64_t copyLen = min(restLen, TILE_LEN_BYTE / bottleneckTypeSize);
+        copyLen = min(copyLen, 65535 / bottleneckTypeSize);
+        if constexpr (!IsSameType<T, float>::value) {
+            DataCopyPad(copyLocal1,
+                        sortedBlock1[thisUniqueCnt - restLen],
+                        {1, static_cast<uint16_t>(sizeof(float) * copyLen), 0, 0},
+                        {false, 0, 0, 0});
+            PipeBarrier<PIPE_ALL>();
+            Muls(copyLocal1, copyLocal1, (float)-1, copyLen);
+            PipeBarrier<PIPE_V>();
+            Cast(copyLocal0, copyLocal1, RoundMode::CAST_RINT, copyLen);
+            PipeBarrier<PIPE_ALL>();
+        } else {
+            DataCopyPad(copyLocal0,
+                        sortedBlock1[thisUniqueCnt - restLen],
+                        {1, static_cast<uint16_t>(sizeof(float) * copyLen), 0, 0},
+                        {false, 0, 0, 0});
+            PipeBarrier<PIPE_ALL>();
+            Muls(copyLocal0, copyLocal0, (float)-1, copyLen);
+            PipeBarrier<PIPE_V>();
+        }
+        // DataCopyPad does not support int64_t. Copy them as uint32_t.
+        if constexpr (sizeof(T) > 4) {
+            DataCopyPad(dstGlobal1As32[(lastAccUniqueCnt + thisUniqueCnt - restLen) * sizeof(T) / sizeof(uint32_t)],
+                        copyVal32,
+                        {1, static_cast<uint16_t>(sizeof(T) * copyLen), 0, 0});
+        } else {
+            DataCopyPad(dstGlobal1[lastAccUniqueCnt + thisUniqueCnt - restLen],
+                        copyLocal0,
+                        {1, static_cast<uint16_t>(sizeof(T) * copyLen), 0, 0});
+        }
+        PipeBarrier<PIPE_ALL>();
+        restLen -= copyLen;
+    }
+    // Return unique count.
+    if (GetBlockIdx() == blockNum - 1) {
+        uniqueVal32.SetValue(0, lastAccUniqueCnt + thisUniqueCnt);
+        DataCopyPad(uniqueCntGlobal, uniqueVal32, {1, static_cast<uint16_t>(sizeof(uint32_t) * 1), 0, 0});
+        PipeBarrier<PIPE_ALL>();
+    }
+    calcBuf[0].FreeTensor(copyLocal0);
+    calcBuf[1].FreeTensor(copyLocal1);
+}
+}  // namespace AscendC
diff --git a/mx_driving/__init__.py b/mx_driving/__init__.py
index f4cc2fa5..9f708370 100644
--- a/mx_driving/__init__.py
+++ b/mx_driving/__init__.py
@@ -61,6 +61,7 @@ __all__ = [
     "diff_iou_rotated_2d",
     "nms3d_on_sight",
     "cartesian_to_frenet",
+    "npu_unique",
 ]
 
 import os
@@ -124,6 +125,7 @@ from .ops.npu_batch_matmul import npu_batch_matmul
 from .ops.nms3d_on_sight import nms3d_on_sight
 from .ops.cartesian_to_frenet import cartesian_to_frenet
 from .patcher import default_patcher_builder, patch_mmcv_version
+from .ops.npu_unique import npu_unique
 
 
 def _set_env():
diff --git a/mx_driving/csrc/Unique.cpp b/mx_driving/csrc/Unique.cpp
new file mode 100644
index 00000000..ee7aa6a7
--- /dev/null
+++ b/mx_driving/csrc/Unique.cpp
@@ -0,0 +1,32 @@
+// Copyright (c) 2024 Huawei Technologies Co., Ltd
+// Copyright (c) 2019, Facebook CORPORATION.
+// All rights reserved.
+//
+// Licensed under the BSD 3-Clause License  (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "csrc/OpApiCommon.h"
+#include "csrc/functions.h"
+
+
+at::Tensor npu_unique(const at::Tensor& input) {
+    if (input.numel() < 2) {
+        at::Tensor output = at::Tensor(input).clone();
+        return output;
+    } else {
+        at::Tensor output = at::empty({input.numel()}, at::TensorOptions().dtype(input.dtype()).device(input.device()));
+        at::Tensor uniqueCnt = at::empty({1}, at::TensorOptions().dtype(at::ScalarType::Int).device(input.device()));
+        EXEC_NPU_CMD_SYNC(aclnnUnique, input, output, uniqueCnt);
+        int uniqueCount = uniqueCnt.item<int>();
+        return output.narrow(0, 0, uniqueCount);
+    }
+}
diff --git a/mx_driving/csrc/pybind.cpp b/mx_driving/csrc/pybind.cpp
index fea4904f..41f5a35c 100644
--- a/mx_driving/csrc/pybind.cpp
+++ b/mx_driving/csrc/pybind.cpp
@@ -242,4 +242,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 
     // npu_subm_sparse_conv3d_with_key
     m.def("npu_subm_sparse_conv3d_with_key", &npu_subm_sparse_conv3d_with_key);
+    
+    // npu_unique
+    m.def("npu_unique", &npu_unique);
 }
diff --git a/mx_driving/fused.py b/mx_driving/fused.py
index 209f2046..123f751e 100644
--- a/mx_driving/fused.py
+++ b/mx_driving/fused.py
@@ -10,6 +10,7 @@ from .ops.multi_scale_deformable_attn import (
 from .ops.npu_add_relu import npu_add_relu
 from .ops.npu_deformable_aggregation import npu_deformable_aggregation
 from .ops.npu_max_pool2d import npu_max_pool2d
+from .ops.npu_unique import npu_unique
 
 warnings.warn(
     "This package is deprecated and will be removed in future. Please use `mx_driving.api` instead.", DeprecationWarning
diff --git a/mx_driving/ops/npu_unique.py b/mx_driving/ops/npu_unique.py
new file mode 100644
index 00000000..7ccde38a
--- /dev/null
+++ b/mx_driving/ops/npu_unique.py
@@ -0,0 +1,19 @@
+"""
+Copyright (c) OpenMMLab. All rights reserved.
+Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+Modification by: Huawei Developers
+Modification date: 2024-06-04 
+Modification Description: 
+Modification 1. Add support for Ascend NPU
+"""
+from torch.autograd import Function
+import mx_driving._C
+
+class UniqueFunction(Function):
+    @staticmethod
+    # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
+    def forward(ctx, input):
+        y = mx_driving._C.npu_unique(input)
+        return y
+
+npu_unique = UniqueFunction.apply
-- 
Gitee


From ae16b0bdf6bbef9f2f62d69843a784568d468769 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=82=B1=E6=AD=A3=E9=98=B3?= <qiuzhengyang3@huawei.com>
Date: Tue, 18 Mar 2025 17:33:21 +0800
Subject: [PATCH 2/7] bugfix: prevent updating deprecated api

---
 mx_driving/fused.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mx_driving/fused.py b/mx_driving/fused.py
index 123f751e..209f2046 100644
--- a/mx_driving/fused.py
+++ b/mx_driving/fused.py
@@ -10,7 +10,6 @@ from .ops.multi_scale_deformable_attn import (
 from .ops.npu_add_relu import npu_add_relu
 from .ops.npu_deformable_aggregation import npu_deformable_aggregation
 from .ops.npu_max_pool2d import npu_max_pool2d
-from .ops.npu_unique import npu_unique
 
 warnings.warn(
     "This package is deprecated and will be removed in future. Please use `mx_driving.api` instead.", DeprecationWarning
-- 
Gitee


From 07871465f587b4a5112bc0b4f09b2b3f34e5b19c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=82=B1=E6=AD=A3=E9=98=B3?= <qiuzhengyang3@huawei.com>
Date: Wed, 19 Mar 2025 10:45:14 +0800
Subject: [PATCH 3/7] feat: Add npu_unique tests

---
 tests/torch/test_npu_unique.py | 67 ++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 tests/torch/test_npu_unique.py

diff --git a/tests/torch/test_npu_unique.py b/tests/torch/test_npu_unique.py
new file mode 100644
index 00000000..8cc8f81c
--- /dev/null
+++ b/tests/torch/test_npu_unique.py
@@ -0,0 +1,67 @@
+"""
+Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+"""
+import torch
+import torch_npu
+from data_cache import golden_data_cache
+from torch_npu.testing.testcase import TestCase, run_tests
+
+from mx_driving import npu_unique
+import random
+import os
+
+
+def gen_inputs(input_shape, dtype):
+    input_tensor = torch.randint(-256, 256, input_shape, dtype=dtype)
+    return input_tensor
+
+def gen_cpu_outputs(input_tensor):
+    cpu_result = torch.unique(input_tensor)
+    return cpu_result
+
+def gen_npu_outputs(input_tensor):
+    npu_result = npu_unique(input_tensor.npu())
+    return npu_result.cpu()
+
+
+class TestNpuUnique(TestCase):
+    def test_bfloat16(self, device='npu'):
+        input_tensor = gen_inputs([random.randint(1, 10000000)], torch.bfloat16)
+        cpu_result = gen_cpu_outputs(input_tensor)
+        npu_result = gen_npu_outputs(input_tensor)
+        self.assertRtolEqual(cpu_result.cpu().detach().to(torch.float32).numpy(),
+                             npu_result.cpu().detach().to(torch.float32).numpy())
+
+    def test_float16(self, device='npu'):
+        input_tensor = gen_inputs([random.randint(1, 10000000)], torch.float16)
+        cpu_result = gen_cpu_outputs(input_tensor)
+        npu_result = gen_npu_outputs(input_tensor)
+        self.assertRtolEqual(cpu_result.cpu().detach().numpy(), npu_result.cpu().detach().numpy())
+
+    def test_float32(self, device='npu'):
+        input_tensor = gen_inputs([random.randint(1, 10000000)], torch.float32)
+        cpu_result = gen_cpu_outputs(input_tensor)
+        npu_result = gen_npu_outputs(input_tensor)
+        self.assertRtolEqual(cpu_result.cpu().detach().numpy(), npu_result.cpu().detach().numpy())
+
+    def test_int16(self, device='npu'):
+        input_tensor = gen_inputs([random.randint(1, 10000000)], torch.int16)
+        cpu_result = gen_cpu_outputs(input_tensor)
+        npu_result = gen_npu_outputs(input_tensor)
+        self.assertRtolEqual(cpu_result.cpu().detach().numpy(), npu_result.cpu().detach().numpy())
+
+    def test_int32(self, device='npu'):
+        input_tensor = gen_inputs([random.randint(1, 10000000)], torch.int32)
+        cpu_result = gen_cpu_outputs(input_tensor)
+        npu_result = gen_npu_outputs(input_tensor)
+        self.assertRtolEqual(cpu_result.cpu().detach().numpy(), npu_result.cpu().detach().numpy())
+
+    def test_int64(self, device='npu'):
+        input_tensor = gen_inputs([random.randint(1, 10000000)], torch.int64)
+        cpu_result = gen_cpu_outputs(input_tensor)
+        npu_result = gen_npu_outputs(input_tensor)
+        self.assertRtolEqual(cpu_result.cpu().detach().numpy(), npu_result.cpu().detach().numpy())
+
+
+if __name__ == "__main__":
+    run_tests()
-- 
Gitee


From 2e3ae6a8db8d344a5356612fff174be66b6e883c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=82=B1=E6=AD=A3=E9=98=B3?= <qiuzhengyang3@huawei.com>
Date: Thu, 20 Mar 2025 15:08:01 +0800
Subject: [PATCH 4/7] feat: unique doc

---
 docs/api/README.md             |  6 +++++-
 docs/api/context/npu_unique.md | 37 ++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)
 create mode 100644 docs/api/context/npu_unique.md

diff --git a/docs/api/README.md b/docs/api/README.md
index 071598bb..313c5029 100644
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -54,7 +54,7 @@
         <td align=center>Y</td>
     </tr>
     <tr>
-        <td rowspan=10>采样</td>
+        <td rowspan=11>采样</td>
         <td align=center><a href=./context/roipoint_pool3d.md>roipoint_pool3d</a></td>
         <td align=center>Y</td>
     </tr>
@@ -94,6 +94,10 @@
         <td align=center><a href=./context/grid_sampler2d_v2.md>grid_sampler2d_v2</a></td>
         <td align=center>N</td>
     </tr>
+    <tr>
+        <td align=center><a href=./context/npu_unique.md>npu_unique</a></td>
+        <td align=center>N</td>
+    </tr>
     <tr>
         <td rowspan=2>体素化</td>
         <td align=center><a href=./context/voxelization.md>voxelization</a></td>
diff --git a/docs/api/context/npu_unique.md b/docs/api/context/npu_unique.md
new file mode 100644
index 00000000..843be385
--- /dev/null
+++ b/docs/api/context/npu_unique.md
@@ -0,0 +1,37 @@
+## npu_unique[beta]
+
+### 接口原型
+
+```python
+mx_driving.npu_unique(Tensor input) -> Tensor
+```
+
+### 功能描述
+
+从小到大排序并去重. 提供一个输入`tensor`, 对`tensor`的输入进行排序, 并去掉`tensor`中的重复元素.
+
+### 参数说明
+
+- `input(Tensor)`：表示输入张量，数据类型支持 `float16`, `bfloat16`, `int16`, `float32`, `int32`, `int64`. shape 为 1 ~ 8 维的任意shape.
+
+### 返回值
+
+- `output(Tensor)`：表示输出张量，数据类型支持 `float16`, `bfloat16`, `int16`, `float32`, `int32`, `int64`, 与输入张量`input`一致. shape 为 1 维。
+
+### 约束说明
+
+- int32, int64输入时, 每个元素的值须在[-16777216, 16777216] (±2^24)之间，否则会引入精度损失.
+
+### 支持的型号
+
+- Atlas A2 训练系列产品
+
+### 调用示例
+
+```python
+import torch, torch_npu
+from mx_driving import npu_unique
+
+rand_tensor = torch.rand(559794, dtype=torch.int64)
+output = npu_unique(rand_tensor.npu())
+```
-- 
Gitee


From b83b41ba3dc2fec616195beefa17e299035934d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=82=B1=E6=AD=A3=E9=98=B3?= <qiuzhengyang3@huawei.com>
Date: Mon, 24 Mar 2025 10:30:27 +0800
Subject: [PATCH 5/7] bugfix: cleancode

---
 kernels/op_host/unique.cpp      |  88 +++----
 kernels/op_host/unique_tiling.h |  12 +-
 kernels/op_kernel/unique.h      | 427 ++++++++++++++------------------
 mx_driving/ops/npu_unique.py    |   6 +-
 tests/torch/test_npu_unique.py  |   7 +-
 5 files changed, 249 insertions(+), 291 deletions(-)

diff --git a/kernels/op_host/unique.cpp b/kernels/op_host/unique.cpp
index bbc0dfbf..f37169d9 100644
--- a/kernels/op_host/unique.cpp
+++ b/kernels/op_host/unique.cpp
@@ -2,57 +2,59 @@
  * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
  *
  */
-#include "unique_tiling.h"
 #include "register/op_def_registry.h"
 #include "tiling/platform/platform_ascendc.h"
+#include "unique_tiling.h"
 constexpr size_t SYS_RSVD_WS_SIZE = 16 * 1024 * 1024;
 
 
 namespace optiling {
-static ge::graphStatus UniqueTilingFunc(gert::TilingContext* context) {
-  UniqueTilingData tiling;
+static ge::graphStatus UniqueTilingFunc(gert::TilingContext* context)
+{
+    UniqueTilingData tiling;
 
-  constexpr uint16_t tileLength = 8192;
-  const uint8_t dimNum = context->GetInputShape(0)->GetStorageShape().GetDimNum();
-  const gert::StorageShape* inputShape = context->GetInputShape(0);
-  uint32_t totalLength = 1;
-  for (int i = 0; i < dimNum; i++) {
-    totalLength *= inputShape->GetStorageShape().GetDim(i);
-  }
-  const uint32_t tileNum = (totalLength + tileLength - 1) / tileLength;
-  const uint16_t tailLength = totalLength % tileLength;
-  const auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
-  const uint32_t aivNum = ascendcPlatform.GetCoreNumAiv();
-  const uint8_t blockNum = tileNum >= aivNum? aivNum: tileNum;
-  const uint32_t shortBlockTileNum = tileNum / blockNum;
-  const uint8_t longBlockNum = tileNum % blockNum;
-  const uint8_t shortBlockNum = blockNum - longBlockNum;
+    constexpr uint16_t tileLength = 8192;
+    const uint8_t dimNum = context->GetInputShape(0)->GetStorageShape().GetDimNum();
+    const gert::StorageShape* inputShape = context->GetInputShape(0);
+    uint32_t totalLength = 1;
+    for (int i = 0; i < dimNum; i++) {
+        totalLength *= inputShape->GetStorageShape().GetDim(i);
+    }
+    const uint32_t tileNum = (totalLength + tileLength - 1) / tileLength;
+    const uint16_t tailLength = totalLength % tileLength;
+    const auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
+    const uint32_t aivNum = ascendcPlatform.GetCoreNumAiv();
+    const uint8_t blockNum = tileNum >= aivNum ? aivNum : tileNum;
+    const uint32_t shortBlockTileNum = tileNum / blockNum;
+    const uint8_t longBlockNum = tileNum % blockNum;
+    const uint8_t shortBlockNum = blockNum - longBlockNum;
 
-  tiling.set_totalLength(totalLength);
-  tiling.set_tileNum(tileNum);
-  tiling.set_shortBlockTileNum(shortBlockTileNum);
-  tiling.set_tailLength(tailLength);
-  tiling.set_blockNum(blockNum);
-  tiling.set_shortBlockNum(shortBlockNum);
+    tiling.set_totalLength(totalLength);
+    tiling.set_tileNum(tileNum);
+    tiling.set_shortBlockTileNum(shortBlockTileNum);
+    tiling.set_tailLength(tailLength);
+    tiling.set_blockNum(blockNum);
+    tiling.set_shortBlockNum(shortBlockNum);
 
-  context->SetBlockDim(blockNum);
-  tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
-  context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
-  // Workspace for IBSet/IBWait up to 8 times, and 2 times full data.
-  uint32_t sysWorkspaceSize = ascendcPlatform.GetLibApiWorkSpaceSize();
-  auto&& currentWorkspace = context->GetWorkspaceSizes(1);
-  if (currentWorkspace == nullptr) {
-    return ge::GRAPH_FAILED;
-  }
-  size_t usrSize = (aivNum * 8 + 1) * 8 * sizeof(uint32_t) + (tileNum * tileLength) * 2 * sizeof(float) * 2;
-  currentWorkspace[0] = usrSize + sysWorkspaceSize;
-  return ge::GRAPH_SUCCESS;
-}
+    context->SetBlockDim(blockNum);
+    tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
+    context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
+    // Workspace for IBSet/IBWait up to 8 times, and 2 times full data.
+    uint32_t sysWorkspaceSize = ascendcPlatform.GetLibApiWorkSpaceSize();
+    auto&& currentWorkspace = context->GetWorkspaceSizes(1);
+    if (currentWorkspace == nullptr) {
+        return ge::GRAPH_FAILED;
+    }
+    size_t usrSize = (aivNum * 8 + 1) * 8 * sizeof(uint32_t) + (tileNum * tileLength) * 2 * sizeof(float) * 2;
+    currentWorkspace[0] = usrSize + sysWorkspaceSize;
+    return ge::GRAPH_SUCCESS;
 }
+} // namespace optiling
 
 
 namespace ge {
-static ge::graphStatus UniqueInferShape(gert::InferShapeContext* context) {
+static ge::graphStatus UniqueInferShape(gert::InferShapeContext* context)
+{
     const gert::Shape* x1_shape = context->GetInputShape(0);
     gert::Shape* y_shape = context->GetOutputShape(0);
     *y_shape = *x1_shape;
@@ -65,13 +67,14 @@ static ge::graphStatus UniqueInferDtype(gert::InferDataTypeContext* context)
     context->SetOutputDataType(0, inputDtype);
     return ge::GRAPH_SUCCESS;
 }
-}
+} // namespace ge
 
 
 namespace ops {
 class Unique : public OpDef {
 public:
-    explicit Unique(const char* name) : OpDef(name) {
+    explicit Unique(const char* name) : OpDef(name)
+    {
         this->Input("input")
             .ParamType(REQUIRED)
             .DataType({ge::DT_BF16, ge::DT_FLOAT16, ge::DT_INT16, ge::DT_FLOAT, ge::DT_INT32, ge::DT_INT64})
@@ -89,12 +92,11 @@ public:
         this->SetInferShape(ge::UniqueInferShape);
         this->SetInferDataType(ge::UniqueInferDtype);
 
-        this->AICore()
-            .SetTiling(optiling::UniqueTilingFunc);
+        this->AICore().SetTiling(optiling::UniqueTilingFunc);
         this->AICore().AddConfig("ascend910b");
         this->AICore().AddConfig("ascend910_93");
     }
 };
 
 OP_ADD(Unique);
-}
+} // namespace ops
diff --git a/kernels/op_host/unique_tiling.h b/kernels/op_host/unique_tiling.h
index f42483c4..aed40f6f 100644
--- a/kernels/op_host/unique_tiling.h
+++ b/kernels/op_host/unique_tiling.h
@@ -6,12 +6,12 @@
 
 namespace optiling {
 BEGIN_TILING_DATA_DEF(UniqueTilingData)
-  TILING_DATA_FIELD_DEF(uint32_t, totalLength);
-  TILING_DATA_FIELD_DEF(uint32_t, tileNum);
-  TILING_DATA_FIELD_DEF(uint32_t, shortBlockTileNum);
-  TILING_DATA_FIELD_DEF(uint16_t, tailLength);
-  TILING_DATA_FIELD_DEF(uint8_t, blockNum);
-  TILING_DATA_FIELD_DEF(uint8_t, shortBlockNum);
+    TILING_DATA_FIELD_DEF(uint32_t, totalLength);
+    TILING_DATA_FIELD_DEF(uint32_t, tileNum);
+    TILING_DATA_FIELD_DEF(uint32_t, shortBlockTileNum);
+    TILING_DATA_FIELD_DEF(uint16_t, tailLength);
+    TILING_DATA_FIELD_DEF(uint8_t, blockNum);
+    TILING_DATA_FIELD_DEF(uint8_t, shortBlockNum);
 END_TILING_DATA_DEF;
 
 REGISTER_TILING_DATA_CLASS(Unique, UniqueTilingData)
diff --git a/kernels/op_kernel/unique.h b/kernels/op_kernel/unique.h
index 0960db13..9470d73b 100644
--- a/kernels/op_kernel/unique.h
+++ b/kernels/op_kernel/unique.h
@@ -8,7 +8,8 @@ using namespace AscendC;
 
 namespace AscendC {
 template<typename Ta, typename Tb>
-__aicore__ inline static Ta min(const Ta a, const Tb b) {
+__aicore__ inline Ta min(const Ta a, const Tb b)
+{
     if (a > b) {
         return b;
     }
@@ -16,131 +17,113 @@ __aicore__ inline static Ta min(const Ta a, const Tb b) {
 }
 
 template<typename Ta, typename Tb>
-__aicore__ inline static Ta max(const Ta a, const Tb b) {
+__aicore__ inline Ta max(const Ta a, const Tb b)
+{
     if (a < b) {
         return b;
     }
     return a;
 }
 
-template <typename T>
+template<typename T>
 class KernelUnique {
 public:
-    __aicore__ inline KernelUnique(TPipe &pipe) : pipe(pipe) {}
+    __aicore__ inline KernelUnique(TPipe& pipe) : pipe(pipe) {}
     // Each block process diffent part of data. This function returns the element-wise first index of data by blockIdx.
     __aicore__ inline size_t GetGlobalOffset(const uint32_t blockIdx);
-    __aicore__ inline void Init(GM_ADDR input,
-                                GM_ADDR output,
-                                GM_ADDR uniqueCnt,
-                                GM_ADDR workspace,
-                                const uint32_t totalLength,
-                                const uint32_t totalTileNum,
-                                const uint32_t shortBlockTileNum,
-                                const uint16_t tailLength,
-                                const uint8_t blockNum,
-                                const uint8_t shortBlockNum);
+    __aicore__ inline void Init(GM_ADDR input, GM_ADDR output, GM_ADDR uniqueCnt, GM_ADDR workspace,
+        const uint32_t totalLength, const uint32_t totalTileNum, const uint32_t shortBlockTileNum,
+        const uint16_t tailLength, const uint8_t blockNum, const uint8_t shortBlockNum);
     __aicore__ inline void Process();
 
 private:
     __aicore__ inline void CopyIn(const int32_t progress);
     __aicore__ inline void Elem32Sort(const int32_t progress);
     __aicore__ inline void TileSort(const int32_t progress);
-    template <typename T1>
-    __aicore__ inline static void DataCopyGM2GM(const GlobalTensor<T1> &dst,
-                                                const GlobalTensor<T1> &src,
-                                                const LocalTensor<T1> &tmpLocal,
-                                                const int elemLength,
-                                                const int bufByteLength);
+    template<typename T1>
+    __aicore__ inline static void DataCopyGM2GM(const GlobalTensor<T1>& dst, const GlobalTensor<T1>& src,
+        const LocalTensor<T1>& tmpLocal, const int elemLength, const int bufByteLength);
     using GMSSrcList = GlobalTensor<float> (&)[4];
     struct GMSParams {
         int (&GMSLengths)[4];
-        uint8_t &queNum;
+        uint8_t& queNum;
         LocalTensor<float> (&&buffLocal)[5];
     };
-    __aicore__ inline static void MrgSortGM(GlobalTensor<float> &&dstGlobal,
-                                            GMSSrcList &srcList,
-                                            GMSParams &params);
+    __aicore__ inline static void MrgSortGM(GlobalTensor<float>&& dstGlobal, GMSSrcList& srcList, GMSParams& params);
     __aicore__ inline void BlockSortV2();
     __aicore__ inline void GlobalSortV2();
-    __aicore__ inline static void ConsecutiveUnique(const LocalTensor<float> &dstVal,
-                                                    const LocalTensor<float> &srcLocal,
-                                                    const LocalTensor<float> &shiftedLocal,
-                                                    const LocalTensor<uint32_t> &bitMask16,
-                                                    const uint16_t elemLength,
-                                                    uint64_t &tileUniqueCnt);
+    __aicore__ inline static void ConsecutiveUnique(const LocalTensor<float>& dstVal,
+        const LocalTensor<float>& srcLocal, const LocalTensor<float>& shiftedLocal,
+        const LocalTensor<uint32_t>& bitMask16, const uint16_t elemLength, uint64_t& tileUniqueCnt);
     __aicore__ inline void TileUnique(const int32_t progress);
     __aicore__ inline void CopyOut();
 
 private:
-    static constexpr int32_t    TILE_LENGTH = 8192;
+    static constexpr int32_t TILE_LENGTH = 8192;
     // INF to fill the tail blank, so that tail is automatically removed by Compare in Unique.
-    static constexpr float      FLOAT_INF = 3e+99;
+    static constexpr float FLOAT_INF = 3e+99;
     // Indicates the factor converting float to data structure used by Sort32&MrgSort.
-    static constexpr int16_t    SORT_DATATYPE_SIZE = sizeof(float) + sizeof(uint32_t);  // 8
-    static constexpr int16_t    SORT_DATATYPE_SIZE_FACTOR = SORT_DATATYPE_SIZE / sizeof(float);  // 2
-    static constexpr int32_t    TILE_LEN_BYTE = TILE_LENGTH * SORT_DATATYPE_SIZE;  // 8192 * 8 = 65536
-    static constexpr int32_t    TILE_LEN_ELEM = TILE_LENGTH * SORT_DATATYPE_SIZE_FACTOR;  // 8192 * 2 = 16384
-    static constexpr uint16_t   VALID_QUE[5] = {0, 0, 0b11, 0b111, 0b1111};  // Converts queue number to validBit of MrgSort.
-
-    TPipe &pipe;
-    TQue<TPosition::VECIN, 1>   calcBuf[3];
-
-    GlobalTensor<T>         srcGlobal;
-    GlobalTensor<uint32_t>  srcGlobalAsUint;
-    GlobalTensor<T>         dstGlobal1;
-    GlobalTensor<int32_t>   dstGlobal1As32;
-    GlobalTensor<int32_t>   uniqueCntGlobal;
-
-    GlobalTensor<float>     sortedBlock1;
-    GlobalTensor<int32_t>   sortedBlock1AsInt;
-    GlobalTensor<float>     sortedBlock2;
-    GlobalTensor<int32_t>   sortedBlock2AsInt;
-    GlobalTensor<float>     sortedGlobal1;
-    GlobalTensor<float>     sortedGlobal2;
-
-    GlobalTensor<int32_t>   IBSyncGlobal;
-    GlobalTensor<uint32_t>  blockUniqueCntGlobal;
-
-    uint16_t    syncWorkspaceSize;
-    uint8_t     eventID {0};
-    uint64_t    accUniqueCnt {0};
-    float       lastTileUniqueVal;
-
-    uint32_t    totalLength;
-    uint32_t    alignedTotalLength;
-    uint32_t    tileNum;
-    uint32_t    shortBlockTileNum;
-    uint16_t    tailLength;
-    uint8_t     blockNum;
-    uint8_t     shortBlockNum;
-
-    size_t      globalOffset;  // Offset of data for current block.
-    size_t      blockLength;  // Length of current block.
-    bool        hasInfFlag {false};
+    static constexpr int16_t SORT_DATATYPE_SIZE = sizeof(float) + sizeof(uint32_t);          // 8
+    static constexpr int16_t SORT_DATATYPE_SIZE_FACTOR = SORT_DATATYPE_SIZE / sizeof(float); // 2
+    static constexpr int32_t TILE_LEN_BYTE = TILE_LENGTH * SORT_DATATYPE_SIZE;               // 8192 * 8 = 65536
+    static constexpr int32_t TILE_LEN_ELEM = TILE_LENGTH * SORT_DATATYPE_SIZE_FACTOR;        // 8192 * 2 = 16384
+    static constexpr uint16_t VALID_QUE[5] = {
+        0, 0, 0b11, 0b111, 0b1111}; // Converts queue number to validBit of MrgSort.
+
+    TPipe& pipe;
+    TQue<TPosition::VECIN, 1> calcBuf[3];
+
+    GlobalTensor<T> srcGlobal;
+    GlobalTensor<uint32_t> srcGlobalAsUint;
+    GlobalTensor<T> dstGlobal1;
+    GlobalTensor<int32_t> dstGlobal1As32;
+    GlobalTensor<int32_t> uniqueCntGlobal;
+
+    GlobalTensor<float> sortedBlock1;
+    GlobalTensor<int32_t> sortedBlock1AsInt;
+    GlobalTensor<float> sortedBlock2;
+    GlobalTensor<int32_t> sortedBlock2AsInt;
+    GlobalTensor<float> sortedGlobal1;
+    GlobalTensor<float> sortedGlobal2;
+
+    GlobalTensor<int32_t> IBSyncGlobal;
+    GlobalTensor<uint32_t> blockUniqueCntGlobal;
+
+    uint16_t syncWorkspaceSize;
+    uint8_t eventID {0};
+    uint64_t accUniqueCnt {0};
+    float lastTileUniqueVal;
+
+    uint32_t totalLength;
+    uint32_t alignedTotalLength;
+    uint32_t tileNum;
+    uint32_t shortBlockTileNum;
+    uint16_t tailLength;
+    uint8_t blockNum;
+    uint8_t shortBlockNum;
+
+    size_t globalOffset; // Offset of data for current block.
+    size_t blockLength;  // Length of current block.
+    bool hasInfFlag {false};
 };
 
 // Each block process diffent part of data. This function returns the element-wise first index of data by blockIdx.
-template <typename T>
-__aicore__ inline size_t KernelUnique<T>::GetGlobalOffset(const uint32_t blockIdx) {
+template<typename T>
+__aicore__ inline size_t KernelUnique<T>::GetGlobalOffset(const uint32_t blockIdx)
+{
     // (shortBlockTileNum + 1) indicates longBlockTileNum.
-    const size_t offset = (this->shortBlockTileNum * min(this->shortBlockNum, blockIdx) +
-                          (this->shortBlockTileNum + 1) *
-                              (this->shortBlockNum >= blockIdx? 0: blockIdx - this->shortBlockNum)) *
-                              TILE_LENGTH;
+    const size_t offset =
+        (this->shortBlockTileNum * min(this->shortBlockNum, blockIdx) +
+            (this->shortBlockTileNum + 1) * (this->shortBlockNum >= blockIdx ? 0 : blockIdx - this->shortBlockNum)) *
+        TILE_LENGTH;
     return offset;
 }
 
-template <typename T>
-__aicore__ inline void KernelUnique<T>::Init(GM_ADDR input,
-                                             GM_ADDR output,
-                                             GM_ADDR uniqueCnt,
-                                             GM_ADDR workspace,
-                                             const uint32_t totalLength,
-                                             const uint32_t totalTileNum,
-                                             const uint32_t shortBlockTileNum,
-                                             const uint16_t tailLength,
-                                             const uint8_t blockNum,
-                                             const uint8_t shortBlockNum) {
+template<typename T>
+__aicore__ inline void KernelUnique<T>::Init(GM_ADDR input, GM_ADDR output, GM_ADDR uniqueCnt, GM_ADDR workspace,
+    const uint32_t totalLength, const uint32_t totalTileNum, const uint32_t shortBlockTileNum,
+    const uint16_t tailLength, const uint8_t blockNum, const uint8_t shortBlockNum)
+{
     this->totalLength = totalLength;
     this->alignedTotalLength = totalTileNum * TILE_LENGTH;
     this->shortBlockTileNum = shortBlockTileNum;
@@ -150,48 +133,48 @@ __aicore__ inline void KernelUnique<T>::Init(GM_ADDR input,
 
     const bool isShortBlock = this->shortBlockNum > GetBlockIdx();
     // (shortBlockTileNum + 1) indicates longBlockTileNum.
-    this->tileNum = isShortBlock? shortBlockTileNum: shortBlockTileNum + 1;
+    this->tileNum = isShortBlock ? shortBlockTileNum : shortBlockTileNum + 1;
     this->blockLength = this->tileNum * TILE_LENGTH;
     this->globalOffset = GetGlobalOffset(GetBlockIdx());
 
-    srcGlobal.SetGlobalBuffer((__gm__ T *)input + globalOffset, this->blockLength);
-    srcGlobalAsUint.SetGlobalBuffer((__gm__ uint32_t *)input + globalOffset * sizeof(T) / sizeof(uint32_t),
-                                    this->blockLength * sizeof(T) / sizeof(uint32_t));
-    dstGlobal1.SetGlobalBuffer((__gm__ T *)output, this->alignedTotalLength);
-    dstGlobal1As32.SetGlobalBuffer((__gm__ int32_t *)output,
-                                   this->alignedTotalLength * sizeof(T) / sizeof(int32_t));
-    uniqueCntGlobal.SetGlobalBuffer((__gm__ int32_t *)uniqueCnt, 1);
+    srcGlobal.SetGlobalBuffer((__gm__ T*)input + globalOffset, this->blockLength);
+    srcGlobalAsUint.SetGlobalBuffer((__gm__ uint32_t*)input + globalOffset * sizeof(T) / sizeof(uint32_t),
+        this->blockLength * sizeof(T) / sizeof(uint32_t));
+    dstGlobal1.SetGlobalBuffer((__gm__ T*)output, this->alignedTotalLength);
+    dstGlobal1As32.SetGlobalBuffer((__gm__ int32_t*)output, this->alignedTotalLength * sizeof(T) / sizeof(int32_t));
+    uniqueCntGlobal.SetGlobalBuffer((__gm__ int32_t*)uniqueCnt, 1);
 
     // sortedBlock is offsetted, and could only see the data that this block should process.
-    sortedBlock1.SetGlobalBuffer((__gm__ float *)workspace + globalOffset * SORT_DATATYPE_SIZE_FACTOR,
-                                 this->blockLength * SORT_DATATYPE_SIZE_FACTOR);
-    sortedBlock1AsInt.SetGlobalBuffer((__gm__ int32_t *)workspace + globalOffset * SORT_DATATYPE_SIZE_FACTOR,
-                                      this->blockLength * SORT_DATATYPE_SIZE_FACTOR);
-    sortedBlock2.SetGlobalBuffer((__gm__ float *)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR +
-                                 globalOffset * SORT_DATATYPE_SIZE_FACTOR,
-                                 this->blockLength * SORT_DATATYPE_SIZE_FACTOR);
-    sortedBlock2AsInt.SetGlobalBuffer((__gm__ int32_t *)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR +
-                                      globalOffset * SORT_DATATYPE_SIZE_FACTOR,
-                                      this->blockLength * SORT_DATATYPE_SIZE_FACTOR);
+    sortedBlock1.SetGlobalBuffer((__gm__ float*)workspace + globalOffset * SORT_DATATYPE_SIZE_FACTOR,
+        this->blockLength * SORT_DATATYPE_SIZE_FACTOR);
+    sortedBlock1AsInt.SetGlobalBuffer((__gm__ int32_t*)workspace + globalOffset * SORT_DATATYPE_SIZE_FACTOR,
+        this->blockLength * SORT_DATATYPE_SIZE_FACTOR);
+    sortedBlock2.SetGlobalBuffer((__gm__ float*)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR +
+                                     globalOffset * SORT_DATATYPE_SIZE_FACTOR,
+        this->blockLength * SORT_DATATYPE_SIZE_FACTOR);
+    sortedBlock2AsInt.SetGlobalBuffer((__gm__ int32_t*)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR +
+                                          globalOffset * SORT_DATATYPE_SIZE_FACTOR,
+        this->blockLength * SORT_DATATYPE_SIZE_FACTOR);
     // sortedGlobal could see all data in the workspace.
-    sortedGlobal1.SetGlobalBuffer((__gm__ float *)workspace, alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR);
-    sortedGlobal2.SetGlobalBuffer((__gm__ float *)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR,
-                                  alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR);
+    sortedGlobal1.SetGlobalBuffer((__gm__ float*)workspace, alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR);
+    sortedGlobal2.SetGlobalBuffer((__gm__ float*)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR,
+        alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR);
 
     // Buff size for syncronizing according to document of IBWait&IBSet.
     this->syncWorkspaceSize = (blockNum * 32 + 1) * 8;
-    IBSyncGlobal.SetGlobalBuffer((__gm__ int32_t *)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR * 2,
-                                 syncWorkspaceSize);
-    blockUniqueCntGlobal.SetGlobalBuffer((__gm__ uint32_t *)workspace + alignedTotalLength * 4 + syncWorkspaceSize,
-                                         (blockNum + 7) / 8 * 8);  // Length aligned up to 32B.
+    IBSyncGlobal.SetGlobalBuffer(
+        (__gm__ int32_t*)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR * 2, syncWorkspaceSize);
+    blockUniqueCntGlobal.SetGlobalBuffer((__gm__ uint32_t*)workspace + alignedTotalLength * 4 + syncWorkspaceSize,
+        (blockNum + 7) / 8 * 8); // Length aligned up to 32B.
 
     pipe.InitBuffer(calcBuf[0], 1, TILE_LEN_BYTE);
     pipe.InitBuffer(calcBuf[1], 1, TILE_LEN_BYTE);
     pipe.InitBuffer(calcBuf[2], 1, TILE_LEN_BYTE);
 }
 
-template <typename T>
-__aicore__ inline void KernelUnique<T>::Process() {
+template<typename T>
+__aicore__ inline void KernelUnique<T>::Process()
+{
     LocalTensor<int32_t> IBSyncLocal;
     // Initialize sync buff.
     if (GetBlockIdx() == 0) {
@@ -201,7 +184,7 @@ __aicore__ inline void KernelUnique<T>::Process() {
         DataCopy(IBSyncGlobal, IBSyncLocal, syncWorkspaceSize);
         PipeBarrier<PIPE_ALL>();
         calcBuf[0].FreeTensor(IBSyncLocal);
-    }  // Initialize sync buff.
+    } // Initialize sync buff.
 
     // Sort within each tile.
     for (int32_t tileIdx = 0; tileIdx < this->tileNum; tileIdx++) {
@@ -212,10 +195,10 @@ __aicore__ inline void KernelUnique<T>::Process() {
 
     if (GetBlockNum() > 1) {
         if (this->tileNum > 1) {
-            BlockSortV2();  // Sort within each block.
+            BlockSortV2(); // Sort within each block.
         }
 
-        GlobalSortV2();  // Sort globally.
+        GlobalSortV2(); // Sort globally.
 
         PipeBarrier<PIPE_ALL>();
         SyncAll<true>();
@@ -255,13 +238,14 @@ __aicore__ inline void KernelUnique<T>::Process() {
     CopyOut();
 }
 
-template <typename T>
-__aicore__ inline void KernelUnique<T>::CopyIn(const int32_t progress) {
+template<typename T>
+__aicore__ inline void KernelUnique<T>::CopyIn(const int32_t progress)
+{
     LocalTensor<T> srcLocal = calcBuf[0].AllocTensor<T>();
     LocalTensor<float> sortedLocal2 = calcBuf[2].AllocTensor<float>();
 
     // To process tail, fill the whole tile with INF, then cover it with tail.
-    int32_t castLen;  // Valid length of the last block.
+    int32_t castLen; // Valid length of the last block.
     if ((progress != tileNum - 1) || (GetBlockIdx() != blockNum - 1) || tailLength == 0) {
         // Must determine during compilation, otherwise we get a compilation error.
         if constexpr (!IsSameType<T, float>::value) {
@@ -276,20 +260,14 @@ __aicore__ inline void KernelUnique<T>::CopyIn(const int32_t progress) {
         Duplicate(sortedLocal2, FLOAT_INF, TILE_LENGTH);
         PipeBarrier<PIPE_V>();
         if constexpr (IsSameType<T, float>::value) {
-            DataCopyPad(sortedLocal2,
-                        srcGlobal[progress * TILE_LENGTH],
-                        {1, static_cast<uint16_t>(sizeof(T) * tailLength), 0, 0},
-                        {false, 0, 0, 0});
+            DataCopyPad(sortedLocal2, srcGlobal[progress * TILE_LENGTH],
+                {1, static_cast<uint16_t>(sizeof(T) * tailLength), 0, 0}, {false, 0, 0, 0});
         } else if constexpr (sizeof(T) >= sizeof(float)) {
-            DataCopyPad(srcAsUint,
-                        srcGlobalAsUint[progress * TILE_LENGTH * sizeof(T) / sizeof(uint32_t)],
-                        {1, static_cast<uint16_t>(sizeof(T) * tailLength), 0, 0},
-                        {false, 0, 0, 0});
+            DataCopyPad(srcAsUint, srcGlobalAsUint[progress * TILE_LENGTH * sizeof(T) / sizeof(uint32_t)],
+                {1, static_cast<uint16_t>(sizeof(T) * tailLength), 0, 0}, {false, 0, 0, 0});
         } else {
-            DataCopyPad(srcLocal,
-                        srcGlobal[progress * TILE_LENGTH],
-                        {1, static_cast<uint16_t>(sizeof(T) * tailLength), 0, 0},
-                        {false, 0, 0, 0});
+            DataCopyPad(srcLocal, srcGlobal[progress * TILE_LENGTH],
+                {1, static_cast<uint16_t>(sizeof(T) * tailLength), 0, 0}, {false, 0, 0, 0});
         }
         castLen = tailLength;
     }
@@ -307,14 +285,15 @@ __aicore__ inline void KernelUnique<T>::CopyIn(const int32_t progress) {
     calcBuf[2].EnQue(sortedLocal2);
 }
 
-template <typename T>
-__aicore__ inline void KernelUnique<T>::Elem32Sort(const int32_t progress) {
+template<typename T>
+__aicore__ inline void KernelUnique<T>::Elem32Sort(const int32_t progress)
+{
     LocalTensor<T> srcLocal = calcBuf[0].DeQue<T>();
     LocalTensor<float> sortedLocal1 = calcBuf[1].AllocTensor<float>();
     LocalTensor<float> sortedLocal2 = calcBuf[2].DeQue<float>();
     LocalTensor<int32_t> arithLocal = srcLocal.template ReinterpretCast<int32_t>()[TILE_LENGTH];
 
-    int32_t baseOffset = progress * TILE_LENGTH + this->globalOffset;  // calc tileOffset
+    int32_t baseOffset = progress * TILE_LENGTH + this->globalOffset; // calc tileOffset
     Duplicate(arithLocal, baseOffset, TILE_LENGTH);
     PipeBarrier<PIPE_V>();
 
@@ -327,9 +306,8 @@ __aicore__ inline void KernelUnique<T>::Elem32Sort(const int32_t progress) {
     while (restLen) {
         int repTime = min(restLen / sort32BatchSize, sort32RepeatLimit);
         Sort32<float>(sortedLocal1[sort32BatchSize * sort32RepeatLimit * SORT_DATATYPE_SIZE_FACTOR * instrRepeatTime],
-                      sortedLocal2[sort32BatchSize * sort32RepeatLimit * instrRepeatTime],
-                      uidArray[sort32BatchSize * sort32RepeatLimit * instrRepeatTime],
-                      repTime);
+            sortedLocal2[sort32BatchSize * sort32RepeatLimit * instrRepeatTime],
+            uidArray[sort32BatchSize * sort32RepeatLimit * instrRepeatTime], repTime);
         restLen -= repTime * sort32BatchSize;
         instrRepeatTime++;
     }
@@ -339,27 +317,25 @@ __aicore__ inline void KernelUnique<T>::Elem32Sort(const int32_t progress) {
     calcBuf[2].EnQue(sortedLocal2);
 }
 
-template <typename T>
-__aicore__ inline void KernelUnique<T>::TileSort(const int32_t progress) {
+template<typename T>
+__aicore__ inline void KernelUnique<T>::TileSort(const int32_t progress)
+{
     LocalTensor<float> sortedLocal1 = calcBuf[1].DeQue<float>();
     LocalTensor<float> sortedLocal2 = calcBuf[2].DeQue<float>();
     LocalTensor<float> sortedQue[2] = {sortedLocal1, sortedLocal2};
-    uint16_t currentQueLength = 32;  // Initial queue length is 32 because data is from Sort32.
+    uint16_t currentQueLength = 32; // Initial queue length is 32 because data is from Sort32.
     uint16_t currentQueNum = TILE_LENGTH / currentQueLength;
     bool switchFlag = false;
     // Multiple MrgSort until we have one generally sorted tile.
     while (currentQueLength < TILE_LENGTH) {
-        const uint16_t elementLengths[4] = {
-            currentQueLength, currentQueLength, currentQueLength, currentQueLength};
+        const uint16_t elementLengths[4] = {currentQueLength, currentQueLength, currentQueLength, currentQueLength};
         const uint16_t fullMrgSortTime = currentQueNum / 4;
         if (fullMrgSortTime > 0) {
             MrgSort4Info params = {elementLengths, false, 0b1111, fullMrgSortTime};
             MrgSort<float>(sortedQue[!switchFlag],
-                           {sortedQue[switchFlag][0],
-                            sortedQue[switchFlag][currentQueLength * 1 * 2],
-                            sortedQue[switchFlag][currentQueLength * 2 * 2],
-                            sortedQue[switchFlag][currentQueLength * 3 * 2]},
-                           params);
+                {sortedQue[switchFlag][0], sortedQue[switchFlag][currentQueLength * 1 * 2],
+                    sortedQue[switchFlag][currentQueLength * 2 * 2], sortedQue[switchFlag][currentQueLength * 3 * 2]},
+                params);
             PipeBarrier<PIPE_ALL>();
             switchFlag = !switchFlag;
         }
@@ -372,22 +348,18 @@ __aicore__ inline void KernelUnique<T>::TileSort(const int32_t progress) {
     calcBuf[2].FreeTensor(sortedLocal2);
 }
 
-template <typename T>
-template <typename T1>
-__aicore__ inline void KernelUnique<T>::DataCopyGM2GM(const GlobalTensor<T1> &dst,
-                                                      const GlobalTensor<T1> &src,
-                                                      const LocalTensor<T1> &tmpLocal,
-                                                      const int elemLength,
-                                                      const int bufByteLength) {
+template<typename T>
+template<typename T1>
+__aicore__ inline void KernelUnique<T>::DataCopyGM2GM(const GlobalTensor<T1>& dst, const GlobalTensor<T1>& src,
+    const LocalTensor<T1>& tmpLocal, const int elemLength, const int bufByteLength)
+{
     // Max byte size of DataCopyPad in one repeat is 65535.
     int bufElemLength = min(bufByteLength, 65535) / sizeof(T1);
     int restLen = elemLength;
     while (restLen > 0) {
         int copyLen = min(restLen, bufElemLength);
-        DataCopyPad(tmpLocal,
-                    src[elemLength - restLen],
-                    {1, static_cast<uint16_t>(sizeof(T1) * copyLen), 0, 0},
-                    {false, 0, 0, 0});
+        DataCopyPad(tmpLocal, src[elemLength - restLen], {1, static_cast<uint16_t>(sizeof(T1) * copyLen), 0, 0},
+            {false, 0, 0, 0});
         PipeBarrier<PIPE_ALL>();
         DataCopyPad(dst[elemLength - restLen], tmpLocal, {1, static_cast<uint16_t>(sizeof(T1) * copyLen), 0, 0});
         PipeBarrier<PIPE_ALL>();
@@ -395,10 +367,10 @@ __aicore__ inline void KernelUnique<T>::DataCopyGM2GM(const GlobalTensor<T1> &ds
     }
 }
 
-template <typename T>
-__aicore__ inline void KernelUnique<T>::MrgSortGM(GlobalTensor<float> &&dstGlobal,
-                                                  GMSSrcList &srcList,
-                                                  GMSParams &params) {
+template<typename T>
+__aicore__ inline void KernelUnique<T>::MrgSortGM(
+    GlobalTensor<float>&& dstGlobal, GMSSrcList& srcList, GMSParams& params)
+{
     int restLen[4] {params.GMSLengths[0], params.GMSLengths[1], params.GMSLengths[2], params.GMSLengths[3]};
     int currentHead[4] {};
     int totalMrgLen {};
@@ -415,24 +387,21 @@ __aicore__ inline void KernelUnique<T>::MrgSortGM(GlobalTensor<float> &&dstGloba
         }
         // CopyIn
         for (int i = 0; i < queNum; i++) {
-            DataCopyPad(params.buffLocal[i],
-                        srcList[i][currentHead[i] * SORT_DATATYPE_SIZE_FACTOR],
-                        {1, static_cast<uint16_t>(sizeof(float) * mrgLen[i] * SORT_DATATYPE_SIZE_FACTOR), 0, 0},
-                        {false, 0, 0, 0});
+            DataCopyPad(params.buffLocal[i], srcList[i][currentHead[i] * SORT_DATATYPE_SIZE_FACTOR],
+                {1, static_cast<uint16_t>(sizeof(float) * mrgLen[i] * SORT_DATATYPE_SIZE_FACTOR), 0, 0},
+                {false, 0, 0, 0});
         }
         PipeBarrier<PIPE_ALL>();
         // MrgSort
         MrgSort4Info localParams {mrgLen, true, VALID_QUE[queNum], 1};
         MrgSort<float>(params.buffLocal[4],
-                       {params.buffLocal[0], params.buffLocal[1], params.buffLocal[2], params.buffLocal[3]},
-                       localParams);
+            {params.buffLocal[0], params.buffLocal[1], params.buffLocal[2], params.buffLocal[3]}, localParams);
         PipeBarrier<PIPE_ALL>();
         GetMrgSortResult(sortedLen[0], sortedLen[1], sortedLen[2], sortedLen[3]);
         const uint16_t localMrgLen = sortedLen[0] + sortedLen[1] + sortedLen[2] + sortedLen[3];
         // CopyOut
-        DataCopyPad(dstGlobal[totalMrgLen * SORT_DATATYPE_SIZE_FACTOR],
-                    params.buffLocal[4],
-                    {1, static_cast<uint16_t>(sizeof(float) * localMrgLen * SORT_DATATYPE_SIZE_FACTOR), 0, 0});
+        DataCopyPad(dstGlobal[totalMrgLen * SORT_DATATYPE_SIZE_FACTOR], params.buffLocal[4],
+            {1, static_cast<uint16_t>(sizeof(float) * localMrgLen * SORT_DATATYPE_SIZE_FACTOR), 0, 0});
         PipeBarrier<PIPE_ALL>();
         // renew currentHead, restLen
         totalMrgLen += localMrgLen;
@@ -450,7 +419,7 @@ __aicore__ inline void KernelUnique<T>::MrgSortGM(GlobalTensor<float> &&dstGloba
                 }
                 restLen[3] = 0;
                 queNum--;
-                break;  // because ifExhaustedSuspension == true, there is 0 or 1 empty que.
+                break; // because ifExhaustedSuspension == true, there is 0 or 1 empty que.
             }
         }
     }
@@ -458,17 +427,16 @@ __aicore__ inline void KernelUnique<T>::MrgSortGM(GlobalTensor<float> &&dstGloba
     for (int i = 0; i < params.queNum; i++) {
         if (restLen[i] > 0) {
             DataCopyGM2GM(dstGlobal[totalMrgLen * SORT_DATATYPE_SIZE_FACTOR],
-                          srcList[i][currentHead[i] * SORT_DATATYPE_SIZE_FACTOR],
-                          params.buffLocal[4],
-                          restLen[i] * SORT_DATATYPE_SIZE_FACTOR,
-                          TILE_LEN_BYTE);
+                srcList[i][currentHead[i] * SORT_DATATYPE_SIZE_FACTOR], params.buffLocal[4],
+                restLen[i] * SORT_DATATYPE_SIZE_FACTOR, TILE_LEN_BYTE);
             break;
         }
     }
 };
 
-template <typename T>
-__aicore__ inline void KernelUnique<T>::BlockSortV2() {
+template<typename T>
+__aicore__ inline void KernelUnique<T>::BlockSortV2()
+{
     LocalTensor<float> sortedLocal1 = calcBuf[0].AllocTensor<float>();
     LocalTensor<float> sortedLocal2 = calcBuf[1].AllocTensor<float>();
     LocalTensor<float> mrgLocal = calcBuf[2].AllocTensor<float>();
@@ -497,27 +465,23 @@ __aicore__ inline void KernelUnique<T>::BlockSortV2() {
             }
             lengths[queNum - 1] = TILE_LENGTH * lastQueTileNum;
             GMSSrcList srcList {srcGlobal};
-            GMSParams params {lengths, queNum, {sortedLocal1, sortedLocal1[TILE_LENGTH],
-                                                sortedLocal2, sortedLocal2[TILE_LENGTH],
-                                                mrgLocal}};
+            GMSParams params {lengths, queNum,
+                {sortedLocal1, sortedLocal1[TILE_LENGTH], sortedLocal2, sortedLocal2[TILE_LENGTH], mrgLocal}};
             MrgSortGM(sortedBlock[!switchFlag][TILE_LEN_ELEM * tileIdx], srcList, params);
         }
         switchFlag = !switchFlag;
     }
     if (switchFlag) {
-        DataCopyGM2GM(sortedBlock1,
-                      sortedBlock2,
-                      sortedLocal1,
-                      blockLength * SORT_DATATYPE_SIZE_FACTOR,
-                      TILE_LEN_BYTE);
+        DataCopyGM2GM(sortedBlock1, sortedBlock2, sortedLocal1, blockLength * SORT_DATATYPE_SIZE_FACTOR, TILE_LEN_BYTE);
     }
     calcBuf[0].FreeTensor(sortedLocal1);
     calcBuf[1].FreeTensor(sortedLocal2);
     calcBuf[2].FreeTensor(mrgLocal);
 }
 
-template <typename T>
-__aicore__ inline void KernelUnique<T>::GlobalSortV2() {
+template<typename T>
+__aicore__ inline void KernelUnique<T>::GlobalSortV2()
+{
     LocalTensor<float> sortedLocal1 = calcBuf[0].AllocTensor<float>();
     LocalTensor<float> sortedLocal2 = calcBuf[1].AllocTensor<float>();
     LocalTensor<float> mrgLocal = calcBuf[2].AllocTensor<float>();
@@ -531,8 +495,7 @@ __aicore__ inline void KernelUnique<T>::GlobalSortV2() {
     int lengths[4];
     for (int bindBlock = 1; bindBlock < blockNum; bindBlock *= PREFIX_QUE_NUM, eventID++) {
         for (int blockIdx = 0; blockIdx < blockNum; blockIdx += bindBlock * PREFIX_QUE_NUM) {
-            if ((GetBlockIdx() == blockIdx + bindBlock) ||
-                (GetBlockIdx() == blockIdx + bindBlock * 2) ||
+            if ((GetBlockIdx() == blockIdx + bindBlock) || (GetBlockIdx() == blockIdx + bindBlock * 2) ||
                 (GetBlockIdx() == blockIdx + bindBlock * 3)) {
                 PipeBarrier<PIPE_ALL>();
                 IBSet(IBSyncGlobal, IBSyncLocal, (int32_t)GetBlockIdx(), eventID);
@@ -552,22 +515,20 @@ __aicore__ inline void KernelUnique<T>::GlobalSortV2() {
                 }
                 // Init GMSSrcList, GMSParams
                 for (int i = 0; i < queNum; i++) {
-                    srcGlobal[i] = sortedGlobal[switchFlag][GetGlobalOffset(blockIdx + bindBlock * i) * 
-                                                            SORT_DATATYPE_SIZE_FACTOR];
+                    srcGlobal[i] =
+                        sortedGlobal[switchFlag][GetGlobalOffset(blockIdx + bindBlock * i) * SORT_DATATYPE_SIZE_FACTOR];
                 }
                 for (int i = 0; i < queNum - 1; i++) {
-                    lengths[i] = GetGlobalOffset(blockIdx + (bindBlock * (i + 1))) -
-                                 GetGlobalOffset(blockIdx + (bindBlock * i));
+                    lengths[i] =
+                        GetGlobalOffset(blockIdx + (bindBlock * (i + 1))) - GetGlobalOffset(blockIdx + (bindBlock * i));
                 }
-                lengths[queNum - 1] = GetGlobalOffset(blockIdx + (bindBlock * (queNum - 1)) + lastQueBlockNum) - 
+                lengths[queNum - 1] = GetGlobalOffset(blockIdx + (bindBlock * (queNum - 1)) + lastQueBlockNum) -
                                       GetGlobalOffset(blockIdx + (bindBlock * (queNum - 1)));
                 GMSSrcList srcList {srcGlobal};
-                GMSParams params {lengths, queNum, {sortedLocal1, sortedLocal1[TILE_LENGTH],
-                                                    sortedLocal2, sortedLocal2[TILE_LENGTH],
-                                                    mrgLocal}};
-                MrgSortGM(sortedGlobal[!switchFlag][GetGlobalOffset(blockIdx) * SORT_DATATYPE_SIZE_FACTOR],
-                          srcList,
-                          params);
+                GMSParams params {lengths, queNum,
+                    {sortedLocal1, sortedLocal1[TILE_LENGTH], sortedLocal2, sortedLocal2[TILE_LENGTH], mrgLocal}};
+                MrgSortGM(
+                    sortedGlobal[!switchFlag][GetGlobalOffset(blockIdx) * SORT_DATATYPE_SIZE_FACTOR], srcList, params);
             }
         }
         switchFlag = !switchFlag;
@@ -591,13 +552,11 @@ __aicore__ inline void KernelUnique<T>::GlobalSortV2() {
     calcBuf[2].FreeTensor(mrgLocal);
 }
 
-template <typename T>
-__aicore__ inline void KernelUnique<T>::ConsecutiveUnique(const LocalTensor<float> &dstVal,
-                                                          const LocalTensor<float> &srcLocal,
-                                                          const LocalTensor<float> &shiftedLocal,
-                                                          const LocalTensor<uint32_t> &bitMask32,
-                                                          const uint16_t elemLength,
-                                                          uint64_t &tileUniqueCnt) {
+template<typename T>
+__aicore__ inline void KernelUnique<T>::ConsecutiveUnique(const LocalTensor<float>& dstVal,
+    const LocalTensor<float>& srcLocal, const LocalTensor<float>& shiftedLocal, const LocalTensor<uint32_t>& bitMask32,
+    const uint16_t elemLength, uint64_t& tileUniqueCnt)
+{
     LocalTensor<uint16_t> bitMask16 = bitMask32.ReinterpretCast<uint16_t>();
     uint64_t rsvdCnt = 0;
     // Seperate Val and Idx.
@@ -624,8 +583,9 @@ __aicore__ inline void KernelUnique<T>::ConsecutiveUnique(const LocalTensor<floa
     PipeBarrier<PIPE_V>();
 }
 
-template <typename T>
-__aicore__ inline void KernelUnique<T>::TileUnique(const int32_t progress) {
+template<typename T>
+__aicore__ inline void KernelUnique<T>::TileUnique(const int32_t progress)
+{
     LocalTensor<uint32_t> bitMask32 = calcBuf[0].AllocTensor<uint32_t>();
     LocalTensor<float> shiftedLocal = bitMask32[TILE_LENGTH].ReinterpretCast<float>();
     LocalTensor<float> sortedLocal1 = calcBuf[1].AllocTensor<float>();
@@ -649,9 +609,8 @@ __aicore__ inline void KernelUnique<T>::TileUnique(const int32_t progress) {
         if (progress != 0 && lastTileUniqueVal == sortedLocal2.GetValue(0)) {
             accUniqueCnt--;
         }
-        DataCopyPad(sortedBlock1[accUniqueCnt - tileUniqueCnt],
-                    sortedLocal2,
-                    {1, static_cast<uint16_t>(sizeof(float) * tileUniqueCnt), 0, 0});
+        DataCopyPad(sortedBlock1[accUniqueCnt - tileUniqueCnt], sortedLocal2,
+            {1, static_cast<uint16_t>(sizeof(float) * tileUniqueCnt), 0, 0});
         PipeBarrier<PIPE_ALL>();
         lastTileUniqueVal = sortedLocal2.GetValue(tileUniqueCnt - 1);
     }
@@ -659,9 +618,8 @@ __aicore__ inline void KernelUnique<T>::TileUnique(const int32_t progress) {
     // upload uniqueCnt.
     if (progress == tileNum - 1) {
         uniqueCntLocal.SetValue(0, accUniqueCnt);
-        DataCopyPad(blockUniqueCntGlobal[GetBlockIdx()],
-                    uniqueCntLocal,
-                    {1, static_cast<uint16_t>(sizeof(uint32_t) * 1), 0, 0});
+        DataCopyPad(blockUniqueCntGlobal[GetBlockIdx()], uniqueCntLocal,
+            {1, static_cast<uint16_t>(sizeof(uint32_t) * 1), 0, 0});
         PipeBarrier<PIPE_ALL>();
     }
     calcBuf[0].FreeTensor(shiftedLocal);
@@ -669,8 +627,9 @@ __aicore__ inline void KernelUnique<T>::TileUnique(const int32_t progress) {
     calcBuf[2].FreeTensor(sortedLocal2);
 }
 
-template <typename T>
-__aicore__ inline void KernelUnique<T>::CopyOut() {
+template<typename T>
+__aicore__ inline void KernelUnique<T>::CopyOut()
+{
     LocalTensor<T> copyLocal0 = calcBuf[0].AllocTensor<T>();
     LocalTensor<float> copyLocal1 = calcBuf[1].AllocTensor<float>();
     LocalTensor<int32_t> IBSyncLocal = copyLocal1.ReinterpretCast<int32_t>();
@@ -692,7 +651,7 @@ __aicore__ inline void KernelUnique<T>::CopyOut() {
 
     uint64_t restLen = thisUniqueCnt;
     // max(Ta a, Tb b) function does not support compilation period calc.
-    constexpr uint64_t bottleneckTypeSize = sizeof(T) > sizeof(float)? sizeof(T): sizeof(float);
+    constexpr uint64_t bottleneckTypeSize = sizeof(T) > sizeof(float) ? sizeof(T) : sizeof(float);
     LocalTensor<int32_t> copyVal32 = copyLocal0.template ReinterpretCast<int32_t>();
     LocalTensor<int32_t> uniqueVal32 = copyLocal1.ReinterpretCast<int32_t>();
     // Copy unique values (and counts) from Workspace to dst.
@@ -702,20 +661,16 @@ __aicore__ inline void KernelUnique<T>::CopyOut() {
         uint64_t copyLen = min(restLen, TILE_LEN_BYTE / bottleneckTypeSize);
         copyLen = min(copyLen, 65535 / bottleneckTypeSize);
         if constexpr (!IsSameType<T, float>::value) {
-            DataCopyPad(copyLocal1,
-                        sortedBlock1[thisUniqueCnt - restLen],
-                        {1, static_cast<uint16_t>(sizeof(float) * copyLen), 0, 0},
-                        {false, 0, 0, 0});
+            DataCopyPad(copyLocal1, sortedBlock1[thisUniqueCnt - restLen],
+                {1, static_cast<uint16_t>(sizeof(float) * copyLen), 0, 0}, {false, 0, 0, 0});
             PipeBarrier<PIPE_ALL>();
             Muls(copyLocal1, copyLocal1, (float)-1, copyLen);
             PipeBarrier<PIPE_V>();
             Cast(copyLocal0, copyLocal1, RoundMode::CAST_RINT, copyLen);
             PipeBarrier<PIPE_ALL>();
         } else {
-            DataCopyPad(copyLocal0,
-                        sortedBlock1[thisUniqueCnt - restLen],
-                        {1, static_cast<uint16_t>(sizeof(float) * copyLen), 0, 0},
-                        {false, 0, 0, 0});
+            DataCopyPad(copyLocal0, sortedBlock1[thisUniqueCnt - restLen],
+                {1, static_cast<uint16_t>(sizeof(float) * copyLen), 0, 0}, {false, 0, 0, 0});
             PipeBarrier<PIPE_ALL>();
             Muls(copyLocal0, copyLocal0, (float)-1, copyLen);
             PipeBarrier<PIPE_V>();
@@ -723,12 +678,10 @@ __aicore__ inline void KernelUnique<T>::CopyOut() {
         // DataCopyPad does not support int64_t. Copy them as uint32_t.
         if constexpr (sizeof(T) > 4) {
             DataCopyPad(dstGlobal1As32[(lastAccUniqueCnt + thisUniqueCnt - restLen) * sizeof(T) / sizeof(uint32_t)],
-                        copyVal32,
-                        {1, static_cast<uint16_t>(sizeof(T) * copyLen), 0, 0});
+                copyVal32, {1, static_cast<uint16_t>(sizeof(T) * copyLen), 0, 0});
         } else {
-            DataCopyPad(dstGlobal1[lastAccUniqueCnt + thisUniqueCnt - restLen],
-                        copyLocal0,
-                        {1, static_cast<uint16_t>(sizeof(T) * copyLen), 0, 0});
+            DataCopyPad(dstGlobal1[lastAccUniqueCnt + thisUniqueCnt - restLen], copyLocal0,
+                {1, static_cast<uint16_t>(sizeof(T) * copyLen), 0, 0});
         }
         PipeBarrier<PIPE_ALL>();
         restLen -= copyLen;
@@ -742,4 +695,4 @@ __aicore__ inline void KernelUnique<T>::CopyOut() {
     calcBuf[0].FreeTensor(copyLocal0);
     calcBuf[1].FreeTensor(copyLocal1);
 }
-}  // namespace AscendC
+} // namespace AscendC
diff --git a/mx_driving/ops/npu_unique.py b/mx_driving/ops/npu_unique.py
index 7ccde38a..1fd367b3 100644
--- a/mx_driving/ops/npu_unique.py
+++ b/mx_driving/ops/npu_unique.py
@@ -9,11 +9,13 @@ Modification 1. Add support for Ascend NPU
 from torch.autograd import Function
 import mx_driving._C
 
+
 class UniqueFunction(Function):
     @staticmethod
     # 'pylint: disable=too-many-arguments,huawei-too-many-arguments
-    def forward(ctx, input):
-        y = mx_driving._C.npu_unique(input)
+    def forward(ctx, input_tensor):
+        y = mx_driving._C.npu_unique(input_tensor)
         return y
 
+
 npu_unique = UniqueFunction.apply
diff --git a/tests/torch/test_npu_unique.py b/tests/torch/test_npu_unique.py
index 8cc8f81c..d9144752 100644
--- a/tests/torch/test_npu_unique.py
+++ b/tests/torch/test_npu_unique.py
@@ -1,24 +1,25 @@
 """
 Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
 """
+import random
+import os
 import torch
 import torch_npu
 from data_cache import golden_data_cache
 from torch_npu.testing.testcase import TestCase, run_tests
-
 from mx_driving import npu_unique
-import random
-import os
 
 
 def gen_inputs(input_shape, dtype):
     input_tensor = torch.randint(-256, 256, input_shape, dtype=dtype)
     return input_tensor
 
+
 def gen_cpu_outputs(input_tensor):
     cpu_result = torch.unique(input_tensor)
     return cpu_result
 
+
 def gen_npu_outputs(input_tensor):
     npu_result = npu_unique(input_tensor.npu())
     return npu_result.cpu()
-- 
Gitee


From ee0b2abcecb4a0b1b5770b92633c3a8474a32203 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=82=B1=E6=AD=A3=E9=98=B3?= <qiuzhengyang3@huawei.com>
Date: Mon, 24 Mar 2025 16:44:02 +0800
Subject: [PATCH 6/7] cleancode

---
 mx_driving/csrc/Unique.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mx_driving/csrc/Unique.cpp b/mx_driving/csrc/Unique.cpp
index ee7aa6a7..b99f9ed0 100644
--- a/mx_driving/csrc/Unique.cpp
+++ b/mx_driving/csrc/Unique.cpp
@@ -18,7 +18,8 @@
 #include "csrc/functions.h"
 
 
-at::Tensor npu_unique(const at::Tensor& input) {
+at::Tensor npu_unique(const at::Tensor& input)
+{
     if (input.numel() < 2) {
         at::Tensor output = at::Tensor(input).clone();
         return output;
-- 
Gitee


From aa4cc88824ebf046111802474fe47b57c9aff9b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=82=B1=E6=AD=A3=E9=98=B3?= <qiuzhengyang3@huawei.com>
Date: Mon, 24 Mar 2025 16:44:02 +0800
Subject: [PATCH 7/7] cleancode

---
 kernels/op_host/unique.cpp | 11 ++++++++++-
 mx_driving/csrc/Unique.cpp |  3 ++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/kernels/op_host/unique.cpp b/kernels/op_host/unique.cpp
index f37169d9..40e33c95 100644
--- a/kernels/op_host/unique.cpp
+++ b/kernels/op_host/unique.cpp
@@ -11,11 +11,17 @@ constexpr size_t SYS_RSVD_WS_SIZE = 16 * 1024 * 1024;
 namespace optiling {
 static ge::graphStatus UniqueTilingFunc(gert::TilingContext* context)
 {
+    if (!context) {
+        return ge::GRAPH_FAILED;
+    }
     UniqueTilingData tiling;
 
     constexpr uint16_t tileLength = 8192;
-    const uint8_t dimNum = context->GetInputShape(0)->GetStorageShape().GetDimNum();
     const gert::StorageShape* inputShape = context->GetInputShape(0);
+    if (!inputShape) {
+        return ge::GRAPH_FAILED;
+    }
+    const uint8_t dimNum = context->GetInputShape(0)->GetStorageShape().GetDimNum();
     uint32_t totalLength = 1;
     for (int i = 0; i < dimNum; i++) {
         totalLength *= inputShape->GetStorageShape().GetDim(i);
@@ -57,6 +63,9 @@ static ge::graphStatus UniqueInferShape(gert::InferShapeContext* context)
 {
     const gert::Shape* x1_shape = context->GetInputShape(0);
     gert::Shape* y_shape = context->GetOutputShape(0);
+    if (!x1_shape || !y_shape) {
+        return GRAPH_FAILED;
+    }
     *y_shape = *x1_shape;
     return GRAPH_SUCCESS;
 }
diff --git a/mx_driving/csrc/Unique.cpp b/mx_driving/csrc/Unique.cpp
index ee7aa6a7..b99f9ed0 100644
--- a/mx_driving/csrc/Unique.cpp
+++ b/mx_driving/csrc/Unique.cpp
@@ -18,7 +18,8 @@
 #include "csrc/functions.h"
 
 
-at::Tensor npu_unique(const at::Tensor& input) {
+at::Tensor npu_unique(const at::Tensor& input)
+{
     if (input.numel() < 2) {
         at::Tensor output = at::Tensor(input).clone();
         return output;
-- 
Gitee