From 3b82a40bcb897238a887a60632922d4055e4f5d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B1=E6=AD=A3=E9=98=B3?= Date: Tue, 18 Mar 2025 17:18:39 +0800 Subject: [PATCH 1/7] feat: unique --- include/csrc/functions.h | 1 + kernels/op_host/unique.cpp | 100 +++++ kernels/op_host/unique_tiling.h | 18 + kernels/op_kernel/unique.cpp | 26 ++ kernels/op_kernel/unique.h | 745 ++++++++++++++++++++++++++++++++ mx_driving/__init__.py | 2 + mx_driving/csrc/Unique.cpp | 32 ++ mx_driving/csrc/pybind.cpp | 3 + mx_driving/fused.py | 1 + mx_driving/ops/npu_unique.py | 19 + 10 files changed, 947 insertions(+) create mode 100644 kernels/op_host/unique.cpp create mode 100644 kernels/op_host/unique_tiling.h create mode 100644 kernels/op_kernel/unique.cpp create mode 100644 kernels/op_kernel/unique.h create mode 100644 mx_driving/csrc/Unique.cpp create mode 100644 mx_driving/ops/npu_unique.py diff --git a/include/csrc/functions.h b/include/csrc/functions.h index 1df7ed71..2508cf26 100644 --- a/include/csrc/functions.h +++ b/include/csrc/functions.h @@ -288,4 +288,5 @@ std::tuple calc_poly_start_end_sl(const at:: at::Tensor npu_subm_sparse_conv3d_with_key(const at::Tensor& ouidx_offset, const at::Tensor& valid_indices, const at::Tensor& weight, const at::Tensor& feature, int indices_number, at::IntArrayRef kernel_size); +at::Tensor npu_unique(const at::Tensor& input); #endif // CSRC_FUNCTIONS_H_ diff --git a/kernels/op_host/unique.cpp b/kernels/op_host/unique.cpp new file mode 100644 index 00000000..bbc0dfbf --- /dev/null +++ b/kernels/op_host/unique.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved. + * + */ +#include "unique_tiling.h" +#include "register/op_def_registry.h" +#include "tiling/platform/platform_ascendc.h" +constexpr size_t SYS_RSVD_WS_SIZE = 16 * 1024 * 1024; + + +namespace optiling { +static ge::graphStatus UniqueTilingFunc(gert::TilingContext* context) { + UniqueTilingData tiling; + + constexpr uint16_t tileLength = 8192; + const uint8_t dimNum = context->GetInputShape(0)->GetStorageShape().GetDimNum(); + const gert::StorageShape* inputShape = context->GetInputShape(0); + uint32_t totalLength = 1; + for (int i = 0; i < dimNum; i++) { + totalLength *= inputShape->GetStorageShape().GetDim(i); + } + const uint32_t tileNum = (totalLength + tileLength - 1) / tileLength; + const uint16_t tailLength = totalLength % tileLength; + const auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); + const uint32_t aivNum = ascendcPlatform.GetCoreNumAiv(); + const uint8_t blockNum = tileNum >= aivNum? aivNum: tileNum; + const uint32_t shortBlockTileNum = tileNum / blockNum; + const uint8_t longBlockNum = tileNum % blockNum; + const uint8_t shortBlockNum = blockNum - longBlockNum; + + tiling.set_totalLength(totalLength); + tiling.set_tileNum(tileNum); + tiling.set_shortBlockTileNum(shortBlockTileNum); + tiling.set_tailLength(tailLength); + tiling.set_blockNum(blockNum); + tiling.set_shortBlockNum(shortBlockNum); + + context->SetBlockDim(blockNum); + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + // Workspace for IBSet/IBWait up to 8 times, and 2 times full data. + uint32_t sysWorkspaceSize = ascendcPlatform.GetLibApiWorkSpaceSize(); + auto&& currentWorkspace = context->GetWorkspaceSizes(1); + if (currentWorkspace == nullptr) { + return ge::GRAPH_FAILED; + } + size_t usrSize = (aivNum * 8 + 1) * 8 * sizeof(uint32_t) + (tileNum * tileLength) * 2 * sizeof(float) * 2; + currentWorkspace[0] = usrSize + sysWorkspaceSize; + return ge::GRAPH_SUCCESS; +} +} + + +namespace ge { +static ge::graphStatus UniqueInferShape(gert::InferShapeContext* context) { + const gert::Shape* x1_shape = context->GetInputShape(0); + gert::Shape* y_shape = context->GetOutputShape(0); + *y_shape = *x1_shape; + return GRAPH_SUCCESS; +} + +static ge::graphStatus UniqueInferDtype(gert::InferDataTypeContext* context) +{ + auto inputDtype = context->GetInputDataType(0); + context->SetOutputDataType(0, inputDtype); + return ge::GRAPH_SUCCESS; +} +} + + +namespace ops { +class Unique : public OpDef { +public: + explicit Unique(const char* name) : OpDef(name) { + this->Input("input") + .ParamType(REQUIRED) + .DataType({ge::DT_BF16, ge::DT_FLOAT16, ge::DT_INT16, ge::DT_FLOAT, ge::DT_INT32, ge::DT_INT64}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}) + .IgnoreContiguous(); + this->Output("output") + .ParamType(REQUIRED) + .DataType({ge::DT_BF16, ge::DT_FLOAT16, ge::DT_INT16, ge::DT_FLOAT, ge::DT_INT32, ge::DT_INT64}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}); + this->Output("uniqueCnt") + .ParamType(REQUIRED) + .DataType({ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32, ge::DT_INT32}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND, ge::FORMAT_ND}); + + this->SetInferShape(ge::UniqueInferShape); + this->SetInferDataType(ge::UniqueInferDtype); + + this->AICore() + .SetTiling(optiling::UniqueTilingFunc); + this->AICore().AddConfig("ascend910b"); + this->AICore().AddConfig("ascend910_93"); + } +}; + +OP_ADD(Unique); +} diff --git a/kernels/op_host/unique_tiling.h b/kernels/op_host/unique_tiling.h new file mode 100644 index 00000000..f42483c4 --- /dev/null +++ b/kernels/op_host/unique_tiling.h @@ -0,0 +1,18 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved. + * + */ +#include "register/tilingdata_base.h" + +namespace optiling { +BEGIN_TILING_DATA_DEF(UniqueTilingData) + TILING_DATA_FIELD_DEF(uint32_t, totalLength); + TILING_DATA_FIELD_DEF(uint32_t, tileNum); + TILING_DATA_FIELD_DEF(uint32_t, shortBlockTileNum); + TILING_DATA_FIELD_DEF(uint16_t, tailLength); + TILING_DATA_FIELD_DEF(uint8_t, blockNum); + TILING_DATA_FIELD_DEF(uint8_t, shortBlockNum); +END_TILING_DATA_DEF; + +REGISTER_TILING_DATA_CLASS(Unique, UniqueTilingData) +} diff --git a/kernels/op_kernel/unique.cpp b/kernels/op_kernel/unique.cpp new file mode 100644 index 00000000..e9cedcf5 --- /dev/null +++ b/kernels/op_kernel/unique.cpp @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + */ + +#include "kernel_operator.h" +#include "unique.h" + + +extern "C" __global__ __aicore__ void unique( + GM_ADDR input, GM_ADDR output, GM_ADDR uniqueCnt, GM_ADDR workspace, GM_ADDR tiling) { + GM_ADDR usrWorkspace = AscendC::GetUserWorkspace(workspace); + GET_TILING_DATA(tiling_data, tiling); + TPipe pipe; + KernelUnique op(pipe); + op.Init(input, + output, + uniqueCnt, + usrWorkspace, + tiling_data.totalLength, + tiling_data.tileNum, + tiling_data.shortBlockTileNum, + tiling_data.tailLength, + tiling_data.blockNum, + tiling_data.shortBlockNum); + op.Process(); +} diff --git a/kernels/op_kernel/unique.h b/kernels/op_kernel/unique.h new file mode 100644 index 00000000..0960db13 --- /dev/null +++ b/kernels/op_kernel/unique.h @@ -0,0 +1,745 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved. + */ + +#include "kernel_operator.h" +using namespace AscendC; + + +namespace AscendC { +template +__aicore__ inline static Ta min(const Ta a, const Tb b) { + if (a > b) { + return b; + } + return a; +} + +template +__aicore__ inline static Ta max(const Ta a, const Tb b) { + if (a < b) { + return b; + } + return a; +} + +template +class KernelUnique { +public: + __aicore__ inline KernelUnique(TPipe &pipe) : pipe(pipe) {} + // Each block process diffent part of data. This function returns the element-wise first index of data by blockIdx. + __aicore__ inline size_t GetGlobalOffset(const uint32_t blockIdx); + __aicore__ inline void Init(GM_ADDR input, + GM_ADDR output, + GM_ADDR uniqueCnt, + GM_ADDR workspace, + const uint32_t totalLength, + const uint32_t totalTileNum, + const uint32_t shortBlockTileNum, + const uint16_t tailLength, + const uint8_t blockNum, + const uint8_t shortBlockNum); + __aicore__ inline void Process(); + +private: + __aicore__ inline void CopyIn(const int32_t progress); + __aicore__ inline void Elem32Sort(const int32_t progress); + __aicore__ inline void TileSort(const int32_t progress); + template + __aicore__ inline static void DataCopyGM2GM(const GlobalTensor &dst, + const GlobalTensor &src, + const LocalTensor &tmpLocal, + const int elemLength, + const int bufByteLength); + using GMSSrcList = GlobalTensor (&)[4]; + struct GMSParams { + int (&GMSLengths)[4]; + uint8_t &queNum; + LocalTensor (&&buffLocal)[5]; + }; + __aicore__ inline static void MrgSortGM(GlobalTensor &&dstGlobal, + GMSSrcList &srcList, + GMSParams ¶ms); + __aicore__ inline void BlockSortV2(); + __aicore__ inline void GlobalSortV2(); + __aicore__ inline static void ConsecutiveUnique(const LocalTensor &dstVal, + const LocalTensor &srcLocal, + const LocalTensor &shiftedLocal, + const LocalTensor &bitMask16, + const uint16_t elemLength, + uint64_t &tileUniqueCnt); + __aicore__ inline void TileUnique(const int32_t progress); + __aicore__ inline void CopyOut(); + +private: + static constexpr int32_t TILE_LENGTH = 8192; + // INF to fill the tail blank, so that tail is automatically removed by Compare in Unique. + static constexpr float FLOAT_INF = 3e+99; + // Indicates the factor converting float to data structure used by Sort32&MrgSort. + static constexpr int16_t SORT_DATATYPE_SIZE = sizeof(float) + sizeof(uint32_t); // 8 + static constexpr int16_t SORT_DATATYPE_SIZE_FACTOR = SORT_DATATYPE_SIZE / sizeof(float); // 2 + static constexpr int32_t TILE_LEN_BYTE = TILE_LENGTH * SORT_DATATYPE_SIZE; // 8192 * 8 = 65536 + static constexpr int32_t TILE_LEN_ELEM = TILE_LENGTH * SORT_DATATYPE_SIZE_FACTOR; // 8192 * 2 = 16384 + static constexpr uint16_t VALID_QUE[5] = {0, 0, 0b11, 0b111, 0b1111}; // Converts queue number to validBit of MrgSort. + + TPipe &pipe; + TQue calcBuf[3]; + + GlobalTensor srcGlobal; + GlobalTensor srcGlobalAsUint; + GlobalTensor dstGlobal1; + GlobalTensor dstGlobal1As32; + GlobalTensor uniqueCntGlobal; + + GlobalTensor sortedBlock1; + GlobalTensor sortedBlock1AsInt; + GlobalTensor sortedBlock2; + GlobalTensor sortedBlock2AsInt; + GlobalTensor sortedGlobal1; + GlobalTensor sortedGlobal2; + + GlobalTensor IBSyncGlobal; + GlobalTensor blockUniqueCntGlobal; + + uint16_t syncWorkspaceSize; + uint8_t eventID {0}; + uint64_t accUniqueCnt {0}; + float lastTileUniqueVal; + + uint32_t totalLength; + uint32_t alignedTotalLength; + uint32_t tileNum; + uint32_t shortBlockTileNum; + uint16_t tailLength; + uint8_t blockNum; + uint8_t shortBlockNum; + + size_t globalOffset; // Offset of data for current block. + size_t blockLength; // Length of current block. + bool hasInfFlag {false}; +}; + +// Each block process diffent part of data. This function returns the element-wise first index of data by blockIdx. +template +__aicore__ inline size_t KernelUnique::GetGlobalOffset(const uint32_t blockIdx) { + // (shortBlockTileNum + 1) indicates longBlockTileNum. + const size_t offset = (this->shortBlockTileNum * min(this->shortBlockNum, blockIdx) + + (this->shortBlockTileNum + 1) * + (this->shortBlockNum >= blockIdx? 0: blockIdx - this->shortBlockNum)) * + TILE_LENGTH; + return offset; +} + +template +__aicore__ inline void KernelUnique::Init(GM_ADDR input, + GM_ADDR output, + GM_ADDR uniqueCnt, + GM_ADDR workspace, + const uint32_t totalLength, + const uint32_t totalTileNum, + const uint32_t shortBlockTileNum, + const uint16_t tailLength, + const uint8_t blockNum, + const uint8_t shortBlockNum) { + this->totalLength = totalLength; + this->alignedTotalLength = totalTileNum * TILE_LENGTH; + this->shortBlockTileNum = shortBlockTileNum; + this->tailLength = tailLength; + this->blockNum = blockNum; + this->shortBlockNum = shortBlockNum; + + const bool isShortBlock = this->shortBlockNum > GetBlockIdx(); + // (shortBlockTileNum + 1) indicates longBlockTileNum. + this->tileNum = isShortBlock? shortBlockTileNum: shortBlockTileNum + 1; + this->blockLength = this->tileNum * TILE_LENGTH; + this->globalOffset = GetGlobalOffset(GetBlockIdx()); + + srcGlobal.SetGlobalBuffer((__gm__ T *)input + globalOffset, this->blockLength); + srcGlobalAsUint.SetGlobalBuffer((__gm__ uint32_t *)input + globalOffset * sizeof(T) / sizeof(uint32_t), + this->blockLength * sizeof(T) / sizeof(uint32_t)); + dstGlobal1.SetGlobalBuffer((__gm__ T *)output, this->alignedTotalLength); + dstGlobal1As32.SetGlobalBuffer((__gm__ int32_t *)output, + this->alignedTotalLength * sizeof(T) / sizeof(int32_t)); + uniqueCntGlobal.SetGlobalBuffer((__gm__ int32_t *)uniqueCnt, 1); + + // sortedBlock is offsetted, and could only see the data that this block should process. + sortedBlock1.SetGlobalBuffer((__gm__ float *)workspace + globalOffset * SORT_DATATYPE_SIZE_FACTOR, + this->blockLength * SORT_DATATYPE_SIZE_FACTOR); + sortedBlock1AsInt.SetGlobalBuffer((__gm__ int32_t *)workspace + globalOffset * SORT_DATATYPE_SIZE_FACTOR, + this->blockLength * SORT_DATATYPE_SIZE_FACTOR); + sortedBlock2.SetGlobalBuffer((__gm__ float *)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR + + globalOffset * SORT_DATATYPE_SIZE_FACTOR, + this->blockLength * SORT_DATATYPE_SIZE_FACTOR); + sortedBlock2AsInt.SetGlobalBuffer((__gm__ int32_t *)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR + + globalOffset * SORT_DATATYPE_SIZE_FACTOR, + this->blockLength * SORT_DATATYPE_SIZE_FACTOR); + // sortedGlobal could see all data in the workspace. + sortedGlobal1.SetGlobalBuffer((__gm__ float *)workspace, alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR); + sortedGlobal2.SetGlobalBuffer((__gm__ float *)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR, + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR); + + // Buff size for syncronizing according to document of IBWait&IBSet. + this->syncWorkspaceSize = (blockNum * 32 + 1) * 8; + IBSyncGlobal.SetGlobalBuffer((__gm__ int32_t *)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR * 2, + syncWorkspaceSize); + blockUniqueCntGlobal.SetGlobalBuffer((__gm__ uint32_t *)workspace + alignedTotalLength * 4 + syncWorkspaceSize, + (blockNum + 7) / 8 * 8); // Length aligned up to 32B. + + pipe.InitBuffer(calcBuf[0], 1, TILE_LEN_BYTE); + pipe.InitBuffer(calcBuf[1], 1, TILE_LEN_BYTE); + pipe.InitBuffer(calcBuf[2], 1, TILE_LEN_BYTE); +} + +template +__aicore__ inline void KernelUnique::Process() { + LocalTensor IBSyncLocal; + // Initialize sync buff. + if (GetBlockIdx() == 0) { + IBSyncLocal = calcBuf[0].AllocTensor(); + Duplicate(IBSyncLocal, 0, syncWorkspaceSize); + PipeBarrier(); + DataCopy(IBSyncGlobal, IBSyncLocal, syncWorkspaceSize); + PipeBarrier(); + calcBuf[0].FreeTensor(IBSyncLocal); + } // Initialize sync buff. + + // Sort within each tile. + for (int32_t tileIdx = 0; tileIdx < this->tileNum; tileIdx++) { + CopyIn(tileIdx); + Elem32Sort(tileIdx); + TileSort(tileIdx); + } + + if (GetBlockNum() > 1) { + if (this->tileNum > 1) { + BlockSortV2(); // Sort within each block. + } + + GlobalSortV2(); // Sort globally. + + PipeBarrier(); + SyncAll(); + PipeBarrier(); + } + + // Check if an inf value exists. If do, inf will be append to the result in TileUnique(). + if ((IsSameType::value || IsSameType::value || IsSameType::value) && + GetBlockIdx() == blockNum - 1) { + PipeBarrier(); + const uint32_t totalLength32BAligned = (totalLength * sizeof(T) + 31) / 32 * 32 / sizeof(float); + if (sortedGlobal1.GetValue((totalLength32BAligned - 1) * 2) == -FLOAT_INF) { + hasInfFlag = true; + } + } + + // Do unique in each block based on tiles. + for (int32_t tileIdx = 0; tileIdx < this->tileNum; tileIdx++) { + TileUnique(tileIdx); + } + + if (this->blockNum > 1) { + // Each block waits for its former block to upload blockUniqueCnt. + IBSyncLocal = calcBuf[0].AllocTensor(); + if (GetBlockIdx() != 0) { + PipeBarrier(); + IBWait(IBSyncGlobal, IBSyncLocal, (int32_t)GetBlockIdx() - 1, eventID); + PipeBarrier(); + } + PipeBarrier(); + IBSet(IBSyncGlobal, IBSyncLocal, (int32_t)GetBlockIdx(), eventID); + PipeBarrier(); + calcBuf[0].FreeTensor(IBSyncLocal); + } + + // Gather result from every block. + CopyOut(); +} + +template +__aicore__ inline void KernelUnique::CopyIn(const int32_t progress) { + LocalTensor srcLocal = calcBuf[0].AllocTensor(); + LocalTensor sortedLocal2 = calcBuf[2].AllocTensor(); + + // To process tail, fill the whole tile with INF, then cover it with tail. + int32_t castLen; // Valid length of the last block. + if ((progress != tileNum - 1) || (GetBlockIdx() != blockNum - 1) || tailLength == 0) { + // Must determine during compilation, otherwise we get a compilation error. + if constexpr (!IsSameType::value) { + DataCopy(srcLocal, srcGlobal[progress * TILE_LENGTH], TILE_LENGTH); + } else { + DataCopy(sortedLocal2, srcGlobal[progress * TILE_LENGTH], TILE_LENGTH); + } + castLen = TILE_LENGTH; + } else { + // Process tail. + LocalTensor srcAsUint = srcLocal.template ReinterpretCast(); + Duplicate(sortedLocal2, FLOAT_INF, TILE_LENGTH); + PipeBarrier(); + if constexpr (IsSameType::value) { + DataCopyPad(sortedLocal2, + srcGlobal[progress * TILE_LENGTH], + {1, static_cast(sizeof(T) * tailLength), 0, 0}, + {false, 0, 0, 0}); + } else if constexpr (sizeof(T) >= sizeof(float)) { + DataCopyPad(srcAsUint, + srcGlobalAsUint[progress * TILE_LENGTH * sizeof(T) / sizeof(uint32_t)], + {1, static_cast(sizeof(T) * tailLength), 0, 0}, + {false, 0, 0, 0}); + } else { + DataCopyPad(srcLocal, + srcGlobal[progress * TILE_LENGTH], + {1, static_cast(sizeof(T) * tailLength), 0, 0}, + {false, 0, 0, 0}); + } + castLen = tailLength; + } + PipeBarrier(); + if constexpr (!IsSameType::value) { + if constexpr (sizeof(T) >= sizeof(float)) { + Cast(sortedLocal2, srcLocal, RoundMode::CAST_ROUND, castLen); + } else { + Cast(sortedLocal2, srcLocal, RoundMode::CAST_NONE, castLen); + } + PipeBarrier(); + } + Muls(sortedLocal2, sortedLocal2, (float)-1, TILE_LENGTH); + calcBuf[0].EnQue(srcLocal); + calcBuf[2].EnQue(sortedLocal2); +} + +template +__aicore__ inline void KernelUnique::Elem32Sort(const int32_t progress) { + LocalTensor srcLocal = calcBuf[0].DeQue(); + LocalTensor sortedLocal1 = calcBuf[1].AllocTensor(); + LocalTensor sortedLocal2 = calcBuf[2].DeQue(); + LocalTensor arithLocal = srcLocal.template ReinterpretCast()[TILE_LENGTH]; + + int32_t baseOffset = progress * TILE_LENGTH + this->globalOffset; // calc tileOffset + Duplicate(arithLocal, baseOffset, TILE_LENGTH); + PipeBarrier(); + + LocalTensor uidArray = arithLocal.template ReinterpretCast(); + // Max repeatTime of Sort32 is 255, which is exceeded because TILE_LENGTH is 8192. + constexpr uint8_t sort32BatchSize = 32; + constexpr uint8_t sort32RepeatLimit = 255; + int instrRepeatTime = 0; + int restLen = TILE_LENGTH; + while (restLen) { + int repTime = min(restLen / sort32BatchSize, sort32RepeatLimit); + Sort32(sortedLocal1[sort32BatchSize * sort32RepeatLimit * SORT_DATATYPE_SIZE_FACTOR * instrRepeatTime], + sortedLocal2[sort32BatchSize * sort32RepeatLimit * instrRepeatTime], + uidArray[sort32BatchSize * sort32RepeatLimit * instrRepeatTime], + repTime); + restLen -= repTime * sort32BatchSize; + instrRepeatTime++; + } + PipeBarrier(); + calcBuf[0].FreeTensor(srcLocal); + calcBuf[1].EnQue(sortedLocal1); + calcBuf[2].EnQue(sortedLocal2); +} + +template +__aicore__ inline void KernelUnique::TileSort(const int32_t progress) { + LocalTensor sortedLocal1 = calcBuf[1].DeQue(); + LocalTensor sortedLocal2 = calcBuf[2].DeQue(); + LocalTensor sortedQue[2] = {sortedLocal1, sortedLocal2}; + uint16_t currentQueLength = 32; // Initial queue length is 32 because data is from Sort32. + uint16_t currentQueNum = TILE_LENGTH / currentQueLength; + bool switchFlag = false; + // Multiple MrgSort until we have one generally sorted tile. + while (currentQueLength < TILE_LENGTH) { + const uint16_t elementLengths[4] = { + currentQueLength, currentQueLength, currentQueLength, currentQueLength}; + const uint16_t fullMrgSortTime = currentQueNum / 4; + if (fullMrgSortTime > 0) { + MrgSort4Info params = {elementLengths, false, 0b1111, fullMrgSortTime}; + MrgSort(sortedQue[!switchFlag], + {sortedQue[switchFlag][0], + sortedQue[switchFlag][currentQueLength * 1 * 2], + sortedQue[switchFlag][currentQueLength * 2 * 2], + sortedQue[switchFlag][currentQueLength * 3 * 2]}, + params); + PipeBarrier(); + switchFlag = !switchFlag; + } + currentQueNum = fullMrgSortTime; + currentQueLength *= 4; + } + DataCopy(sortedBlock1[progress * TILE_LEN_ELEM], sortedQue[switchFlag], TILE_LEN_ELEM); + PipeBarrier(); + calcBuf[1].FreeTensor(sortedLocal1); + calcBuf[2].FreeTensor(sortedLocal2); +} + +template +template +__aicore__ inline void KernelUnique::DataCopyGM2GM(const GlobalTensor &dst, + const GlobalTensor &src, + const LocalTensor &tmpLocal, + const int elemLength, + const int bufByteLength) { + // Max byte size of DataCopyPad in one repeat is 65535. + int bufElemLength = min(bufByteLength, 65535) / sizeof(T1); + int restLen = elemLength; + while (restLen > 0) { + int copyLen = min(restLen, bufElemLength); + DataCopyPad(tmpLocal, + src[elemLength - restLen], + {1, static_cast(sizeof(T1) * copyLen), 0, 0}, + {false, 0, 0, 0}); + PipeBarrier(); + DataCopyPad(dst[elemLength - restLen], tmpLocal, {1, static_cast(sizeof(T1) * copyLen), 0, 0}); + PipeBarrier(); + restLen -= copyLen; + } +} + +template +__aicore__ inline void KernelUnique::MrgSortGM(GlobalTensor &&dstGlobal, + GMSSrcList &srcList, + GMSParams ¶ms) { + int restLen[4] {params.GMSLengths[0], params.GMSLengths[1], params.GMSLengths[2], params.GMSLengths[3]}; + int currentHead[4] {}; + int totalMrgLen {}; + uint8_t queNum = params.queNum; + // limited by MrgSort api constraint and mrgLocal size, we set different buffer length due to diffent queNum. + // mrgLocal contains 8192 elems, and MrgSort limits max 4095 elems per queue. + constexpr int BUFFER_LEN[5] {0, 0, 4095, 2730, 2048}; + uint16_t sortedLen[4]; + uint16_t mrgLen[4] {}; + while (queNum > 1) { + int currentBufferLen = BUFFER_LEN[queNum]; + for (int i = 0; i < queNum; i++) { + mrgLen[i] = min(restLen[i], currentBufferLen); + } + // CopyIn + for (int i = 0; i < queNum; i++) { + DataCopyPad(params.buffLocal[i], + srcList[i][currentHead[i] * SORT_DATATYPE_SIZE_FACTOR], + {1, static_cast(sizeof(float) * mrgLen[i] * SORT_DATATYPE_SIZE_FACTOR), 0, 0}, + {false, 0, 0, 0}); + } + PipeBarrier(); + // MrgSort + MrgSort4Info localParams {mrgLen, true, VALID_QUE[queNum], 1}; + MrgSort(params.buffLocal[4], + {params.buffLocal[0], params.buffLocal[1], params.buffLocal[2], params.buffLocal[3]}, + localParams); + PipeBarrier(); + GetMrgSortResult(sortedLen[0], sortedLen[1], sortedLen[2], sortedLen[3]); + const uint16_t localMrgLen = sortedLen[0] + sortedLen[1] + sortedLen[2] + sortedLen[3]; + // CopyOut + DataCopyPad(dstGlobal[totalMrgLen * SORT_DATATYPE_SIZE_FACTOR], + params.buffLocal[4], + {1, static_cast(sizeof(float) * localMrgLen * SORT_DATATYPE_SIZE_FACTOR), 0, 0}); + PipeBarrier(); + // renew currentHead, restLen + totalMrgLen += localMrgLen; + for (int i = 0; i < queNum; i++) { + restLen[i] -= sortedLen[i]; + currentHead[i] += sortedLen[i]; + } + // Switch empty to tail + for (int i = 0; i < queNum; i++) { + if (restLen[i] == 0) { + for (int j = i; j < 3; j++) { + restLen[j] = restLen[j + 1]; + currentHead[j] = currentHead[j + 1]; + srcList[j] = srcList[j + 1]; + } + restLen[3] = 0; + queNum--; + break; // because ifExhaustedSuspension == true, there is 0 or 1 empty que. + } + } + } + // Process tail + for (int i = 0; i < params.queNum; i++) { + if (restLen[i] > 0) { + DataCopyGM2GM(dstGlobal[totalMrgLen * SORT_DATATYPE_SIZE_FACTOR], + srcList[i][currentHead[i] * SORT_DATATYPE_SIZE_FACTOR], + params.buffLocal[4], + restLen[i] * SORT_DATATYPE_SIZE_FACTOR, + TILE_LEN_BYTE); + break; + } + } +}; + +template +__aicore__ inline void KernelUnique::BlockSortV2() { + LocalTensor sortedLocal1 = calcBuf[0].AllocTensor(); + LocalTensor sortedLocal2 = calcBuf[1].AllocTensor(); + LocalTensor mrgLocal = calcBuf[2].AllocTensor(); + GlobalTensor sortedBlock[2] = {sortedBlock1, sortedBlock2}; + + // Each time merge 4 queues into 1 queue. + constexpr uint8_t PREFIX_QUE_NUM = 4; + bool switchFlag = false; + GlobalTensor srcGlobal[4]; + LocalTensor buffLocal[5]; + int lengths[4]; + for (int bindTile = 1; bindTile < tileNum; bindTile *= PREFIX_QUE_NUM) { + for (int tileIdx = 0; tileIdx < tileNum; tileIdx += bindTile * PREFIX_QUE_NUM) { + int mrgTileNum = min(tileNum - tileIdx, bindTile * PREFIX_QUE_NUM); + uint8_t queNum = (mrgTileNum + bindTile - 1) / bindTile; + uint8_t lastQueTileNum = mrgTileNum % bindTile; + if (lastQueTileNum == 0) { + lastQueTileNum = bindTile; + } + // Init GMSSrcList, GMSParams + for (int i = 0; i < queNum; i++) { + srcGlobal[i] = sortedBlock[switchFlag][TILE_LEN_ELEM * (tileIdx + bindTile * i)]; + } + for (int i = 0; i < queNum - 1; i++) { + lengths[i] = TILE_LENGTH * bindTile; + } + lengths[queNum - 1] = TILE_LENGTH * lastQueTileNum; + GMSSrcList srcList {srcGlobal}; + GMSParams params {lengths, queNum, {sortedLocal1, sortedLocal1[TILE_LENGTH], + sortedLocal2, sortedLocal2[TILE_LENGTH], + mrgLocal}}; + MrgSortGM(sortedBlock[!switchFlag][TILE_LEN_ELEM * tileIdx], srcList, params); + } + switchFlag = !switchFlag; + } + if (switchFlag) { + DataCopyGM2GM(sortedBlock1, + sortedBlock2, + sortedLocal1, + blockLength * SORT_DATATYPE_SIZE_FACTOR, + TILE_LEN_BYTE); + } + calcBuf[0].FreeTensor(sortedLocal1); + calcBuf[1].FreeTensor(sortedLocal2); + calcBuf[2].FreeTensor(mrgLocal); +} + +template +__aicore__ inline void KernelUnique::GlobalSortV2() { + LocalTensor sortedLocal1 = calcBuf[0].AllocTensor(); + LocalTensor sortedLocal2 = calcBuf[1].AllocTensor(); + LocalTensor mrgLocal = calcBuf[2].AllocTensor(); + LocalTensor IBSyncLocal = sortedLocal2.ReinterpretCast(); + GlobalTensor sortedGlobal[2] = {sortedGlobal1, sortedGlobal2}; + + // Each time merge up to 4 queues into 1 queue. + constexpr uint8_t PREFIX_QUE_NUM = 4; + bool switchFlag = false; + GlobalTensor srcGlobal[4]; + int lengths[4]; + for (int bindBlock = 1; bindBlock < blockNum; bindBlock *= PREFIX_QUE_NUM, eventID++) { + for (int blockIdx = 0; blockIdx < blockNum; blockIdx += bindBlock * PREFIX_QUE_NUM) { + if ((GetBlockIdx() == blockIdx + bindBlock) || + (GetBlockIdx() == blockIdx + bindBlock * 2) || + (GetBlockIdx() == blockIdx + bindBlock * 3)) { + PipeBarrier(); + IBSet(IBSyncGlobal, IBSyncLocal, (int32_t)GetBlockIdx(), eventID); + PipeBarrier(); + } else if (GetBlockIdx() == blockIdx) { + int mrgBlockNum = min(blockNum - blockIdx, bindBlock * PREFIX_QUE_NUM); + uint8_t queNum = (mrgBlockNum + bindBlock - 1) / bindBlock; + for (int i = 1; i < queNum; i++) { + PipeBarrier(); + IBWait(IBSyncGlobal, IBSyncLocal, (int32_t)blockIdx + (bindBlock * i), eventID); + PipeBarrier(); + } + // 判断最后一个队列包含了多少个block的数据. + uint8_t lastQueBlockNum = mrgBlockNum % bindBlock; + if (lastQueBlockNum == 0) { + lastQueBlockNum = bindBlock; + } + // Init GMSSrcList, GMSParams + for (int i = 0; i < queNum; i++) { + srcGlobal[i] = sortedGlobal[switchFlag][GetGlobalOffset(blockIdx + bindBlock * i) * + SORT_DATATYPE_SIZE_FACTOR]; + } + for (int i = 0; i < queNum - 1; i++) { + lengths[i] = GetGlobalOffset(blockIdx + (bindBlock * (i + 1))) - + GetGlobalOffset(blockIdx + (bindBlock * i)); + } + lengths[queNum - 1] = GetGlobalOffset(blockIdx + (bindBlock * (queNum - 1)) + lastQueBlockNum) - + GetGlobalOffset(blockIdx + (bindBlock * (queNum - 1))); + GMSSrcList srcList {srcGlobal}; + GMSParams params {lengths, queNum, {sortedLocal1, sortedLocal1[TILE_LENGTH], + sortedLocal2, sortedLocal2[TILE_LENGTH], + mrgLocal}}; + MrgSortGM(sortedGlobal[!switchFlag][GetGlobalOffset(blockIdx) * SORT_DATATYPE_SIZE_FACTOR], + srcList, + params); + } + } + switchFlag = !switchFlag; + } + // Switch valid workspace pointer. + if (switchFlag) { + GlobalTensor tmpGlobal = sortedGlobal1; + sortedGlobal1 = sortedGlobal2; + sortedGlobal2 = tmpGlobal; + + GlobalTensor tmpGlobal1 = sortedBlock1; + sortedBlock1 = sortedBlock2; + sortedBlock2 = tmpGlobal1; + + GlobalTensor tmpGlobal2 = sortedBlock1AsInt; + sortedBlock1AsInt = sortedBlock2AsInt; + sortedBlock2AsInt = tmpGlobal2; + } + calcBuf[0].FreeTensor(sortedLocal1); + calcBuf[1].FreeTensor(sortedLocal2); + calcBuf[2].FreeTensor(mrgLocal); +} + +template +__aicore__ inline void KernelUnique::ConsecutiveUnique(const LocalTensor &dstVal, + const LocalTensor &srcLocal, + const LocalTensor &shiftedLocal, + const LocalTensor &bitMask32, + const uint16_t elemLength, + uint64_t &tileUniqueCnt) { + LocalTensor bitMask16 = bitMask32.ReinterpretCast(); + uint64_t rsvdCnt = 0; + // Seperate Val and Idx. + GatherMask(dstVal, srcLocal, 1, false, 0, {1, static_cast((elemLength * 2 + 63) / 64), 8, 0}, rsvdCnt); + PipeBarrier(); + + // Gen bitMask to calc shifted array. + Duplicate(bitMask16, (uint16_t)0b1111111111111111, elemLength / 16); + PipeBarrier(); + bitMask16.SetValue(0, 0b1111111111111110); + + // Calc shifted array. + GatherMask(shiftedLocal, dstVal, bitMask32, true, elemLength, {1, 1, 8, 8}, rsvdCnt); + PipeBarrier(); + // Set the last val as INF in order to avoid dropping the last unique val. + shiftedLocal.SetValue(elemLength - 1, -FLOAT_INF); + + // Generate bitMask which represents unique numbers. + Compare(bitMask16, dstVal, shiftedLocal, CMPMODE::NE, (elemLength + 63) / 64 * 64); + PipeBarrier(); + + // Gather unique numbers and their idx. + GatherMask(dstVal, dstVal, bitMask32, true, elemLength, {1, 1, 8, 8}, tileUniqueCnt); + PipeBarrier(); +} + +template +__aicore__ inline void KernelUnique::TileUnique(const int32_t progress) { + LocalTensor bitMask32 = calcBuf[0].AllocTensor(); + LocalTensor shiftedLocal = bitMask32[TILE_LENGTH].ReinterpretCast(); + LocalTensor sortedLocal1 = calcBuf[1].AllocTensor(); + LocalTensor sortedLocal2 = calcBuf[2].AllocTensor(); + LocalTensor uniqueCntLocal = shiftedLocal.ReinterpretCast(); + uint64_t tileUniqueCnt; + uint64_t tmpRsvdCnt; + + DataCopy(sortedLocal1, sortedBlock1[progress * TILE_LEN_ELEM], TILE_LEN_ELEM); + PipeBarrier(); + + ConsecutiveUnique(sortedLocal2, sortedLocal1, shiftedLocal, bitMask32, TILE_LENGTH, tileUniqueCnt); + // If has inf, append. + if ((progress == tileNum - 1) && hasInfFlag) { + sortedLocal2.SetValue(tileUniqueCnt, -FLOAT_INF); + tileUniqueCnt++; + } + + if (tileUniqueCnt != 0) { + accUniqueCnt += tileUniqueCnt; + if (progress != 0 && lastTileUniqueVal == sortedLocal2.GetValue(0)) { + accUniqueCnt--; + } + DataCopyPad(sortedBlock1[accUniqueCnt - tileUniqueCnt], + sortedLocal2, + {1, static_cast(sizeof(float) * tileUniqueCnt), 0, 0}); + PipeBarrier(); + lastTileUniqueVal = sortedLocal2.GetValue(tileUniqueCnt - 1); + } + + // upload uniqueCnt. + if (progress == tileNum - 1) { + uniqueCntLocal.SetValue(0, accUniqueCnt); + DataCopyPad(blockUniqueCntGlobal[GetBlockIdx()], + uniqueCntLocal, + {1, static_cast(sizeof(uint32_t) * 1), 0, 0}); + PipeBarrier(); + } + calcBuf[0].FreeTensor(shiftedLocal); + calcBuf[1].FreeTensor(sortedLocal1); + calcBuf[2].FreeTensor(sortedLocal2); +} + +template +__aicore__ inline void KernelUnique::CopyOut() { + LocalTensor copyLocal0 = calcBuf[0].AllocTensor(); + LocalTensor copyLocal1 = calcBuf[1].AllocTensor(); + LocalTensor IBSyncLocal = copyLocal1.ReinterpretCast(); + LocalTensor copyLocal2 = calcBuf[2].AllocTensor(); + + uint64_t lastAccUniqueCnt = 0; + // Get every blockUniqueCnt before current block. Calc accumulate uniqueCnt. + for (int i = 0; i < GetBlockIdx(); i++) { + uint64_t lastUniqueCnt = blockUniqueCntGlobal.GetValue(i); + lastAccUniqueCnt += lastUniqueCnt; + // If the first val of (i+1)th block equals to the last val of (i)th block, then they should be placed in + // the same position, accUniqueCnt--. + if (sortedGlobal1[GetGlobalOffset(i + 1) * SORT_DATATYPE_SIZE_FACTOR].GetValue(0) == + sortedGlobal1[GetGlobalOffset(i) * SORT_DATATYPE_SIZE_FACTOR].GetValue(lastUniqueCnt - 1)) { + lastAccUniqueCnt--; + } + } + uint64_t thisUniqueCnt = blockUniqueCntGlobal.GetValue(GetBlockIdx()); + + uint64_t restLen = thisUniqueCnt; + // max(Ta a, Tb b) function does not support compilation period calc. + constexpr uint64_t bottleneckTypeSize = sizeof(T) > sizeof(float)? sizeof(T): sizeof(float); + LocalTensor copyVal32 = copyLocal0.template ReinterpretCast(); + LocalTensor uniqueVal32 = copyLocal1.ReinterpretCast(); + // Copy unique values (and counts) from Workspace to dst. + while (restLen > 0) { + // DataCopyPad could copy up to 65535B in one cycle. And one tile may contain up to 65536B. So we should + // process multiple cycles. + uint64_t copyLen = min(restLen, TILE_LEN_BYTE / bottleneckTypeSize); + copyLen = min(copyLen, 65535 / bottleneckTypeSize); + if constexpr (!IsSameType::value) { + DataCopyPad(copyLocal1, + sortedBlock1[thisUniqueCnt - restLen], + {1, static_cast(sizeof(float) * copyLen), 0, 0}, + {false, 0, 0, 0}); + PipeBarrier(); + Muls(copyLocal1, copyLocal1, (float)-1, copyLen); + PipeBarrier(); + Cast(copyLocal0, copyLocal1, RoundMode::CAST_RINT, copyLen); + PipeBarrier(); + } else { + DataCopyPad(copyLocal0, + sortedBlock1[thisUniqueCnt - restLen], + {1, static_cast(sizeof(float) * copyLen), 0, 0}, + {false, 0, 0, 0}); + PipeBarrier(); + Muls(copyLocal0, copyLocal0, (float)-1, copyLen); + PipeBarrier(); + } + // DataCopyPad does not support int64_t. Copy them as uint32_t. + if constexpr (sizeof(T) > 4) { + DataCopyPad(dstGlobal1As32[(lastAccUniqueCnt + thisUniqueCnt - restLen) * sizeof(T) / sizeof(uint32_t)], + copyVal32, + {1, static_cast(sizeof(T) * copyLen), 0, 0}); + } else { + DataCopyPad(dstGlobal1[lastAccUniqueCnt + thisUniqueCnt - restLen], + copyLocal0, + {1, static_cast(sizeof(T) * copyLen), 0, 0}); + } + PipeBarrier(); + restLen -= copyLen; + } + // Return unique count. + if (GetBlockIdx() == blockNum - 1) { + uniqueVal32.SetValue(0, lastAccUniqueCnt + thisUniqueCnt); + DataCopyPad(uniqueCntGlobal, uniqueVal32, {1, static_cast(sizeof(uint32_t) * 1), 0, 0}); + PipeBarrier(); + } + calcBuf[0].FreeTensor(copyLocal0); + calcBuf[1].FreeTensor(copyLocal1); +} +} // namespace AscendC diff --git a/mx_driving/__init__.py b/mx_driving/__init__.py index f4cc2fa5..9f708370 100644 --- a/mx_driving/__init__.py +++ b/mx_driving/__init__.py @@ -61,6 +61,7 @@ __all__ = [ "diff_iou_rotated_2d", "nms3d_on_sight", "cartesian_to_frenet", + "npu_unique", ] import os @@ -124,6 +125,7 @@ from .ops.npu_batch_matmul import npu_batch_matmul from .ops.nms3d_on_sight import nms3d_on_sight from .ops.cartesian_to_frenet import cartesian_to_frenet from .patcher import default_patcher_builder, patch_mmcv_version +from .ops.npu_unique import npu_unique def _set_env(): diff --git a/mx_driving/csrc/Unique.cpp b/mx_driving/csrc/Unique.cpp new file mode 100644 index 00000000..ee7aa6a7 --- /dev/null +++ b/mx_driving/csrc/Unique.cpp @@ -0,0 +1,32 @@ +// Copyright (c) 2024 Huawei Technologies Co., Ltd +// Copyright (c) 2019, Facebook CORPORATION. +// All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "csrc/OpApiCommon.h" +#include "csrc/functions.h" + + +at::Tensor npu_unique(const at::Tensor& input) { + if (input.numel() < 2) { + at::Tensor output = at::Tensor(input).clone(); + return output; + } else { + at::Tensor output = at::empty({input.numel()}, at::TensorOptions().dtype(input.dtype()).device(input.device())); + at::Tensor uniqueCnt = at::empty({1}, at::TensorOptions().dtype(at::ScalarType::Int).device(input.device())); + EXEC_NPU_CMD_SYNC(aclnnUnique, input, output, uniqueCnt); + int uniqueCount = uniqueCnt.item(); + return output.narrow(0, 0, uniqueCount); + } +} diff --git a/mx_driving/csrc/pybind.cpp b/mx_driving/csrc/pybind.cpp index fea4904f..41f5a35c 100644 --- a/mx_driving/csrc/pybind.cpp +++ b/mx_driving/csrc/pybind.cpp @@ -242,4 +242,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) // npu_subm_sparse_conv3d_with_key m.def("npu_subm_sparse_conv3d_with_key", &npu_subm_sparse_conv3d_with_key); + + // npu_unique + m.def("npu_unique", &npu_unique); } diff --git a/mx_driving/fused.py b/mx_driving/fused.py index 209f2046..123f751e 100644 --- a/mx_driving/fused.py +++ b/mx_driving/fused.py @@ -10,6 +10,7 @@ from .ops.multi_scale_deformable_attn import ( from .ops.npu_add_relu import npu_add_relu from .ops.npu_deformable_aggregation import npu_deformable_aggregation from .ops.npu_max_pool2d import npu_max_pool2d +from .ops.npu_unique import npu_unique warnings.warn( "This package is deprecated and will be removed in future. Please use `mx_driving.api` instead.", DeprecationWarning diff --git a/mx_driving/ops/npu_unique.py b/mx_driving/ops/npu_unique.py new file mode 100644 index 00000000..7ccde38a --- /dev/null +++ b/mx_driving/ops/npu_unique.py @@ -0,0 +1,19 @@ +""" +Copyright (c) OpenMMLab. All rights reserved. +Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. +Modification by: Huawei Developers +Modification date: 2024-06-04 +Modification Description: +Modification 1. Add support for Ascend NPU +""" +from torch.autograd import Function +import mx_driving._C + +class UniqueFunction(Function): + @staticmethod + # 'pylint: disable=too-many-arguments,huawei-too-many-arguments + def forward(ctx, input): + y = mx_driving._C.npu_unique(input) + return y + +npu_unique = UniqueFunction.apply -- Gitee From ae16b0bdf6bbef9f2f62d69843a784568d468769 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B1=E6=AD=A3=E9=98=B3?= Date: Tue, 18 Mar 2025 17:33:21 +0800 Subject: [PATCH 2/7] bugfix: prevent updating deprecated api --- mx_driving/fused.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mx_driving/fused.py b/mx_driving/fused.py index 123f751e..209f2046 100644 --- a/mx_driving/fused.py +++ b/mx_driving/fused.py @@ -10,7 +10,6 @@ from .ops.multi_scale_deformable_attn import ( from .ops.npu_add_relu import npu_add_relu from .ops.npu_deformable_aggregation import npu_deformable_aggregation from .ops.npu_max_pool2d import npu_max_pool2d -from .ops.npu_unique import npu_unique warnings.warn( "This package is deprecated and will be removed in future. Please use `mx_driving.api` instead.", DeprecationWarning -- Gitee From 07871465f587b4a5112bc0b4f09b2b3f34e5b19c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B1=E6=AD=A3=E9=98=B3?= Date: Wed, 19 Mar 2025 10:45:14 +0800 Subject: [PATCH 3/7] feat: Add npu_unique tests --- tests/torch/test_npu_unique.py | 67 ++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 tests/torch/test_npu_unique.py diff --git a/tests/torch/test_npu_unique.py b/tests/torch/test_npu_unique.py new file mode 100644 index 00000000..8cc8f81c --- /dev/null +++ b/tests/torch/test_npu_unique.py @@ -0,0 +1,67 @@ +""" +Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. +""" +import torch +import torch_npu +from data_cache import golden_data_cache +from torch_npu.testing.testcase import TestCase, run_tests + +from mx_driving import npu_unique +import random +import os + + +def gen_inputs(input_shape, dtype): + input_tensor = torch.randint(-256, 256, input_shape, dtype=dtype) + return input_tensor + +def gen_cpu_outputs(input_tensor): + cpu_result = torch.unique(input_tensor) + return cpu_result + +def gen_npu_outputs(input_tensor): + npu_result = npu_unique(input_tensor.npu()) + return npu_result.cpu() + + +class TestNpuUnique(TestCase): + def test_bfloat16(self, device='npu'): + input_tensor = gen_inputs([random.randint(1, 10000000)], torch.bfloat16) + cpu_result = gen_cpu_outputs(input_tensor) + npu_result = gen_npu_outputs(input_tensor) + self.assertRtolEqual(cpu_result.cpu().detach().to(torch.float32).numpy(), + npu_result.cpu().detach().to(torch.float32).numpy()) + + def test_float16(self, device='npu'): + input_tensor = gen_inputs([random.randint(1, 10000000)], torch.float16) + cpu_result = gen_cpu_outputs(input_tensor) + npu_result = gen_npu_outputs(input_tensor) + self.assertRtolEqual(cpu_result.cpu().detach().numpy(), npu_result.cpu().detach().numpy()) + + def test_float32(self, device='npu'): + input_tensor = gen_inputs([random.randint(1, 10000000)], torch.float32) + cpu_result = gen_cpu_outputs(input_tensor) + npu_result = gen_npu_outputs(input_tensor) + self.assertRtolEqual(cpu_result.cpu().detach().numpy(), npu_result.cpu().detach().numpy()) + + def test_int16(self, device='npu'): + input_tensor = gen_inputs([random.randint(1, 10000000)], torch.int16) + cpu_result = gen_cpu_outputs(input_tensor) + npu_result = gen_npu_outputs(input_tensor) + self.assertRtolEqual(cpu_result.cpu().detach().numpy(), npu_result.cpu().detach().numpy()) + + def test_int32(self, device='npu'): + input_tensor = gen_inputs([random.randint(1, 10000000)], torch.int32) + cpu_result = gen_cpu_outputs(input_tensor) + npu_result = gen_npu_outputs(input_tensor) + self.assertRtolEqual(cpu_result.cpu().detach().numpy(), npu_result.cpu().detach().numpy()) + + def test_int64(self, device='npu'): + input_tensor = gen_inputs([random.randint(1, 10000000)], torch.int64) + cpu_result = gen_cpu_outputs(input_tensor) + npu_result = gen_npu_outputs(input_tensor) + self.assertRtolEqual(cpu_result.cpu().detach().numpy(), npu_result.cpu().detach().numpy()) + + +if __name__ == "__main__": + run_tests() -- Gitee From 2e3ae6a8db8d344a5356612fff174be66b6e883c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B1=E6=AD=A3=E9=98=B3?= Date: Thu, 20 Mar 2025 15:08:01 +0800 Subject: [PATCH 4/7] feat: unique doc --- docs/api/README.md | 6 +++++- docs/api/context/npu_unique.md | 37 ++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 docs/api/context/npu_unique.md diff --git a/docs/api/README.md b/docs/api/README.md index 071598bb..313c5029 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -54,7 +54,7 @@ Y - 采样 + 采样 roipoint_pool3d Y @@ -94,6 +94,10 @@ grid_sampler2d_v2 N + + npu_unique + N + 体素化 voxelization diff --git a/docs/api/context/npu_unique.md b/docs/api/context/npu_unique.md new file mode 100644 index 00000000..843be385 --- /dev/null +++ b/docs/api/context/npu_unique.md @@ -0,0 +1,37 @@ +## npu_unique[beta] + +### 接口原型 + +```python +mx_driving.npu_unique(Tensor input) -> Tensor +``` + +### 功能描述 + +从小到大排序并去重. 提供一个输入`tensor`, 对`tensor`的输入进行排序, 并去掉`tensor`中的重复元素. + +### 参数说明 + +- `input(Tensor)`:表示输入张量,数据类型支持 `float16`, `bfloat16`, `int16`, `float32`, `int32`, `int64`. shape 为 1 ~ 8 维的任意shape. + +### 返回值 + +- `output(Tensor)`:表示输出张量,数据类型支持 `float16`, `bfloat16`, `int16`, `float32`, `int32`, `int64`, 与输入张量`input`一致. shape 为 1 维。 + +### 约束说明 + +- int32, int64输入时, 每个元素的值须在[-16777216, 16777216] (±2^24)之间,否则会引入精度损失. + +### 支持的型号 + +- Atlas A2 训练系列产品 + +### 调用示例 + +```python +import torch, torch_npu +from mx_driving import npu_unique + +rand_tensor = torch.rand(559794, dtype=torch.int64) +output = npu_unique(rand_tensor.npu()) +``` -- Gitee From b83b41ba3dc2fec616195beefa17e299035934d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B1=E6=AD=A3=E9=98=B3?= Date: Mon, 24 Mar 2025 10:30:27 +0800 Subject: [PATCH 5/7] bugfix: cleancode --- kernels/op_host/unique.cpp | 88 +++---- kernels/op_host/unique_tiling.h | 12 +- kernels/op_kernel/unique.h | 427 ++++++++++++++------------------ mx_driving/ops/npu_unique.py | 6 +- tests/torch/test_npu_unique.py | 7 +- 5 files changed, 249 insertions(+), 291 deletions(-) diff --git a/kernels/op_host/unique.cpp b/kernels/op_host/unique.cpp index bbc0dfbf..f37169d9 100644 --- a/kernels/op_host/unique.cpp +++ b/kernels/op_host/unique.cpp @@ -2,57 +2,59 @@ * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved. * */ -#include "unique_tiling.h" #include "register/op_def_registry.h" #include "tiling/platform/platform_ascendc.h" +#include "unique_tiling.h" constexpr size_t SYS_RSVD_WS_SIZE = 16 * 1024 * 1024; namespace optiling { -static ge::graphStatus UniqueTilingFunc(gert::TilingContext* context) { - UniqueTilingData tiling; +static ge::graphStatus UniqueTilingFunc(gert::TilingContext* context) +{ + UniqueTilingData tiling; - constexpr uint16_t tileLength = 8192; - const uint8_t dimNum = context->GetInputShape(0)->GetStorageShape().GetDimNum(); - const gert::StorageShape* inputShape = context->GetInputShape(0); - uint32_t totalLength = 1; - for (int i = 0; i < dimNum; i++) { - totalLength *= inputShape->GetStorageShape().GetDim(i); - } - const uint32_t tileNum = (totalLength + tileLength - 1) / tileLength; - const uint16_t tailLength = totalLength % tileLength; - const auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); - const uint32_t aivNum = ascendcPlatform.GetCoreNumAiv(); - const uint8_t blockNum = tileNum >= aivNum? aivNum: tileNum; - const uint32_t shortBlockTileNum = tileNum / blockNum; - const uint8_t longBlockNum = tileNum % blockNum; - const uint8_t shortBlockNum = blockNum - longBlockNum; + constexpr uint16_t tileLength = 8192; + const uint8_t dimNum = context->GetInputShape(0)->GetStorageShape().GetDimNum(); + const gert::StorageShape* inputShape = context->GetInputShape(0); + uint32_t totalLength = 1; + for (int i = 0; i < dimNum; i++) { + totalLength *= inputShape->GetStorageShape().GetDim(i); + } + const uint32_t tileNum = (totalLength + tileLength - 1) / tileLength; + const uint16_t tailLength = totalLength % tileLength; + const auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); + const uint32_t aivNum = ascendcPlatform.GetCoreNumAiv(); + const uint8_t blockNum = tileNum >= aivNum ? aivNum : tileNum; + const uint32_t shortBlockTileNum = tileNum / blockNum; + const uint8_t longBlockNum = tileNum % blockNum; + const uint8_t shortBlockNum = blockNum - longBlockNum; - tiling.set_totalLength(totalLength); - tiling.set_tileNum(tileNum); - tiling.set_shortBlockTileNum(shortBlockTileNum); - tiling.set_tailLength(tailLength); - tiling.set_blockNum(blockNum); - tiling.set_shortBlockNum(shortBlockNum); + tiling.set_totalLength(totalLength); + tiling.set_tileNum(tileNum); + tiling.set_shortBlockTileNum(shortBlockTileNum); + tiling.set_tailLength(tailLength); + tiling.set_blockNum(blockNum); + tiling.set_shortBlockNum(shortBlockNum); - context->SetBlockDim(blockNum); - tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); - context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); - // Workspace for IBSet/IBWait up to 8 times, and 2 times full data. - uint32_t sysWorkspaceSize = ascendcPlatform.GetLibApiWorkSpaceSize(); - auto&& currentWorkspace = context->GetWorkspaceSizes(1); - if (currentWorkspace == nullptr) { - return ge::GRAPH_FAILED; - } - size_t usrSize = (aivNum * 8 + 1) * 8 * sizeof(uint32_t) + (tileNum * tileLength) * 2 * sizeof(float) * 2; - currentWorkspace[0] = usrSize + sysWorkspaceSize; - return ge::GRAPH_SUCCESS; -} + context->SetBlockDim(blockNum); + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + // Workspace for IBSet/IBWait up to 8 times, and 2 times full data. + uint32_t sysWorkspaceSize = ascendcPlatform.GetLibApiWorkSpaceSize(); + auto&& currentWorkspace = context->GetWorkspaceSizes(1); + if (currentWorkspace == nullptr) { + return ge::GRAPH_FAILED; + } + size_t usrSize = (aivNum * 8 + 1) * 8 * sizeof(uint32_t) + (tileNum * tileLength) * 2 * sizeof(float) * 2; + currentWorkspace[0] = usrSize + sysWorkspaceSize; + return ge::GRAPH_SUCCESS; } +} // namespace optiling namespace ge { -static ge::graphStatus UniqueInferShape(gert::InferShapeContext* context) { +static ge::graphStatus UniqueInferShape(gert::InferShapeContext* context) +{ const gert::Shape* x1_shape = context->GetInputShape(0); gert::Shape* y_shape = context->GetOutputShape(0); *y_shape = *x1_shape; @@ -65,13 +67,14 @@ static ge::graphStatus UniqueInferDtype(gert::InferDataTypeContext* context) context->SetOutputDataType(0, inputDtype); return ge::GRAPH_SUCCESS; } -} +} // namespace ge namespace ops { class Unique : public OpDef { public: - explicit Unique(const char* name) : OpDef(name) { + explicit Unique(const char* name) : OpDef(name) + { this->Input("input") .ParamType(REQUIRED) .DataType({ge::DT_BF16, ge::DT_FLOAT16, ge::DT_INT16, ge::DT_FLOAT, ge::DT_INT32, ge::DT_INT64}) @@ -89,12 +92,11 @@ public: this->SetInferShape(ge::UniqueInferShape); this->SetInferDataType(ge::UniqueInferDtype); - this->AICore() - .SetTiling(optiling::UniqueTilingFunc); + this->AICore().SetTiling(optiling::UniqueTilingFunc); this->AICore().AddConfig("ascend910b"); this->AICore().AddConfig("ascend910_93"); } }; OP_ADD(Unique); -} +} // namespace ops diff --git a/kernels/op_host/unique_tiling.h b/kernels/op_host/unique_tiling.h index f42483c4..aed40f6f 100644 --- a/kernels/op_host/unique_tiling.h +++ b/kernels/op_host/unique_tiling.h @@ -6,12 +6,12 @@ namespace optiling { BEGIN_TILING_DATA_DEF(UniqueTilingData) - TILING_DATA_FIELD_DEF(uint32_t, totalLength); - TILING_DATA_FIELD_DEF(uint32_t, tileNum); - TILING_DATA_FIELD_DEF(uint32_t, shortBlockTileNum); - TILING_DATA_FIELD_DEF(uint16_t, tailLength); - TILING_DATA_FIELD_DEF(uint8_t, blockNum); - TILING_DATA_FIELD_DEF(uint8_t, shortBlockNum); + TILING_DATA_FIELD_DEF(uint32_t, totalLength); + TILING_DATA_FIELD_DEF(uint32_t, tileNum); + TILING_DATA_FIELD_DEF(uint32_t, shortBlockTileNum); + TILING_DATA_FIELD_DEF(uint16_t, tailLength); + TILING_DATA_FIELD_DEF(uint8_t, blockNum); + TILING_DATA_FIELD_DEF(uint8_t, shortBlockNum); END_TILING_DATA_DEF; REGISTER_TILING_DATA_CLASS(Unique, UniqueTilingData) diff --git a/kernels/op_kernel/unique.h b/kernels/op_kernel/unique.h index 0960db13..9470d73b 100644 --- a/kernels/op_kernel/unique.h +++ b/kernels/op_kernel/unique.h @@ -8,7 +8,8 @@ using namespace AscendC; namespace AscendC { template -__aicore__ inline static Ta min(const Ta a, const Tb b) { +__aicore__ inline Ta min(const Ta a, const Tb b) +{ if (a > b) { return b; } @@ -16,131 +17,113 @@ __aicore__ inline static Ta min(const Ta a, const Tb b) { } template -__aicore__ inline static Ta max(const Ta a, const Tb b) { +__aicore__ inline Ta max(const Ta a, const Tb b) +{ if (a < b) { return b; } return a; } -template +template class KernelUnique { public: - __aicore__ inline KernelUnique(TPipe &pipe) : pipe(pipe) {} + __aicore__ inline KernelUnique(TPipe& pipe) : pipe(pipe) {} // Each block process diffent part of data. This function returns the element-wise first index of data by blockIdx. __aicore__ inline size_t GetGlobalOffset(const uint32_t blockIdx); - __aicore__ inline void Init(GM_ADDR input, - GM_ADDR output, - GM_ADDR uniqueCnt, - GM_ADDR workspace, - const uint32_t totalLength, - const uint32_t totalTileNum, - const uint32_t shortBlockTileNum, - const uint16_t tailLength, - const uint8_t blockNum, - const uint8_t shortBlockNum); + __aicore__ inline void Init(GM_ADDR input, GM_ADDR output, GM_ADDR uniqueCnt, GM_ADDR workspace, + const uint32_t totalLength, const uint32_t totalTileNum, const uint32_t shortBlockTileNum, + const uint16_t tailLength, const uint8_t blockNum, const uint8_t shortBlockNum); __aicore__ inline void Process(); private: __aicore__ inline void CopyIn(const int32_t progress); __aicore__ inline void Elem32Sort(const int32_t progress); __aicore__ inline void TileSort(const int32_t progress); - template - __aicore__ inline static void DataCopyGM2GM(const GlobalTensor &dst, - const GlobalTensor &src, - const LocalTensor &tmpLocal, - const int elemLength, - const int bufByteLength); + template + __aicore__ inline static void DataCopyGM2GM(const GlobalTensor& dst, const GlobalTensor& src, + const LocalTensor& tmpLocal, const int elemLength, const int bufByteLength); using GMSSrcList = GlobalTensor (&)[4]; struct GMSParams { int (&GMSLengths)[4]; - uint8_t &queNum; + uint8_t& queNum; LocalTensor (&&buffLocal)[5]; }; - __aicore__ inline static void MrgSortGM(GlobalTensor &&dstGlobal, - GMSSrcList &srcList, - GMSParams ¶ms); + __aicore__ inline static void MrgSortGM(GlobalTensor&& dstGlobal, GMSSrcList& srcList, GMSParams& params); __aicore__ inline void BlockSortV2(); __aicore__ inline void GlobalSortV2(); - __aicore__ inline static void ConsecutiveUnique(const LocalTensor &dstVal, - const LocalTensor &srcLocal, - const LocalTensor &shiftedLocal, - const LocalTensor &bitMask16, - const uint16_t elemLength, - uint64_t &tileUniqueCnt); + __aicore__ inline static void ConsecutiveUnique(const LocalTensor& dstVal, + const LocalTensor& srcLocal, const LocalTensor& shiftedLocal, + const LocalTensor& bitMask16, const uint16_t elemLength, uint64_t& tileUniqueCnt); __aicore__ inline void TileUnique(const int32_t progress); __aicore__ inline void CopyOut(); private: - static constexpr int32_t TILE_LENGTH = 8192; + static constexpr int32_t TILE_LENGTH = 8192; // INF to fill the tail blank, so that tail is automatically removed by Compare in Unique. - static constexpr float FLOAT_INF = 3e+99; + static constexpr float FLOAT_INF = 3e+99; // Indicates the factor converting float to data structure used by Sort32&MrgSort. - static constexpr int16_t SORT_DATATYPE_SIZE = sizeof(float) + sizeof(uint32_t); // 8 - static constexpr int16_t SORT_DATATYPE_SIZE_FACTOR = SORT_DATATYPE_SIZE / sizeof(float); // 2 - static constexpr int32_t TILE_LEN_BYTE = TILE_LENGTH * SORT_DATATYPE_SIZE; // 8192 * 8 = 65536 - static constexpr int32_t TILE_LEN_ELEM = TILE_LENGTH * SORT_DATATYPE_SIZE_FACTOR; // 8192 * 2 = 16384 - static constexpr uint16_t VALID_QUE[5] = {0, 0, 0b11, 0b111, 0b1111}; // Converts queue number to validBit of MrgSort. - - TPipe &pipe; - TQue calcBuf[3]; - - GlobalTensor srcGlobal; - GlobalTensor srcGlobalAsUint; - GlobalTensor dstGlobal1; - GlobalTensor dstGlobal1As32; - GlobalTensor uniqueCntGlobal; - - GlobalTensor sortedBlock1; - GlobalTensor sortedBlock1AsInt; - GlobalTensor sortedBlock2; - GlobalTensor sortedBlock2AsInt; - GlobalTensor sortedGlobal1; - GlobalTensor sortedGlobal2; - - GlobalTensor IBSyncGlobal; - GlobalTensor blockUniqueCntGlobal; - - uint16_t syncWorkspaceSize; - uint8_t eventID {0}; - uint64_t accUniqueCnt {0}; - float lastTileUniqueVal; - - uint32_t totalLength; - uint32_t alignedTotalLength; - uint32_t tileNum; - uint32_t shortBlockTileNum; - uint16_t tailLength; - uint8_t blockNum; - uint8_t shortBlockNum; - - size_t globalOffset; // Offset of data for current block. - size_t blockLength; // Length of current block. - bool hasInfFlag {false}; + static constexpr int16_t SORT_DATATYPE_SIZE = sizeof(float) + sizeof(uint32_t); // 8 + static constexpr int16_t SORT_DATATYPE_SIZE_FACTOR = SORT_DATATYPE_SIZE / sizeof(float); // 2 + static constexpr int32_t TILE_LEN_BYTE = TILE_LENGTH * SORT_DATATYPE_SIZE; // 8192 * 8 = 65536 + static constexpr int32_t TILE_LEN_ELEM = TILE_LENGTH * SORT_DATATYPE_SIZE_FACTOR; // 8192 * 2 = 16384 + static constexpr uint16_t VALID_QUE[5] = { + 0, 0, 0b11, 0b111, 0b1111}; // Converts queue number to validBit of MrgSort. + + TPipe& pipe; + TQue calcBuf[3]; + + GlobalTensor srcGlobal; + GlobalTensor srcGlobalAsUint; + GlobalTensor dstGlobal1; + GlobalTensor dstGlobal1As32; + GlobalTensor uniqueCntGlobal; + + GlobalTensor sortedBlock1; + GlobalTensor sortedBlock1AsInt; + GlobalTensor sortedBlock2; + GlobalTensor sortedBlock2AsInt; + GlobalTensor sortedGlobal1; + GlobalTensor sortedGlobal2; + + GlobalTensor IBSyncGlobal; + GlobalTensor blockUniqueCntGlobal; + + uint16_t syncWorkspaceSize; + uint8_t eventID {0}; + uint64_t accUniqueCnt {0}; + float lastTileUniqueVal; + + uint32_t totalLength; + uint32_t alignedTotalLength; + uint32_t tileNum; + uint32_t shortBlockTileNum; + uint16_t tailLength; + uint8_t blockNum; + uint8_t shortBlockNum; + + size_t globalOffset; // Offset of data for current block. + size_t blockLength; // Length of current block. + bool hasInfFlag {false}; }; // Each block process diffent part of data. This function returns the element-wise first index of data by blockIdx. -template -__aicore__ inline size_t KernelUnique::GetGlobalOffset(const uint32_t blockIdx) { +template +__aicore__ inline size_t KernelUnique::GetGlobalOffset(const uint32_t blockIdx) +{ // (shortBlockTileNum + 1) indicates longBlockTileNum. - const size_t offset = (this->shortBlockTileNum * min(this->shortBlockNum, blockIdx) + - (this->shortBlockTileNum + 1) * - (this->shortBlockNum >= blockIdx? 0: blockIdx - this->shortBlockNum)) * - TILE_LENGTH; + const size_t offset = + (this->shortBlockTileNum * min(this->shortBlockNum, blockIdx) + + (this->shortBlockTileNum + 1) * (this->shortBlockNum >= blockIdx ? 0 : blockIdx - this->shortBlockNum)) * + TILE_LENGTH; return offset; } -template -__aicore__ inline void KernelUnique::Init(GM_ADDR input, - GM_ADDR output, - GM_ADDR uniqueCnt, - GM_ADDR workspace, - const uint32_t totalLength, - const uint32_t totalTileNum, - const uint32_t shortBlockTileNum, - const uint16_t tailLength, - const uint8_t blockNum, - const uint8_t shortBlockNum) { +template +__aicore__ inline void KernelUnique::Init(GM_ADDR input, GM_ADDR output, GM_ADDR uniqueCnt, GM_ADDR workspace, + const uint32_t totalLength, const uint32_t totalTileNum, const uint32_t shortBlockTileNum, + const uint16_t tailLength, const uint8_t blockNum, const uint8_t shortBlockNum) +{ this->totalLength = totalLength; this->alignedTotalLength = totalTileNum * TILE_LENGTH; this->shortBlockTileNum = shortBlockTileNum; @@ -150,48 +133,48 @@ __aicore__ inline void KernelUnique::Init(GM_ADDR input, const bool isShortBlock = this->shortBlockNum > GetBlockIdx(); // (shortBlockTileNum + 1) indicates longBlockTileNum. - this->tileNum = isShortBlock? shortBlockTileNum: shortBlockTileNum + 1; + this->tileNum = isShortBlock ? shortBlockTileNum : shortBlockTileNum + 1; this->blockLength = this->tileNum * TILE_LENGTH; this->globalOffset = GetGlobalOffset(GetBlockIdx()); - srcGlobal.SetGlobalBuffer((__gm__ T *)input + globalOffset, this->blockLength); - srcGlobalAsUint.SetGlobalBuffer((__gm__ uint32_t *)input + globalOffset * sizeof(T) / sizeof(uint32_t), - this->blockLength * sizeof(T) / sizeof(uint32_t)); - dstGlobal1.SetGlobalBuffer((__gm__ T *)output, this->alignedTotalLength); - dstGlobal1As32.SetGlobalBuffer((__gm__ int32_t *)output, - this->alignedTotalLength * sizeof(T) / sizeof(int32_t)); - uniqueCntGlobal.SetGlobalBuffer((__gm__ int32_t *)uniqueCnt, 1); + srcGlobal.SetGlobalBuffer((__gm__ T*)input + globalOffset, this->blockLength); + srcGlobalAsUint.SetGlobalBuffer((__gm__ uint32_t*)input + globalOffset * sizeof(T) / sizeof(uint32_t), + this->blockLength * sizeof(T) / sizeof(uint32_t)); + dstGlobal1.SetGlobalBuffer((__gm__ T*)output, this->alignedTotalLength); + dstGlobal1As32.SetGlobalBuffer((__gm__ int32_t*)output, this->alignedTotalLength * sizeof(T) / sizeof(int32_t)); + uniqueCntGlobal.SetGlobalBuffer((__gm__ int32_t*)uniqueCnt, 1); // sortedBlock is offsetted, and could only see the data that this block should process. - sortedBlock1.SetGlobalBuffer((__gm__ float *)workspace + globalOffset * SORT_DATATYPE_SIZE_FACTOR, - this->blockLength * SORT_DATATYPE_SIZE_FACTOR); - sortedBlock1AsInt.SetGlobalBuffer((__gm__ int32_t *)workspace + globalOffset * SORT_DATATYPE_SIZE_FACTOR, - this->blockLength * SORT_DATATYPE_SIZE_FACTOR); - sortedBlock2.SetGlobalBuffer((__gm__ float *)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR + - globalOffset * SORT_DATATYPE_SIZE_FACTOR, - this->blockLength * SORT_DATATYPE_SIZE_FACTOR); - sortedBlock2AsInt.SetGlobalBuffer((__gm__ int32_t *)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR + - globalOffset * SORT_DATATYPE_SIZE_FACTOR, - this->blockLength * SORT_DATATYPE_SIZE_FACTOR); + sortedBlock1.SetGlobalBuffer((__gm__ float*)workspace + globalOffset * SORT_DATATYPE_SIZE_FACTOR, + this->blockLength * SORT_DATATYPE_SIZE_FACTOR); + sortedBlock1AsInt.SetGlobalBuffer((__gm__ int32_t*)workspace + globalOffset * SORT_DATATYPE_SIZE_FACTOR, + this->blockLength * SORT_DATATYPE_SIZE_FACTOR); + sortedBlock2.SetGlobalBuffer((__gm__ float*)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR + + globalOffset * SORT_DATATYPE_SIZE_FACTOR, + this->blockLength * SORT_DATATYPE_SIZE_FACTOR); + sortedBlock2AsInt.SetGlobalBuffer((__gm__ int32_t*)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR + + globalOffset * SORT_DATATYPE_SIZE_FACTOR, + this->blockLength * SORT_DATATYPE_SIZE_FACTOR); // sortedGlobal could see all data in the workspace. - sortedGlobal1.SetGlobalBuffer((__gm__ float *)workspace, alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR); - sortedGlobal2.SetGlobalBuffer((__gm__ float *)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR, - alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR); + sortedGlobal1.SetGlobalBuffer((__gm__ float*)workspace, alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR); + sortedGlobal2.SetGlobalBuffer((__gm__ float*)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR, + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR); // Buff size for syncronizing according to document of IBWait&IBSet. this->syncWorkspaceSize = (blockNum * 32 + 1) * 8; - IBSyncGlobal.SetGlobalBuffer((__gm__ int32_t *)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR * 2, - syncWorkspaceSize); - blockUniqueCntGlobal.SetGlobalBuffer((__gm__ uint32_t *)workspace + alignedTotalLength * 4 + syncWorkspaceSize, - (blockNum + 7) / 8 * 8); // Length aligned up to 32B. + IBSyncGlobal.SetGlobalBuffer( + (__gm__ int32_t*)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR * 2, syncWorkspaceSize); + blockUniqueCntGlobal.SetGlobalBuffer((__gm__ uint32_t*)workspace + alignedTotalLength * 4 + syncWorkspaceSize, + (blockNum + 7) / 8 * 8); // Length aligned up to 32B. pipe.InitBuffer(calcBuf[0], 1, TILE_LEN_BYTE); pipe.InitBuffer(calcBuf[1], 1, TILE_LEN_BYTE); pipe.InitBuffer(calcBuf[2], 1, TILE_LEN_BYTE); } -template -__aicore__ inline void KernelUnique::Process() { +template +__aicore__ inline void KernelUnique::Process() +{ LocalTensor IBSyncLocal; // Initialize sync buff. if (GetBlockIdx() == 0) { @@ -201,7 +184,7 @@ __aicore__ inline void KernelUnique::Process() { DataCopy(IBSyncGlobal, IBSyncLocal, syncWorkspaceSize); PipeBarrier(); calcBuf[0].FreeTensor(IBSyncLocal); - } // Initialize sync buff. + } // Initialize sync buff. // Sort within each tile. for (int32_t tileIdx = 0; tileIdx < this->tileNum; tileIdx++) { @@ -212,10 +195,10 @@ __aicore__ inline void KernelUnique::Process() { if (GetBlockNum() > 1) { if (this->tileNum > 1) { - BlockSortV2(); // Sort within each block. + BlockSortV2(); // Sort within each block. } - GlobalSortV2(); // Sort globally. + GlobalSortV2(); // Sort globally. PipeBarrier(); SyncAll(); @@ -255,13 +238,14 @@ __aicore__ inline void KernelUnique::Process() { CopyOut(); } -template -__aicore__ inline void KernelUnique::CopyIn(const int32_t progress) { +template +__aicore__ inline void KernelUnique::CopyIn(const int32_t progress) +{ LocalTensor srcLocal = calcBuf[0].AllocTensor(); LocalTensor sortedLocal2 = calcBuf[2].AllocTensor(); // To process tail, fill the whole tile with INF, then cover it with tail. - int32_t castLen; // Valid length of the last block. + int32_t castLen; // Valid length of the last block. if ((progress != tileNum - 1) || (GetBlockIdx() != blockNum - 1) || tailLength == 0) { // Must determine during compilation, otherwise we get a compilation error. if constexpr (!IsSameType::value) { @@ -276,20 +260,14 @@ __aicore__ inline void KernelUnique::CopyIn(const int32_t progress) { Duplicate(sortedLocal2, FLOAT_INF, TILE_LENGTH); PipeBarrier(); if constexpr (IsSameType::value) { - DataCopyPad(sortedLocal2, - srcGlobal[progress * TILE_LENGTH], - {1, static_cast(sizeof(T) * tailLength), 0, 0}, - {false, 0, 0, 0}); + DataCopyPad(sortedLocal2, srcGlobal[progress * TILE_LENGTH], + {1, static_cast(sizeof(T) * tailLength), 0, 0}, {false, 0, 0, 0}); } else if constexpr (sizeof(T) >= sizeof(float)) { - DataCopyPad(srcAsUint, - srcGlobalAsUint[progress * TILE_LENGTH * sizeof(T) / sizeof(uint32_t)], - {1, static_cast(sizeof(T) * tailLength), 0, 0}, - {false, 0, 0, 0}); + DataCopyPad(srcAsUint, srcGlobalAsUint[progress * TILE_LENGTH * sizeof(T) / sizeof(uint32_t)], + {1, static_cast(sizeof(T) * tailLength), 0, 0}, {false, 0, 0, 0}); } else { - DataCopyPad(srcLocal, - srcGlobal[progress * TILE_LENGTH], - {1, static_cast(sizeof(T) * tailLength), 0, 0}, - {false, 0, 0, 0}); + DataCopyPad(srcLocal, srcGlobal[progress * TILE_LENGTH], + {1, static_cast(sizeof(T) * tailLength), 0, 0}, {false, 0, 0, 0}); } castLen = tailLength; } @@ -307,14 +285,15 @@ __aicore__ inline void KernelUnique::CopyIn(const int32_t progress) { calcBuf[2].EnQue(sortedLocal2); } -template -__aicore__ inline void KernelUnique::Elem32Sort(const int32_t progress) { +template +__aicore__ inline void KernelUnique::Elem32Sort(const int32_t progress) +{ LocalTensor srcLocal = calcBuf[0].DeQue(); LocalTensor sortedLocal1 = calcBuf[1].AllocTensor(); LocalTensor sortedLocal2 = calcBuf[2].DeQue(); LocalTensor arithLocal = srcLocal.template ReinterpretCast()[TILE_LENGTH]; - int32_t baseOffset = progress * TILE_LENGTH + this->globalOffset; // calc tileOffset + int32_t baseOffset = progress * TILE_LENGTH + this->globalOffset; // calc tileOffset Duplicate(arithLocal, baseOffset, TILE_LENGTH); PipeBarrier(); @@ -327,9 +306,8 @@ __aicore__ inline void KernelUnique::Elem32Sort(const int32_t progress) { while (restLen) { int repTime = min(restLen / sort32BatchSize, sort32RepeatLimit); Sort32(sortedLocal1[sort32BatchSize * sort32RepeatLimit * SORT_DATATYPE_SIZE_FACTOR * instrRepeatTime], - sortedLocal2[sort32BatchSize * sort32RepeatLimit * instrRepeatTime], - uidArray[sort32BatchSize * sort32RepeatLimit * instrRepeatTime], - repTime); + sortedLocal2[sort32BatchSize * sort32RepeatLimit * instrRepeatTime], + uidArray[sort32BatchSize * sort32RepeatLimit * instrRepeatTime], repTime); restLen -= repTime * sort32BatchSize; instrRepeatTime++; } @@ -339,27 +317,25 @@ __aicore__ inline void KernelUnique::Elem32Sort(const int32_t progress) { calcBuf[2].EnQue(sortedLocal2); } -template -__aicore__ inline void KernelUnique::TileSort(const int32_t progress) { +template +__aicore__ inline void KernelUnique::TileSort(const int32_t progress) +{ LocalTensor sortedLocal1 = calcBuf[1].DeQue(); LocalTensor sortedLocal2 = calcBuf[2].DeQue(); LocalTensor sortedQue[2] = {sortedLocal1, sortedLocal2}; - uint16_t currentQueLength = 32; // Initial queue length is 32 because data is from Sort32. + uint16_t currentQueLength = 32; // Initial queue length is 32 because data is from Sort32. uint16_t currentQueNum = TILE_LENGTH / currentQueLength; bool switchFlag = false; // Multiple MrgSort until we have one generally sorted tile. while (currentQueLength < TILE_LENGTH) { - const uint16_t elementLengths[4] = { - currentQueLength, currentQueLength, currentQueLength, currentQueLength}; + const uint16_t elementLengths[4] = {currentQueLength, currentQueLength, currentQueLength, currentQueLength}; const uint16_t fullMrgSortTime = currentQueNum / 4; if (fullMrgSortTime > 0) { MrgSort4Info params = {elementLengths, false, 0b1111, fullMrgSortTime}; MrgSort(sortedQue[!switchFlag], - {sortedQue[switchFlag][0], - sortedQue[switchFlag][currentQueLength * 1 * 2], - sortedQue[switchFlag][currentQueLength * 2 * 2], - sortedQue[switchFlag][currentQueLength * 3 * 2]}, - params); + {sortedQue[switchFlag][0], sortedQue[switchFlag][currentQueLength * 1 * 2], + sortedQue[switchFlag][currentQueLength * 2 * 2], sortedQue[switchFlag][currentQueLength * 3 * 2]}, + params); PipeBarrier(); switchFlag = !switchFlag; } @@ -372,22 +348,18 @@ __aicore__ inline void KernelUnique::TileSort(const int32_t progress) { calcBuf[2].FreeTensor(sortedLocal2); } -template -template -__aicore__ inline void KernelUnique::DataCopyGM2GM(const GlobalTensor &dst, - const GlobalTensor &src, - const LocalTensor &tmpLocal, - const int elemLength, - const int bufByteLength) { +template +template +__aicore__ inline void KernelUnique::DataCopyGM2GM(const GlobalTensor& dst, const GlobalTensor& src, + const LocalTensor& tmpLocal, const int elemLength, const int bufByteLength) +{ // Max byte size of DataCopyPad in one repeat is 65535. int bufElemLength = min(bufByteLength, 65535) / sizeof(T1); int restLen = elemLength; while (restLen > 0) { int copyLen = min(restLen, bufElemLength); - DataCopyPad(tmpLocal, - src[elemLength - restLen], - {1, static_cast(sizeof(T1) * copyLen), 0, 0}, - {false, 0, 0, 0}); + DataCopyPad(tmpLocal, src[elemLength - restLen], {1, static_cast(sizeof(T1) * copyLen), 0, 0}, + {false, 0, 0, 0}); PipeBarrier(); DataCopyPad(dst[elemLength - restLen], tmpLocal, {1, static_cast(sizeof(T1) * copyLen), 0, 0}); PipeBarrier(); @@ -395,10 +367,10 @@ __aicore__ inline void KernelUnique::DataCopyGM2GM(const GlobalTensor &ds } } -template -__aicore__ inline void KernelUnique::MrgSortGM(GlobalTensor &&dstGlobal, - GMSSrcList &srcList, - GMSParams ¶ms) { +template +__aicore__ inline void KernelUnique::MrgSortGM( + GlobalTensor&& dstGlobal, GMSSrcList& srcList, GMSParams& params) +{ int restLen[4] {params.GMSLengths[0], params.GMSLengths[1], params.GMSLengths[2], params.GMSLengths[3]}; int currentHead[4] {}; int totalMrgLen {}; @@ -415,24 +387,21 @@ __aicore__ inline void KernelUnique::MrgSortGM(GlobalTensor &&dstGloba } // CopyIn for (int i = 0; i < queNum; i++) { - DataCopyPad(params.buffLocal[i], - srcList[i][currentHead[i] * SORT_DATATYPE_SIZE_FACTOR], - {1, static_cast(sizeof(float) * mrgLen[i] * SORT_DATATYPE_SIZE_FACTOR), 0, 0}, - {false, 0, 0, 0}); + DataCopyPad(params.buffLocal[i], srcList[i][currentHead[i] * SORT_DATATYPE_SIZE_FACTOR], + {1, static_cast(sizeof(float) * mrgLen[i] * SORT_DATATYPE_SIZE_FACTOR), 0, 0}, + {false, 0, 0, 0}); } PipeBarrier(); // MrgSort MrgSort4Info localParams {mrgLen, true, VALID_QUE[queNum], 1}; MrgSort(params.buffLocal[4], - {params.buffLocal[0], params.buffLocal[1], params.buffLocal[2], params.buffLocal[3]}, - localParams); + {params.buffLocal[0], params.buffLocal[1], params.buffLocal[2], params.buffLocal[3]}, localParams); PipeBarrier(); GetMrgSortResult(sortedLen[0], sortedLen[1], sortedLen[2], sortedLen[3]); const uint16_t localMrgLen = sortedLen[0] + sortedLen[1] + sortedLen[2] + sortedLen[3]; // CopyOut - DataCopyPad(dstGlobal[totalMrgLen * SORT_DATATYPE_SIZE_FACTOR], - params.buffLocal[4], - {1, static_cast(sizeof(float) * localMrgLen * SORT_DATATYPE_SIZE_FACTOR), 0, 0}); + DataCopyPad(dstGlobal[totalMrgLen * SORT_DATATYPE_SIZE_FACTOR], params.buffLocal[4], + {1, static_cast(sizeof(float) * localMrgLen * SORT_DATATYPE_SIZE_FACTOR), 0, 0}); PipeBarrier(); // renew currentHead, restLen totalMrgLen += localMrgLen; @@ -450,7 +419,7 @@ __aicore__ inline void KernelUnique::MrgSortGM(GlobalTensor &&dstGloba } restLen[3] = 0; queNum--; - break; // because ifExhaustedSuspension == true, there is 0 or 1 empty que. + break; // because ifExhaustedSuspension == true, there is 0 or 1 empty que. } } } @@ -458,17 +427,16 @@ __aicore__ inline void KernelUnique::MrgSortGM(GlobalTensor &&dstGloba for (int i = 0; i < params.queNum; i++) { if (restLen[i] > 0) { DataCopyGM2GM(dstGlobal[totalMrgLen * SORT_DATATYPE_SIZE_FACTOR], - srcList[i][currentHead[i] * SORT_DATATYPE_SIZE_FACTOR], - params.buffLocal[4], - restLen[i] * SORT_DATATYPE_SIZE_FACTOR, - TILE_LEN_BYTE); + srcList[i][currentHead[i] * SORT_DATATYPE_SIZE_FACTOR], params.buffLocal[4], + restLen[i] * SORT_DATATYPE_SIZE_FACTOR, TILE_LEN_BYTE); break; } } }; -template -__aicore__ inline void KernelUnique::BlockSortV2() { +template +__aicore__ inline void KernelUnique::BlockSortV2() +{ LocalTensor sortedLocal1 = calcBuf[0].AllocTensor(); LocalTensor sortedLocal2 = calcBuf[1].AllocTensor(); LocalTensor mrgLocal = calcBuf[2].AllocTensor(); @@ -497,27 +465,23 @@ __aicore__ inline void KernelUnique::BlockSortV2() { } lengths[queNum - 1] = TILE_LENGTH * lastQueTileNum; GMSSrcList srcList {srcGlobal}; - GMSParams params {lengths, queNum, {sortedLocal1, sortedLocal1[TILE_LENGTH], - sortedLocal2, sortedLocal2[TILE_LENGTH], - mrgLocal}}; + GMSParams params {lengths, queNum, + {sortedLocal1, sortedLocal1[TILE_LENGTH], sortedLocal2, sortedLocal2[TILE_LENGTH], mrgLocal}}; MrgSortGM(sortedBlock[!switchFlag][TILE_LEN_ELEM * tileIdx], srcList, params); } switchFlag = !switchFlag; } if (switchFlag) { - DataCopyGM2GM(sortedBlock1, - sortedBlock2, - sortedLocal1, - blockLength * SORT_DATATYPE_SIZE_FACTOR, - TILE_LEN_BYTE); + DataCopyGM2GM(sortedBlock1, sortedBlock2, sortedLocal1, blockLength * SORT_DATATYPE_SIZE_FACTOR, TILE_LEN_BYTE); } calcBuf[0].FreeTensor(sortedLocal1); calcBuf[1].FreeTensor(sortedLocal2); calcBuf[2].FreeTensor(mrgLocal); } -template -__aicore__ inline void KernelUnique::GlobalSortV2() { +template +__aicore__ inline void KernelUnique::GlobalSortV2() +{ LocalTensor sortedLocal1 = calcBuf[0].AllocTensor(); LocalTensor sortedLocal2 = calcBuf[1].AllocTensor(); LocalTensor mrgLocal = calcBuf[2].AllocTensor(); @@ -531,8 +495,7 @@ __aicore__ inline void KernelUnique::GlobalSortV2() { int lengths[4]; for (int bindBlock = 1; bindBlock < blockNum; bindBlock *= PREFIX_QUE_NUM, eventID++) { for (int blockIdx = 0; blockIdx < blockNum; blockIdx += bindBlock * PREFIX_QUE_NUM) { - if ((GetBlockIdx() == blockIdx + bindBlock) || - (GetBlockIdx() == blockIdx + bindBlock * 2) || + if ((GetBlockIdx() == blockIdx + bindBlock) || (GetBlockIdx() == blockIdx + bindBlock * 2) || (GetBlockIdx() == blockIdx + bindBlock * 3)) { PipeBarrier(); IBSet(IBSyncGlobal, IBSyncLocal, (int32_t)GetBlockIdx(), eventID); @@ -552,22 +515,20 @@ __aicore__ inline void KernelUnique::GlobalSortV2() { } // Init GMSSrcList, GMSParams for (int i = 0; i < queNum; i++) { - srcGlobal[i] = sortedGlobal[switchFlag][GetGlobalOffset(blockIdx + bindBlock * i) * - SORT_DATATYPE_SIZE_FACTOR]; + srcGlobal[i] = + sortedGlobal[switchFlag][GetGlobalOffset(blockIdx + bindBlock * i) * SORT_DATATYPE_SIZE_FACTOR]; } for (int i = 0; i < queNum - 1; i++) { - lengths[i] = GetGlobalOffset(blockIdx + (bindBlock * (i + 1))) - - GetGlobalOffset(blockIdx + (bindBlock * i)); + lengths[i] = + GetGlobalOffset(blockIdx + (bindBlock * (i + 1))) - GetGlobalOffset(blockIdx + (bindBlock * i)); } - lengths[queNum - 1] = GetGlobalOffset(blockIdx + (bindBlock * (queNum - 1)) + lastQueBlockNum) - + lengths[queNum - 1] = GetGlobalOffset(blockIdx + (bindBlock * (queNum - 1)) + lastQueBlockNum) - GetGlobalOffset(blockIdx + (bindBlock * (queNum - 1))); GMSSrcList srcList {srcGlobal}; - GMSParams params {lengths, queNum, {sortedLocal1, sortedLocal1[TILE_LENGTH], - sortedLocal2, sortedLocal2[TILE_LENGTH], - mrgLocal}}; - MrgSortGM(sortedGlobal[!switchFlag][GetGlobalOffset(blockIdx) * SORT_DATATYPE_SIZE_FACTOR], - srcList, - params); + GMSParams params {lengths, queNum, + {sortedLocal1, sortedLocal1[TILE_LENGTH], sortedLocal2, sortedLocal2[TILE_LENGTH], mrgLocal}}; + MrgSortGM( + sortedGlobal[!switchFlag][GetGlobalOffset(blockIdx) * SORT_DATATYPE_SIZE_FACTOR], srcList, params); } } switchFlag = !switchFlag; @@ -591,13 +552,11 @@ __aicore__ inline void KernelUnique::GlobalSortV2() { calcBuf[2].FreeTensor(mrgLocal); } -template -__aicore__ inline void KernelUnique::ConsecutiveUnique(const LocalTensor &dstVal, - const LocalTensor &srcLocal, - const LocalTensor &shiftedLocal, - const LocalTensor &bitMask32, - const uint16_t elemLength, - uint64_t &tileUniqueCnt) { +template +__aicore__ inline void KernelUnique::ConsecutiveUnique(const LocalTensor& dstVal, + const LocalTensor& srcLocal, const LocalTensor& shiftedLocal, const LocalTensor& bitMask32, + const uint16_t elemLength, uint64_t& tileUniqueCnt) +{ LocalTensor bitMask16 = bitMask32.ReinterpretCast(); uint64_t rsvdCnt = 0; // Seperate Val and Idx. @@ -624,8 +583,9 @@ __aicore__ inline void KernelUnique::ConsecutiveUnique(const LocalTensor(); } -template -__aicore__ inline void KernelUnique::TileUnique(const int32_t progress) { +template +__aicore__ inline void KernelUnique::TileUnique(const int32_t progress) +{ LocalTensor bitMask32 = calcBuf[0].AllocTensor(); LocalTensor shiftedLocal = bitMask32[TILE_LENGTH].ReinterpretCast(); LocalTensor sortedLocal1 = calcBuf[1].AllocTensor(); @@ -649,9 +609,8 @@ __aicore__ inline void KernelUnique::TileUnique(const int32_t progress) { if (progress != 0 && lastTileUniqueVal == sortedLocal2.GetValue(0)) { accUniqueCnt--; } - DataCopyPad(sortedBlock1[accUniqueCnt - tileUniqueCnt], - sortedLocal2, - {1, static_cast(sizeof(float) * tileUniqueCnt), 0, 0}); + DataCopyPad(sortedBlock1[accUniqueCnt - tileUniqueCnt], sortedLocal2, + {1, static_cast(sizeof(float) * tileUniqueCnt), 0, 0}); PipeBarrier(); lastTileUniqueVal = sortedLocal2.GetValue(tileUniqueCnt - 1); } @@ -659,9 +618,8 @@ __aicore__ inline void KernelUnique::TileUnique(const int32_t progress) { // upload uniqueCnt. if (progress == tileNum - 1) { uniqueCntLocal.SetValue(0, accUniqueCnt); - DataCopyPad(blockUniqueCntGlobal[GetBlockIdx()], - uniqueCntLocal, - {1, static_cast(sizeof(uint32_t) * 1), 0, 0}); + DataCopyPad(blockUniqueCntGlobal[GetBlockIdx()], uniqueCntLocal, + {1, static_cast(sizeof(uint32_t) * 1), 0, 0}); PipeBarrier(); } calcBuf[0].FreeTensor(shiftedLocal); @@ -669,8 +627,9 @@ __aicore__ inline void KernelUnique::TileUnique(const int32_t progress) { calcBuf[2].FreeTensor(sortedLocal2); } -template -__aicore__ inline void KernelUnique::CopyOut() { +template +__aicore__ inline void KernelUnique::CopyOut() +{ LocalTensor copyLocal0 = calcBuf[0].AllocTensor(); LocalTensor copyLocal1 = calcBuf[1].AllocTensor(); LocalTensor IBSyncLocal = copyLocal1.ReinterpretCast(); @@ -692,7 +651,7 @@ __aicore__ inline void KernelUnique::CopyOut() { uint64_t restLen = thisUniqueCnt; // max(Ta a, Tb b) function does not support compilation period calc. - constexpr uint64_t bottleneckTypeSize = sizeof(T) > sizeof(float)? sizeof(T): sizeof(float); + constexpr uint64_t bottleneckTypeSize = sizeof(T) > sizeof(float) ? sizeof(T) : sizeof(float); LocalTensor copyVal32 = copyLocal0.template ReinterpretCast(); LocalTensor uniqueVal32 = copyLocal1.ReinterpretCast(); // Copy unique values (and counts) from Workspace to dst. @@ -702,20 +661,16 @@ __aicore__ inline void KernelUnique::CopyOut() { uint64_t copyLen = min(restLen, TILE_LEN_BYTE / bottleneckTypeSize); copyLen = min(copyLen, 65535 / bottleneckTypeSize); if constexpr (!IsSameType::value) { - DataCopyPad(copyLocal1, - sortedBlock1[thisUniqueCnt - restLen], - {1, static_cast(sizeof(float) * copyLen), 0, 0}, - {false, 0, 0, 0}); + DataCopyPad(copyLocal1, sortedBlock1[thisUniqueCnt - restLen], + {1, static_cast(sizeof(float) * copyLen), 0, 0}, {false, 0, 0, 0}); PipeBarrier(); Muls(copyLocal1, copyLocal1, (float)-1, copyLen); PipeBarrier(); Cast(copyLocal0, copyLocal1, RoundMode::CAST_RINT, copyLen); PipeBarrier(); } else { - DataCopyPad(copyLocal0, - sortedBlock1[thisUniqueCnt - restLen], - {1, static_cast(sizeof(float) * copyLen), 0, 0}, - {false, 0, 0, 0}); + DataCopyPad(copyLocal0, sortedBlock1[thisUniqueCnt - restLen], + {1, static_cast(sizeof(float) * copyLen), 0, 0}, {false, 0, 0, 0}); PipeBarrier(); Muls(copyLocal0, copyLocal0, (float)-1, copyLen); PipeBarrier(); @@ -723,12 +678,10 @@ __aicore__ inline void KernelUnique::CopyOut() { // DataCopyPad does not support int64_t. Copy them as uint32_t. if constexpr (sizeof(T) > 4) { DataCopyPad(dstGlobal1As32[(lastAccUniqueCnt + thisUniqueCnt - restLen) * sizeof(T) / sizeof(uint32_t)], - copyVal32, - {1, static_cast(sizeof(T) * copyLen), 0, 0}); + copyVal32, {1, static_cast(sizeof(T) * copyLen), 0, 0}); } else { - DataCopyPad(dstGlobal1[lastAccUniqueCnt + thisUniqueCnt - restLen], - copyLocal0, - {1, static_cast(sizeof(T) * copyLen), 0, 0}); + DataCopyPad(dstGlobal1[lastAccUniqueCnt + thisUniqueCnt - restLen], copyLocal0, + {1, static_cast(sizeof(T) * copyLen), 0, 0}); } PipeBarrier(); restLen -= copyLen; @@ -742,4 +695,4 @@ __aicore__ inline void KernelUnique::CopyOut() { calcBuf[0].FreeTensor(copyLocal0); calcBuf[1].FreeTensor(copyLocal1); } -} // namespace AscendC +} // namespace AscendC diff --git a/mx_driving/ops/npu_unique.py b/mx_driving/ops/npu_unique.py index 7ccde38a..1fd367b3 100644 --- a/mx_driving/ops/npu_unique.py +++ b/mx_driving/ops/npu_unique.py @@ -9,11 +9,13 @@ Modification 1. Add support for Ascend NPU from torch.autograd import Function import mx_driving._C + class UniqueFunction(Function): @staticmethod # 'pylint: disable=too-many-arguments,huawei-too-many-arguments - def forward(ctx, input): - y = mx_driving._C.npu_unique(input) + def forward(ctx, input_tensor): + y = mx_driving._C.npu_unique(input_tensor) return y + npu_unique = UniqueFunction.apply diff --git a/tests/torch/test_npu_unique.py b/tests/torch/test_npu_unique.py index 8cc8f81c..d9144752 100644 --- a/tests/torch/test_npu_unique.py +++ b/tests/torch/test_npu_unique.py @@ -1,24 +1,25 @@ """ Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. """ +import random +import os import torch import torch_npu from data_cache import golden_data_cache from torch_npu.testing.testcase import TestCase, run_tests - from mx_driving import npu_unique -import random -import os def gen_inputs(input_shape, dtype): input_tensor = torch.randint(-256, 256, input_shape, dtype=dtype) return input_tensor + def gen_cpu_outputs(input_tensor): cpu_result = torch.unique(input_tensor) return cpu_result + def gen_npu_outputs(input_tensor): npu_result = npu_unique(input_tensor.npu()) return npu_result.cpu() -- Gitee From ee0b2abcecb4a0b1b5770b92633c3a8474a32203 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B1=E6=AD=A3=E9=98=B3?= Date: Mon, 24 Mar 2025 16:44:02 +0800 Subject: [PATCH 6/7] cleancode --- mx_driving/csrc/Unique.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mx_driving/csrc/Unique.cpp b/mx_driving/csrc/Unique.cpp index ee7aa6a7..b99f9ed0 100644 --- a/mx_driving/csrc/Unique.cpp +++ b/mx_driving/csrc/Unique.cpp @@ -18,7 +18,8 @@ #include "csrc/functions.h" -at::Tensor npu_unique(const at::Tensor& input) { +at::Tensor npu_unique(const at::Tensor& input) +{ if (input.numel() < 2) { at::Tensor output = at::Tensor(input).clone(); return output; -- Gitee From aa4cc88824ebf046111802474fe47b57c9aff9b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B1=E6=AD=A3=E9=98=B3?= Date: Mon, 24 Mar 2025 16:44:02 +0800 Subject: [PATCH 7/7] cleancode --- kernels/op_host/unique.cpp | 11 ++++++++++- mx_driving/csrc/Unique.cpp | 3 ++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/kernels/op_host/unique.cpp b/kernels/op_host/unique.cpp index f37169d9..40e33c95 100644 --- a/kernels/op_host/unique.cpp +++ b/kernels/op_host/unique.cpp @@ -11,11 +11,17 @@ constexpr size_t SYS_RSVD_WS_SIZE = 16 * 1024 * 1024; namespace optiling { static ge::graphStatus UniqueTilingFunc(gert::TilingContext* context) { + if (!context) { + return ge::GRAPH_FAILED; + } UniqueTilingData tiling; constexpr uint16_t tileLength = 8192; - const uint8_t dimNum = context->GetInputShape(0)->GetStorageShape().GetDimNum(); const gert::StorageShape* inputShape = context->GetInputShape(0); + if (!inputShape) { + return ge::GRAPH_FAILED; + } + const uint8_t dimNum = context->GetInputShape(0)->GetStorageShape().GetDimNum(); uint32_t totalLength = 1; for (int i = 0; i < dimNum; i++) { totalLength *= inputShape->GetStorageShape().GetDim(i); @@ -57,6 +63,9 @@ static ge::graphStatus UniqueInferShape(gert::InferShapeContext* context) { const gert::Shape* x1_shape = context->GetInputShape(0); gert::Shape* y_shape = context->GetOutputShape(0); + if (!x1_shape || !y_shape) { + return GRAPH_FAILED; + } *y_shape = *x1_shape; return GRAPH_SUCCESS; } diff --git a/mx_driving/csrc/Unique.cpp b/mx_driving/csrc/Unique.cpp index ee7aa6a7..b99f9ed0 100644 --- a/mx_driving/csrc/Unique.cpp +++ b/mx_driving/csrc/Unique.cpp @@ -18,7 +18,8 @@ #include "csrc/functions.h" -at::Tensor npu_unique(const at::Tensor& input) { +at::Tensor npu_unique(const at::Tensor& input) +{ if (input.numel() < 2) { at::Tensor output = at::Tensor(input).clone(); return output; -- Gitee