diff --git a/kernels/op_host/deformable_conv2d.cpp b/kernels/op_host/deformable_conv2d.cpp index d4facf60a3b0dfaadbbcb969d6fd7c6e59bfe626..9e3d90630267f9c2028839d8bb6a820fdfef53a5 100644 --- a/kernels/op_host/deformable_conv2d.cpp +++ b/kernels/op_host/deformable_conv2d.cpp @@ -4,6 +4,12 @@ #include "tiling/platform/platform_ascendc.h" using namespace matmul_tiling; + +namespace { +constexpr uint32_t SIZE_OF_FP16 = 2; +constexpr uint32_t SIZE_OF_FP32 = 4; +} // namespace + namespace optiling { static ge::graphStatus TilingForDeformableConv2d(gert::TilingContext* context) { @@ -64,12 +70,30 @@ static ge::graphStatus TilingForDeformableConv2d(gert::TilingContext* context) context->SetTilingKey(*modulatedPtr); + auto inputInf = context->GetInputDesc(0); + if (inputInf == nullptr) { + return ge::GRAPH_FAILED; + } + auto dtype = inputInf->GetDataType(); + DeformableConv2dTilingData tilingData; matmul_tiling::MatmulApiTiling mmTiling(ascendPlatformInfo); - mmTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); - mmTiling.SetBType( - matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT, true); - mmTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + if (dtype == ge::DT_FLOAT) { + mmTiling.SetAType( + matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + mmTiling.SetBType( + matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT, true); + mmTiling.SetCType( + matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + } else { + mmTiling.SetAType( + matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16); + mmTiling.SetBType( + matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16, true); + mmTiling.SetCType( + matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16); + } + mmTiling.SetShape(cOut / groups, wOut, kH * kW * cIn / groups); mmTiling.SetOrgShape(cOut / groups, wOut, kH * kW * cIn / groups); mmTiling.SetBias(false); @@ -99,7 +123,8 @@ static ge::graphStatus TilingForDeformableConv2d(gert::TilingContext* context) ADD_TILING_DATA(context, tilingData); size_t systemWorkspaceSize = ascendPlatformInfo.GetLibApiWorkSpaceSize(); - size_t auxSize = 2 * kH * kW * wOut * sizeof(float); + size_t auxSize = + (dtype == ge::DT_FLOAT) ? (2 * kH * kW * wOut * SIZE_OF_FP32) : (2 * kH * kW * wOut * SIZE_OF_FP16); size_t* currentWorkspace = context->GetWorkspaceSizes(1); CHECK_NULLPTR(currentWorkspace); currentWorkspace[0] = systemWorkspaceSize + auxSize; @@ -153,33 +178,33 @@ public: { this->Input("x") .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}) + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}) .AutoContiguous(); this->Input("weight") .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}) + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}) .AutoContiguous(); this->Input("bias") .ParamType(OPTIONAL) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}) + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}) .AutoContiguous(); this->Input("offset") .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}) + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}) .AutoContiguous(); this->Input("mask") .ParamType(OPTIONAL) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}) + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}) .AutoContiguous(); this->Attr("kernel_size").ListInt(); @@ -193,14 +218,14 @@ public: this->Output("y") .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); this->Output("offset_output") .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); + .DataType({ge::DT_FLOAT, ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND, ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND}); this->SetInferShape(ge::InferShapeForDeformableConv2d).SetInferDataType(ge::InferDataTypeForDeformableConv2d); this->AICore().SetTiling(optiling::TilingForDeformableConv2d); diff --git a/kernels/op_kernel/deformable_conv2d.cpp b/kernels/op_kernel/deformable_conv2d.cpp index 3f0f06105e75e24521fa870af26ad78302474a78..f7199b2e13cb06d45542daacf0b63154dbee3f84 100644 --- a/kernels/op_kernel/deformable_conv2d.cpp +++ b/kernels/op_kernel/deformable_conv2d.cpp @@ -3,17 +3,28 @@ using namespace AscendC; +constexpr uint32_t byteBlock = 32; +constexpr uint32_t byteSingleRepeat = 256; +constexpr uint32_t doubleBuffer = 2; +constexpr uint32_t fourTimesCin = 4; // one pixel from 4 weighted cin +constexpr uint32_t fourTimesOffsetSpace = 4; // (x,y,x0,y0) +constexpr uint32_t xySpace = 2; // (x,y) constexpr MatmulConfig DEFORMABLE_CONV2D_CFG = GetNormalConfig(); - -template + +template class DeformableConv2dKernel { public: - using AType = matmul::MatmulType; - using BType = matmul::MatmulType; - using CType = matmul::MatmulType; + using AType = matmul::MatmulType; + using BType = matmul::MatmulType; + using CType = matmul::MatmulType; matmul::Matmul mm_; + const uint32_t DATA_SIZE = sizeof(T); + const uint32_t INPUT_DATA_NUM_PER_REPEAT = byteSingleRepeat / DATA_SIZE; + const uint32_t INPUT_BYTE_SIZE = DATA_SIZE; + const uint32_t INPUT_DATA_NUM_PER_BLOCK = byteBlock / DATA_SIZE; + __aicore__ inline DeformableConv2dKernel() = default; __aicore__ inline void Init(GM_ADDR x, GM_ADDR weight, GM_ADDR bias, GM_ADDR offset, GM_ADDR mask, GM_ADDR y, @@ -27,8 +38,7 @@ public: InitGM(x, weight, bias, offset, mask, y, offsetOutput, workspace); InitBuffer(); InitEvent(); - - SetVectorMask(FULL_MASK, FULL_MASK); + SetVectorMask(FULL_MASK, FULL_MASK); SetAtomicNone(); } @@ -36,9 +46,9 @@ public: protected: TPipe* pipe_; - GlobalTensor xGm_, weightGm_, offsetGm_, biasGm_, maskGm_; - GlobalTensor yGm_, offsetOutputGm_; - GlobalTensor auxHGm_, auxWGm_; + GlobalTensor xGm_, weightGm_, offsetGm_, biasGm_, maskGm_; + GlobalTensor yGm_, offsetOutputGm_; + GlobalTensor auxHGm_, auxWGm_; TBuf auxHBuf_, auxWBuf_; TBuf offsetBuf_, offsetIntBuf_, weightBuf_, maskBuf_, featureBuf_, offsetOutputBuf_; @@ -68,16 +78,15 @@ private: __aicore__ inline void ProcessVector(uint32_t taskIdx); - __aicore__ inline void CopyInOffset( - uint32_t taskIdx, const LocalTensor& offset, const LocalTensor& mask); + __aicore__ inline void CopyInOffset(uint32_t taskIdx, const LocalTensor& offset, const LocalTensor& mask); - __aicore__ inline void ComputeWeight(uint32_t taskIdx, const LocalTensor& auxW, - const LocalTensor& auxH, const LocalTensor& offset, const LocalTensor& offsetInt, - const LocalTensor& weight, const LocalTensor& mask); + __aicore__ inline void ComputeWeight(uint32_t taskIdx, const LocalTensor& auxW, const LocalTensor& auxH, + const LocalTensor& offset, const LocalTensor& offsetInt, const LocalTensor& weight, + const LocalTensor& mask); - __aicore__ inline void ComputeBilinearInterpolation(uint32_t w, const LocalTensor& offset, - const LocalTensor& offsetInt, const LocalTensor& feature, const LocalTensor& weight, - const LocalTensor& offsetOutput); + __aicore__ inline void ComputeBilinearInterpolation(uint32_t w, const LocalTensor& offset, + const LocalTensor& offsetInt, const LocalTensor& feature, const LocalTensor& weight, + const LocalTensor& offsetOutput); __aicore__ inline void InitTiling(const DeformableConv2dTilingData* tilingData) { @@ -99,7 +108,7 @@ private: dilationW_ = tilingData->dilationW; groups_ = tilingData->groups; usedBlkNum_ = tilingData->usedBlkNum; - featureOffset_ = 4 * cIn_; + featureOffset_ = fourTimesCin * cIn_; rowOut_ = wOut_ * cOut_; rowOutPerGroup_ = rowOut_ / groups_; kwIn_ = kernelSize_ * cIn_; @@ -109,10 +118,10 @@ private: cInPerGroup_ = cIn_ / groups_; kernelPerGroup_ = cOut_ / groups_ * kwInPerGroup_; rowOffset_ = wOut_ * kernelSize_; - alignedRowOffset_ = AlignUp(rowOffset_, B32_DATA_NUM_PER_REPEAT); - rowOffsetBlk_ = Ceil(rowOffset_, B32_DATA_NUM_PER_BLOCK); - doubleRowOffsetBlk_ = Ceil(2 * rowOffset_, B32_DATA_NUM_PER_BLOCK); - cInBlk_ = Ceil(cIn_, B32_DATA_NUM_PER_BLOCK); + alignedRowOffset_ = AlignUp(rowOffset_, INPUT_DATA_NUM_PER_REPEAT); + rowOffsetBlk_ = Ceil(rowOffset_, INPUT_DATA_NUM_PER_BLOCK); + doubleRowOffsetBlk_ = Ceil(xySpace * rowOffset_, INPUT_DATA_NUM_PER_BLOCK); + cInBlk_ = Ceil(cIn_, INPUT_DATA_NUM_PER_BLOCK); cpOneValParams_.blockLen = cInBlk_; cpRowDoubleValParams_.blockLen = 2 * cInBlk_; @@ -124,8 +133,8 @@ private: cpOffsetOutParams_.blockCount = kernelSize_; cpOffsetOutParams_.blockLen = cInBlk_ / groups_; cpOffsetOutParams_.srcStride = cInBlk_ - cInBlk_ / groups_; - rptTimes_ = alignedRowOffset_ / B32_DATA_NUM_PER_REPEAT; - valRptTimes_ = cIn_ / B32_DATA_NUM_PER_REPEAT; + rptTimes_ = alignedRowOffset_ / INPUT_DATA_NUM_PER_REPEAT; + valRptTimes_ = cIn_ / INPUT_DATA_NUM_PER_REPEAT; gatherParams_.repeatTimes = rptTimes_ * 2; } @@ -151,65 +160,65 @@ private: __aicore__ inline void InitBuffer() { - pipe_->InitBuffer(auxHBuf_, alignedRowOffset_ * B32_BYTE_SIZE); // 9 * 100 - pipe_->InitBuffer(auxWBuf_, alignedRowOffset_ * B32_BYTE_SIZE); - pipe_->InitBuffer(offsetBuf_, 4 * alignedRowOffset_ * B32_BYTE_SIZE); - pipe_->InitBuffer(offsetIntBuf_, 2 * alignedRowOffset_ * B32_BYTE_SIZE); - pipe_->InitBuffer(weightBuf_, 4 * alignedRowOffset_ * B32_BYTE_SIZE); + pipe_->InitBuffer(auxHBuf_, alignedRowOffset_ * INPUT_BYTE_SIZE); // 9 * 100 + pipe_->InitBuffer(auxWBuf_, alignedRowOffset_ * INPUT_BYTE_SIZE); + pipe_->InitBuffer(offsetBuf_, fourTimesOffsetSpace * alignedRowOffset_ * INPUT_BYTE_SIZE); + pipe_->InitBuffer(offsetIntBuf_, xySpace * alignedRowOffset_ * sizeof(int32_t)); + pipe_->InitBuffer(weightBuf_, fourTimesOffsetSpace * alignedRowOffset_ * INPUT_BYTE_SIZE); if (modulated) { - pipe_->InitBuffer(maskBuf_, alignedRowOffset_ * B32_BYTE_SIZE); + pipe_->InitBuffer(maskBuf_, alignedRowOffset_ * INPUT_BYTE_SIZE); } - pipe_->InitBuffer(offsetOutputBuf_, 2 * kwIn_ * B32_BYTE_SIZE); // 2 for double buffer - pipe_->InitBuffer(featureBuf_, 2 * 4 * cIn_ * B32_BYTE_SIZE); + pipe_->InitBuffer(offsetOutputBuf_, doubleBuffer * kwIn_ * INPUT_BYTE_SIZE); // 2 for double buffer + pipe_->InitBuffer(featureBuf_, doubleBuffer * fourTimesCin * cIn_ * INPUT_BYTE_SIZE); } __aicore__ inline void InitGM(GM_ADDR x, GM_ADDR weight, GM_ADDR bias, GM_ADDR offset, GM_ADDR mask, GM_ADDR y, GM_ADDR offsetOutput, GM_ADDR workspace) { - xGm_.SetGlobalBuffer(reinterpret_cast<__gm__ float*>(x)); - weightGm_.SetGlobalBuffer(reinterpret_cast<__gm__ float*>(weight)); - biasGm_.SetGlobalBuffer(reinterpret_cast<__gm__ float*>(bias)); - offsetGm_.SetGlobalBuffer(reinterpret_cast<__gm__ float*>(offset)); - yGm_.SetGlobalBuffer(reinterpret_cast<__gm__ float*>(y)); - offsetOutputGm_.SetGlobalBuffer(reinterpret_cast<__gm__ float*>(offsetOutput)); - - auxHGm_.SetGlobalBuffer(reinterpret_cast<__gm__ float*>(workspace)); - auxWGm_.SetGlobalBuffer(reinterpret_cast<__gm__ float*>(workspace) + rowOffset_); + xGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(x)); + weightGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(weight)); + biasGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(bias)); + offsetGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(offset)); + yGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(y)); + offsetOutputGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(offsetOutput)); + + auxHGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(workspace)); + auxWGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(workspace) + rowOffset_); if (modulated) { - maskGm_.SetGlobalBuffer(reinterpret_cast<__gm__ float*>(mask)); + maskGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(mask)); } } }; -template -__aicore__ inline void DeformableConv2dKernel::PreProcess() +template +__aicore__ inline void DeformableConv2dKernel::PreProcess() { - LocalTensor auxH = auxHBuf_.Get(); - LocalTensor auxW = auxWBuf_.Get(); + LocalTensor auxH = auxHBuf_.Get(); + LocalTensor auxW = auxWBuf_.Get(); uint32_t idx = 0; for (int32_t w = auxStart_; w < auxEnd_; ++w) { for (int32_t i = 0; i < kH_; ++i) { for (int32_t j = 0; j < kW_; ++j) { - auxW.SetValue(idx, static_cast(w * strideW_ - padW_ + j * dilationW_)); - auxH.SetValue(idx, static_cast(-padH_ + i * dilationH_)); + auxW.SetValue(idx, static_cast(w * strideW_ - padW_ + j * dilationW_)); + auxH.SetValue(idx, static_cast(-padH_ + i * dilationH_)); ++idx; } } } DataCopyPad(auxWGm_[auxStart_ * kernelSize_], auxW, - {1, static_cast(B32_BYTE_SIZE * (auxEnd_ - auxStart_) * kernelSize_), 0, 0}); + {1, static_cast(INPUT_BYTE_SIZE * (auxEnd_ - auxStart_) * kernelSize_), 0, 0}); DataCopyPad(auxHGm_[auxStart_ * kernelSize_], auxH, - {1, static_cast(B32_BYTE_SIZE * (auxEnd_ - auxStart_) * kernelSize_), 0, 0}); + {1, static_cast(INPUT_BYTE_SIZE * (auxEnd_ - auxStart_) * kernelSize_), 0, 0}); SyncAll(); DataCopy(auxW, auxWGm_, {1, rowOffsetBlk_, 0, 0}); DataCopy(auxH, auxHGm_, {1, rowOffsetBlk_, 0, 0}); - LocalTensor feature = featureBuf_.Get(); - Duplicate(feature, 0.f, MASK_PLACEHOLDER, 4 * valRptTimes_, 1, 8); + LocalTensor feature = featureBuf_.Get(); + Duplicate(feature, 0.f, MASK_PLACEHOLDER, 4 * valRptTimes_, 1, 8); } -template -__aicore__ inline void DeformableConv2dKernel::ProcessCube(uint32_t taskIdx) +template +__aicore__ inline void DeformableConv2dKernel::ProcessCube(uint32_t taskIdx) { uint64_t aOffset = 0; uint64_t bOffset = taskIdx * rowIn_; @@ -223,23 +232,23 @@ __aicore__ inline void DeformableConv2dKernel::ProcessCube(uint32_t t cOffset += rowOutPerGroup_; } } -template -__aicore__ inline void DeformableConv2dKernel::ProcessVector(uint32_t taskIdx) +template +__aicore__ inline void DeformableConv2dKernel::ProcessVector(uint32_t taskIdx) { uint32_t batch = taskIdx / hOut_; srcOffset_ = batch * hIn_ * wIn_ * cIn_; dstOffset_ = taskIdx * rowIn_; - LocalTensor offset = offsetBuf_.Get(); - LocalTensor auxW = auxWBuf_.Get(); - LocalTensor auxH = auxHBuf_.Get(); + LocalTensor offset = offsetBuf_.Get(); + LocalTensor auxW = auxWBuf_.Get(); + LocalTensor auxH = auxHBuf_.Get(); LocalTensor offsetInt = offsetIntBuf_.Get(); - LocalTensor weight = weightBuf_.Get(); - LocalTensor feature = featureBuf_.Get(); - LocalTensor mask; + LocalTensor weight = weightBuf_.Get(); + LocalTensor feature = featureBuf_.Get(); + LocalTensor mask; if (modulated) { - mask = maskBuf_.Get(); + mask = maskBuf_.Get(); } - LocalTensor offsetOutput = offsetOutputBuf_.Get(); + LocalTensor offsetOutput = offsetOutputBuf_.Get(); CopyInOffset(taskIdx, offset, mask); ComputeWeight(taskIdx, auxW, auxH, offset, offsetInt, weight, mask); @@ -260,9 +269,9 @@ __aicore__ inline void DeformableConv2dKernel::ProcessVector(uint32_t WaitFlag(1); } -template -__aicore__ inline void DeformableConv2dKernel::CopyInOffset( - uint32_t taskIdx, const LocalTensor& offset, const LocalTensor& mask) +template +__aicore__ inline void DeformableConv2dKernel::CopyInOffset( + uint32_t taskIdx, const LocalTensor& offset, const LocalTensor& mask) { uint32_t offsetIdx = taskIdx * rowOffset_ * 2; DataCopy(offset, offsetGm_[offsetIdx], {1, doubleRowOffsetBlk_, 0, 0}); @@ -274,55 +283,55 @@ __aicore__ inline void DeformableConv2dKernel::CopyInOffset( uint64_t cnt; GatherMask(offset[2 * alignedRowOffset_], offset, 2, false, MASK_PLACEHOLDER, gatherParams_, cnt); GatherMask(offset[3 * alignedRowOffset_], offset, 1, false, MASK_PLACEHOLDER, gatherParams_, cnt); - SetVectorMask(FULL_MASK, FULL_MASK); + SetVectorMask(FULL_MASK, FULL_MASK); } -template -__aicore__ inline void DeformableConv2dKernel::ComputeWeight(uint32_t taskIdx, - const LocalTensor& auxW, const LocalTensor& auxH, const LocalTensor& offset, - const LocalTensor& offsetInt, const LocalTensor& weight, const LocalTensor& mask) +template +__aicore__ inline void DeformableConv2dKernel::ComputeWeight(uint32_t taskIdx, const LocalTensor& auxW, + const LocalTensor& auxH, const LocalTensor& offset, const LocalTensor& offsetInt, + const LocalTensor& weight, const LocalTensor& mask) { int32_t h = taskIdx % hOut_; - Copy(offset, auxW, MASK_PLACEHOLDER, rptTimes_, {1, 1, 8, 8}); - Adds(offset[alignedRowOffset_], auxH, float(h * strideH_), MASK_PLACEHOLDER, rptTimes_, {1, 1, 8, 8}); - Add( - offset, offset, offset[2 * alignedRowOffset_], MASK_PLACEHOLDER, 2 * rptTimes_, {1, 1, 1, 8, 8, 8}); - - Cast( - offsetInt, offset, RoundMode::CAST_FLOOR, MASK_PLACEHOLDER, 2 * rptTimes_, {1, 1, 8, 8}); - Cast( - offset[2 * alignedRowOffset_], offsetInt, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 2 * rptTimes_, {1, 1, 8, 8}); - Sub( + Copy(offset, auxW, MASK_PLACEHOLDER, rptTimes_, {1, 1, 8, 8}); + Adds(offset[alignedRowOffset_], auxH, T(h * strideH_), MASK_PLACEHOLDER, rptTimes_, {1, 1, 8, 8}); + Add(offset, offset, offset[2 * alignedRowOffset_], MASK_PLACEHOLDER, 2 * rptTimes_, {1, 1, 1, 8, 8, 8}); + + Cast(offsetInt, offset, RoundMode::CAST_FLOOR, 2 * alignedRowOffset_); + half scale = 1.0; + AscendC::SetDeqScale(scale); + Cast(offset[2 * alignedRowOffset_], offsetInt, RoundMode::CAST_NONE, 2 * alignedRowOffset_); + + Sub( offset, offset, offset[2 * alignedRowOffset_], MASK_PLACEHOLDER, 2 * rptTimes_, {1, 1, 1, 8, 8, 8}); // lw, lh - Duplicate(weight, 1.f, MASK_PLACEHOLDER, 2 * rptTimes_, 1, 8); - Sub( + Duplicate(weight, 1.f, MASK_PLACEHOLDER, 2 * rptTimes_, 1, 8); + Sub( offset[2 * alignedRowOffset_], weight, offset, MASK_PLACEHOLDER, 2 * rptTimes_, {1, 1, 1, 8, 8, 8}); // hw, hh - Mul(weight, offset[2 * alignedRowOffset_], offset[3 * alignedRowOffset_], MASK_PLACEHOLDER, rptTimes_, + Mul(weight, offset[2 * alignedRowOffset_], offset[3 * alignedRowOffset_], MASK_PLACEHOLDER, rptTimes_, {1, 1, 1, 8, 8, 8}); // hw * hh - Mul(weight[alignedRowOffset_], offset, offset[3 * alignedRowOffset_], MASK_PLACEHOLDER, rptTimes_, + Mul(weight[alignedRowOffset_], offset, offset[3 * alignedRowOffset_], MASK_PLACEHOLDER, rptTimes_, {1, 1, 1, 8, 8, 8}); // lw * hh - Mul(weight[2 * alignedRowOffset_], offset[alignedRowOffset_], offset[2 * alignedRowOffset_], + Mul(weight[2 * alignedRowOffset_], offset[alignedRowOffset_], offset[2 * alignedRowOffset_], MASK_PLACEHOLDER, rptTimes_, {1, 1, 1, 8, 8, 8}); // hw * lh - Mul(weight[3 * alignedRowOffset_], offset, offset[alignedRowOffset_], MASK_PLACEHOLDER, rptTimes_, + Mul(weight[3 * alignedRowOffset_], offset, offset[alignedRowOffset_], MASK_PLACEHOLDER, rptTimes_, {1, 1, 1, 8, 8, 8}); // lh * lw if (modulated) { - Mul(weight, weight, mask, MASK_PLACEHOLDER, rptTimes_, {1, 1, 1, 8, 8, 8}); - Mul(weight[alignedRowOffset_], weight[alignedRowOffset_], mask, MASK_PLACEHOLDER, rptTimes_, + Mul(weight, weight, mask, MASK_PLACEHOLDER, rptTimes_, {1, 1, 1, 8, 8, 8}); + Mul(weight[alignedRowOffset_], weight[alignedRowOffset_], mask, MASK_PLACEHOLDER, rptTimes_, {1, 1, 1, 8, 8, 8}); // lw * hh - Mul(weight[2 * alignedRowOffset_], weight[2 * alignedRowOffset_], mask, MASK_PLACEHOLDER, - rptTimes_, {1, 1, 1, 8, 8, 8}); // hw * lh - Mul(weight[3 * alignedRowOffset_], weight[3 * alignedRowOffset_], mask, MASK_PLACEHOLDER, - rptTimes_, {1, 1, 1, 8, 8, 8}); // lh * lw + Mul(weight[2 * alignedRowOffset_], weight[2 * alignedRowOffset_], mask, MASK_PLACEHOLDER, rptTimes_, + {1, 1, 1, 8, 8, 8}); // hw * lh + Mul(weight[3 * alignedRowOffset_], weight[3 * alignedRowOffset_], mask, MASK_PLACEHOLDER, rptTimes_, + {1, 1, 1, 8, 8, 8}); // lh * lw } } -template -__aicore__ inline void DeformableConv2dKernel::ComputeBilinearInterpolation(uint32_t w, - const LocalTensor& offset, const LocalTensor& offsetInt, const LocalTensor& feature, - const LocalTensor& weight, const LocalTensor& offsetOutput) +template +__aicore__ inline void DeformableConv2dKernel::ComputeBilinearInterpolation(uint32_t w, + const LocalTensor& offset, const LocalTensor& offsetInt, const LocalTensor& feature, + const LocalTensor& weight, const LocalTensor& offsetOutput) { - Duplicate(offsetOutput, 0.f, MASK_PLACEHOLDER, kernelSize_ * valRptTimes_, 1, 8); + Duplicate(offsetOutput, 0.f, MASK_PLACEHOLDER, kernelSize_ * valRptTimes_, 1, 8); uint8_t ping = 0; uint32_t kernelOffset = w * kernelSize_; SetFlag(0); @@ -346,16 +355,16 @@ __aicore__ inline void DeformableConv2dKernel::ComputeBilinearInterpo SetFlag(copyEvt_); WaitFlag(copyEvt_); PipeBarrier(); - Axpy(offsetOutput[outOffset], feature[ftOffset], weight.GetValue(pw), - MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8}); + Axpy(offsetOutput[outOffset], feature[ftOffset], weight.GetValue(pw), MASK_PLACEHOLDER, + valRptTimes_, {1, 1, 8, 8}); PipeBarrier(); - Axpy(offsetOutput[outOffset], feature[ftOffset + cIn_], weight.GetValue(ph), + Axpy(offsetOutput[outOffset], feature[ftOffset + cIn_], weight.GetValue(ph), MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8}); PipeBarrier(); - Axpy(offsetOutput[outOffset], feature[ftOffset + 2 * cIn_], + Axpy(offsetOutput[outOffset], feature[ftOffset + 2 * cIn_], weight.GetValue(pw + 2 * alignedRowOffset_), MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8}); PipeBarrier(); - Axpy(offsetOutput[outOffset], feature[ftOffset + 3 * cIn_], + Axpy(offsetOutput[outOffset], feature[ftOffset + 3 * cIn_], weight.GetValue(ph + 2 * alignedRowOffset_), MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8}); } else if (w1 == 0) { uint64_t gmOffset = srcOffset_ + (h0 * wIn_) * cIn_; @@ -363,10 +372,10 @@ __aicore__ inline void DeformableConv2dKernel::ComputeBilinearInterpo SetFlag(copyEvt_); WaitFlag(copyEvt_); PipeBarrier(); - Axpy(offsetOutput[outOffset], feature[ftOffset + cIn_], weight.GetValue(ph), + Axpy(offsetOutput[outOffset], feature[ftOffset + cIn_], weight.GetValue(ph), MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8}); PipeBarrier(); - Axpy(offsetOutput[outOffset], feature[ftOffset + 3 * cIn_], + Axpy(offsetOutput[outOffset], feature[ftOffset + 3 * cIn_], weight.GetValue(ph + 2 * alignedRowOffset_), MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8}); } else if (w1 == wIn_) { uint64_t gmOffset = srcOffset_ + (h0 * wIn_ + w0) * cIn_; @@ -374,10 +383,10 @@ __aicore__ inline void DeformableConv2dKernel::ComputeBilinearInterpo SetFlag(copyEvt_); WaitFlag(copyEvt_); PipeBarrier(); - Axpy(offsetOutput[outOffset], feature[ftOffset], weight.GetValue(pw), - MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8}); + Axpy(offsetOutput[outOffset], feature[ftOffset], weight.GetValue(pw), MASK_PLACEHOLDER, + valRptTimes_, {1, 1, 8, 8}); PipeBarrier(); - Axpy(offsetOutput[outOffset], feature[ftOffset + 2 * cIn_], + Axpy(offsetOutput[outOffset], feature[ftOffset + 2 * cIn_], weight.GetValue(pw + 2 * alignedRowOffset_), MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8}); } } else if (h1 == 0) { @@ -387,10 +396,10 @@ __aicore__ inline void DeformableConv2dKernel::ComputeBilinearInterpo SetFlag(copyEvt_); WaitFlag(copyEvt_); PipeBarrier(); - Axpy(offsetOutput[outOffset], feature[ftOffset + 2 * cIn_], + Axpy(offsetOutput[outOffset], feature[ftOffset + 2 * cIn_], weight.GetValue(pw + 2 * alignedRowOffset_), MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8}); PipeBarrier(); - Axpy(offsetOutput[outOffset], feature[ftOffset + 3 * cIn_], + Axpy(offsetOutput[outOffset], feature[ftOffset + 3 * cIn_], weight.GetValue(ph + 2 * alignedRowOffset_), MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8}); } else if (w1 == 0) { uint64_t gmOffset = srcOffset_; @@ -398,7 +407,7 @@ __aicore__ inline void DeformableConv2dKernel::ComputeBilinearInterpo SetFlag(copyEvt_); WaitFlag(copyEvt_); PipeBarrier(); - Axpy(offsetOutput[outOffset], feature[ftOffset + 3 * cIn_], + Axpy(offsetOutput[outOffset], feature[ftOffset + 3 * cIn_], weight.GetValue(ph + 2 * alignedRowOffset_), MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8}); } else if (w1 == wIn_) { uint64_t gmOffset = srcOffset_ + w0 * cIn_; @@ -406,7 +415,7 @@ __aicore__ inline void DeformableConv2dKernel::ComputeBilinearInterpo SetFlag(copyEvt_); WaitFlag(copyEvt_); PipeBarrier(); - Axpy(offsetOutput[outOffset], feature[ftOffset + 2 * cIn_], + Axpy(offsetOutput[outOffset], feature[ftOffset + 2 * cIn_], weight.GetValue(pw + 2 * alignedRowOffset_), MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8}); } } else if (h1 == hIn_) { @@ -416,10 +425,10 @@ __aicore__ inline void DeformableConv2dKernel::ComputeBilinearInterpo SetFlag(copyEvt_); WaitFlag(copyEvt_); PipeBarrier(); - Axpy(offsetOutput[outOffset], feature[ftOffset], weight.GetValue(pw), - MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8}); + Axpy(offsetOutput[outOffset], feature[ftOffset], weight.GetValue(pw), MASK_PLACEHOLDER, + valRptTimes_, {1, 1, 8, 8}); PipeBarrier(); - Axpy(offsetOutput[outOffset], feature[ftOffset + cIn_], weight.GetValue(ph), + Axpy(offsetOutput[outOffset], feature[ftOffset + cIn_], weight.GetValue(ph), MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8}); } else if (w1 == 0) { uint64_t gmOffset = srcOffset_ + (h0 * wIn_) * cIn_; @@ -427,7 +436,7 @@ __aicore__ inline void DeformableConv2dKernel::ComputeBilinearInterpo SetFlag(copyEvt_); WaitFlag(copyEvt_); PipeBarrier(); - Axpy(offsetOutput[outOffset], feature[ftOffset + cIn_], weight.GetValue(ph), + Axpy(offsetOutput[outOffset], feature[ftOffset + cIn_], weight.GetValue(ph), MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8}); } else if (w1 == wIn_) { uint64_t gmOffset = srcOffset_ + (h0 * wIn_ + w0) * cIn_; @@ -435,8 +444,8 @@ __aicore__ inline void DeformableConv2dKernel::ComputeBilinearInterpo SetFlag(copyEvt_); WaitFlag(copyEvt_); PipeBarrier(); - Axpy(offsetOutput[outOffset], feature[ftOffset], weight.GetValue(pw), - MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8}); + Axpy(offsetOutput[outOffset], feature[ftOffset], weight.GetValue(pw), MASK_PLACEHOLDER, + valRptTimes_, {1, 1, 8, 8}); } } SetFlag(ping); @@ -452,8 +461,8 @@ __aicore__ inline void DeformableConv2dKernel::ComputeBilinearInterpo WaitFlag(1); } -template -__aicore__ inline void DeformableConv2dKernel::Process() +template +__aicore__ inline void DeformableConv2dKernel::Process() { PreProcess(); for (uint32_t taskIdx = start_; taskIdx < end_; ++taskIdx) { @@ -475,12 +484,12 @@ extern "C" __global__ __aicore__ void deformable_conv2d(GM_ADDR x, GM_ADDR weigh TPipe pipe; if (TILING_KEY_IS(0)) { - DeformableConv2dKernel op; + DeformableConv2dKernel op; REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), op.mm_, &(tilingData.mmTilingData)); op.Init(x, weight, bias, offset, mask, y, offsetOutput, usrWorkspace, &tilingData, &pipe); op.Process(); } else if (TILING_KEY_IS(1)) { - DeformableConv2dKernel op; + DeformableConv2dKernel op; REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), op.mm_, &(tilingData.mmTilingData)); op.Init(x, weight, bias, offset, mask, y, offsetOutput, usrWorkspace, &tilingData, &pipe); op.Process(); diff --git a/mx_driving/csrc/DeformableConv2dBackward.cpp b/mx_driving/csrc/DeformableConv2dBackward.cpp index 8ed3f008891097a8f5e617874b61abc66473ee9b..63f1a6e8a27adc0897d996b9a70311b9a08af22e 100644 --- a/mx_driving/csrc/DeformableConv2dBackward.cpp +++ b/mx_driving/csrc/DeformableConv2dBackward.cpp @@ -29,6 +29,7 @@ std::tuple deformable_conv2d_backward(const TORCH_CHECK(offset.dim() == 4, "offset has to be a 4D Tensor, but got: ", offset.dim()); TORCH_CHECK(weight.dim() == 4, "weight has to be a 4D Tensor, but got: ", offset.dim()); TORCH_CHECK(groups > 0, "groups must be greater than 0"); + TORCH_CHECK(input.scalar_type() == at::kFloat, "deformable_conv2d only support float32 input.") const at::Tensor& bias = at::Tensor(); const at::Tensor& grad_bias = at::Tensor(); @@ -42,7 +43,7 @@ std::tuple deformable_conv2d_backward(const at::Tensor grad_input = at::zeros(input_sizes, input.options()); at::Tensor grad_offset = at::empty(offset_sizes, offset.options()); at::Tensor grad_weight = at::zeros(weight_sizes, weight.options()); - + bool modulated = false; bool with_bias = false;