diff --git a/kernels/op_host/deformable_conv2d.cpp b/kernels/op_host/deformable_conv2d.cpp
index d4facf60a3b0dfaadbbcb969d6fd7c6e59bfe626..9e3d90630267f9c2028839d8bb6a820fdfef53a5 100644
--- a/kernels/op_host/deformable_conv2d.cpp
+++ b/kernels/op_host/deformable_conv2d.cpp
@@ -4,6 +4,12 @@
 #include "tiling/platform/platform_ascendc.h"
 using namespace matmul_tiling;
 
+
+namespace {
+constexpr uint32_t SIZE_OF_FP16 = 2;
+constexpr uint32_t SIZE_OF_FP32 = 4;
+} // namespace
+
 namespace optiling {
 static ge::graphStatus TilingForDeformableConv2d(gert::TilingContext* context)
 {
@@ -64,12 +70,30 @@ static ge::graphStatus TilingForDeformableConv2d(gert::TilingContext* context)
 
     context->SetTilingKey(*modulatedPtr);
 
+    auto inputInf = context->GetInputDesc(0);
+    if (inputInf == nullptr) {
+        return ge::GRAPH_FAILED;
+    }
+    auto dtype = inputInf->GetDataType();
+
     DeformableConv2dTilingData tilingData;
     matmul_tiling::MatmulApiTiling mmTiling(ascendPlatformInfo);
-    mmTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT);
-    mmTiling.SetBType(
-        matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT, true);
-    mmTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT);
+    if (dtype == ge::DT_FLOAT) {
+        mmTiling.SetAType(
+            matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT);
+        mmTiling.SetBType(
+            matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT, true);
+        mmTiling.SetCType(
+            matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT);
+    } else {
+        mmTiling.SetAType(
+            matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16);
+        mmTiling.SetBType(
+            matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16, true);
+        mmTiling.SetCType(
+            matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16);
+    }
+
     mmTiling.SetShape(cOut / groups, wOut, kH * kW * cIn / groups);
     mmTiling.SetOrgShape(cOut / groups, wOut, kH * kW * cIn / groups);
     mmTiling.SetBias(false);
@@ -99,7 +123,8 @@ static ge::graphStatus TilingForDeformableConv2d(gert::TilingContext* context)
     ADD_TILING_DATA(context, tilingData);
 
     size_t systemWorkspaceSize = ascendPlatformInfo.GetLibApiWorkSpaceSize();
-    size_t auxSize = 2 * kH * kW * wOut * sizeof(float);
+    size_t auxSize =
+        (dtype == ge::DT_FLOAT) ? (2 * kH * kW * wOut * SIZE_OF_FP32) : (2 * kH * kW * wOut * SIZE_OF_FP16);
     size_t* currentWorkspace = context->GetWorkspaceSizes(1);
     CHECK_NULLPTR(currentWorkspace);
     currentWorkspace[0] = systemWorkspaceSize + auxSize;
@@ -153,33 +178,33 @@ public:
     {
         this->Input("x")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND})
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND})
             .AutoContiguous();
         this->Input("weight")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND})
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND})
             .AutoContiguous();
         this->Input("bias")
             .ParamType(OPTIONAL)
-            .DataType({ge::DT_FLOAT})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND})
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND})
             .AutoContiguous();
         this->Input("offset")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND})
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND})
             .AutoContiguous();
         this->Input("mask")
             .ParamType(OPTIONAL)
-            .DataType({ge::DT_FLOAT})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND})
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND})
             .AutoContiguous();
 
         this->Attr("kernel_size").ListInt();
@@ -193,14 +218,14 @@ public:
 
         this->Output("y")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND});
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
         this->Output("offset_output")
             .ParamType(REQUIRED)
-            .DataType({ge::DT_FLOAT})
-            .Format({ge::FORMAT_ND})
-            .UnknownShapeFormat({ge::FORMAT_ND});
+            .DataType({ge::DT_FLOAT, ge::DT_FLOAT16})
+            .Format({ge::FORMAT_ND, ge::FORMAT_ND})
+            .UnknownShapeFormat({ge::FORMAT_ND, ge::FORMAT_ND});
 
         this->SetInferShape(ge::InferShapeForDeformableConv2d).SetInferDataType(ge::InferDataTypeForDeformableConv2d);
         this->AICore().SetTiling(optiling::TilingForDeformableConv2d);
diff --git a/kernels/op_kernel/deformable_conv2d.cpp b/kernels/op_kernel/deformable_conv2d.cpp
index 3f0f06105e75e24521fa870af26ad78302474a78..f7199b2e13cb06d45542daacf0b63154dbee3f84 100644
--- a/kernels/op_kernel/deformable_conv2d.cpp
+++ b/kernels/op_kernel/deformable_conv2d.cpp
@@ -3,17 +3,28 @@
 
 using namespace AscendC;
 
+constexpr uint32_t byteBlock = 32;
+constexpr uint32_t byteSingleRepeat = 256;
+constexpr uint32_t doubleBuffer = 2;
+constexpr uint32_t fourTimesCin = 4;          // one pixel from 4 weighted cin
+constexpr uint32_t fourTimesOffsetSpace = 4; // (x,y,x0,y0)
+constexpr uint32_t xySpace = 2;                // (x,y)
 constexpr MatmulConfig DEFORMABLE_CONV2D_CFG = GetNormalConfig();
-
-template<bool modulated>
+    
+template<bool modulated, typename T>
 class DeformableConv2dKernel {
 public:
-    using AType = matmul::MatmulType<TPosition::GM, CubeFormat::ND, float>;
-    using BType = matmul::MatmulType<TPosition::GM, CubeFormat::ND, float, true, LayoutMode::NONE, true>;
-    using CType = matmul::MatmulType<TPosition::GM, CubeFormat::ND, float>;
+    using AType = matmul::MatmulType<TPosition::GM, CubeFormat::ND, T>;
+    using BType = matmul::MatmulType<TPosition::GM, CubeFormat::ND, T, true, LayoutMode::NONE, true>;
+    using CType = matmul::MatmulType<TPosition::GM, CubeFormat::ND, T>;
 
     matmul::Matmul<AType, BType, CType, CType, DEFORMABLE_CONV2D_CFG> mm_;
 
+    const uint32_t DATA_SIZE = sizeof(T);
+    const uint32_t INPUT_DATA_NUM_PER_REPEAT = byteSingleRepeat / DATA_SIZE;
+    const uint32_t INPUT_BYTE_SIZE = DATA_SIZE;
+    const uint32_t INPUT_DATA_NUM_PER_BLOCK = byteBlock / DATA_SIZE;
+
     __aicore__ inline DeformableConv2dKernel() = default;
 
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR weight, GM_ADDR bias, GM_ADDR offset, GM_ADDR mask, GM_ADDR y,
@@ -27,8 +38,7 @@ public:
         InitGM(x, weight, bias, offset, mask, y, offsetOutput, workspace);
         InitBuffer();
         InitEvent();
-
-        SetVectorMask<float>(FULL_MASK, FULL_MASK);
+        SetVectorMask<T>(FULL_MASK, FULL_MASK);
         SetAtomicNone();
     }
 
@@ -36,9 +46,9 @@ public:
 
 protected:
     TPipe* pipe_;
-    GlobalTensor<float> xGm_, weightGm_, offsetGm_, biasGm_, maskGm_;
-    GlobalTensor<float> yGm_, offsetOutputGm_;
-    GlobalTensor<float> auxHGm_, auxWGm_;
+    GlobalTensor<T> xGm_, weightGm_, offsetGm_, biasGm_, maskGm_;
+    GlobalTensor<T> yGm_, offsetOutputGm_;
+    GlobalTensor<T> auxHGm_, auxWGm_;
     TBuf<TPosition::VECCALC> auxHBuf_, auxWBuf_;
     TBuf<TPosition::VECCALC> offsetBuf_, offsetIntBuf_, weightBuf_, maskBuf_, featureBuf_, offsetOutputBuf_;
 
@@ -68,16 +78,15 @@ private:
 
     __aicore__ inline void ProcessVector(uint32_t taskIdx);
 
-    __aicore__ inline void CopyInOffset(
-        uint32_t taskIdx, const LocalTensor<float>& offset, const LocalTensor<float>& mask);
+    __aicore__ inline void CopyInOffset(uint32_t taskIdx, const LocalTensor<T>& offset, const LocalTensor<T>& mask);
 
-    __aicore__ inline void ComputeWeight(uint32_t taskIdx, const LocalTensor<float>& auxW,
-        const LocalTensor<float>& auxH, const LocalTensor<float>& offset, const LocalTensor<int32_t>& offsetInt,
-        const LocalTensor<float>& weight, const LocalTensor<float>& mask);
+    __aicore__ inline void ComputeWeight(uint32_t taskIdx, const LocalTensor<T>& auxW, const LocalTensor<T>& auxH,
+        const LocalTensor<T>& offset, const LocalTensor<int32_t>& offsetInt, const LocalTensor<T>& weight,
+        const LocalTensor<T>& mask);
 
-    __aicore__ inline void ComputeBilinearInterpolation(uint32_t w, const LocalTensor<float>& offset,
-        const LocalTensor<int32_t>& offsetInt, const LocalTensor<float>& feature, const LocalTensor<float>& weight,
-        const LocalTensor<float>& offsetOutput);
+    __aicore__ inline void ComputeBilinearInterpolation(uint32_t w, const LocalTensor<T>& offset,
+        const LocalTensor<int32_t>& offsetInt, const LocalTensor<T>& feature, const LocalTensor<T>& weight,
+        const LocalTensor<T>& offsetOutput);
 
     __aicore__ inline void InitTiling(const DeformableConv2dTilingData* tilingData)
     {
@@ -99,7 +108,7 @@ private:
         dilationW_ = tilingData->dilationW;
         groups_ = tilingData->groups;
         usedBlkNum_ = tilingData->usedBlkNum;
-        featureOffset_ = 4 * cIn_;
+        featureOffset_ = fourTimesCin * cIn_;
         rowOut_ = wOut_ * cOut_;
         rowOutPerGroup_ = rowOut_ / groups_;
         kwIn_ = kernelSize_ * cIn_;
@@ -109,10 +118,10 @@ private:
         cInPerGroup_ = cIn_ / groups_;
         kernelPerGroup_ = cOut_ / groups_ * kwInPerGroup_;
         rowOffset_ = wOut_ * kernelSize_;
-        alignedRowOffset_ = AlignUp(rowOffset_, B32_DATA_NUM_PER_REPEAT);
-        rowOffsetBlk_ = Ceil(rowOffset_, B32_DATA_NUM_PER_BLOCK);
-        doubleRowOffsetBlk_ = Ceil(2 * rowOffset_, B32_DATA_NUM_PER_BLOCK);
-        cInBlk_ = Ceil(cIn_, B32_DATA_NUM_PER_BLOCK);
+        alignedRowOffset_ = AlignUp(rowOffset_, INPUT_DATA_NUM_PER_REPEAT);
+        rowOffsetBlk_ = Ceil(rowOffset_, INPUT_DATA_NUM_PER_BLOCK);
+        doubleRowOffsetBlk_ = Ceil(xySpace * rowOffset_, INPUT_DATA_NUM_PER_BLOCK);
+        cInBlk_ = Ceil(cIn_, INPUT_DATA_NUM_PER_BLOCK);
 
         cpOneValParams_.blockLen = cInBlk_;
         cpRowDoubleValParams_.blockLen = 2 * cInBlk_;
@@ -124,8 +133,8 @@ private:
         cpOffsetOutParams_.blockCount = kernelSize_;
         cpOffsetOutParams_.blockLen = cInBlk_ / groups_;
         cpOffsetOutParams_.srcStride = cInBlk_ - cInBlk_ / groups_;
-        rptTimes_ = alignedRowOffset_ / B32_DATA_NUM_PER_REPEAT;
-        valRptTimes_ = cIn_ / B32_DATA_NUM_PER_REPEAT;
+        rptTimes_ = alignedRowOffset_ / INPUT_DATA_NUM_PER_REPEAT;
+        valRptTimes_ = cIn_ / INPUT_DATA_NUM_PER_REPEAT;
         gatherParams_.repeatTimes = rptTimes_ * 2;
     }
 
@@ -151,65 +160,65 @@ private:
 
     __aicore__ inline void InitBuffer()
     {
-        pipe_->InitBuffer(auxHBuf_, alignedRowOffset_ * B32_BYTE_SIZE); // 9 * 100
-        pipe_->InitBuffer(auxWBuf_, alignedRowOffset_ * B32_BYTE_SIZE);
-        pipe_->InitBuffer(offsetBuf_, 4 * alignedRowOffset_ * B32_BYTE_SIZE);
-        pipe_->InitBuffer(offsetIntBuf_, 2 * alignedRowOffset_ * B32_BYTE_SIZE);
-        pipe_->InitBuffer(weightBuf_, 4 * alignedRowOffset_ * B32_BYTE_SIZE);
+        pipe_->InitBuffer(auxHBuf_, alignedRowOffset_ * INPUT_BYTE_SIZE); // 9 * 100
+        pipe_->InitBuffer(auxWBuf_, alignedRowOffset_ * INPUT_BYTE_SIZE);
+        pipe_->InitBuffer(offsetBuf_, fourTimesOffsetSpace * alignedRowOffset_ * INPUT_BYTE_SIZE);
+        pipe_->InitBuffer(offsetIntBuf_, xySpace * alignedRowOffset_ * sizeof(int32_t));
+        pipe_->InitBuffer(weightBuf_, fourTimesOffsetSpace * alignedRowOffset_ * INPUT_BYTE_SIZE);
         if (modulated) {
-            pipe_->InitBuffer(maskBuf_, alignedRowOffset_ * B32_BYTE_SIZE);
+            pipe_->InitBuffer(maskBuf_, alignedRowOffset_ * INPUT_BYTE_SIZE);
         }
-        pipe_->InitBuffer(offsetOutputBuf_, 2 * kwIn_ * B32_BYTE_SIZE); // 2 for double buffer
-        pipe_->InitBuffer(featureBuf_, 2 * 4 * cIn_ * B32_BYTE_SIZE);
+        pipe_->InitBuffer(offsetOutputBuf_, doubleBuffer * kwIn_ * INPUT_BYTE_SIZE); // 2 for double buffer
+        pipe_->InitBuffer(featureBuf_, doubleBuffer * fourTimesCin * cIn_ * INPUT_BYTE_SIZE);
     }
     __aicore__ inline void InitGM(GM_ADDR x, GM_ADDR weight, GM_ADDR bias, GM_ADDR offset, GM_ADDR mask, GM_ADDR y,
         GM_ADDR offsetOutput, GM_ADDR workspace)
     {
-        xGm_.SetGlobalBuffer(reinterpret_cast<__gm__ float*>(x));
-        weightGm_.SetGlobalBuffer(reinterpret_cast<__gm__ float*>(weight));
-        biasGm_.SetGlobalBuffer(reinterpret_cast<__gm__ float*>(bias));
-        offsetGm_.SetGlobalBuffer(reinterpret_cast<__gm__ float*>(offset));
-        yGm_.SetGlobalBuffer(reinterpret_cast<__gm__ float*>(y));
-        offsetOutputGm_.SetGlobalBuffer(reinterpret_cast<__gm__ float*>(offsetOutput));
-
-        auxHGm_.SetGlobalBuffer(reinterpret_cast<__gm__ float*>(workspace));
-        auxWGm_.SetGlobalBuffer(reinterpret_cast<__gm__ float*>(workspace) + rowOffset_);
+        xGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(x));
+        weightGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(weight));
+        biasGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(bias));
+        offsetGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(offset));
+        yGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(y));
+        offsetOutputGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(offsetOutput));
+
+        auxHGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(workspace));
+        auxWGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(workspace) + rowOffset_);
         if (modulated) {
-            maskGm_.SetGlobalBuffer(reinterpret_cast<__gm__ float*>(mask));
+            maskGm_.SetGlobalBuffer(reinterpret_cast<__gm__ T*>(mask));
         }
     }
 };
 
-template<bool modulated>
-__aicore__ inline void DeformableConv2dKernel<modulated>::PreProcess()
+template<bool modulated, typename T>
+__aicore__ inline void DeformableConv2dKernel<modulated, T>::PreProcess()
 {
-    LocalTensor<float> auxH = auxHBuf_.Get<float>();
-    LocalTensor<float> auxW = auxWBuf_.Get<float>();
+    LocalTensor<T> auxH = auxHBuf_.Get<T>();
+    LocalTensor<T> auxW = auxWBuf_.Get<T>();
 
     uint32_t idx = 0;
     for (int32_t w = auxStart_; w < auxEnd_; ++w) {
         for (int32_t i = 0; i < kH_; ++i) {
             for (int32_t j = 0; j < kW_; ++j) {
-                auxW.SetValue(idx, static_cast<float>(w * strideW_ - padW_ + j * dilationW_));
-                auxH.SetValue(idx, static_cast<float>(-padH_ + i * dilationH_));
+                auxW.SetValue(idx, static_cast<T>(w * strideW_ - padW_ + j * dilationW_));
+                auxH.SetValue(idx, static_cast<T>(-padH_ + i * dilationH_));
                 ++idx;
             }
         }
     }
     DataCopyPad(auxWGm_[auxStart_ * kernelSize_], auxW,
-        {1, static_cast<uint16_t>(B32_BYTE_SIZE * (auxEnd_ - auxStart_) * kernelSize_), 0, 0});
+        {1, static_cast<uint16_t>(INPUT_BYTE_SIZE * (auxEnd_ - auxStart_) * kernelSize_), 0, 0});
     DataCopyPad(auxHGm_[auxStart_ * kernelSize_], auxH,
-        {1, static_cast<uint16_t>(B32_BYTE_SIZE * (auxEnd_ - auxStart_) * kernelSize_), 0, 0});
+        {1, static_cast<uint16_t>(INPUT_BYTE_SIZE * (auxEnd_ - auxStart_) * kernelSize_), 0, 0});
     SyncAll();
     DataCopy(auxW, auxWGm_, {1, rowOffsetBlk_, 0, 0});
     DataCopy(auxH, auxHGm_, {1, rowOffsetBlk_, 0, 0});
 
-    LocalTensor<float> feature = featureBuf_.Get<float>();
-    Duplicate<float, false>(feature, 0.f, MASK_PLACEHOLDER, 4 * valRptTimes_, 1, 8);
+    LocalTensor<T> feature = featureBuf_.Get<T>();
+    Duplicate<T, false>(feature, 0.f, MASK_PLACEHOLDER, 4 * valRptTimes_, 1, 8);
 }
 
-template<bool modulated>
-__aicore__ inline void DeformableConv2dKernel<modulated>::ProcessCube(uint32_t taskIdx)
+template<bool modulated, typename T>
+__aicore__ inline void DeformableConv2dKernel<modulated, T>::ProcessCube(uint32_t taskIdx)
 {
     uint64_t aOffset = 0;
     uint64_t bOffset = taskIdx * rowIn_;
@@ -223,23 +232,23 @@ __aicore__ inline void DeformableConv2dKernel<modulated>::ProcessCube(uint32_t t
         cOffset += rowOutPerGroup_;
     }
 }
-template<bool modulated>
-__aicore__ inline void DeformableConv2dKernel<modulated>::ProcessVector(uint32_t taskIdx)
+template<bool modulated, typename T>
+__aicore__ inline void DeformableConv2dKernel<modulated, T>::ProcessVector(uint32_t taskIdx)
 {
     uint32_t batch = taskIdx / hOut_;
     srcOffset_ = batch * hIn_ * wIn_ * cIn_;
     dstOffset_ = taskIdx * rowIn_;
-    LocalTensor<float> offset = offsetBuf_.Get<float>();
-    LocalTensor<float> auxW = auxWBuf_.Get<float>();
-    LocalTensor<float> auxH = auxHBuf_.Get<float>();
+    LocalTensor<T> offset = offsetBuf_.Get<T>();
+    LocalTensor<T> auxW = auxWBuf_.Get<T>();
+    LocalTensor<T> auxH = auxHBuf_.Get<T>();
     LocalTensor<int32_t> offsetInt = offsetIntBuf_.Get<int32_t>();
-    LocalTensor<float> weight = weightBuf_.Get<float>();
-    LocalTensor<float> feature = featureBuf_.Get<float>();
-    LocalTensor<float> mask;
+    LocalTensor<T> weight = weightBuf_.Get<T>();
+    LocalTensor<T> feature = featureBuf_.Get<T>();
+    LocalTensor<T> mask;
     if (modulated) {
-        mask = maskBuf_.Get<float>();
+        mask = maskBuf_.Get<T>();
     }
-    LocalTensor<float> offsetOutput = offsetOutputBuf_.Get<float>();
+    LocalTensor<T> offsetOutput = offsetOutputBuf_.Get<T>();
 
     CopyInOffset(taskIdx, offset, mask);
     ComputeWeight(taskIdx, auxW, auxH, offset, offsetInt, weight, mask);
@@ -260,9 +269,9 @@ __aicore__ inline void DeformableConv2dKernel<modulated>::ProcessVector(uint32_t
     WaitFlag<HardEvent::MTE3_V>(1);
 }
 
-template<bool modulated>
-__aicore__ inline void DeformableConv2dKernel<modulated>::CopyInOffset(
-    uint32_t taskIdx, const LocalTensor<float>& offset, const LocalTensor<float>& mask)
+template<bool modulated, typename T>
+__aicore__ inline void DeformableConv2dKernel<modulated, T>::CopyInOffset(
+    uint32_t taskIdx, const LocalTensor<T>& offset, const LocalTensor<T>& mask)
 {
     uint32_t offsetIdx = taskIdx * rowOffset_ * 2;
     DataCopy(offset, offsetGm_[offsetIdx], {1, doubleRowOffsetBlk_, 0, 0});
@@ -274,55 +283,55 @@ __aicore__ inline void DeformableConv2dKernel<modulated>::CopyInOffset(
     uint64_t cnt;
     GatherMask(offset[2 * alignedRowOffset_], offset, 2, false, MASK_PLACEHOLDER, gatherParams_, cnt);
     GatherMask(offset[3 * alignedRowOffset_], offset, 1, false, MASK_PLACEHOLDER, gatherParams_, cnt);
-    SetVectorMask<float>(FULL_MASK, FULL_MASK);
+    SetVectorMask<T>(FULL_MASK, FULL_MASK);
 }
 
-template<bool modulated>
-__aicore__ inline void DeformableConv2dKernel<modulated>::ComputeWeight(uint32_t taskIdx,
-    const LocalTensor<float>& auxW, const LocalTensor<float>& auxH, const LocalTensor<float>& offset,
-    const LocalTensor<int32_t>& offsetInt, const LocalTensor<float>& weight, const LocalTensor<float>& mask)
+template<bool modulated, typename T>
+__aicore__ inline void DeformableConv2dKernel<modulated, T>::ComputeWeight(uint32_t taskIdx, const LocalTensor<T>& auxW,
+    const LocalTensor<T>& auxH, const LocalTensor<T>& offset, const LocalTensor<int32_t>& offsetInt,
+    const LocalTensor<T>& weight, const LocalTensor<T>& mask)
 {
     int32_t h = taskIdx % hOut_;
-    Copy<float, false>(offset, auxW, MASK_PLACEHOLDER, rptTimes_, {1, 1, 8, 8});
-    Adds<float, false>(offset[alignedRowOffset_], auxH, float(h * strideH_), MASK_PLACEHOLDER, rptTimes_, {1, 1, 8, 8});
-    Add<float, false>(
-        offset, offset, offset[2 * alignedRowOffset_], MASK_PLACEHOLDER, 2 * rptTimes_, {1, 1, 1, 8, 8, 8});
-
-    Cast<int32_t, float, false>(
-        offsetInt, offset, RoundMode::CAST_FLOOR, MASK_PLACEHOLDER, 2 * rptTimes_, {1, 1, 8, 8});
-    Cast<float, int32_t, false>(
-        offset[2 * alignedRowOffset_], offsetInt, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 2 * rptTimes_, {1, 1, 8, 8});
-    Sub<float, false>(
+    Copy<T, false>(offset, auxW, MASK_PLACEHOLDER, rptTimes_, {1, 1, 8, 8});
+    Adds<T, false>(offset[alignedRowOffset_], auxH, T(h * strideH_), MASK_PLACEHOLDER, rptTimes_, {1, 1, 8, 8});
+    Add<T, false>(offset, offset, offset[2 * alignedRowOffset_], MASK_PLACEHOLDER, 2 * rptTimes_, {1, 1, 1, 8, 8, 8});
+
+    Cast<int32_t, T>(offsetInt, offset, RoundMode::CAST_FLOOR, 2 * alignedRowOffset_);
+    half scale = 1.0;
+    AscendC::SetDeqScale(scale);
+    Cast<T, int32_t>(offset[2 * alignedRowOffset_], offsetInt, RoundMode::CAST_NONE, 2 * alignedRowOffset_);
+
+    Sub<T, false>(
         offset, offset, offset[2 * alignedRowOffset_], MASK_PLACEHOLDER, 2 * rptTimes_, {1, 1, 1, 8, 8, 8}); // lw, lh
-    Duplicate<float, false>(weight, 1.f, MASK_PLACEHOLDER, 2 * rptTimes_, 1, 8);
-    Sub<float, false>(
+    Duplicate<T, false>(weight, 1.f, MASK_PLACEHOLDER, 2 * rptTimes_, 1, 8);
+    Sub<T, false>(
         offset[2 * alignedRowOffset_], weight, offset, MASK_PLACEHOLDER, 2 * rptTimes_, {1, 1, 1, 8, 8, 8}); // hw, hh
 
-    Mul<float, false>(weight, offset[2 * alignedRowOffset_], offset[3 * alignedRowOffset_], MASK_PLACEHOLDER, rptTimes_,
+    Mul<T, false>(weight, offset[2 * alignedRowOffset_], offset[3 * alignedRowOffset_], MASK_PLACEHOLDER, rptTimes_,
         {1, 1, 1, 8, 8, 8}); // hw * hh
-    Mul<float, false>(weight[alignedRowOffset_], offset, offset[3 * alignedRowOffset_], MASK_PLACEHOLDER, rptTimes_,
+    Mul<T, false>(weight[alignedRowOffset_], offset, offset[3 * alignedRowOffset_], MASK_PLACEHOLDER, rptTimes_,
         {1, 1, 1, 8, 8, 8}); // lw * hh
-    Mul<float, false>(weight[2 * alignedRowOffset_], offset[alignedRowOffset_], offset[2 * alignedRowOffset_],
+    Mul<T, false>(weight[2 * alignedRowOffset_], offset[alignedRowOffset_], offset[2 * alignedRowOffset_],
         MASK_PLACEHOLDER, rptTimes_, {1, 1, 1, 8, 8, 8}); // hw * lh
-    Mul<float, false>(weight[3 * alignedRowOffset_], offset, offset[alignedRowOffset_], MASK_PLACEHOLDER, rptTimes_,
+    Mul<T, false>(weight[3 * alignedRowOffset_], offset, offset[alignedRowOffset_], MASK_PLACEHOLDER, rptTimes_,
         {1, 1, 1, 8, 8, 8}); // lh * lw
     if (modulated) {
-        Mul<float, false>(weight, weight, mask, MASK_PLACEHOLDER, rptTimes_, {1, 1, 1, 8, 8, 8});
-        Mul<float, false>(weight[alignedRowOffset_], weight[alignedRowOffset_], mask, MASK_PLACEHOLDER, rptTimes_,
+        Mul<T, false>(weight, weight, mask, MASK_PLACEHOLDER, rptTimes_, {1, 1, 1, 8, 8, 8});
+        Mul<T, false>(weight[alignedRowOffset_], weight[alignedRowOffset_], mask, MASK_PLACEHOLDER, rptTimes_,
             {1, 1, 1, 8, 8, 8}); // lw * hh
-        Mul<float, false>(weight[2 * alignedRowOffset_], weight[2 * alignedRowOffset_], mask, MASK_PLACEHOLDER,
-            rptTimes_, {1, 1, 1, 8, 8, 8}); // hw * lh
-        Mul<float, false>(weight[3 * alignedRowOffset_], weight[3 * alignedRowOffset_], mask, MASK_PLACEHOLDER,
-            rptTimes_, {1, 1, 1, 8, 8, 8}); // lh * lw
+        Mul<T, false>(weight[2 * alignedRowOffset_], weight[2 * alignedRowOffset_], mask, MASK_PLACEHOLDER, rptTimes_,
+            {1, 1, 1, 8, 8, 8}); // hw * lh
+        Mul<T, false>(weight[3 * alignedRowOffset_], weight[3 * alignedRowOffset_], mask, MASK_PLACEHOLDER, rptTimes_,
+            {1, 1, 1, 8, 8, 8}); // lh * lw
     }
 }
 
-template<bool modulated>
-__aicore__ inline void DeformableConv2dKernel<modulated>::ComputeBilinearInterpolation(uint32_t w,
-    const LocalTensor<float>& offset, const LocalTensor<int32_t>& offsetInt, const LocalTensor<float>& feature,
-    const LocalTensor<float>& weight, const LocalTensor<float>& offsetOutput)
+template<bool modulated, typename T>
+__aicore__ inline void DeformableConv2dKernel<modulated, T>::ComputeBilinearInterpolation(uint32_t w,
+    const LocalTensor<T>& offset, const LocalTensor<int32_t>& offsetInt, const LocalTensor<T>& feature,
+    const LocalTensor<T>& weight, const LocalTensor<T>& offsetOutput)
 {
-    Duplicate<float, false>(offsetOutput, 0.f, MASK_PLACEHOLDER, kernelSize_ * valRptTimes_, 1, 8);
+    Duplicate<T, false>(offsetOutput, 0.f, MASK_PLACEHOLDER, kernelSize_ * valRptTimes_, 1, 8);
     uint8_t ping = 0;
     uint32_t kernelOffset = w * kernelSize_;
     SetFlag<HardEvent::V_MTE2>(0);
@@ -346,16 +355,16 @@ __aicore__ inline void DeformableConv2dKernel<modulated>::ComputeBilinearInterpo
                 SetFlag<HardEvent::MTE2_V>(copyEvt_);
                 WaitFlag<HardEvent::MTE2_V>(copyEvt_);
                 PipeBarrier<PIPE_V>();
-                Axpy<float, float, false>(offsetOutput[outOffset], feature[ftOffset], weight.GetValue(pw),
-                    MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8});
+                Axpy<T, T, false>(offsetOutput[outOffset], feature[ftOffset], weight.GetValue(pw), MASK_PLACEHOLDER,
+                    valRptTimes_, {1, 1, 8, 8});
                 PipeBarrier<PIPE_V>();
-                Axpy<float, float, false>(offsetOutput[outOffset], feature[ftOffset + cIn_], weight.GetValue(ph),
+                Axpy<T, T, false>(offsetOutput[outOffset], feature[ftOffset + cIn_], weight.GetValue(ph),
                     MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8});
                 PipeBarrier<PIPE_V>();
-                Axpy<float, float, false>(offsetOutput[outOffset], feature[ftOffset + 2 * cIn_],
+                Axpy<T, T, false>(offsetOutput[outOffset], feature[ftOffset + 2 * cIn_],
                     weight.GetValue(pw + 2 * alignedRowOffset_), MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8});
                 PipeBarrier<PIPE_V>();
-                Axpy<float, float, false>(offsetOutput[outOffset], feature[ftOffset + 3 * cIn_],
+                Axpy<T, T, false>(offsetOutput[outOffset], feature[ftOffset + 3 * cIn_],
                     weight.GetValue(ph + 2 * alignedRowOffset_), MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8});
             } else if (w1 == 0) {
                 uint64_t gmOffset = srcOffset_ + (h0 * wIn_) * cIn_;
@@ -363,10 +372,10 @@ __aicore__ inline void DeformableConv2dKernel<modulated>::ComputeBilinearInterpo
                 SetFlag<HardEvent::MTE2_V>(copyEvt_);
                 WaitFlag<HardEvent::MTE2_V>(copyEvt_);
                 PipeBarrier<PIPE_V>();
-                Axpy<float, float, false>(offsetOutput[outOffset], feature[ftOffset + cIn_], weight.GetValue(ph),
+                Axpy<T, T, false>(offsetOutput[outOffset], feature[ftOffset + cIn_], weight.GetValue(ph),
                     MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8});
                 PipeBarrier<PIPE_V>();
-                Axpy<float, float, false>(offsetOutput[outOffset], feature[ftOffset + 3 * cIn_],
+                Axpy<T, T, false>(offsetOutput[outOffset], feature[ftOffset + 3 * cIn_],
                     weight.GetValue(ph + 2 * alignedRowOffset_), MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8});
             } else if (w1 == wIn_) {
                 uint64_t gmOffset = srcOffset_ + (h0 * wIn_ + w0) * cIn_;
@@ -374,10 +383,10 @@ __aicore__ inline void DeformableConv2dKernel<modulated>::ComputeBilinearInterpo
                 SetFlag<HardEvent::MTE2_V>(copyEvt_);
                 WaitFlag<HardEvent::MTE2_V>(copyEvt_);
                 PipeBarrier<PIPE_V>();
-                Axpy<float, float, false>(offsetOutput[outOffset], feature[ftOffset], weight.GetValue(pw),
-                    MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8});
+                Axpy<T, T, false>(offsetOutput[outOffset], feature[ftOffset], weight.GetValue(pw), MASK_PLACEHOLDER,
+                    valRptTimes_, {1, 1, 8, 8});
                 PipeBarrier<PIPE_V>();
-                Axpy<float, float, false>(offsetOutput[outOffset], feature[ftOffset + 2 * cIn_],
+                Axpy<T, T, false>(offsetOutput[outOffset], feature[ftOffset + 2 * cIn_],
                     weight.GetValue(pw + 2 * alignedRowOffset_), MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8});
             }
         } else if (h1 == 0) {
@@ -387,10 +396,10 @@ __aicore__ inline void DeformableConv2dKernel<modulated>::ComputeBilinearInterpo
                 SetFlag<HardEvent::MTE2_V>(copyEvt_);
                 WaitFlag<HardEvent::MTE2_V>(copyEvt_);
                 PipeBarrier<PIPE_V>();
-                Axpy<float, float, false>(offsetOutput[outOffset], feature[ftOffset + 2 * cIn_],
+                Axpy<T, T, false>(offsetOutput[outOffset], feature[ftOffset + 2 * cIn_],
                     weight.GetValue(pw + 2 * alignedRowOffset_), MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8});
                 PipeBarrier<PIPE_V>();
-                Axpy<float, float, false>(offsetOutput[outOffset], feature[ftOffset + 3 * cIn_],
+                Axpy<T, T, false>(offsetOutput[outOffset], feature[ftOffset + 3 * cIn_],
                     weight.GetValue(ph + 2 * alignedRowOffset_), MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8});
             } else if (w1 == 0) {
                 uint64_t gmOffset = srcOffset_;
@@ -398,7 +407,7 @@ __aicore__ inline void DeformableConv2dKernel<modulated>::ComputeBilinearInterpo
                 SetFlag<HardEvent::MTE2_V>(copyEvt_);
                 WaitFlag<HardEvent::MTE2_V>(copyEvt_);
                 PipeBarrier<PIPE_V>();
-                Axpy<float, float, false>(offsetOutput[outOffset], feature[ftOffset + 3 * cIn_],
+                Axpy<T, T, false>(offsetOutput[outOffset], feature[ftOffset + 3 * cIn_],
                     weight.GetValue(ph + 2 * alignedRowOffset_), MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8});
             } else if (w1 == wIn_) {
                 uint64_t gmOffset = srcOffset_ + w0 * cIn_;
@@ -406,7 +415,7 @@ __aicore__ inline void DeformableConv2dKernel<modulated>::ComputeBilinearInterpo
                 SetFlag<HardEvent::MTE2_V>(copyEvt_);
                 WaitFlag<HardEvent::MTE2_V>(copyEvt_);
                 PipeBarrier<PIPE_V>();
-                Axpy<float, float, false>(offsetOutput[outOffset], feature[ftOffset + 2 * cIn_],
+                Axpy<T, T, false>(offsetOutput[outOffset], feature[ftOffset + 2 * cIn_],
                     weight.GetValue(pw + 2 * alignedRowOffset_), MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8});
             }
         } else if (h1 == hIn_) {
@@ -416,10 +425,10 @@ __aicore__ inline void DeformableConv2dKernel<modulated>::ComputeBilinearInterpo
                 SetFlag<HardEvent::MTE2_V>(copyEvt_);
                 WaitFlag<HardEvent::MTE2_V>(copyEvt_);
                 PipeBarrier<PIPE_V>();
-                Axpy<float, float, false>(offsetOutput[outOffset], feature[ftOffset], weight.GetValue(pw),
-                    MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8});
+                Axpy<T, T, false>(offsetOutput[outOffset], feature[ftOffset], weight.GetValue(pw), MASK_PLACEHOLDER,
+                    valRptTimes_, {1, 1, 8, 8});
                 PipeBarrier<PIPE_V>();
-                Axpy<float, float, false>(offsetOutput[outOffset], feature[ftOffset + cIn_], weight.GetValue(ph),
+                Axpy<T, T, false>(offsetOutput[outOffset], feature[ftOffset + cIn_], weight.GetValue(ph),
                     MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8});
             } else if (w1 == 0) {
                 uint64_t gmOffset = srcOffset_ + (h0 * wIn_) * cIn_;
@@ -427,7 +436,7 @@ __aicore__ inline void DeformableConv2dKernel<modulated>::ComputeBilinearInterpo
                 SetFlag<HardEvent::MTE2_V>(copyEvt_);
                 WaitFlag<HardEvent::MTE2_V>(copyEvt_);
                 PipeBarrier<PIPE_V>();
-                Axpy<float, float, false>(offsetOutput[outOffset], feature[ftOffset + cIn_], weight.GetValue(ph),
+                Axpy<T, T, false>(offsetOutput[outOffset], feature[ftOffset + cIn_], weight.GetValue(ph),
                     MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8});
             } else if (w1 == wIn_) {
                 uint64_t gmOffset = srcOffset_ + (h0 * wIn_ + w0) * cIn_;
@@ -435,8 +444,8 @@ __aicore__ inline void DeformableConv2dKernel<modulated>::ComputeBilinearInterpo
                 SetFlag<HardEvent::MTE2_V>(copyEvt_);
                 WaitFlag<HardEvent::MTE2_V>(copyEvt_);
                 PipeBarrier<PIPE_V>();
-                Axpy<float, float, false>(offsetOutput[outOffset], feature[ftOffset], weight.GetValue(pw),
-                    MASK_PLACEHOLDER, valRptTimes_, {1, 1, 8, 8});
+                Axpy<T, T, false>(offsetOutput[outOffset], feature[ftOffset], weight.GetValue(pw), MASK_PLACEHOLDER,
+                    valRptTimes_, {1, 1, 8, 8});
             }
         }
         SetFlag<HardEvent::V_MTE2>(ping);
@@ -452,8 +461,8 @@ __aicore__ inline void DeformableConv2dKernel<modulated>::ComputeBilinearInterpo
     WaitFlag<HardEvent::V_MTE2>(1);
 }
 
-template<bool modulated>
-__aicore__ inline void DeformableConv2dKernel<modulated>::Process()
+template<bool modulated, typename T>
+__aicore__ inline void DeformableConv2dKernel<modulated, T>::Process()
 {
     PreProcess();
     for (uint32_t taskIdx = start_; taskIdx < end_; ++taskIdx) {
@@ -475,12 +484,12 @@ extern "C" __global__ __aicore__ void deformable_conv2d(GM_ADDR x, GM_ADDR weigh
     TPipe pipe;
 
     if (TILING_KEY_IS(0)) {
-        DeformableConv2dKernel<false> op;
+        DeformableConv2dKernel<false, DTYPE_X> op;
         REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), op.mm_, &(tilingData.mmTilingData));
         op.Init(x, weight, bias, offset, mask, y, offsetOutput, usrWorkspace, &tilingData, &pipe);
         op.Process();
     } else if (TILING_KEY_IS(1)) {
-        DeformableConv2dKernel<true> op;
+        DeformableConv2dKernel<true, DTYPE_X> op;
         REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), op.mm_, &(tilingData.mmTilingData));
         op.Init(x, weight, bias, offset, mask, y, offsetOutput, usrWorkspace, &tilingData, &pipe);
         op.Process();
diff --git a/mx_driving/csrc/DeformableConv2dBackward.cpp b/mx_driving/csrc/DeformableConv2dBackward.cpp
index 8ed3f008891097a8f5e617874b61abc66473ee9b..63f1a6e8a27adc0897d996b9a70311b9a08af22e 100644
--- a/mx_driving/csrc/DeformableConv2dBackward.cpp
+++ b/mx_driving/csrc/DeformableConv2dBackward.cpp
@@ -29,6 +29,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> deformable_conv2d_backward(const
     TORCH_CHECK(offset.dim() == 4, "offset has to be a 4D Tensor, but got: ", offset.dim());
     TORCH_CHECK(weight.dim() == 4, "weight has to be a 4D Tensor, but got: ", offset.dim());
     TORCH_CHECK(groups > 0, "groups must be greater than 0");
+    TORCH_CHECK(input.scalar_type() == at::kFloat, "deformable_conv2d only support float32 input.")
 
     const at::Tensor& bias = at::Tensor();
     const at::Tensor& grad_bias = at::Tensor();
@@ -42,7 +43,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> deformable_conv2d_backward(const
     at::Tensor grad_input = at::zeros(input_sizes, input.options());
     at::Tensor grad_offset = at::empty(offset_sizes, offset.options());
     at::Tensor grad_weight = at::zeros(weight_sizes, weight.options());
-    
+
     bool modulated = false;
     bool with_bias = false;