From a26acd0946a28b4e1085537ddc44043653c3cf3c Mon Sep 17 00:00:00 2001 From: chenmingkai Date: Mon, 25 Mar 2024 07:08:22 +0000 Subject: [PATCH 1/6] =?UTF-8?q?!116=20[Sec]=20=E5=A2=9E=E5=8A=A0=E5=AE=89?= =?UTF-8?q?=E5=85=A8=E7=BC=96=E8=AF=91=E6=9D=83=E9=99=90=20Merge=20pull=20?= =?UTF-8?q?request=20!116=20from=20chenmingkai/cmake?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmake/config.cmake | 2 +- setup.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/cmake/config.cmake b/cmake/config.cmake index ef0e1cc0..2105c2cc 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -8,7 +8,7 @@ if(NOT DEFINED vendor_name) endif() # read ASCEND_HOME_PATH from environment variable, change ASCEND_CANN_PACKAGE_PATH to ASCEND_HOME_PATH if (DEFINED ENV{ASCEND_AICPU_PATH}) - set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_HOME_PATH}) + set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_AICPU_PATH}) endif() if(NOT DEFINED ASCEND_CANN_PACKAGE_PATH) set(ASCEND_CANN_PACKAGE_PATH diff --git a/setup.py b/setup.py index 585b7cee..5a1b35d6 100644 --- a/setup.py +++ b/setup.py @@ -27,6 +27,13 @@ ext1 = extension.NpuExtension( '-D__FILENAME__="$$(notdir $$(abspath $$<))"', "-fprofile-arcs", "-ftest-coverage", + "-fPIC", + "-fstack-protector-all", + ], + extra_link_args=[ + "-Wl,-z,relro", + "-Wl,-z,now", + "-s" ], libraries=["gcov"], ) -- Gitee From bc40b880834166b32a72f51c83005fe81a42aa92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E7=BB=B4=E7=90=9B?= Date: Mon, 25 Mar 2024 12:16:10 +0000 Subject: [PATCH 2/6] =?UTF-8?q?!118=20Fix=20MSDAGrad=20Precison=20Merge=20?= =?UTF-8?q?pull=20request=20!118=20from=20=E6=9C=B1=E7=BB=B4=E7=90=9B/fix?= =?UTF-8?q?=5Fprecision=5Fmsdag?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ops/csrc/MultiScaleDeformableAttnFunctionKernelNpu.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ads/common/ops/csrc/MultiScaleDeformableAttnFunctionKernelNpu.cpp b/ads/common/ops/csrc/MultiScaleDeformableAttnFunctionKernelNpu.cpp index 1bc2cdc3..89873d75 100644 --- a/ads/common/ops/csrc/MultiScaleDeformableAttnFunctionKernelNpu.cpp +++ b/ads/common/ops/csrc/MultiScaleDeformableAttnFunctionKernelNpu.cpp @@ -109,9 +109,9 @@ std::tuple multi_scale_deformable_attn_grad( auto grad_sample_loc_size = {location_size[0], location_size[1], location_size[2], location_size[3], location_size[5], location_size[4]}; at::Tensor value1 = value.transpose(1, 2).contiguous(); at::Tensor location1 = location.transpose(4, 5).contiguous(); - at::Tensor result1 = at::empty(grad_value_size, value.options().dtype(at::kFloat)); - at::Tensor result2 = at::empty(grad_sample_loc_size, location.options().dtype(at::kFloat)); - at::Tensor result3 = at::empty(grad_atten_weight_size, attn_weight.options().dtype(at::kFloat)); + at::Tensor result1 = at::zeros(grad_value_size, value.options().dtype(at::kFloat)); + at::Tensor result2 = at::zeros(grad_sample_loc_size, location.options().dtype(at::kFloat)); + at::Tensor result3 = at::zeros(grad_atten_weight_size, attn_weight.options().dtype(at::kFloat)); at::Tensor value_fp = value1.to(at::kFloat); at::Tensor shape_fp = shape.to(at::kInt); -- Gitee From 57153d5b5a8cf601b60db299171b6be80c19459a Mon Sep 17 00:00:00 2001 From: chenmingkai Date: Mon, 25 Mar 2024 12:32:46 +0000 Subject: [PATCH 3/6] =?UTF-8?q?!121=20[SCA]=20=E4=BF=AE=E5=A4=8DSCA?= =?UTF-8?q?=E6=89=AB=E6=8F=8F=E9=97=AE=E9=A2=98=20Merge=20pull=20request?= =?UTF-8?q?=20!121=20from=20chenmingkai/6.0.rc1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ads/common/ops/kernels/op_kernel/nms3d.cpp | 21 ++++++++++----------- cmake/func.cmake | 4 ++-- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/ads/common/ops/kernels/op_kernel/nms3d.cpp b/ads/common/ops/kernels/op_kernel/nms3d.cpp index f52853a7..35e32fdd 100644 --- a/ads/common/ops/kernels/op_kernel/nms3d.cpp +++ b/ads/common/ops/kernels/op_kernel/nms3d.cpp @@ -328,17 +328,16 @@ private: Point center_a(boxATensor.GetValue(0), boxATensor.GetValue(1)); Point center_b(boxBTensor.GetValue(0), boxBTensor.GetValue(1)); - Point box_a_corners[5]; - box_a_corners[0].set(a_x1, a_y1); - box_a_corners[1].set(a_x2, a_y1); - box_a_corners[2].set(a_x2, a_y2); - box_a_corners[3].set(a_x1, a_y2); - - Point box_b_corners[5]; - box_b_corners[0].set(b_x1, b_y1); - box_b_corners[1].set(b_x2, b_y1); - box_b_corners[2].set(b_x2, b_y2); - box_b_corners[3].set(b_x1, b_y2); + Point box_a_corners[5] = {{a_x1, a_y1}, + {a_x2, a_y1}, + {a_x2, a_y2}, + {a_x1, a_y2}, + {a_x1, a_y1}}; + Point box_b_corners[5] = {{b_x1, b_y1}, + {b_x2, b_y1}, + {b_x2, b_y2}, + {b_x1, b_y2}, + {b_x1, b_y1}}; // get oriented corners LocalTensor angleLocal = angleBuf.Get(); diff --git a/cmake/func.cmake b/cmake/func.cmake index e37977f3..865f7654 100644 --- a/cmake/func.cmake +++ b/cmake/func.cmake @@ -53,10 +53,10 @@ function(opbuild) endif() endif() if(NOT EXISTS ${CANN_INCLUDE_PATH}) - message(FATAL_ERROR "CANN include path not found: ${CANN_INCLUDE_PATH}") + message(FATAL_ERROR "CANN include path not found: ${CANN_PATHS}") endif() if(NOT EXISTS ${CANN_LIB_PATH}) - message(FATAL_ERROR "CANN lib path not found: ${CANN_LIB_PATH}") + message(FATAL_ERROR "CANN lib path not found: ${CANN_PATHS}") endif () execute_process( COMMAND -- Gitee From 71d621e53a64eec5809fb9fb81c557bc0a7bbeb0 Mon Sep 17 00:00:00 2001 From: chenmingkai Date: Tue, 26 Mar 2024 16:18:10 +0800 Subject: [PATCH 4/6] modify nms3d, fix SCA problem --- ads/common/ops/kernels/op_kernel/nms3d.cpp | 49 ++++++++++------------ 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/ads/common/ops/kernels/op_kernel/nms3d.cpp b/ads/common/ops/kernels/op_kernel/nms3d.cpp index 35e32fdd..b4e40e25 100644 --- a/ads/common/ops/kernels/op_kernel/nms3d.cpp +++ b/ads/common/ops/kernels/op_kernel/nms3d.cpp @@ -365,7 +365,7 @@ private: // get intersection of lines Point cross_points[16]; Point poly_center; - int cnt = 0; + int count = 0; int flag = 0; poly_center.set(0, 0); @@ -373,10 +373,10 @@ private: for (int j = 0; j < 4; j++) { flag = intersection(box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j], - cross_points[cnt]); + cross_points[count]); if (flag) { - poly_center = poly_center + cross_points[cnt]; - cnt++; + poly_center = poly_center + cross_points[count]; + count++; } } } @@ -385,40 +385,37 @@ private: for (int k = 0; k < 4; k++) { if (check_in_box2d(boxATensor, box_b_corners[k])) { poly_center = poly_center + box_b_corners[k]; - cross_points[cnt] = box_b_corners[k]; - cnt++; + cross_points[count] = box_b_corners[k]; + count++; } if (check_in_box2d(boxBTensor, box_a_corners[k])) { poly_center = poly_center + box_a_corners[k]; - cross_points[cnt] = box_a_corners[k]; - cnt++; + cross_points[count] = box_a_corners[k]; + count++; } } - if (cnt != 0) { - poly_center.x /= cnt; - poly_center.y /= cnt; + if (count != 0) { + poly_center.x /= count; + poly_center.y /= count; } - // sort the points of polygon - Point temp; - for (int j = 0; j < cnt - 1; j++) { - for (int i = 0; i < cnt - j - 1; i++) { - if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) { - temp = cross_points[i]; - cross_points[i] = cross_points[i + 1]; - cross_points[i + 1] = temp; - } + for (size_t i = 1; i < count; ++i) { + Point key = cross_points[i]; + int j = i - 1; + while (j >= 0 && point_cmp(cross_points[j], key, poly_center)) { + cross_points[j + 1] = cross_points[j]; + --j; } + cross_points[j + 1] = key; } - // get the overlap areas - float area = 0; - for (int k = 0; k < cnt - 1; k++) { - area += cross(cross_points[k] - cross_points[0], - cross_points[k + 1] - cross_points[0]); + float cross_area = 0; + for (int k = 0; k < count - 1; k++) { + cross_area += cross(cross_points[k] - cross_points[0], + cross_points[k + 1] - cross_points[0]); } - return abs(area) / static_cast(2.0); + return abs(cross_area) / static_cast(2.0); } __aicore__ inline float iou_bev(const LocalTensor &boxATensor, -- Gitee From e3b3e8a5e894de6e7fc84a1f85d44aa536740845 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E7=BB=B4=E7=90=9B?= Date: Tue, 26 Mar 2024 13:09:33 +0000 Subject: [PATCH 5/6] =?UTF-8?q?!126=20Fix=20precision=20error=20&=20optimi?= =?UTF-8?q?zed=20msdagrad=20Merge=20pull=20request=20!126=20from=20?= =?UTF-8?q?=E6=9C=B1=E7=BB=B4=E7=90=9B/precision=5Ffix=5Fmsdagrad?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...tiScaleDeformableAttnFunctionKernelNpu.cpp | 8 +- ...lti_scale_deformable_attention_v2_grad.cpp | 683 ++++++++---------- 2 files changed, 304 insertions(+), 387 deletions(-) diff --git a/ads/common/ops/csrc/MultiScaleDeformableAttnFunctionKernelNpu.cpp b/ads/common/ops/csrc/MultiScaleDeformableAttnFunctionKernelNpu.cpp index 89873d75..32b51a57 100644 --- a/ads/common/ops/csrc/MultiScaleDeformableAttnFunctionKernelNpu.cpp +++ b/ads/common/ops/csrc/MultiScaleDeformableAttnFunctionKernelNpu.cpp @@ -101,7 +101,7 @@ std::tuple multi_scale_deformable_attn_grad( auto data_total = channels + num_points + num_levels; TORCH_CHECK(data_total < 512, "data_total is over 512: channels ", channels, " num_points is ", num_points, " num_level is ", num_levels, "."); - TORCH_CHECK(channels % 32 == 0, "channels must be a multiple of 32, but channels is ", channels, "."); + TORCH_CHECK(channels % 16 == 0, "channels must be a multiple of 16, but channels is ", channels, "."); TORCH_CHECK(num_points % 4 == 0, "num_points must be a multiple of 4, but num_points is ", num_points, "."); TORCH_CHECK(num_heads % 4 == 0, "num_heads must be a multiple of 4, but num_heads is ", num_heads, "."); auto grad_value_size = {value_size[0], value_size[2], value_size[1], value_size[3]}; @@ -109,9 +109,9 @@ std::tuple multi_scale_deformable_attn_grad( auto grad_sample_loc_size = {location_size[0], location_size[1], location_size[2], location_size[3], location_size[5], location_size[4]}; at::Tensor value1 = value.transpose(1, 2).contiguous(); at::Tensor location1 = location.transpose(4, 5).contiguous(); - at::Tensor result1 = at::zeros(grad_value_size, value.options().dtype(at::kFloat)); - at::Tensor result2 = at::zeros(grad_sample_loc_size, location.options().dtype(at::kFloat)); - at::Tensor result3 = at::zeros(grad_atten_weight_size, attn_weight.options().dtype(at::kFloat)); + at::Tensor result1 = at::empty(grad_value_size, value.options().dtype(at::kFloat)); + at::Tensor result2 = at::empty(grad_sample_loc_size, location.options().dtype(at::kFloat)); + at::Tensor result3 = at::empty(grad_atten_weight_size, attn_weight.options().dtype(at::kFloat)); at::Tensor value_fp = value1.to(at::kFloat); at::Tensor shape_fp = shape.to(at::kInt); diff --git a/ads/common/ops/kernels/op_kernel/multi_scale_deformable_attention_v2_grad.cpp b/ads/common/ops/kernels/op_kernel/multi_scale_deformable_attention_v2_grad.cpp index 0f084999..73043d10 100644 --- a/ads/common/ops/kernels/op_kernel/multi_scale_deformable_attention_v2_grad.cpp +++ b/ads/common/ops/kernels/op_kernel/multi_scale_deformable_attention_v2_grad.cpp @@ -36,6 +36,7 @@ public: { pipe = tmpPipe; curBlockIdx = GetBlockIdx(); + blockBytes = 32; dataAlign = blockBytes / sizeof(DTYPE_VALUE); numKeys = tiling_data->numKeys; @@ -50,32 +51,50 @@ public: taskNum = numQueries; taskNumPerCore = DivCeil(taskNum, coreNum); - embedDimsAlign = AlignUp(embedDims, dataAlign); numPointsAlign = AlignUp(numPoints, dataAlign); numLevelsAlign = AlignUp(numLevels, dataAlign); - batchOffset = numPoints * embedDimsAlign; - - curBlockIdx = GetBlockIdx(); startOffset = curBlockIdx * taskNumPerCore; endOffset = (curBlockIdx + 1) * taskNumPerCore; if (endOffset > taskNum) { endOffset = taskNum; } + // offsets + gradOutStride0 = embedDims; + gradOutStride1 = numHeads * gradOutStride0; + gradOutStride2 = numQueries * gradOutStride1; + weightStride0 = numLevels * numPoints; + weightStride1 = numHeads * weightStride0; + weightStride2 = numQueries * weightStride1; + valueStride0 = embedDims; + valueStride1 = numKeys * valueStride0; + valueStride2 = numHeads * valueStride1; + + hOffsetUb = numPointsAlign; + baseOffsetUb = numPoints * embedDims; + + eventIdMte2ToV = static_cast(pipe->AllocEventID()); + eventIdMte3ToV = static_cast(pipe->AllocEventID()); + eventIdVToMte2 = static_cast(pipe->AllocEventID()); + eventIdVToMte3 = static_cast(pipe->AllocEventID()); + eventIdVToMteWeight = static_cast(pipe->AllocEventID()); + eventIdVToMte3X = static_cast(pipe->AllocEventID()); + eventIdVToMte3Y = static_cast(pipe->AllocEventID()); + + copyParams = {1, (uint16_t)(numPoints * sizeof(DTYPE_VALUE)), 0, 0}; + sumParams = {numPoints, embedDims, embedDims}; + valueGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_VALUE *>(value_gm), batchSize * numKeys * numHeads * embedDims); - valueSpatialShapesGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_SPATIAL_SHAPES *>(spatial_shapes_gm), numLevels * 2); valueLevelStartIndexGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_SPATIAL_SHAPES *>(level_start_index_gm), numLevels); - locationGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_VALUE *>(sampling_loc_gm), batchSize * numQueries * numHeads * numLevels * numPoints * 2); attentionWeightsGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_VALUE *>(attn_weight_gm), batchSize * numQueries * numHeads * numLevels * numPoints); - gradOutputGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_VALUE *>(grad_output_gm), batchSize * numQueries * numHeads * embedDims); @@ -85,94 +104,105 @@ public: batchSize * numQueries * numHeads * numLevels * 2 * numPoints); gradWeightGm.SetGlobalBuffer(reinterpret_cast<__gm__ DTYPE_VALUE *>(grad_attn_weight_gm), batchSize * numQueries * numHeads * numLevels * numPoints); + } - pipe->InitBuffer(shapeQueue, BUFFER_NUM, AlignUp(numLevels * 2, dataAlign) * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(offsetQueue, BUFFER_NUM, numLevelsAlign * sizeof(DTYPE_VALUE)); - - pipe->InitBuffer(locationQueue, BUFFER_NUM, - AlignUp(numHeads * numLevels * numPoints * 2, dataAlign) * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(attentionWeightsUb, BUFFER_NUM, - AlignUp(numHeads * numLevels * numPoints, dataAlign) * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(gradQueue, BUFFER_NUM, embedDimsAlign * sizeof(DTYPE_VALUE)); - - pipe->InitBuffer(gradValueQueue, BUFFER_NUM, - AlignUp(numHeads * numLevels * numPoints * 2, dataAlign) * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(gradLocationQueue, BUFFER_NUM, - AlignUp(numHeads * numLevels * numPoints * 2, dataAlign) * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(gradWeightQueue, BUFFER_NUM, - AlignUp(numHeads * numLevels * numPoints, dataAlign) * sizeof(DTYPE_VALUE)); - - pipe->InitBuffer(floatOneUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(topGradUb, BUFFER_NUM, embedDimsAlign * sizeof(DTYPE_VALUE)); - - + __aicore__ inline void InitBuffer() + { + pipe->InitBuffer(shapeUb, BUFFER_NUM, 2 * numLevelsAlign * sizeof(DTYPE_VALUE)); + pipe->InitBuffer(offsetUb, BUFFER_NUM, numLevelsAlign * sizeof(DTYPE_VALUE)); + pipe->InitBuffer(locationUb, BUFFER_NUM, numHeads * numLevels * numPointsAlign * sizeof(DTYPE_VALUE)); + pipe->InitBuffer(attentionWeightsUb, BUFFER_NUM, numHeads * numLevels * numPointsAlign * sizeof(DTYPE_VALUE)); + pipe->InitBuffer(topGradUb, BUFFER_NUM, embedDims * sizeof(DTYPE_VALUE)); + + pipe->InitBuffer(floatOneUb, BUFFER_NUM, 2 * numPointsAlign * sizeof(DTYPE_VALUE)); pipe->InitBuffer(tmpXUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); pipe->InitBuffer(tmpYUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); pipe->InitBuffer(weightSumUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(weightQueue, BUFFER_NUM, 4 * numPointsAlign * sizeof(DTYPE_VALUE)); - - pipe->InitBuffer(valueUb, BUFFER_NUM, batchOffset * 4 * sizeof(DTYPE_VALUE)); pipe->InitBuffer(locWUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); pipe->InitBuffer(locHUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(hImUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(wImUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(hLowUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_SPATIAL_SHAPES)); - pipe->InitBuffer(wLowUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_SPATIAL_SHAPES)); - pipe->InitBuffer(hHighUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_SPATIAL_SHAPES)); - pipe->InitBuffer(wHighUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_SPATIAL_SHAPES)); - - pipe->InitBuffer(hLowFloatUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(wLowFloatUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(hHighFloatUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(wHighFloatUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); - - pipe->InitBuffer(hHighPtrOffsetUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(hLowPtrOffsetUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(wHighPtrOffsetUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(wLowPtrOffsetUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(w1Ub, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(w2Ub, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(w3Ub, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(w4Ub, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); - - pipe->InitBuffer(v1Ub, BUFFER_NUM, numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(v2Ub, BUFFER_NUM, numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(v3Ub, BUFFER_NUM, numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(v4Ub, BUFFER_NUM, numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); - - pipe->InitBuffer(lwUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(lhUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); - - pipe->InitBuffer(hwUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(hhUb, BUFFER_NUM, numPointsAlign * sizeof(DTYPE_VALUE)); - - pipe->InitBuffer(gradHWeightUb, BUFFER_NUM, numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(gradWWeightUb, BUFFER_NUM, numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(topGradValueUb, BUFFER_NUM, numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(gradWeightUb, BUFFER_NUM, numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); - - pipe->InitBuffer(tmpUb, BUFFER_NUM, numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(tmp1Ub, BUFFER_NUM, numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(tmp2Ub, BUFFER_NUM, numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(tmp3Ub, BUFFER_NUM, numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(tmp4Ub, BUFFER_NUM, numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(tmp5Ub, BUFFER_NUM, numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(tmp6Ub, BUFFER_NUM, numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(tmp7Ub, BUFFER_NUM, numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(tmp8Ub, BUFFER_NUM, numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(tmp9Ub, BUFFER_NUM, numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(tmp10Ub, BUFFER_NUM, numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); - - pipe->InitBuffer(tmpAUb, BUFFER_NUM, embedDimsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(tmpBUb, BUFFER_NUM, embedDimsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(midUb, BUFFER_NUM, 4 * numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); - - pipe->InitBuffer(gradSampleXLocUb, BUFFER_NUM, numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); - pipe->InitBuffer(gradSampleYLocUb, BUFFER_NUM, numPoints * embedDimsAlign * sizeof(DTYPE_VALUE)); + pipe->InitBuffer(imUb, BUFFER_NUM, 2 * numPointsAlign * sizeof(DTYPE_VALUE)); + pipe->InitBuffer(lowUb, BUFFER_NUM, 2 * numPointsAlign * sizeof(DTYPE_SPATIAL_SHAPES)); + pipe->InitBuffer(lowFloatUb, BUFFER_NUM, 2 * numPointsAlign * sizeof(DTYPE_VALUE)); + pipe->InitBuffer(distLowUb, BUFFER_NUM, 2 * numPointsAlign * sizeof(DTYPE_VALUE)); + pipe->InitBuffer(distHighUb, BUFFER_NUM, 2 * numPointsAlign * sizeof(DTYPE_VALUE)); + + pipe->InitBuffer(zerosUb, BUFFER_NUM, 8 * numPoints * embedDims * sizeof(DTYPE_VALUE)); + pipe->InitBuffer(w1v1Ub, BUFFER_NUM, numPoints * embedDims * sizeof(DTYPE_VALUE)); + pipe->InitBuffer(w2v2Ub, BUFFER_NUM, numPoints * embedDims * sizeof(DTYPE_VALUE)); + pipe->InitBuffer(w3v3Ub, BUFFER_NUM, numPoints * embedDims * sizeof(DTYPE_VALUE)); + pipe->InitBuffer(w4v4Ub, BUFFER_NUM, numPoints * embedDims * sizeof(DTYPE_VALUE)); + pipe->InitBuffer(tmpUb, BUFFER_NUM, numPoints * embedDims * sizeof(DTYPE_VALUE)); + + pipe->InitBuffer(tmpAUb, BUFFER_NUM, embedDims * sizeof(DTYPE_VALUE)); + pipe->InitBuffer(tmpBUb, BUFFER_NUM, embedDims * sizeof(DTYPE_VALUE)); + pipe->InitBuffer(midUb, BUFFER_NUM, 4 * numPoints * embedDims * sizeof(DTYPE_VALUE)); + + pipe->InitBuffer(gradSampleXLocUb, BUFFER_NUM, numPoints * embedDims * sizeof(DTYPE_VALUE)); + pipe->InitBuffer(gradSampleYLocUb, BUFFER_NUM, numPoints * embedDims * sizeof(DTYPE_VALUE)); } + + __aicore__ inline void GetLocalTensor() + { + locationLocal = locationUb.Get(); + attentionWeightLocal = attentionWeightsUb.Get(); + shapesLocal = shapeUb.Get(); + offsetLocal = offsetUb.Get(); + xLocal = tmpXUb.Get(); + yLocal = tmpYUb.Get(); + weightSumLocal = weightSumUb.Get(); + floatOneLocal = floatOneUb.Get(); + topGradLocal = topGradUb.Get(); + locWLocal = locWUb.Get(); + locHLocal = locHUb.Get(); + + imLocal = imUb.Get(); + lowLocal = lowUb.Get(); + lowFloatLocal = lowFloatUb.Get(); + zerosLocal = zerosUb.Get(); + + distLowLocal = distLowUb.Get(); + distHighLocal = distHighUb.Get(); + + w1v1Local = w1v1Ub.Get(); + w2v2Local = w2v2Ub.Get(); + w3v3Local = w3v3Ub.Get(); + w4v4Local = w4v4Ub.Get(); + tmpLocal = tmpUb.Get(); + + tmpALocal = tmpAUb.Get(); + tmpBLocal = tmpBUb.Get(); + midLocal = midUb.Get(); + + gradSampleXLocLocal = gradSampleXLocUb.Get(); + gradSampleYLocLocal = gradSampleYLocUb.Get(); + } + + __aicore__ inline void ClearOutput() + { + switch (curBlockIdx) { + case 0: + InitOutput(gradValueGm, batchSize * numKeys * numHeads * embedDims, 0); + break; + case 1: + InitOutput(gradLocationGm, 2 * batchSize * numQueries * numHeads * numLevels * numPoints); + break; + case 2: + InitOutput(gradWeightGm, batchSize * numQueries * numHeads * numLevels * numPoints); + break; + default: + break; + } + if ASCEND_IS_AIV { + SyncAll(); + } + } + __aicore__ inline void Process() { + DataCopy(shapesLocal, valueSpatialShapesGm, 2 * numLevelsAlign); + DataCopy(offsetLocal, valueLevelStartIndexGm, numLevelsAlign); + Duplicate(floatOneLocal, (DTYPE_VALUE)1, 2 * numPointsAlign); for (uint32_t taskIdx = startOffset; taskIdx < endOffset; taskIdx++) { SetAtomicAdd(); Compute(taskIdx); @@ -180,290 +210,171 @@ public: } } -private: - __aicore__ inline void Compute(uint32_t query) + __aicore__ inline void ReleaseEventID() { - LocalTensor locationLocal = locationQueue.Get(); - LocalTensor attentionWeightLocal = attentionWeightsUb.Get(); + pipe->ReleaseEventID(eventIdMte2ToV); + pipe->ReleaseEventID(eventIdMte3ToV); + pipe->ReleaseEventID(eventIdVToMte2); + pipe->ReleaseEventID(eventIdVToMte3); + pipe->ReleaseEventID(eventIdVToMteWeight); + pipe->ReleaseEventID(eventIdVToMte3X); + pipe->ReleaseEventID(eventIdVToMte3Y); + } - LocalTensor shapesLocal = shapeQueue.Get(); - LocalTensor offsetLocal = offsetQueue.Get(); +private: + template + __aicore__ inline void ComputeGrad(uint32_t midId, uint32_t vId, DTYPE_VALUE distH, DTYPE_VALUE distW, + uint32_t hPtrOffset, uint32_t wPtrOffset, DTYPE_VALUE w) + { + uint32_t offsetMid = (point + midId * numPoints) * embedDims; + uint32_t offsetV = vId * baseOffsetUb; + uint32_t offsetGradHWeight = pointOffset + gradHWeightId * baseOffsetUb; + uint32_t offsetGradWWeight = pointOffset + gradWWeightId * baseOffsetUb; + uint32_t ptr = hPtrOffset + wPtrOffset; + DataCopy(zerosLocal[pointOffset + offsetV], valueGm[offsetValue + ptr], embedDims); + SetFlag(eventIdMte2ToV); + + Muls(midLocal[offsetMid], zerosLocal[pointOffset + topGradValueId * baseOffsetUb], w, embedDims); + SetFlag(eventIdVToMte3); + + WaitFlag(eventIdMte2ToV); + Muls(tmpALocal, zerosLocal[pointOffset + offsetV], distW, embedDims); + Muls(tmpBLocal, zerosLocal[pointOffset + offsetV], distH, embedDims); + if (AddH) { + Add(zerosLocal[offsetGradHWeight], zerosLocal[offsetGradHWeight], tmpALocal, embedDims); + } else { + Sub(zerosLocal[offsetGradHWeight], zerosLocal[offsetGradHWeight], tmpALocal, embedDims); + } + if (AddW) { + Add(zerosLocal[offsetGradWWeight], zerosLocal[offsetGradWWeight], tmpBLocal, embedDims); + } else { + Sub(zerosLocal[offsetGradWWeight], zerosLocal[offsetGradWWeight], tmpBLocal, embedDims); + } - DataCopy(shapesLocal, valueSpatialShapesGm, AlignUp(numLevels * 2, dataAlign)); - DataCopy(offsetLocal, valueLevelStartIndexGm, numLevelsAlign); + WaitFlag(eventIdVToMte3); + DataCopy(gradValueGm[offsetValue + ptr], midLocal[offsetMid], embedDims); + } - DataCopyParams copyParamsA{1, (uint16_t)(embedDims * sizeof(DTYPE_VALUE)), 0, 0}; - DataCopyParams copyParamsB{1, (uint16_t)(numPoints * sizeof(DTYPE_VALUE)), 0, 0}; - - LocalTensor valueLocal = valueUb.Get(); - - event_t eventIdVToMte3 = static_cast(GetTPipePtr()->AllocEventID()); - event_t eventIdMte2ToV = static_cast(GetTPipePtr()->AllocEventID()); - event_t eventIdMte3ToV = static_cast(GetTPipePtr()->AllocEventID()); - - for (uint32_t batch = 0; batch < batchSize; batch++) { - LocalTensor weightLocal = weightQueue.Get(); - LocalTensor xLocal = tmpXUb.Get(); - LocalTensor yLocal = tmpYUb.Get(); - LocalTensor weightSumLocal = weightSumUb.Get(); - LocalTensor floatOneLocal = floatOneUb.Get(); - LocalTensor topGradLocal = topGradUb.Get(); - LocalTensor lwLocal = lwUb.Get(); - LocalTensor lhLocal = lhUb.Get(); - LocalTensor locWLocal = locWUb.Get(); - LocalTensor locHLocal = locHUb.Get(); - - LocalTensor hImLocal = hImUb.Get(); - LocalTensor wImLocal = wImUb.Get(); - LocalTensor hLowLocal = hLowUb.Get(); - LocalTensor wLowLocal = wLowUb.Get(); - LocalTensor hHighLocal = hHighUb.Get(); - LocalTensor wHighLocal = wHighUb.Get(); - - LocalTensor hLowFloatLocal = hLowFloatUb.Get(); - LocalTensor wLowFloatLocal = wLowFloatUb.Get(); - - LocalTensor hHighPtrOffsetLocal = hHighPtrOffsetUb.Get(); - LocalTensor hLowPtrOffsetLocal = hLowPtrOffsetUb.Get(); - LocalTensor wHighPtrOffsetLocal = wHighPtrOffsetUb.Get(); - LocalTensor wLowPtrOffsetLocal = wLowPtrOffsetUb.Get(); - LocalTensor w1Local = w1Ub.Get(); - LocalTensor w2Local = w2Ub.Get(); - LocalTensor w3Local = w3Ub.Get(); - LocalTensor w4Local = w4Ub.Get(); - - LocalTensor v1Local = v1Ub.Get(); - LocalTensor v2Local = v2Ub.Get(); - LocalTensor v3Local = v3Ub.Get(); - LocalTensor v4Local = v4Ub.Get(); - - LocalTensor hwLocal = hwUb.Get(); - LocalTensor hhLocal = hhUb.Get(); - - LocalTensor gradHWeightLocal = gradHWeightUb.Get(); - LocalTensor gradWWeightLocal = gradWWeightUb.Get(); - LocalTensor topGradValueLocal = topGradValueUb.Get(); - LocalTensor gradWeightLocal = gradWeightUb.Get(); - - LocalTensor tmpLocal = tmpUb.Get(); - LocalTensor tmp1Local = tmp1Ub.Get(); - LocalTensor tmp2Local = tmp2Ub.Get(); - LocalTensor tmp3Local = tmp3Ub.Get(); - LocalTensor tmp4Local = tmp4Ub.Get(); - LocalTensor tmp5Local = tmp5Ub.Get(); - LocalTensor tmp6Local = tmp6Ub.Get(); - LocalTensor tmp7Local = tmp7Ub.Get(); - LocalTensor tmp8Local = tmp8Ub.Get(); - LocalTensor tmp9Local = tmp9Ub.Get(); - LocalTensor tmp10Local = tmp10Ub.Get(); - - LocalTensor tmpALocal = tmpAUb.Get(); - LocalTensor tmpBLocal = tmpBUb.Get(); - LocalTensor midLocal = midUb.Get(); - - LocalTensor gradSampleXLocLocal = gradSampleXLocUb.Get(); - LocalTensor gradSampleYLocLocal = gradSampleYLocUb.Get(); - - Duplicate(floatOneLocal, (DTYPE_VALUE)1, numPointsAlign); - for (uint32_t head = 0; head < numHeads; head++) { - offsetWeight = (batch * numQueries * numHeads + query * numHeads + head) * numLevels * numPoints; + __aicore__ inline void Compute(uint32_t query) + { + for (batch = 0; batch < batchSize; batch++) { + for (head = 0; head < numHeads; head++) { + offsetWeight = batch * weightStride2 + query * weightStride1 + head * weightStride0; offsetLocation = 2 * offsetWeight; - DataCopy(topGradLocal, gradOutputGm[batch * numQueries * numHeads * embedDims + query * numHeads * embedDims + head * embedDims], - embedDimsAlign); - for (uint32_t level = 0; level < numLevels; level++) { + DataCopy(topGradLocal, + gradOutputGm[batch * gradOutStride2 + query * gradOutStride1 + head * gradOutStride0], + embedDims); + for (level = 0; level < numLevels; level++) { levelStartId = offsetLocal.GetValue(level); h = shapesLocal.GetValue(level * 2); w = shapesLocal.GetValue(level * 2 + 1); - offsetValue = batch * numHeads * numKeys * embedDims + head * numKeys * embedDims + levelStartId * embedDims; + offsetValue = batch * valueStride2 + head * valueStride1 + levelStartId * valueStride0; + wStride = embedDims; + hStride = w * wStride; DataCopy(locWLocal, locationGm[offsetLocation + level * numPoints * 2], numPointsAlign); DataCopy(locHLocal, locationGm[offsetLocation + level * numPoints * 2 + numPoints], numPointsAlign); + SetFlag(eventIdMte2ToV); + WaitFlag(eventIdMte2ToV); DataCopy(attentionWeightLocal, attentionWeightsGm[offsetWeight + level * numPoints], numPointsAlign); + Muls(imLocal[hOffsetUb], locHLocal, (DTYPE_VALUE)h, numPointsAlign); + Muls(imLocal, locWLocal, (DTYPE_VALUE)w, numPointsAlign); + Adds(imLocal, imLocal, DTYPE_VALUE(-0.5), 2 * numPointsAlign); + Cast(lowLocal, imLocal, RoundMode::CAST_FLOOR, 2 * numPointsAlign); + Cast(lowFloatLocal, lowLocal, RoundMode::CAST_NONE, 2 * numPointsAlign); + + Sub(distLowLocal, imLocal, lowFloatLocal, 2 * numPointsAlign); + Sub(distHighLocal, floatOneLocal, distLowLocal, 2 * numPointsAlign); + + Duplicate(zerosLocal, (DTYPE_VALUE)0, 8 * numPoints * embedDims); + SetFlag(eventIdMte2ToV); WaitFlag(eventIdMte2ToV); - Muls(hImLocal, locHLocal, (DTYPE_VALUE)h, numPointsAlign); - Muls(wImLocal, locWLocal, (DTYPE_VALUE)w, numPointsAlign); - Adds(hImLocal, hImLocal, DTYPE_VALUE(-0.5), numPointsAlign); - Adds(wImLocal, wImLocal, DTYPE_VALUE(-0.5), numPointsAlign); - Cast(hLowLocal, hImLocal, RoundMode::CAST_FLOOR, numPointsAlign); - Cast(wLowLocal, wImLocal, RoundMode::CAST_FLOOR, numPointsAlign); - Adds(hHighLocal, hLowLocal, (DTYPE_SPATIAL_SHAPES)1, numPointsAlign); - Adds(wHighLocal, wLowLocal, (DTYPE_SPATIAL_SHAPES)1, numPointsAlign); - - Cast(wLowFloatLocal, wLowLocal, RoundMode::CAST_NONE, numPointsAlign); - Cast(hLowFloatLocal, hLowLocal, RoundMode::CAST_NONE, numPointsAlign); - - Sub(lhLocal, hImLocal, hLowFloatLocal, numPointsAlign); - Sub(lwLocal, wImLocal, wLowFloatLocal, numPointsAlign); - - Sub(hhLocal, floatOneLocal, lhLocal, numPointsAlign); - Sub(hwLocal, floatOneLocal, lwLocal, numPointsAlign); - wStride = embedDims; - hStride = w * wStride; - Muls(hLowPtrOffsetLocal, hLowLocal, hStride, numPointsAlign); - Adds(hHighPtrOffsetLocal, hLowPtrOffsetLocal, hStride, numPointsAlign); - Muls(wLowPtrOffsetLocal, wLowLocal, wStride, numPointsAlign); - Adds(wHighPtrOffsetLocal, wLowPtrOffsetLocal, wStride, numPointsAlign); - basePtr = head * embedDims; - - Mul(w1Local, hhLocal, hwLocal, numPointsAlign); - Mul(w2Local, hhLocal, lwLocal, numPointsAlign); - Mul(w3Local, lhLocal, hwLocal, numPointsAlign); - Mul(w4Local, lhLocal, lwLocal, numPointsAlign); - - Duplicate(gradHWeightLocal, (DTYPE_VALUE)0, numPoints * embedDimsAlign); - Duplicate(gradWWeightLocal, (DTYPE_VALUE)0, numPoints * embedDimsAlign); - Duplicate(topGradValueLocal, (DTYPE_VALUE)0, numPoints * embedDimsAlign); - Duplicate(gradWeightLocal, (DTYPE_VALUE)0, numPoints * embedDimsAlign); - - Duplicate(v1Local, (DTYPE_VALUE)0, numPoints * embedDimsAlign); - Duplicate(v2Local, (DTYPE_VALUE)0, numPoints * embedDimsAlign); - Duplicate(v3Local, (DTYPE_VALUE)0, numPoints * embedDimsAlign); - Duplicate(v4Local, (DTYPE_VALUE)0, numPoints * embedDimsAlign); - - for (uint32_t point = 0; point < numPoints; point++) { - if (hImLocal.GetValue(point) > -1 && wImLocal.GetValue(point) > -1 && - hImLocal.GetValue(point) < h && wImLocal.GetValue(point) < w) { - Muls(topGradValueLocal[point * embedDimsAlign], topGradLocal, - attentionWeightLocal.GetValue(point), embedDimsAlign); - if (hLowLocal.GetValue(point) >= 0) { - if (wLowLocal.GetValue(point) >= 0) { - ptr = hLowPtrOffsetLocal.GetValue(point) + wLowPtrOffsetLocal.GetValue(point); - DataCopy(v1Local[point * embedDimsAlign], valueGm[offsetValue + ptr], - embedDimsAlign); - SetFlag(eventIdMte2ToV); - - Muls(midLocal[point * embedDimsAlign], topGradValueLocal[point * embedDimsAlign], - w1Local.GetValue(point), embedDims); - SetFlag(eventIdVToMte3); - - WaitFlag(eventIdMte2ToV); - Muls(tmpALocal, v1Local[point * embedDimsAlign], hwLocal.GetValue(point), - embedDims); - Muls(tmpBLocal, v1Local[point * embedDimsAlign], hhLocal.GetValue(point), - embedDims); - Sub(gradHWeightLocal[point * embedDimsAlign], - gradHWeightLocal[point * embedDimsAlign], tmpALocal, embedDims); - Sub(gradWWeightLocal[point * embedDimsAlign], - gradWWeightLocal[point * embedDimsAlign], tmpBLocal, embedDims); - WaitFlag(eventIdVToMte3); - DataCopyPad(gradValueGm[offsetValue + ptr], midLocal[point * embedDimsAlign], - copyParamsA); + + for (point = 0; point < numPoints; point++) { + pointOffset = point * embedDims; + hIm = imLocal.GetValue(hOffsetUb + point); + wIm = imLocal.GetValue(point); + if (hIm > -1 && wIm > -1 && hIm < h && wIm < w) { + hLow = lowLocal.GetValue(hOffsetUb + point); + wLow = lowLocal.GetValue(point); + hLowPtrOffset = hLow * hStride; + wLowPtrOffset = wLow * wStride; + Muls(zerosLocal[pointOffset + topGradValueId * baseOffsetUb], topGradLocal, + attentionWeightLocal.GetValue(point), embedDims); + if (hLow >= 0) { + if (wLow >= 0) { + DTYPE_VALUE distH = distHighLocal.GetValue(hOffsetUb + point); + DTYPE_VALUE distW = distHighLocal.GetValue(point); + w1 = distH * distW; + ComputeGrad(mid1Id, v1Id, distH, distW, hLowPtrOffset, wLowPtrOffset, + w1); } - if (wHighLocal.GetValue(point) < w) { - ptr = hLowPtrOffsetLocal.GetValue(point) + wHighPtrOffsetLocal.GetValue(point); - DataCopy(v2Local[point * embedDimsAlign], valueGm[offsetValue + ptr], - embedDimsAlign); - SetFlag(eventIdMte2ToV); - - Muls(midLocal[point * embedDimsAlign + numPoints * embedDimsAlign], - topGradValueLocal[point * embedDimsAlign], w2Local.GetValue(point), embedDims); - SetFlag(eventIdVToMte3); - - WaitFlag(eventIdMte2ToV); - Muls(tmpALocal, v2Local[point * embedDimsAlign], lwLocal.GetValue(point), - embedDims); - Muls(tmpBLocal, v2Local[point * embedDimsAlign], hhLocal.GetValue(point), - embedDims); - Sub(gradHWeightLocal[point * embedDimsAlign], - gradHWeightLocal[point * embedDimsAlign], tmpALocal, embedDims); - Add(gradWWeightLocal[point * embedDimsAlign], - gradWWeightLocal[point * embedDimsAlign], tmpBLocal, embedDims); - WaitFlag(eventIdVToMte3); - DataCopyPad(gradValueGm[offsetValue + ptr], - midLocal[point * embedDimsAlign + numPoints * embedDimsAlign], - copyParamsA); + if (wLow < w - 1) { + DTYPE_VALUE distH = distHighLocal.GetValue(hOffsetUb + point); + DTYPE_VALUE distW = distLowLocal.GetValue(point); + w2 = distH * distW; + ComputeGrad(mid2Id, v2Id, distH, distW, hLowPtrOffset, wLowPtrOffset + wStride, + w2); } } - if (hHighLocal.GetValue(point) < h) { - if (wLowLocal.GetValue(point) >= 0) { - ptr = hHighPtrOffsetLocal.GetValue(point) + wLowPtrOffsetLocal.GetValue(point); - DataCopy(v3Local[point * embedDimsAlign], valueGm[offsetValue + ptr], - embedDimsAlign); - SetFlag(eventIdMte2ToV); - - Muls(midLocal[point * embedDimsAlign + numPoints * embedDimsAlign * 2], - topGradValueLocal[point * embedDimsAlign], w3Local.GetValue(point), embedDims); - SetFlag(eventIdVToMte3); - - WaitFlag(eventIdMte2ToV); - Muls(tmpALocal, v3Local[point * embedDimsAlign], hwLocal.GetValue(point), - embedDims); - Muls(tmpBLocal, v3Local[point * embedDimsAlign], lhLocal.GetValue(point), - embedDims); - Add(gradHWeightLocal[point * embedDimsAlign], - gradHWeightLocal[point * embedDimsAlign], tmpALocal, embedDims); - Sub(gradWWeightLocal[point * embedDimsAlign], - gradWWeightLocal[point * embedDimsAlign], tmpBLocal, embedDims); - WaitFlag(eventIdVToMte3); - DataCopyPad(gradValueGm[offsetValue + ptr], - midLocal[point * embedDimsAlign + numPoints * embedDimsAlign * 2], - copyParamsA); + if (hLow < h - 1) { + if (wLow >= 0) { + DTYPE_VALUE distH = distLowLocal.GetValue(hOffsetUb + point); + DTYPE_VALUE distW = distHighLocal.GetValue(point); + w3 = distH * distW; + ComputeGrad(mid3Id, v3Id, distH, distW, hLowPtrOffset + hStride, wLowPtrOffset, + w3); } - if (wHighLocal.GetValue(point) < w) { - ptr = hHighPtrOffsetLocal.GetValue(point) + wHighPtrOffsetLocal.GetValue(point); - DataCopy(v4Local[point * embedDimsAlign], valueGm[offsetValue + ptr], - embedDimsAlign); - SetFlag(eventIdMte2ToV); - Muls(midLocal[point * embedDimsAlign + numPoints * embedDimsAlign * 3], - topGradValueLocal[point * embedDimsAlign], w4Local.GetValue(point), embedDims); - SetFlag(eventIdVToMte3); - WaitFlag(eventIdMte2ToV); - Muls(tmpALocal, v4Local[point * embedDimsAlign], lwLocal.GetValue(point), - embedDims); - Muls(tmpBLocal, v4Local[point * embedDimsAlign], lhLocal.GetValue(point), - embedDims); - Add(gradHWeightLocal[point * embedDimsAlign], - gradHWeightLocal[point * embedDimsAlign], tmpALocal, embedDims); - Add(gradWWeightLocal[point * embedDimsAlign], - gradWWeightLocal[point * embedDimsAlign], tmpBLocal, embedDims); - WaitFlag(eventIdVToMte3); - DataCopyPad(gradValueGm[offsetValue + ptr], - midLocal[point * embedDimsAlign + numPoints * embedDimsAlign * 3], - copyParamsA); + if (wLow < w - 1) { + DTYPE_VALUE distH = distLowLocal.GetValue(hOffsetUb + point); + DTYPE_VALUE distW = distLowLocal.GetValue(point); + w4 = distH * distW; + ComputeGrad(mid4Id, v4Id, distH, distW, hLowPtrOffset + hStride, wLowPtrOffset + wStride, + w4); } } - Muls(tmp1Local[point * embedDimsAlign], v1Local[point * embedDimsAlign], - w1Local.GetValue(point), embedDimsAlign); - Muls(tmp2Local[point * embedDimsAlign], v2Local[point * embedDimsAlign], - w2Local.GetValue(point), embedDimsAlign); - Muls(tmp3Local[point * embedDimsAlign], v3Local[point * embedDimsAlign], - w3Local.GetValue(point), embedDimsAlign); - Muls(tmp4Local[point * embedDimsAlign], v4Local[point * embedDimsAlign], - w4Local.GetValue(point), embedDimsAlign); - Add(tmp5Local[point * embedDimsAlign], tmp1Local[point * embedDimsAlign], - tmp2Local[point * embedDimsAlign], embedDimsAlign); - Add(tmp6Local[point * embedDimsAlign], tmp3Local[point * embedDimsAlign], - tmp4Local[point * embedDimsAlign], embedDimsAlign); - Add(tmp7Local[point * embedDimsAlign], tmp5Local[point * embedDimsAlign], - tmp6Local[point * embedDimsAlign], embedDimsAlign); - Mul(gradWeightLocal[point * embedDimsAlign], topGradLocal, - tmp7Local[point * embedDimsAlign], embedDimsAlign); + Muls(w1v1Local[pointOffset], zerosLocal[pointOffset + v1Id * baseOffsetUb], + w1, embedDims); + Muls(w2v2Local[pointOffset], zerosLocal[pointOffset + v2Id * baseOffsetUb], + w2, embedDims); + Muls(w3v3Local[pointOffset], zerosLocal[pointOffset + v3Id * baseOffsetUb], + w3, embedDims); + Muls(w4v4Local[pointOffset], zerosLocal[pointOffset + v4Id * baseOffsetUb], + w4, embedDims); + Add(w1v1Local[pointOffset], w1v1Local[pointOffset], w2v2Local[pointOffset], embedDims); + Add(w1v1Local[pointOffset], w1v1Local[pointOffset], w3v3Local[pointOffset], embedDims); + Add(w1v1Local[pointOffset], w1v1Local[pointOffset], w4v4Local[pointOffset], embedDims); + Mul(zerosLocal[pointOffset + gradWeightId * baseOffsetUb], topGradLocal, + w1v1Local[pointOffset], embedDims); } } SetFlag(eventIdMte3ToV); - Mul(tmp9Local, topGradValueLocal, gradWWeightLocal, numPoints * embedDimsAlign); - Muls(gradSampleXLocLocal, tmp9Local, (DTYPE_VALUE)w, numPoints * embedDimsAlign); - Mul(tmp10Local, topGradValueLocal, gradHWeightLocal, numPoints * embedDimsAlign); - Muls(gradSampleYLocLocal, tmp10Local, (DTYPE_VALUE)h, numPoints * embedDimsAlign); - SumParams sumParams{numPoints, embedDimsAlign, embedDims}; + Mul(tmpLocal, zerosLocal[topGradValueId * baseOffsetUb], zerosLocal[gradWWeightId * baseOffsetUb], + numPoints * embedDims); + Muls(gradSampleXLocLocal, tmpLocal, (DTYPE_VALUE)w, numPoints * embedDims); + Mul(tmpLocal, zerosLocal[topGradValueId * baseOffsetUb], zerosLocal[gradHWeightId * baseOffsetUb], + numPoints * embedDims); + Muls(gradSampleYLocLocal, tmpLocal, (DTYPE_VALUE)h, numPoints * embedDims); + Sum(weightSumLocal, zerosLocal[gradWeightId * baseOffsetUb], sumParams); + SetFlag(eventIdVToMteWeight); Sum(xLocal, gradSampleXLocLocal, sumParams); + SetFlag(eventIdVToMte3X); Sum(yLocal, gradSampleYLocLocal, sumParams); - Sum(weightSumLocal, gradWeightLocal, sumParams); - SetFlag(eventIdVToMte3); - WaitFlag(eventIdVToMte3); - DataCopyPad(gradWeightGm[offsetWeight + level * numPoints], weightSumLocal, copyParamsB); - DataCopyPad(gradLocationGm[offsetLocation + level * 2 * numPoints], xLocal, copyParamsB); - DataCopyPad(gradLocationGm[offsetLocation + level * 2 * numPoints + numPoints], yLocal, - copyParamsB); + SetFlag(eventIdVToMte3Y); + + WaitFlag(eventIdVToMteWeight); + DataCopyPad(gradWeightGm[offsetWeight + level * numPoints], weightSumLocal, copyParams); + WaitFlag(eventIdVToMte3X); + DataCopyPad(gradLocationGm[offsetLocation + level * 2 * numPoints], xLocal, copyParams); + WaitFlag(eventIdVToMte3Y); + DataCopyPad(gradLocationGm[offsetLocation + level * 2 * numPoints + numPoints], yLocal, copyParams); WaitFlag(eventIdMte3ToV); - pipe_barrier(PIPE_ALL); + SetFlag(eventIdVToMte2); + WaitFlag(eventIdVToMte2); } } } - GetTPipePtr()->ReleaseEventID(eventIdVToMte3); - GetTPipePtr()->ReleaseEventID(eventIdMte2ToV); - GetTPipePtr()->ReleaseEventID(eventIdMte3ToV); } private: @@ -472,52 +383,55 @@ private: gradWeightGm; GlobalTensor valueSpatialShapesGm, valueLevelStartIndexGm; - TBuf locationQueue, attentionWeightsUb, shapeQueue, offsetQueue, gradQueue; - TBuf gradValueQueue, gradLocationQueue, gradWeightQueue; - + TBuf locationUb, attentionWeightsUb, shapeUb, offsetUb, topGradUb; TBuf tmpXUb, tmpYUb, weightSumUb; - TBuf intOneUb, floatOneUb, weightQueue, emptyUb, topGradUb; - TBuf valueUb, locWUb, locHUb, hImUb, wImUb, hLowUb, wLowUb, hHighUb, wHighUb, hLowFloatUb, - wLowFloatUb, hHighFloatUb, wHighFloatUb, hHighPtrOffsetUb, hLowPtrOffsetUb, wHighPtrOffsetUb, wLowPtrOffsetUb; - - TBuf lwUb, lhUb, hwUb, hhUb, w1Ub, w2Ub, w3Ub, w4Ub, v1Ub, v2Ub, v3Ub, v4Ub; - - TBuf tmpUb, tmp1Ub, tmp2Ub, tmp3Ub, tmp4Ub, tmp5Ub, tmp6Ub, tmp7Ub, tmp8Ub, tmp9Ub, tmp10Ub, - tmpAUb, tmpBUb, midUb; - TBuf gradHWeightUb, gradWWeightUb, topGradValueUb, gradWeightUb, gradSampleXLocUb, - gradSampleYLocUb; - - uint32_t batchSize; - uint32_t numKeys; - uint32_t numHeads; - uint32_t embedDims; + TBuf floatOneUb, zerosUb; + TBuf locWUb, locHUb, imUb, lowUb, lowFloatUb; + TBuf distLowUb, distHighUb, w1Ub, w2Ub, w3Ub, w4Ub; + TBuf w1v1Ub, w2v2Ub, w3v3Ub, w4v4Ub, tmpUb, tmpAUb, tmpBUb, midUb; + TBuf gradSampleXLocUb, gradSampleYLocUb; - uint32_t numLevels; - uint32_t numQueries; - uint32_t numPoints; uint32_t coreNum; - - uint32_t embedDimsAlign; - uint32_t numPointsAlign; - uint32_t numLevelsAlign; - - uint32_t batch; - uint32_t query; - uint32_t head; - - uint32_t taskNum; - uint32_t taskNumPerCore; + uint32_t batchSize, numKeys, numHeads, embedDims, numLevels, numQueries, numPoints; + uint32_t numPointsAlign, numLevelsAlign; + uint32_t batch, query, head, level, point; uint32_t curBlockIdx; - uint32_t startOffset; - uint32_t endOffset; - uint32_t dataAlign; - uint32_t blockBytes = 32; - - DTYPE_VALUE tmp1, tmp2, leftTopWeight, rightTopWeiight, leftBottomWeight, rightBottomWeight, attnWeight; - DTYPE_SPATIAL_SHAPES h, w, x0, y0, x1, y1, valueOffset, weightOffset, locationOffset, batchOffset, levelStartId, - offsetValue; - - DTYPE_SPATIAL_SHAPES offsetWeight, offsetLocation, wStride, hStride, basePtr, ptr; + uint32_t taskNum, taskNumPerCore; + uint32_t startOffset, endOffset; + uint32_t dataAlign, blockBytes; + uint32_t gradOutStride0, gradOutStride1, gradOutStride2; + uint32_t weightStride0, weightStride1, weightStride2; + uint32_t valueStride0, valueStride1, valueStride2; + uint32_t hOffsetUb, baseOffsetUb, pointOffset; + uint32_t mid1Id = 0, mid2Id = 1, mid3Id = 2, mid4Id = 3; + uint32_t gradHWeightId = 0, gradWWeightId = 1, topGradValueId = 2, gradWeightId = 3; + uint32_t v1Id = 4, v2Id = 5, v3Id = 6, v4Id = 7; + + DTYPE_VALUE hIm, wIm; + DTYPE_VALUE w1 = 0, w2 = 0, w3 = 0, w4 = 0; + DTYPE_SPATIAL_SHAPES h, w, levelStartId; + DTYPE_SPATIAL_SHAPES offsetValue, offsetWeight, offsetLocation, wStride, hStride; + DTYPE_SPATIAL_SHAPES hLowPtrOffset, wLowPtrOffset; + DTYPE_SPATIAL_SHAPES hLow, wLow; + + LocalTensor lowFloatLocal; + LocalTensor floatOneLocal; + LocalTensor xLocal, yLocal; + LocalTensor distLowLocal, distHighLocal; + LocalTensor locWLocal, locHLocal; + LocalTensor imLocal; + LocalTensor zerosLocal; + LocalTensor w1v1Local, w2v2Local, w3v3Local, w4v4Local; + LocalTensor weightSumLocal, midLocal, tmpLocal, tmpALocal, tmpBLocal; + LocalTensor gradSampleXLocLocal, gradSampleYLocLocal; + LocalTensor topGradLocal, locationLocal, attentionWeightLocal; + LocalTensor shapesLocal, offsetLocal; + LocalTensor lowLocal; + + SumParams sumParams; + DataCopyParams copyParams; + event_t eventIdVToMte2, eventIdVToMte3, eventIdMte2ToV, eventIdMte3ToV, + eventIdVToMteWeight, eventIdVToMte3X, eventIdVToMte3Y; }; // core func @@ -532,6 +446,9 @@ extern "C" __global__ __aicore__ void multi_scale_deformable_attention_v2_grad( MultiScaleDeformableAttentionV2Grad op; op.Init(value_gm, spatial_shapes_gm, level_start_index_gm, sampling_loc_gm, attn_weight_gm, grad_output_gm, grad_value_gm, grad_sampling_loc_gm, grad_attn_weight_gm, &tiling_datas, &pipe); - + op.InitBuffer(); + op.GetLocalTensor(); + op.ClearOutput(); op.Process(); + op.ReleaseEventID(); } -- Gitee From ee1e9395f4f5c0787b8ef2508888b5753a9a37e9 Mon Sep 17 00:00:00 2001 From: chenmingkai Date: Thu, 28 Mar 2024 10:41:43 +0800 Subject: [PATCH 6/6] remove unavailable apis from api/README --- docs/api/README.md | 649 ++------------------------------------------- 1 file changed, 20 insertions(+), 629 deletions(-) diff --git a/docs/api/README.md b/docs/api/README.md index 841ddde4..d63fd883 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -1,3 +1,4 @@ +> Note: 以prototype标注的接口,表示该接口为预发布接口,可能会有变动,不建议在生产环境中使用。 # Common 算子 ## scatter_max ### 接口原型 @@ -38,63 +39,39 @@ tensor([[0., 0., 0., 0., 0., 0.], tensor([[2, 2, 2, 2, 2, 2], [ 1, 1, 1, 1, 0, 0]]) ``` - ## npu_rotated_box_decode +## \[prototype\] npu_rotated_overlaps ### 接口原型 ```python -ads.common.npu_rotated_box_decode(Tensor anchor_boxes, Tensor deltas, Tensor weight) -> Tensor -``` -### 功能描述 -解码旋转框的坐标。 -### 参数说明 -- `anchor_box(Tensor)`:锚框张量,数据类型为`float32, float16`,形状为`[B, 5, N]`,其中`B`为批大小,`N`为锚框个数, 值`5`分别代表`x0, x1, y0, y1, angle`。 -- `deltas(Tensor)`:偏移量张量,数据类型为`float32, float16`,形状为`[B, 5, N]`,其中`B`为批大小,`N`为锚框个数, 值`5`分别代表`dx, dy, dw, dh, dangle`。 -- `weight(Tensor)`:权重张量,数据类型为`float32, float16`,形状为`[5]`,其中`5`分别代表`wx, wy, ww, wh, wangle`。默认值为`[1, 1, 1, 1, 1]`。 -### 返回值 -- `Tensor`:解码后的旋转框坐标张量,数据类型为`float32, float16`,形状为`[B, 5, N]`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from ads.common import npu_rotated_box_decode -anchor_boxes = torch.tensor([[[4.137], [33.72], [29.4], [54.06], [41.28]]], dtype=torch.float16).npu() -deltas = torch.tensor([[[0.0244], [-1.992], [0.2109], [0.315], [-37.25]]], dtype=torch.float16).npu() -wegiht = torch.tensor([1, 1, 1, 1, 1], dtype=torch.float16).npu() -out = npu_rotated_box_decode(anchor_boxes, deltas, weight) -print(out) -``` -```text -tensor([[[1.7861], [-10.5781], [33.0000], [17.2969], [-88.4375]]], dtype=torch.float16) -``` -## npu_rotated_box_encode -### 接口原型 -```python -ads.common.npu_rotated_box_encode(Tensor anchor_boxes, Tensor gt_bboxes, Tensor weight) -> Tensor +ads.common.npu_rotated_overlaps(Tensor self, Tensor query_boxes, bool trans=False) -> Tensor ``` ### 功能描述 -编码旋转框的坐标。 +计算旋转框的重叠面积。 ### 参数说明 -- `anchor_box(Tensor)`:锚框张量,数据类型为`float32, float16`,形状为`[B, 5, N]`,其中`B`为批大小,`N`为锚框个数, 值`5`分别代表`x0, x1, y0, y1, angle`。 -- `gt_bboxes(Tensor)`:真实框张量,数据类型为`float32, float16`,形状为`[B, 5, N]`,其中`B`为批大小,`N`为锚框个数, 值`5`分别代表`x0, x1, y0, y1, angle`。 -- `weight(Tensor)`:权重张量,数据类型为`float32, float16`,形状为`[5]`,其中`5`分别代表`wx, wy, ww, wh, wangle`。默认值为`[1, 1, 1, 1, 1]`。 +- `self(Tensor)`:梯度增量,数据类型为`float32, float16`,形状为`[B, 5, N]`。 +- `query_boxes(Tensor)`:查询框张量,数据类型为`float32, float16`,形状为`[B, 5, M]`。 +- `trans(bool)`:是否进行坐标变换。默认值为`False`。值为`True`时,表示`xyxyt`, 值为`False`时,表示`xywht`。 ### 返回值 -- `Tensor`:编码后的旋转框坐标张量,数据类型为`float32, float16`,形状为`[B, 5, N]`。 +- `Tensor`:重叠面积张量,数据类型为`float32, float16`,形状为`[B, N, M]`。 ### 支持的型号 - Atlas A2 训练系列产品 ### 调用示例 ```python import torch, torch_npu -from ads.common import npu_rotated_box_encode -anchor_boxes = torch.tensor([[[30.69], [32.6], [45.94], [59.88], [-44.53]]], dtype=torch.float16).npu() -gt_bboxes = torch.tensor([[[30.44], [18.72], [33.22], [45.56], [8.5]]], dtype=torch.float16).npu() -weight = torch.tensor([1, 1, 1, 1, 1], dtype=torch.float16).npu() -out = npu_rotated_box_encode(anchor_boxes, gt_bboxes, weight) -print(out) +import numpy as np +from ads.common import npu_rotated_overlaps +a = np.random.uniform(0, 1, (1, 3, 5)).astype(np.float16) +b = np.random.uniform(0, 1, (1, 2, 5)).astype(np.float16) +box1 = torch.from_numpy(a).npu() +box2 = torch.from_numpy(b).npu() +output = npu_rotated_overlaps(box1, box2) +print(output) ``` ```text -tensor([[[-0.4253], [-0.5166], [-1.7021], [-0.0162], [1.1328]]], dtype=torch.float16) +tensor([[[0.0000, 0.1562, 0.0000], + [0.1562, 0.3713, 0.0611], + [0.0000, 0.0611, 0.0000]]], dtype=torch.float16) ``` -## npu_rotated_iou +## \[prototype\] npu_rotated_iou ### 接口原型 ```python ads.common.npu_rotated_iou(Tensor self, Tensor query_boxes, bool trans=False, int mode=0, bool is_cross=True, float v_threshold=0.0, float e_threshold=0.0) -> Tensor @@ -132,566 +109,6 @@ tensor([[[3.3325e-01, 1.0162e-01], [[0.0000e+00, 0.0000e+00], [0.0000e+00, 5.9605e-08]]], dtype=torch.float16) ``` -## npu_rotated_overlaps -### 接口原型 -```python -ads.common.npu_rotated_overlaps(Tensor self, Tensor query_boxes, bool trans=False) -> Tensor -``` -### 功能描述 -计算旋转框的重叠面积。 -### 参数说明 -- `self(Tensor)`:梯度增量,数据类型为`float32, float16`,形状为`[B, 5, N]`。 -- `query_boxes(Tensor)`:查询框张量,数据类型为`float32, float16`,形状为`[B, 5, M]`。 -- `trans(bool)`:是否进行坐标变换。默认值为`False`。值为`True`时,表示`xyxyt`, 值为`False`时,表示`xywht`。 -### 返回值 -- `Tensor`:重叠面积张量,数据类型为`float32, float16`,形状为`[B, N, M]`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -import numpy as np -from ads.common import npu_rotated_overlaps -a = np.random.uniform(0, 1, (1, 3, 5)).astype(np.float16) -b = np.random.uniform(0, 1, (1, 2, 5)).astype(np.float16) -box1 = torch.from_numpy(a).npu() -box2 = torch.from_numpy(b).npu() -output = npu_rotated_overlaps(box1, box2) -print(output) -``` -```text -tensor([[[0.0000, 0.1562, 0.0000], - [0.1562, 0.3713, 0.0611], - [0.0000, 0.0611, 0.0000]]], dtype=torch.float16) -``` -## npu_sign_bits_pack -### 接口原型 -```python -ads.common.npu_sign_bits_pack(Tensor self, int size) -> Tensor -``` -### 功能描述 -将输入张量的数据按位打包为uint8类型。 -### 参数说明 -- `self(Tensor)`:1D输入张量,数据类型为`float32, float16`。 -- `size(int)`:reshape 时输出张量的第一个维度。 -### 返回值 -- `Tensor`:打包后的张量,数据类型为`uint8`。 -### 约束说明 -Size为可被float打包的输出整除的整数。如果self的size可被8整除,则size为self.size/8,否则size为self.size/8+1。将在小端位置添加-1浮点值以填充可整除性。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from ads.common import npu_sign_bits_pack -a = torch.tensor([5, 4, 3, 2, 0, -1, -2, 4, 3, 2, 1, 0, -1, -2], dtype=torch.float32).npu() -out = npu_sign_bits_pack(a, 2) -print(out) -``` -```text -tensor([[159], [15]], dtype=torch.uint8) -``` -## npu_sign_bits_unpack -### 接口原型 -```python -ads.common.npu_sign_bits_unpack(Tensor x, int dtype, int size) -> Tensor -``` -### 功能描述 -将输入张量的数据按位解包为float类型。 -### 参数说明 -- `x(Tensor)`:1D输入张量,数据类型为`uint8`。 -- `dtype(torch.dtype)`:输出张量的数据类型。值为1时,表示`float32`,值为0时,表示`float16`。 -- `size(int)`:reshape 时输出张量的第一个维度。 -### 返回值 -- `Tensor`:解包后的张量,数据类型为`float32, float16`。 -### 约束说明 -Size为可被uint8s解包的输出整数。输出大小为(size of x)*8。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from ads.common import npu_sign_bits_unpack -a = torch.tensor([159, 15], dtype=torch.uint8).npu() -out = npu_sign_bits_unpack(a, 0, 2) -print(out) -``` -```text -tensor([[1., 1., 1., 1., 1., -1., -1., 1.], [1., 1., 1., 1., -1., -1., -1., -1.]], dtype=torch.float16) -``` -## npu_softmax_cross_entropy_with_logits -### 接口原型 -```python -ads.common.npu_softmax_cross_entropy_with_logits(Tensor features, Tensor labels) -> Tensor -``` -### 功能描述 -计算softmax交叉熵。 -### 参数说明 -- `features(Tensor)`:输入张量,数据类型为`float32, float16`。shape为`[B, N]`, 其中`B`为批大小,`N`为类别数。 -- `labels(Tensor)`:标签张量, 与`features`的shape相同。 -### 返回值 -- `Tensor`:交叉熵张量,数据类型为`float32, float16`,shape为`[B]`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from ads.common import npu_softmax_cross_entropy_with_logits -features = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.float32).npu() -labels = torch.tensor([[0, 1, 0], [1, 0, 0]], dtype=torch.float32).npu() -out = npu_softmax_cross_entropy_with_logits(features, labels) -print(out) -``` -```text -tensor([1.4076, 2.4076], dtype=torch.float32) -``` -## npu_stride_add -### 接口原型 -```python -ads.common.npu_stride_add(Tensor x1, Tensor x2, int offset1, int offset2, int c1_len) -> Tensor -``` -### 功能描述 -将两个张量按照指定的偏移量进行相加, 格式为`NC1HWC0`。 -### 参数说明 -- `x1(Tensor)`:输入张量,`5HD`格式,数据类型为`float32, float16`。 -- `x2(Tensor)`:输入张量,与`x1`的shape相同,数据类型为`float32, float16`。 -- `offset1(int)`:`x1`的偏移量。 -- `offset2(int)`:`x2`的偏移量。 -- `c1_len(int)`:输出张量的`C1`维度。该值必须小于`x1`和`x2`中`C1`与`offset`的差值。 -### 返回值 -- `Tensor`:相加后的张量,数据类型为`float32, float16`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from ads.common import npu_stride_add -x1 = torch.tensor([[[[[1]]]]], dtype=torch.float32).npu() -out = npu_stride_add(x1, x1, 0, 0, 1) -print(out) -``` -```text -tensor([[[[[2]]], [[[0]]], [[[0]]], [[[0]]], [[[0]]], [[[0]]], [[[0]]], [[[0]]], [[[0]]], [[[0]]], [[[0]]], [[[0]]], [[[0]]], [[[0]]], [[[0]]], [[[0]]]]], dtype=torch.float32) -``` -## npu_transpose -### 接口原型 -```python -ads.common.npu_transpose(Tensor x, List[int] perm, bool require_contiguous=True) -> Tensor -``` -### 功能描述 -将输入张量的维度按照指定的顺序进行转置。支持`FakeTensor`模式。 -### 参数说明 -- `x(Tensor)`:输入张量,数据类型为`float32, float16`。 -- `perm(List[int])`:转置顺序。 -- `require_contiguous(bool)`:是否要求输出张量是连续的。默认值为`True`。 -### 返回值 -- `Tensor`:转置后的张量,数据类型为`float32, float16`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from ads.common import npu_transpose -x = torch.tensor([[[1, 2, 3], [4, 5, 6]]], dtype=torch.float32).npu() -y = npu_transpose(x, [0, 2, 1]) -print(y) -``` -```text -tensor([[[1., 4.], [2., 5.], [3., 6.]]], dtype=torch.float -``` -## npu_yolo_boxes_encode -### 接口原型 -```python -ads.common.npu_yolo_boxes_encode(Tensor anchors, Tensor gt_bboxes, Tensor stride, bool perfermance_mode=False) -> Tensor -``` -### 功能描述 -根据YOLO的锚点框(anchor)和真实框(gt_bboxes)生成编码后的框。 -### 参数说明 -- `anchors(Tensor)`:锚点框张量,数据类型为`float32, float16`,形状为`[N, 4]`,其中`N`为`ROI`的个数,`4`分别代表`tx, ty, tw, th`。 -- `gt_bboxes(Tensor)`:真实框张量,数据类型为`float32, float16`,形状为`[N, 4]`,其中`N`为`ROI`的个数,`4`分别代表`dx, dy, dw, dh`。 -- `stride(Tensor)`:步长张量,数据类型为`int32`,形状为`[N]`,其中`N`为`ROI`的个数。 -- `perfermance_mode(bool)`:是否为性能模式。默认值为`False`。当值为`True`时,表示为性能模式,输入类型为`float16`时,将是最新的性能模式,但精度只小于`0.005`;当值为`False`时,表示为精度模式,输入类型为`float32`是,输出精度小于`0.0001`。 -### 返回值 -- `Tensor`:编码后的框张量,数据类型为`float32, float16`,形状为`[N, 4]`。 -### 约束说明 -- `anchors`和`gt_bboxes`的`N`必须相同,且`N`的值必须小于`20480`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from ads.common import npu_yolo_boxes_encode -anchors = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=torch.float32).npu() -gt_bboxes = torch.tensor([[5, 6, 7, 8], [1, 2, 3, 4]], dtype=torch.float32).npu() -stride = torch.tensor([1, 2], dtype=torch.int32).npu() -out = npu_yolo_boxes_encode(anchors, gt_bboxes, stride) -print(out) -``` -```text -tensor([[ 1.0000, 1.0000, 0.0000, 0.0000], - [1.0133e-06, 1.0133e-06, 0.0000, 0.0000]], dtype=torch.float32) -``` -## npu_scatter -### 接口原型 -```python -ads.common.npu_scatter(Tensor self, Tensor indices, Tensor updates, int dim) -> Tensor -``` -### 功能描述 -将`updates`张量中的元素按照`indices`张量中的索引进行分散,然后将分散的元素加到`self`张量中。 -### 参数说明 -- `self(Tensor)`:被更新张量,数据类型为`float32, float16`。 -- `indices(Tensor)`:索引张量,数据类型为`int32`。可以为空,也可以与`updates`有相同的维数。当为空时,操作返回`self unchanged`。 -- `updates(Tensor)`:更新源张量,数据类型为`float32, float16`。 -- `dim(int)`:分散的维度。 -### 返回值 -- `Tensor`:更新后的张量,数据类型为`float32, float16`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from ads.common import npu_scatter -input = torch.tensor([[1.6279, 0.1226], [0.9041, 1.0980]], dtype=torch.float32).npu() -indices = torch.tensor([0, 1], dtype=torch.int32).npu() -updates = torch.tensor([-1.1993, -1.5247], dtype=torch.float32).npu() -out = npu_scatter(input, indices, updates, 0) -print(out) -``` -```text -tensor([[-0.1993, 0.1226], [ 0.9041, -1.5247]], dtype=torch.float32) -``` -## npu_silu -### 接口原型 -```python -ads.common.npu_silu(Tensor x) -> Tensor -``` -### 功能描述 -计算Sigmoid Linear Unit(SiLU)激活函数。公式如下: -$$f(x) = x * sigmoid(x)$$ -### 参数说明 -- `x(Tensor)`:输入张量,数据类型为`float32, float16`。 -### 返回值 -- `Tensor`:激活后的张量,数据类型为`float32, float16`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from ads.common import npu_silu -x = torch.tensor([1, 2, 3, 4], dtype=torch.float32).npu() -out = npu_silu(x) -print(out) -``` -```text -tensor([0.7311, 1.7646, 2.8577, 3.9281], dtype=torch.float32) -``` -> 注意:可以通过`npu_silu_`接口实现原地操作。 -## npu_rotary_mul -### 接口原型 -```python -ads.common.npu_rotary_mul(Tensor x, Tensor r1, Tensor r2) -> Tensor -``` -### 功能描述 -计算旋转乘法。公式如下: -$$x1, x2 = x[..., :C//2], x[..., C//2:]$$ -$$x_new = [-x2, x1]$$ -$$y = x * r1 + x_new * r2$$ -### 参数说明 -- `x(Tensor)`:输入张量,数据类型为`float32, float16`。要求`x`的维度为`4`。 -- `r1(Tensor)`:旋转因子张量,数据类型为`float32, float16`。代表`cos`。 -- `r2(Tensor)`:旋转因子张量,数据类型为`float32, float16`。代表`sin`。 -### 返回值 -- `Tensor`:旋转乘法后的张量,数据类型为`float32, float16`。 -### 约束说明 -- `x`的维度必须为`4`, 一般为`[B, N, S, D]`或`[B, S, N, D]`或`[S, B, N, D]`。 -- `r1`和`r2`的维度必须为`4`, 一般为`[1, 1, S, D]`或`[S, 1, 1, D]`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from ads.common import npu_rotary_mul -x = torch.tensor([[[[1, 2], [3, 4]], [[5, 6], [7, 8]]]], dtype=torch.float32).npu() -r1 = torch.tensor([[[[0.1, 0.2], [0.3, 0.4]], [[0.5, 0.6], [0.7, 0.8]]]], dtype=torch.float32).npu() -r2 = torch.tensor([[[[0.2, 0.3], [0.4, 0.5]], [[0.6, 0.7], [0.8, 0.9]]]], dtype=torch.float32).npu() -out = npu_rotary_mul(x, r1, r2) -print(out) -``` -```text -tensor([[[[-0.3000, 0.7000], [-0.7000, 3.1000]], [[-1.1000, 7.1000], [-1.5000, 12.7000]]]], dtype=torch.float32) -``` -## npu_abs -### 接口原型 -```python -ads.common.npu_abs(Tensor x) -> Tensor -``` -### 功能描述 -计算输入张量的绝对值。 -### 参数说明 -- `x(Tensor)`:输入张量,数据类型为`float32, float16`。 -### 返回值 -- `Tensor`:绝对值张量,数据类型为`float32, float16`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from ads.common import npu_abs -x = torch.tensor([1, -2, 3, -4], dtype=torch.float32).npu() -out = npu_abs(x) -print(out) -``` -```text -tensor([1., 2., 3., 4.], dtype=torch.float32) -``` -## fast_gelu -### 接口原型 -```python -ads.common.fast_gelu(Tensor x) -> Tensor -``` -### 功能描述 -计算输入张量的GELU激活函数。公式如下: -$$f(x) = x/(1+exp(-1.702 * |x|))*exp(0.851*(x-|x|))$$ -### 参数说明 -- `x(Tensor)`:输入张量,数据类型为`float32, float16`。 -### 返回值 -- `Tensor`:激活后的张量,数据类型为`float32, float16`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -import numpy as np -from ads.common import fast_gelu -x = torch.from_numpy(np.array([[-1.0, 4.0, -8.0], [2.0, -5.0, 9.0]])).float().npu() -output = fast_gelu(x) -print(output) -``` -```text -tensor([[-1.5418735e-01 3.9921875e+00 -9.7473649e-06], [ 1.9375000e+00 -1.0052517e-03 8.9824219e+00]], dtype=torch.float32) -``` -## npu_anchor_response_flags -### 接口原型 -```python -ads.common.npu_anchor_response_flags(Tensor gt_bboxes, List[int] featmap_size, List[int] strides, int num_base_anchors) -> Tensor -``` -### 功能描述 -根据真实框(gt_bboxes)和特征图大小(featmap_size)生成锚点响应标志。 -### 参数说明 -- `gt_bboxes(Tensor)`:真实框张量,数据类型为`float32, float16`,形状为`[N, 4]`,其中`N`为`ROI`的个数,`4`分别代表`x0, y0, x1, y1`。 -- `featmap_size(List[int])`:特征图大小,形状为`[2]`,其中`2`分别代表`H, W`。 -- `strides(List[int])`:步长,形状为`[2]`,其中`2`分别代表`stride_h, stride_w`。 -- `num_base_anchors(int)`:基础锚点数。 -### 返回值 -- `Tensor`:锚点响应标志张量,数据类型为`uint8`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from ads.common import npu_anchor_response_flags -gt_bboxes = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=torch.float32).npu() -featmap_size = [2, 3] -strides = [1, 2] -num_base_anchors = 2 -out = npu_anchor_response_flags(gt_bboxes, featmap_size, strides, num_base_anchors) -print(out) -``` -```text -tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], dtype=torch.uint8) -``` -## npu_bounding_box_decode -### 接口原型 -```python -ads.common.npu_bounding_box_decode(Tensor rois, Tensor deltas, float means0, float means1, float means2, float means3, float stds0, float stds1, float stds2, float stds3, int max_shape, float wh_ratio_clip) -> Tensor -``` -### 功能描述 -根据`rois`和`deltas`生成解码后的框。 -### 参数说明 -- `rois(Tensor)`:区域候选网络(RPN)生成的ROI,数据类型为`float32, float16`,形状为`[N, 4]`,其中`N`为`ROI`的个数,`4`分别代表`x0, y0, x1, y1`。 -- `deltas(Tensor)`:偏移量张量,数据类型为`float32, float16`,形状为`[N, 4]`,其中`N`为`ROI`的个数,`4`分别代表`dx, dy, dw, dh`。 -- `means0(float)`:均值,用于归一化`dx`。 -- `means1(float)`:均值,用于归一化`dy`。 -- `means2(float)`:均值,用于归一化`dw`。 -- `means3(float)`:均值,用于归一化`dh`。 -- `stds0(float)`:标准差,用于归一化`dx`。 -- `stds1(float)`:标准差,用于归一化`dy`。 -- `stds2(float)`:标准差,用于归一化`dw`。 -- `stds3(float)`:标准差,用于归一化`dh`。 - - 以上参数均为`float32`类型,`meas`默认值为`0`, `std`默认值为`1`。`delta`的归一化公式为:`delta = (delta - means) / stds`。 -- `max_shape(int)`:最大形状。用于确保转换后的bbox不超过最大形状。默认值为`0`。 -- `wh_ratio_clip(float)`:宽高比裁剪。`dw`和`dh`的值在`(-wh_ratio_clip, wh_ratio_clip)`之间。 -### 返回值 -- `Tensor`:解码后的框张量,数据类型为`float32, float16`,形状为`[N, 4]`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from ads.common import npu_bounding_box_decode -rois = torch.tensor([[1, 2, 3, 4], [3, 4, 5, 6]], dtype=torch.float32).npu() -deltas = torch.tensor([[5, 6, 7, 8], [7, 8, 9, 6]], dtype=torch.float32).npu() -out = npu_bounding_box_decode(rois, deltas, 0, 0, 0, 0, 1, 1, 1, 1, (10, 10), 0.1) -print(out) -``` -```text -tensor([[ 2.5000, 6.5000, 9.0000, 9.0000], [ 9.0000, 9.0000, 9.0000, 9.0000]], dtype=torch.float32) -``` -## npu_bounding_box_encode -### 接口原型 -```python -ads.common.npu_bounding_box_encode(Tensor anchor_boxes, Tensor gt_bboxes, float means0, float means1, float means2, float means3, float stds0, float stds1, float stds2, float stds3) -> Tensor -``` -### 功能描述 -根据`anchor_boxes`和`gt_bboxes`生成编码后的框。 -### 参数说明 -- `anchor_boxes(Tensor)`:锚框张量,数据类型为`float32, float16`,形状为`[N, 4]`,其中`N`为`ROI`的个数,`4`分别代表`x0, y0, x1, y1`。 -- `gt_bboxes(Tensor)`:真实框张量,数据类型为`float32, float16`,形状为`[N, 4]`,其中`N`为`ROI`的个数,`4`分别代表`x0, y0, x1, y1`。 -- `means0(float)`:均值,用于归一化`dx`。 -- `means1(float)`:均值,用于归一化`dy`。 -- `means2(float)`:均值,用于归一化`dw`。 -- `means3(float)`:均值,用于归一化`dh`。 -- `stds0(float)`:标准差,用于归一化`dx`。 -- `stds1(float)`:标准差,用于归一化`dy`。 -- `stds2(float)`:标准差,用于归一化`dw`。 -- `stds3(float)`:标准差,用于归一化`dh`。 - - 以上参数均为`float32`类型,`meas`默认值为`0`, `std`默认值为`1`。`delta`的归一化公式为:`delta = (delta - means) / stds`。 -### 返回值 -- `Tensor`:编码后的框张量,数据类型为`float32, float16`,形状为`[N, 4]`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from ads.common import npu_bounding_box_encode -anchor_boxes = torch.tensor([[1, 2, 3, 4], [3, 4, 5, 6]], dtype=torch.float32).npu() -gt_bboxes = torch.tensor([[5, 6, 7, 8], [7, 8, 9, 6]], dtype=torch.float32).npu() -out = npu_bounding_box_encode(anchor_boxes, gt_bboxes, 0, 0, 0, 0, 0.1, 0.1, 0.2, 0.2) -print(out) -``` -```text -tensor([[13.3281, 13.3281, 0.0000, 0.0000], [ 13.3281, 6.6641, 0.0000, -5.4922]], dtype=torch.float32) -``` -## npu_batch_nms -### 接口原型 -```python -ads.common.npu_batch_nms(Tensor self, Tensor scores, float score_threshold, float iou_threshold, int max_size_per_class, int max_total_size, bool change_coordinate_frame=False, bool transpose_box=False) -> (Tensor, Tensor, Tensor, Tensor) -``` -### 功能描述 -根据`batch` 分类计算输入框评分,通过评分排序,删除评分高于阈值的框。通过NMS操作,删除重叠度高于阈值的框。 -### 参数说明 -- `self(Tensor)`:输入张量,数据类型为`float16`,形状为`[B, N, q, 1]`,其中`B`为批大小,`N`为框的个数,`q=1`或`q=num_classes`。 -- `scores(Tensor)`:评分张量,数据类型为`float16`,形状为`[B, N, num_classes]`。 -- `score_threshold(float)`:评分阈值,用于过滤评分低于阈值的框。 -- `iou_threshold(float)`:IoU阈值,用于过滤重叠度高于阈值的框。 -- `max_size_per_class(int)`:每个类别的最大框数。 -- `max_total_size(int)`:总的最大框数。 -- `change_coordinate_frame(bool)`:是否正则化输出框坐标矩阵。默认值为`False`。 -- `transpose_box(bool)`:是否转置输出框坐标矩阵。默认值为`False`。 -### 返回值 -- nmsed_boxes(Tensor):NMS后的框张量,数据类型为`float16`,形状为`[B, max_total_size, 4]`。 -- nmsed_scores(Tensor):NMS后的评分张量,数据类型为`float16`,形状为`[B, max_total_size]`。 -- nmsed_classes(Tensor):NMS后的类别张量,数据类型为`float16`,形状为`[B, max_total_size]`。 -- nmsed_num(Tensor):NMS后的框数张量,数据类型为`int32`,形状为`[B]`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from ads.common import npu_batch_nms -self = torch.tensor([[[[1, 2, 3, 4]]]], dtype=torch.float16).npu() -scores = torch.tensor([[[1, 2, 3]]], dtype=torch.float16).npu() -nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_num = npu_batch_nms(self, scores, 0.5, 0.5, 1, 1) -print(nmsed_boxes) -print(nmsed_scores) -print(nmsed_classes) -print(nmsed_num) -``` -```text -tensor([[[1.0000, 2.0000, 3.0000, 4.0000]]], dtype=torch.float16) -tensor([[3.]], dtype=torch.float16) -tensor([[2.]], dtype=torch.float16) -tensor([1], dtype=torch.int32) -``` -## npu_confusion_transpose -### 接口原型 -```python -ads.common.npu_confusion_transpose(Tensor self, List[int] perm, List[int] shape, bool transpose_first) -> Tensor -``` -### 功能描述 -根据`perm`和`shape`对输入张量进行转置。 -### 参数说明 -- `self(Tensor)`:输入张量,数据类型为`float32, float16, int8, int16, int32, int64, uint8, uint16, uint32, uint64`。 -- `perm(List[int])`:转置顺序。 -- `shape(List[int])`:输入张量的形状。 -- `transpose_first(bool)`:是否先转置。默认值为`False`。 -### 返回值 -- `Tensor`:转置后的张量,数据类型为`float32, float16, int8, int16, int32, int64, uint8, uint16, uint32, uint64`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from ads.common import npu_confusion_transpose -x = torch.tensor([[[1, 2], [3, 4]], [[5, 6], [7, 8]]], dtype=torch.float32).npu() -out = npu_confusion_transpose(x, [0, 2, 1], [2, 2, 2], False) -print(out) -``` -```text -tensor([[[1., 3.], [2., 4.]], [[5., 7.], [6., 8.]]], dtype=torch.float32) -``` -## npu_broadcast -### 接口原型 -```python -ads.common.npu_broadcast(Tensor self, List[int] size) -> Tensor -``` -### 功能描述 -根据`size`对输入张量进行广播。 -### 参数说明 -- `self(Tensor)`:输入张量,数据类型为`float32, float16, int8, int16, int32, int64, uint8, uint16, uint32, uint64`。 -- `size(List[int])`:广播后的形状。 -### 返回值 -- `Tensor`:广播后的张量,数据类型为`float32, float16, int8, int16, int32, int64, uint8, uint16, uint32, uint64`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from ads.common import npu_broadcast -x = torch.tensor([[1], [2], [3]], dtype=torch.float32).npu() -out = npu_broadcast(x, [3, 4]) -print(out) -``` -```text -tensor([[1., 1., 1., 1.], [2., 2., 2., 2.], [3., 3., 3., 3.]], dtype=torch.float32) -``` -## npu_moe_tutel -### 接口原型 -```python -ads.common.npu_moe_tutel(Tensor x, Tensor gates, Tensor indices, Tensor locations, int capacity) -``` -### 功能描述 -Expert parallelism 把专家分配到不同的计算资源上,比如,一个专家分配1-N个NPU。 -### 参数说明 -- `x(Tensor)`:MHA层输出的全量token,数据类型为`float32, float16, bf16`。 -- `gates(Tensor)`:门控函数的输出结果,数据类型为`float32, float16, bf16`。 -- `indices(Tensor)`:batch值对应的索引,数据类型为`int32`。 -- `locations(Tensor)`:capacity值对应的索引,数据类型为`int32`。 -### 返回值 -- `y(Tensor)`: 专家输出的结果,数据类型为`float32, float16, bf16`。shape 为`[B, capacity, x[1]]`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from ads.common import npu_moe_tutel -x = torch.tensor([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=torch.float32).npu() -gates = torch.tensor([[[1, 2], [3, 4]], [[5, 6], [7, 8]]], dtype=torch.float32).npu() -indices = torch.tensor([1, 2], dtype=torch.int32).npu() -locations = torch.tensor([1, 2], dtype=torch.int32).npu() -out = npu_moe_tutel(x, gates, indices, locations, 2) -print(out) -``` ## npu_dynamic_scatter ### 接口原型 ```python @@ -744,32 +161,6 @@ print(out) ```text tensor([[0, 1]], dtype=torch.int32) ``` -## npu_ads_add -### 接口原型 -```python -ads.common.npu_ads_add(Tensor x, Tensor y) -> Tensor -``` -### 功能描述 -计算两个张量的和。 -### 参数说明 -- `x(Tensor)`:输入张量,数据类型为`float32, float16`。 -- `y(Tensor)`:输入张量,数据类型为`float32, float16`。 -### 返回值 -- `Tensor`:和张量,数据类型为`float32, float16`。 -### 支持的型号 -- Atlas A2 训练系列产品 -### 调用示例 -```python -import torch, torch_npu -from ads.common import npu_ads_add -x = torch.tensor([1, 2, 3, 4], dtype=torch.float32).npu() -y = torch.tensor([5, 6, 7, 8], dtype=torch.float32).npu() -out = npu_ads_add(x, y) -print(out) -``` -```text -tensor([6., 8., 10., 12.], dtype=torch.float32) -``` ## npu_multi_scale_deformable_attn_function ### 接口原型 ```python -- Gitee