diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index 0ac77517aa9d965731f73ae41e934ab9b66db525..18a9119a3b8e90ea4b32d85c9307027a197e2905 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -565,6 +565,7 @@ aclError AclrtMapMem(void *virPtr, size_t size, size_t offset, aclrtDrvMemHandle if (func == nullptr) { func = (AclrtMapMem)GET_FUNC(aclrtMapMem); } + ASCEND_LOGE("AclrtMapMem virPtr:%p , size: %u", virPtr, size); TORCH_CHECK(func, "Failed to find function ", "aclrtMapMem", PTA_ERROR(ErrCode::NOT_FOUND)); auto ret = func(virPtr, size, offset, handle, flags); if (hcclComm) { @@ -580,6 +581,7 @@ aclError AclrtUnmapMem(void *virPtr, HcclComm hcclComm) if (func == nullptr) { func = (AclrtUnmapMem)GET_FUNC(aclrtUnmapMem); } + ASCEND_LOGE("AclrtUnmapMem virPtr:%p ", virPtr); TORCH_CHECK(func, "Failed to find function ", "aclrtUnmapMem", PTA_ERROR(ErrCode::NOT_FOUND)); auto ret = func(virPtr); if (hcclComm) { diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index 1b5852101c3f8fc9f255a08af36f7d4e9a53cd16..efa35cc9e541413f22392853a4b0d71e0a6fb6db 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -4306,6 +4306,7 @@ c10::intrusive_ptr ProcessGroupHCCL::allgather( auto outputDataPtr = output.data_ptr(); auto numel = getNumelForHCCL(input); auto hcclType = getHcclDataType(input.scalar_type()); + ASCEND_LOGE("HcclAllgather inputptr:%p , outputptr:%p , numl: %u", inputDataPtr, outputDataPtr, numel); auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, comm, stream, is_dispatched, streamId]() -> int { torch_npu::profiler::MstxRange range( getMstxHcclMsg("HcclAllGather", numel, hcclType, comm, streamId, -1, -1), stream.stream(false), @@ -4501,6 +4502,7 @@ c10::intrusive_ptr ProcessGroupHCCL::allgather_into_tensor_coalesced auto outputDataPtr = output.data_ptr(); auto numel = getNumelForHCCL(input); auto hcclType = getHcclDataType(input.scalar_type()); + ASCEND_LOGE("HcclAllgatherBase allgather_into_tensor_coalesced inputptr:%p , outputptr:%p , numl: %u", inputDataPtr, outputDataPtr, numel); auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, comm, stream, is_dispatched, streamId]() -> int { torch_npu::profiler::MstxRange range( getMstxHcclMsg("HcclAllGather", numel, hcclType, comm, streamId, -1, -1), stream.stream(false), @@ -4547,6 +4549,7 @@ c10::intrusive_ptr ProcessGroupHCCL::allgather_togather( auto outputDataPtr = output.data_ptr(); auto numel = getNumelForHCCL(input); auto hcclType = getHcclDataType(input.scalar_type()); + ASCEND_LOGE("HcclAllgatherTogather inputptr:%p , outputptr:%p , numl: %u", inputDataPtr, outputDataPtr, numel); auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, comm, stream, is_dispatched, streamId]() -> int { torch_npu::profiler::MstxRange range( getMstxHcclMsg("HcclAllGather", numel, hcclType, comm, streamId, -1, -1), stream.stream(false), @@ -4598,6 +4601,7 @@ c10::intrusive_ptr ProcessGroupHCCL::_allgather_base( auto outputDataPtr = output.data_ptr(); auto numel = getNumelForHCCL(input); auto hcclType = getHcclDataType(input.scalar_type()); + ASCEND_LOGE("HcclAllgatherBase inputptr:%p , outputptr:%p , numl: %u", inputDataPtr, outputDataPtr, numel); auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, comm, stream, is_dispatched, streamId]() -> int { torch_npu::profiler::MstxRange range( getMstxHcclMsg("HcclAllGather", numel, hcclType, comm, streamId, -1, -1), stream.stream(false),