diff --git a/third_party/acl/inc/acl/acl_rt.h b/third_party/acl/inc/acl/acl_rt.h index 4b897a33ec37eadd0cdfb4479454e32a45b59fe9..803a7cbc297838ec5dbfa8ea87b439d7257eb8f4 100755 --- a/third_party/acl/inc/acl/acl_rt.h +++ b/third_party/acl/inc/acl/acl_rt.h @@ -1610,6 +1610,9 @@ ACL_FUNC_VISIBILITY aclError aclrtSetDeviceResLimit(int32_t deviceId, aclrtDevRe */ ACL_FUNC_VISIBILITY aclError aclrtResetDeviceResLimit(int32_t deviceId); +ACL_FUNC_VISIBILITY aclError aclrtGetOpTimeoutInterval(uint64_t *interval); +ACL_FUNC_VISIBILITY aclError aclrtSetOpExecuteTimeOutV2(uint64_t timeout, uint64_t *actualTimeout); + #ifdef __cplusplus } #endif diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp index dae26952dd0570cf9a5363ee3a63ae2e1f3f98a2..6f415f3c40dffd47d22c58629daa9bbfec327231 100644 --- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp +++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp @@ -95,6 +95,9 @@ LOAD_FUNCTION(aclrtSetDeviceResLimit) LOAD_FUNCTION(aclrtResetDeviceResLimit) LOAD_FUNCTION(aclrtStreamGetId) LOAD_FUNCTION(aclrtMemcpyAsyncWithCondition) +LOAD_FUNCTION(aclrtGetOpTimeoutInterval) +LOAD_FUNCTION(aclrtSetOpExecuteTimeOutV2) + aclprofStepInfoPtr init_stepinfo() { typedef aclprofStepInfoPtr(*npdInitFunc)(); @@ -378,14 +381,41 @@ aclError AclrtSetStreamOverflowSwitch(aclrtStream stream, uint32_t flag) { } aclError AclrtSetOpExecuteTimeOut(uint32_t timeout) { + typedef aclError (*AclrtGetOpTimeoutInterval)(uint64_t *); + static AclrtGetOpTimeoutInterval intervalFunc = nullptr; + if (intervalFunc == nullptr) { + intervalFunc = (AclrtGetOpTimeoutInterval)GET_FUNC(aclrtGetOpTimeoutInterval); + } + + if (intervalFunc) { + uint64_t interval = 0; + NPU_CHECK_ERROR(intervalFunc(&interval)); + uint32_t customTimeout = c10_npu::option::OptionsManager::GetNpuOpTimeoutMs(); + uint64_t value = 0; + if (customTimeout != 0) { + value = customTimeout * 1000; + } else { + value = timeout * 1000 * 1000; + } + + typedef aclError (*AclrtSetOpExecuteTimeOutV2)(uint64_t, uint64_t *); + static AclrtSetOpExecuteTimeOutV2 func = nullptr; + if (func == nullptr) { + func = (AclrtSetOpExecuteTimeOutV2)GET_FUNC(aclrtSetOpExecuteTimeOutV2); + } + TORCH_CHECK(func, "Failed to find function ", "aclrtSetOpExecuteTimeOutV2", PTA_ERROR(ErrCode::NOT_FOUND)); + uint64_t actualTimeout = 0; + auto ret = func(timeout, &actualTimeout); + ASCEND_LOGI("AclrtSetOpExecuteTimeOutV2 set actual timeout: %zuus", static_cast(actualTimeout)); + return ret; + } + typedef aclError (*AclrtSetOpExecuteTimeOut)(uint32_t); static AclrtSetOpExecuteTimeOut func = nullptr; if (func == nullptr) { func = (AclrtSetOpExecuteTimeOut)GET_FUNC(aclrtSetOpExecuteTimeOut); } - if (func == nullptr) { - return ACL_ERROR_RT_FEATURE_NOT_SUPPORT; - } + TORCH_CHECK(func, "Failed to find function ", "aclrtSetOpExecuteTimeOut", PTA_ERROR(ErrCode::NOT_FOUND)); return func(timeout); } diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp index a049743655c259f39ec3a733e54e5c152cb81763..1b3a100aa35bb6cb6b7d5d775524ef096ac04a96 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp +++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp @@ -149,17 +149,16 @@ uint32_t OptionsManager::GetHCCLConnectTimeout() int32_t OptionsManager::GetHCCLExecTimeout() { char* env_val = std::getenv("HCCL_EXEC_TIMEOUT"); - int64_t envFlag; if (env_val != nullptr) { - envFlag = strtol(env_val, nullptr, 10); - if (envFlag < 0) { - envFlag = -1; - TORCH_NPU_WARN_ONCE("Get env HCCL_EXEC_TIMEOUT less than 0, so reset it to the default value."); - } - } else { - envFlag = -1; + return -1; } - return static_cast(envFlag); + // support for decimal + double value = strtod(env_val, nullptr); + if (value < 0) { + TORCH_NPU_WARN_ONCE("Get env HCCL_EXEC_TIMEOUT less than 0, so reset it to the default value."); + return -1; + } + return static_cast(std::lround(value)); } int32_t OptionsManager::GetHCCLEventTimeout() @@ -640,5 +639,12 @@ bool OptionsManager::LazySetDevice() return lazy_set; } +uint32_t OptionsManager::GetNpuOpTimeoutMs() +{ + char* env_val = std::getenv("NPU_OP_TIMEOUT_MS"); + uint32_t envFlag = (env_val != nullptr) ? strtoul(env_val, nullptr, 10) : 0; + return envFlag; +} + } // namespace option } // namespace c10_npu diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h index 7f07b943f984df278d12f8540a5e77f181d851fe..6db035a6db8556a2aba2ce19de8122ff8efb2b14 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.h +++ b/torch_npu/csrc/core/npu/register/OptionsManager.h @@ -135,6 +135,7 @@ public: static bool ShouldPrintWarning(); static bool IsCompactErrorOutput(); static bool LazySetDevice(); + static uint32_t GetNpuOpTimeoutMs(); private: static int GetBoolTypeOption(const char* env_str, int defaultVal = 0);