diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp index 20257ede29551f03c82209aa6d67eaa2274940a8..1714c5352fb32d2347320664b6d48a9e7f48eb09 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp +++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp @@ -131,6 +131,13 @@ uint32_t OptionsManager::GetHCCLEventTimeout() return static_cast(envFlag); } +uint32_t OptionsManager::GetHCCLDispatchTimeout() +{ + char* env_val = std::getenv("TORCH_HCCL_DISPATCH_TIMEOUT"); + int64_t envFlag = (env_val != nullptr) ? strtol(env_val, nullptr, 10) : 0; + return static_cast(envFlag); +} + int32_t OptionsManager::GetACLExecTimeout() { char* env_val = std::getenv("ACL_STREAM_TIMEOUT"); diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h index 718bc36ce65f0bbd8194369c3a89f905b8e04f66..c60f42c24e33fa5721f6f6f2b8b18a9b4f3a354c 100644 --- a/torch_npu/csrc/core/npu/register/OptionsManager.h +++ b/torch_npu/csrc/core/npu/register/OptionsManager.h @@ -43,6 +43,7 @@ public: static uint32_t GetHCCLConnectTimeout(); static uint32_t GetHCCLExecTimeout(); static uint32_t GetHCCLEventTimeout(); + static uint32_t GetHCCLDispatchTimeout(); static std::string CheckDisableDynamicPath(); static int32_t GetACLExecTimeout(); static int32_t GetACLDeviceSyncTimeout(); diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index d1f5a6837290642887bfa5b41705362ee5f64b6b..ff3c4ad39e5126f1783a2babf89d5acf4c5b58db 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -444,11 +444,20 @@ std::chrono::milliseconds GetDispatchTimeout() uint32_t mindispatchTimeout_ = 120U; uint32_t hccl_exec_timeout = c10_npu::option::OptionsManager::GetHCCLExecTimeout(); + uint32_t hccl_dispatch_timeout = c10_npu::option::OptionsManager::GetHCCLDispatchTimeout(); if (hccl_exec_timeout > 0) { if (hccl_exec_timeout < dispatchTimeout_ + dispatchoffset && hccl_exec_timeout > mindispatchTimeout_ + dispatchoffset) { dispatchTimeout_ = hccl_exec_timeout - dispatchoffset; }; }; + if (hccl_dispatch_timeout > 0) { + dispatchTimeout_ = hccl_dispatch_timeout; + if (hccl_exec_timeout > 0 && hccl_exec_timeout < hccl_dispatch_timeout + dispatchoffset) { + dispatchTimeout_ = hccl_exec_timeout - dispatchoffset; + } else if (hccl_exec_timeout == 0 && hccl_dispatch_timeout > 1770) { + dispatchTimeout_ = 1770; + } + } ASCEND_LOGI("set dispatchTimeout_ %u s.", dispatchTimeout_); return std::chrono::milliseconds(dispatchTimeout_ * 1000U); }