diff --git "a/ascend_log\357\274\210\346\216\250\347\220\206\346\212\245\351\224\231\357\274\211.zip" "b/ascend_log\357\274\210\346\216\250\347\220\206\346\212\245\351\224\231\357\274\211.zip" new file mode 100644 index 0000000000000000000000000000000000000000..c7e1608768094439a9a2db9f925ef79f5ab65dd6 Binary files /dev/null and "b/ascend_log\357\274\210\346\216\250\347\220\206\346\212\245\351\224\231\357\274\211.zip" differ diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py index 7a01246c9a74b1ed72c7b0efd81be8974bfa25c1..c0c42d668d96eb138381531af9cef0020f156e90 100644 --- a/torch_npu/__init__.py +++ b/torch_npu/__init__.py @@ -190,6 +190,7 @@ def _new_process_group_hccl_helper(dist_backend_opts, pg_options): pg_options.is_high_priority_stream = False pg_options._timeout = dist_backend_opts.timeout pg_options.global_ranks_in_group = dist_backend_opts.global_ranks_in_group + pg_options.group_id = dist_backend_opts.group_id return torch_npu._C._distributed_c10d.ProcessGroupHCCL(store, group_rank, group_size, pg_options) diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp index 35abac738fa8ce5cabe687fa8445ae95a246ae61..9e33ab84f01b0f70a6c04620cc4cc40939e08d02 100644 --- a/torch_npu/csrc/distributed/Init.cpp +++ b/torch_npu/csrc/distributed/Init.cpp @@ -422,7 +422,9 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs) { .def_readwrite("is_high_priority_stream", &::c10d_npu::ProcessGroupHCCL::Options::is_high_priority_stream) .def_readwrite("global_ranks_in_group", - &::c10d_npu::ProcessGroupHCCL::Options::global_ranks_in_group); + &::c10d_npu::ProcessGroupHCCL::Options::global_ranks_in_group) + .def_readwrite("group_id", + &::c10d_npu::ProcessGroupHCCL::Options::group_id); auto cDist = py::module_::import("torch._C._distributed_c10d"); auto parallelStore = intrusive_ptr_no_gil_destructor_class_<::c10d::ParallelTcpStore>( diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp index e8a9f5a2834b9de7e81c98e96f1f54e8399c2f1b..50dea7baa79c2e4e781cce664d6c9769299000c0 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp @@ -233,8 +233,6 @@ const int64_t ProcessGroupHCCL::kProcessGroupHCCLOpTimeoutMillis = 10 * 1000; thread_local uint64_t ProcessGroupHCCL::hcclActiveGroupCounter_ = 0; const int64_t ProcessGroupHCCL::kWatchdogThreadSleepMillis = 1000; std::string ProcessGroupHCCL::perfdumppath = ""; -std::unordered_map ProcessGroupHCCL::group_ranks_map_; -std::mutex ProcessGroupHCCL::group_ranks_map_mutex_; ProcessGroupHCCL* ProcessGroupHCCL::global_ = nullptr; std::ostream& operator<<(std::ostream& output, const ProcessGroupHCCL::WorkHCCL& workHCCL) @@ -692,28 +690,6 @@ ProcessGroupHCCL::ProcessGroupHCCL( hcclCommWatchdogThread_ = std::thread(&ProcessGroupHCCL::hcclCommWatchdog, this); #endif - const std::vector& ranks = groupRanks(); - std::stringstream ss; - for (size_t i = 0; i < ranks.size(); ++i) { - ss << ranks[i]; - if (i != ranks.size() - 1) { - ss << ", "; - } - } - std::string group_ranks = ss.str(); - - { - std::lock_guard lock(group_ranks_map_mutex_); - auto it = group_ranks_map_.find(group_ranks); - if (it != group_ranks_map_.end()) { - group_ranks_map_[group_ranks]++; - } else { - group_ranks_map_[group_ranks] = 0; - } - - global_hccl_id_ = group_ranks + "_" + std::to_string(group_ranks_map_[group_ranks]); - } - if (options_->global_ranks_in_group.empty()) { global_ = this; } @@ -786,6 +762,7 @@ ProcessGroupHCCL::~ProcessGroupHCCL() } devHCCLCommMap_.clear(); } + ASCEND_LOGI("process group destroyed, group id is %s.", options_->group_id.c_str()); } void ProcessGroupHCCL::hcclCommWatchdog() @@ -1218,7 +1195,7 @@ bool ProcessGroupHCCL::createHCCLCommEx(const std::vector& devices, return false; } - uint64_t hcclid = (std::hash{}(global_hccl_id_)); + uint64_t hcclid = (std::hash{}(options_->group_id)); auto subStartTime = std::chrono::steady_clock::now(); for (size_t i = 0; i < devices.size(); ++i) { int numRanks = getSize(); @@ -1235,7 +1212,8 @@ bool ProcessGroupHCCL::createHCCLCommEx(const std::vector& devices, } auto subComm = HCCLComm::createSubHcclComm(globalHcclComm, numRanks, options_->global_ranks_in_group.data(), hcclid, rank, commConfig); if (subComm == nullptr) { - ASCEND_LOGI("Create sub hccl comm by hcclCreateSubCommConfig failed."); + ASCEND_LOGI("Create sub hccl comm by hcclCreateSubCommConfig failed, group id is %s, subCommId is %llu.", + options_->group_id.c_str(), hcclid); return false; } hcclComms[i] = subComm; @@ -1244,8 +1222,8 @@ bool ProcessGroupHCCL::createHCCLCommEx(const std::vector& devices, } auto subEndTime = std::chrono::steady_clock::now(); auto subTimeElapsed = std::chrono::duration_cast(subEndTime - subStartTime); - ASCEND_LOGI("Create sub hccl comm by hcclCreateSubCommConfig success, subCommId is %llu, take %d milliseconds.", - hcclid, subTimeElapsed.count()); + ASCEND_LOGI("Create sub hccl comm by hcclCreateSubCommConfig success, group id is %s, subCommId is %llu, use %d ms.", + options_->group_id.c_str(), hcclid, subTimeElapsed.count()); return true; } @@ -1902,6 +1880,7 @@ c10::intrusive_ptr ProcessGroupHCCL::allreduce( { check_npu_tensors_different_devices(tensors); std::vector tensors_cp = {tensors[0]}; + ASCEND_LOGE("allreduce tensors_cp %p", tensors_cp[0].data_ptr()); std::string functionName = __FUNCTION__; return collective( tensors_cp, @@ -1915,9 +1894,11 @@ c10::intrusive_ptr ProcessGroupHCCL::allreduce( auto outputDataPtr = output.data_ptr(); auto numel = getNumelForHCCL(input); auto hcclReduceOp = getHcclReduceOp(opts.reduceOp, input); + ASCEND_LOGE("allreduce fn inputDataPtr %p, outputDataPtr %p", inputDataPtr, outputDataPtr); auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched]() -> int { torch_npu::profiler::MstxRange range( getMstxHcclMsg("HcclAllreduce", numel, hcclType, comm), stream.stream(false)); + ASCEND_LOGE("allreduce hccl inputDataPtr %p, outputDataPtr %p", inputDataPtr, outputDataPtr); auto hccl_result = HcclAllReduce( inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream.stream(false)); *is_dispatched = true; diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp index f8cf3c30904839b71d53b4c7339f90079720d51b..960ab8df38d38b51cd1df0495b9a295d603e7f96 100644 --- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp +++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp @@ -261,6 +261,8 @@ public: bool is_high_priority_stream; std::vector global_ranks_in_group; + + std::string group_id; }; // If you wish to create multiple process groups, each with a potentially @@ -679,10 +681,6 @@ private: WatchdogStatus watchdogStatus; - static std::mutex group_ranks_map_mutex_; - static std::unordered_map group_ranks_map_; - std::string global_hccl_id_; - static ProcessGroupHCCL* global_; }; } // namespace c10d_npu