diff --git "a/ascend_log\357\274\210\346\216\250\347\220\206\346\212\245\351\224\231\357\274\211.zip" "b/ascend_log\357\274\210\346\216\250\347\220\206\346\212\245\351\224\231\357\274\211.zip"
new file mode 100644
index 0000000000000000000000000000000000000000..c7e1608768094439a9a2db9f925ef79f5ab65dd6
Binary files /dev/null and "b/ascend_log\357\274\210\346\216\250\347\220\206\346\212\245\351\224\231\357\274\211.zip" differ
diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index 7a01246c9a74b1ed72c7b0efd81be8974bfa25c1..c0c42d668d96eb138381531af9cef0020f156e90 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -190,6 +190,7 @@ def _new_process_group_hccl_helper(dist_backend_opts, pg_options):
     pg_options.is_high_priority_stream = False
     pg_options._timeout = dist_backend_opts.timeout
     pg_options.global_ranks_in_group = dist_backend_opts.global_ranks_in_group
+    pg_options.group_id = dist_backend_opts.group_id
     return torch_npu._C._distributed_c10d.ProcessGroupHCCL(store, group_rank, group_size, pg_options)
 
 
diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp
index 35abac738fa8ce5cabe687fa8445ae95a246ae61..9e33ab84f01b0f70a6c04620cc4cc40939e08d02 100644
--- a/torch_npu/csrc/distributed/Init.cpp
+++ b/torch_npu/csrc/distributed/Init.cpp
@@ -422,7 +422,9 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs) {
       .def_readwrite("is_high_priority_stream",
                      &::c10d_npu::ProcessGroupHCCL::Options::is_high_priority_stream)
       .def_readwrite("global_ranks_in_group",
-                     &::c10d_npu::ProcessGroupHCCL::Options::global_ranks_in_group);
+                     &::c10d_npu::ProcessGroupHCCL::Options::global_ranks_in_group)
+      .def_readwrite("group_id",
+                     &::c10d_npu::ProcessGroupHCCL::Options::group_id);
 
     auto cDist = py::module_::import("torch._C._distributed_c10d");
     auto parallelStore = intrusive_ptr_no_gil_destructor_class_<::c10d::ParallelTcpStore>(
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index e8a9f5a2834b9de7e81c98e96f1f54e8399c2f1b..50dea7baa79c2e4e781cce664d6c9769299000c0 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -233,8 +233,6 @@ const int64_t ProcessGroupHCCL::kProcessGroupHCCLOpTimeoutMillis = 10 * 1000;
 thread_local uint64_t ProcessGroupHCCL::hcclActiveGroupCounter_ = 0;
 const int64_t ProcessGroupHCCL::kWatchdogThreadSleepMillis = 1000;
 std::string ProcessGroupHCCL::perfdumppath = "";
-std::unordered_map<std::string, uint32_t> ProcessGroupHCCL::group_ranks_map_;
-std::mutex ProcessGroupHCCL::group_ranks_map_mutex_;
 ProcessGroupHCCL* ProcessGroupHCCL::global_ = nullptr;
 
 std::ostream& operator<<(std::ostream& output, const ProcessGroupHCCL::WorkHCCL& workHCCL)
@@ -692,28 +690,6 @@ ProcessGroupHCCL::ProcessGroupHCCL(
     hcclCommWatchdogThread_ = std::thread(&ProcessGroupHCCL::hcclCommWatchdog, this);
 #endif
 
-    const std::vector<uint32_t>& ranks = groupRanks();
-    std::stringstream ss;
-    for (size_t i = 0; i < ranks.size(); ++i) {
-        ss << ranks[i];
-        if (i != ranks.size() - 1) {
-            ss << ", ";
-        }
-    }
-    std::string group_ranks = ss.str();
-
-    {
-        std::lock_guard<std::mutex> lock(group_ranks_map_mutex_);
-        auto it = group_ranks_map_.find(group_ranks);
-        if (it != group_ranks_map_.end()) {
-            group_ranks_map_[group_ranks]++;
-        } else {
-            group_ranks_map_[group_ranks] = 0;
-        }
-
-        global_hccl_id_ = group_ranks + "_" + std::to_string(group_ranks_map_[group_ranks]);
-    }
-
     if (options_->global_ranks_in_group.empty()) {
         global_ = this;
     }
@@ -786,6 +762,7 @@ ProcessGroupHCCL::~ProcessGroupHCCL()
         }
         devHCCLCommMap_.clear();
     }
+    ASCEND_LOGI("process group destroyed, group id is %s.", options_->group_id.c_str());
 }
 
 void ProcessGroupHCCL::hcclCommWatchdog()
@@ -1218,7 +1195,7 @@ bool ProcessGroupHCCL::createHCCLCommEx(const std::vector<at::Device>& devices,
         return false;
     }
 
-    uint64_t hcclid = (std::hash<string>{}(global_hccl_id_));
+    uint64_t hcclid = (std::hash<string>{}(options_->group_id));
     auto subStartTime = std::chrono::steady_clock::now();
     for (size_t i = 0; i < devices.size(); ++i) {
         int numRanks = getSize();
@@ -1235,7 +1212,8 @@ bool ProcessGroupHCCL::createHCCLCommEx(const std::vector<at::Device>& devices,
         }
         auto subComm = HCCLComm::createSubHcclComm(globalHcclComm, numRanks, options_->global_ranks_in_group.data(), hcclid, rank, commConfig);
         if (subComm == nullptr) {
-            ASCEND_LOGI("Create sub hccl comm by hcclCreateSubCommConfig failed.");
+            ASCEND_LOGI("Create sub hccl comm by hcclCreateSubCommConfig failed, group id is %s, subCommId is %llu.",
+                options_->group_id.c_str(), hcclid);
             return false;
         }
         hcclComms[i] = subComm;
@@ -1244,8 +1222,8 @@ bool ProcessGroupHCCL::createHCCLCommEx(const std::vector<at::Device>& devices,
     }
     auto subEndTime = std::chrono::steady_clock::now();
     auto subTimeElapsed = std::chrono::duration_cast<std::chrono::milliseconds>(subEndTime - subStartTime);
-    ASCEND_LOGI("Create sub hccl comm by hcclCreateSubCommConfig success, subCommId is %llu, take %d milliseconds.",
-        hcclid, subTimeElapsed.count());
+    ASCEND_LOGI("Create sub hccl comm by hcclCreateSubCommConfig success, group id is %s, subCommId is %llu, use %d ms.",
+        options_->group_id.c_str(), hcclid, subTimeElapsed.count());
     return true;
 }
 
@@ -1902,6 +1880,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allreduce(
 {
     check_npu_tensors_different_devices(tensors);
     std::vector<at::Tensor> tensors_cp = {tensors[0]};
+    ASCEND_LOGE("allreduce tensors_cp %p", tensors_cp[0].data_ptr());
     std::string functionName = __FUNCTION__;
     return collective(
         tensors_cp,
@@ -1915,9 +1894,11 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allreduce(
             auto outputDataPtr = output.data_ptr();
             auto numel = getNumelForHCCL(input);
             auto hcclReduceOp = getHcclReduceOp(opts.reduceOp, input);
+            ASCEND_LOGE("allreduce fn inputDataPtr %p, outputDataPtr %p", inputDataPtr, outputDataPtr);
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched]() -> int {
                 torch_npu::profiler::MstxRange range(
                     getMstxHcclMsg("HcclAllreduce", numel, hcclType, comm), stream.stream(false));
+                ASCEND_LOGE("allreduce hccl inputDataPtr %p, outputDataPtr %p", inputDataPtr, outputDataPtr);
                 auto hccl_result = HcclAllReduce(
                     inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream.stream(false));
                 *is_dispatched = true;
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
index f8cf3c30904839b71d53b4c7339f90079720d51b..960ab8df38d38b51cd1df0495b9a295d603e7f96 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
@@ -261,6 +261,8 @@ public:
         bool is_high_priority_stream;
 
         std::vector<uint32_t> global_ranks_in_group;
+
+        std::string group_id;
     };
 
     // If you wish to create multiple process groups, each with a potentially
@@ -679,10 +681,6 @@ private:
 
     WatchdogStatus watchdogStatus;
 
-    static std::mutex group_ranks_map_mutex_;
-    static std::unordered_map<std::string, uint32_t> group_ranks_map_;
-    std::string global_hccl_id_;
-
     static ProcessGroupHCCL* global_;
 };
 } // namespace c10d_npu