From 08b6e1070c9a34d6e81c1a7cee371f04ac44c06e Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Fri, 14 Feb 2025 01:10:51 +0000
Subject: [PATCH 001/358] !17959 Update torchair commit id Merge pull request
 !17959 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index adfaa941ff..bef63ceb25 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit adfaa941ff59504d9ef46497426f83a53c036e18
+Subproject commit bef63ceb255eaf030a2a67dbe09ba52cced24eaf
-- 
Gitee


From 9e962957b74eb117884f9138752aafb3e3edc4dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=A1=91=E6=9E=97?= <sanglin4@hisilicon.com>
Date: Fri, 14 Feb 2025 09:29:05 +0000
Subject: [PATCH 002/358] =?UTF-8?q?!17949=20moe=5Fdistribute=5Fdispatch/co?=
 =?UTF-8?q?mbine=E7=AE=97=E5=AD=90=E4=BF=AE=E6=94=B9tp=E5=9F=9F=E5=85=A5?=
 =?UTF-8?q?=E5=8F=82=E4=B8=BA=E5=8F=AF=E9=80=89=20Merge=20pull=20request?=
 =?UTF-8?q?=20!17949=20from=20=E6=A1=91=E6=9E=97/master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/meta/_meta_registrations.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torch_npu/meta/_meta_registrations.py b/torch_npu/meta/_meta_registrations.py
index e880a91f81..019ce61117 100644
--- a/torch_npu/meta/_meta_registrations.py
+++ b/torch_npu/meta/_meta_registrations.py
@@ -270,8 +270,8 @@ def npu_masked_softmax_with_rel_pos_bias_meta(x, atten_mask, relative_pos_bias,
 
 
 @impl(m, "npu_moe_distribute_dispatch")
-def npu_moe_distribute_dispatch_meta(x, expert_ids, group_ep, group_tp, ep_world_size, tp_world_size, ep_rank_id, tp_rank_id, expert_shard_type,
-                                   shared_expert_rank_num, moe_expert_num, scales=None, quant_mode=0, global_bs=0):
+def npu_moe_distribute_dispatch_meta(x, expert_ids, group_ep, ep_world_size, ep_rank_id, moe_expert_num, scales=None, group_tp="", tp_world_size=0,
+                                     tp_rank_id=0, expert_shard_type=0, shared_expert_rank_num=0, quant_mode=0, global_bs=0):
     n = x.size(0)
     h = x.size(1)
     k = expert_ids.size(1)
@@ -316,8 +316,8 @@ def npu_moe_distribute_dispatch_meta(x, expert_ids, group_ep, group_tp, ep_world
 
 
 @impl(m, "npu_moe_distribute_combine")
-def npu_moe_distribute_combine_meta(expand_x, expert_ids, expand_idx, ep_send_counts, tp_send_counts, expert_scales, group_ep, group_tp, ep_world_size, tp_world_size, ep_rank_id, tp_rank_id, expert_shard_type,
-                                   shared_expert_rank_num, moe_expert_num, global_bs=0):
+def npu_moe_distribute_combine_meta(expand_x, expert_ids, expand_idx, ep_send_counts, expert_scales, group_ep, ep_world_size, ep_rank_id, moe_expert_num,
+                                    tp_send_counts=None, group_tp="", tp_world_size=0, tp_rank_id=0, expert_shard_type=0, shared_expert_rank_num=0, global_bs=0):
     dim_list = []
     dim_list.append(expert_ids.size(0))
     dim_list.append(expand_x.size(1))
-- 
Gitee


From 22e405feaa0c41eb7de13784a1f3e7648e6a7703 Mon Sep 17 00:00:00 2001
From: wangjie <wjchuee@foxmail.com>
Date: Fri, 14 Feb 2025 09:52:02 +0000
Subject: [PATCH 003/358] !17957 [PROF] Profiler fix get group info error Merge
 pull request !17957 from wangjie/profiler_group_info_fix260

---
 torch_npu/profiler/profiler_interface.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/torch_npu/profiler/profiler_interface.py b/torch_npu/profiler/profiler_interface.py
index e2df003acf..3b1127ed0d 100644
--- a/torch_npu/profiler/profiler_interface.py
+++ b/torch_npu/profiler/profiler_interface.py
@@ -254,19 +254,21 @@ class _ProfInterface:
                     if backend != "hccl":
                         continue
                     hccl_group = group._get_backend(torch.device("npu"))
-                    comm_name = hccl_group.get_hccl_comm_name(global_rank)
+                    comm_name = hccl_group.get_hccl_comm_name(global_rank, init_comm=False)
+                    if comm_name:
+                        group_info[comm_name] = {
+                            "group_name": hccl_group.options.hccl_config.get("group_name", ""),
+                            "group_rank": torch.distributed.get_group_rank(group, global_rank),
+                            "global_ranks": torch.distributed.get_process_group_ranks(group)
+                        }
+                default_group = torch.distributed.distributed_c10d._get_default_group()
+                comm_name = default_group._get_backend(torch.device("npu")).get_hccl_comm_name(global_rank, init_comm=False)
+                if comm_name:
                     group_info[comm_name] = {
-                        "group_name": hccl_group.options.hccl_config.get("group_name", ""),
-                        "group_rank": torch.distributed.get_group_rank(group, global_rank),
-                        "global_ranks": torch.distributed.get_process_group_ranks(group)
+                        "group_name": "default_group",
+                        "group_rank": torch.distributed.get_group_rank(default_group, global_rank),
+                        "global_ranks": torch.distributed.get_process_group_ranks(default_group)
                     }
-                default_group = torch.distributed.distributed_c10d._get_default_group()
-                comm_name = default_group._get_backend(torch.device("npu")).get_hccl_comm_name(global_rank)
-                group_info[comm_name] = {
-                    "group_name": "default_group",
-                    "group_rank": torch.distributed.get_group_rank(default_group, global_rank),
-                    "global_ranks": torch.distributed.get_process_group_ranks(default_group)
-                }
                 if group_info:
                     self.metadata.update({self.PARALLEL_GROUP_KEY: group_info})
         except Exception as err:
-- 
Gitee


From 919ae5ab417cb3432857a500fd9a1b1b57fd3172 Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Fri, 14 Feb 2025 23:39:52 +0000
Subject: [PATCH 004/358] !17997 Update torchair commit id Merge pull request
 !17997 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index bef63ceb25..4ed6f8606a 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit bef63ceb255eaf030a2a67dbe09ba52cced24eaf
+Subproject commit 4ed6f8606ae15e4136af04356de9f4fdfea6a7a5
-- 
Gitee


From 5929a78c68ce6fc84d3187c76ea752e27e1b8ca2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A9=B9=E6=98=8A?= <zhanhao9@huawei.com>
Date: Sat, 15 Feb 2025 01:44:56 +0000
Subject: [PATCH 005/358] =?UTF-8?q?!17968=20add=20new=20api=20npu=5Fgroup?=
 =?UTF-8?q?=5Fnorm=5Fswish=20Merge=20pull=20request=20!17968=20from=20?=
 =?UTF-8?q?=E8=A9=B9=E6=98=8A/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/allowlist_for_publicAPI.json | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index 045819a6d4..4d0b97dc94 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -2850,7 +2850,8 @@
     "scatter_update_",
     "npu_moe_compute_expert_tokens",
     "npu_moe_gating_top_k_softmax",
-    "npu_moe_init_routing"
+    "npu_moe_init_routing",
+    "npu_group_norm_swish"
   ],
   "torch_npu.contrib": [
     "npu_fused_attention_with_layernorm",
-- 
Gitee


From de260a36ca896b345b90ecdec557439e5a549ddd Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Sat, 15 Feb 2025 03:19:56 +0000
Subject: [PATCH 006/358] !18002 Update op_plugin commit id Merge pull request
 !18002 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 7d3403fa9b..8f6490f029 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 7d3403fa9b4d3fd7fc027e160dded5a5596fd324
+Subproject commit 8f6490f029f51d8d2c3f78fc2d4540996ceb491a
-- 
Gitee


From 6bf33eadbcbb8c9dac81eb294997cc00296c6cf6 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Sat, 15 Feb 2025 04:49:55 +0000
Subject: [PATCH 007/358] !18006 Update op_plugin commit id Merge pull request
 !18006 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 8f6490f029..e8fe30bd83 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 8f6490f029f51d8d2c3f78fc2d4540996ceb491a
+Subproject commit e8fe30bd83ae021b90a1ecd7d520f87d424481ea
-- 
Gitee


From 3cf1b942ee20b9cc6e00e09a2b866a141fd8c134 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E5=98=89=E5=B7=8D?= <843972097@qq.com>
Date: Sat, 15 Feb 2025 10:14:05 +0000
Subject: [PATCH 008/358] =?UTF-8?q?!17977=20[Feature]=20Add=20npu=5Fcross?=
 =?UTF-8?q?=5Fentropy=5Floss=20for=20allowlist=5Ffor=5FpublicAPI=20Merge?=
 =?UTF-8?q?=20pull=20request=20!17977=20from=20=E5=88=98=E5=98=89=E5=B7=8D?=
 =?UTF-8?q?/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/allowlist_for_publicAPI.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index 4d0b97dc94..9ed3179d1d 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -2815,6 +2815,7 @@
     "npu_apply_adam",
     "npu_bert_apply_adam",
     "npu_clear_float_status",
+    "npu_cross_entropy_loss",
     "npu_format_cast_",
     "npu_fusion_attention",
     "npu_get_float_status",
-- 
Gitee


From e60ecf5bcb3101dc038fb269434d8981887b370f Mon Sep 17 00:00:00 2001
From: wgb <wgb_strive@163.com>
Date: Mon, 17 Feb 2025 04:21:25 +0000
Subject: [PATCH 009/358] !17956 Enable atb for PTA Merge pull request !17956
 from wgb/2.6_copy

---
 setup.py              | 17 +++++++++++++++++
 torch_npu/__init__.py |  1 +
 2 files changed, 18 insertions(+)

diff --git a/setup.py b/setup.py
index 4409a99caa..1ca33aab47 100644
--- a/setup.py
+++ b/setup.py
@@ -414,6 +414,21 @@ def add_ops_files(base_dir, file_list):
     return
 
 
+def add_ops_python_files(ret_list):
+    # add ops python files
+    opplugin_path = os.path.join(BASE_DIR, 'third_party/op-plugin/op_plugin/python')
+
+    if os.path.exists(opplugin_path):
+        ops_python_files = glob.glob(os.path.join(opplugin_path, '**/*.py'), recursive=True)  
+        for src in ops_python_files:
+            dst = os.path.join(
+                os.path.join(BASE_DIR, "build/packages/torch_npu/op_plugin"),
+                os.path.relpath(src, opplugin_path))
+            os.makedirs(os.path.dirname(dst), exist_ok=True) 
+            ret_list.append((src, dst))
+    return
+
+
 def get_src_py_and_dst():
     ret = []
     generated_python_files = glob.glob(
@@ -432,6 +447,8 @@ def get_src_py_and_dst():
         os.makedirs(os.path.dirname(dst), exist_ok=True)
         ret.append((src, dst))
 
+    add_ops_python_files(ret)
+
     header_files = [
         "torch_npu/csrc/*.h",
         "torch_npu/csrc/*/*.h",
diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index e6e4d7534f..d46af43d83 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -74,6 +74,7 @@ from torch_npu.utils import _apply_module_patch, _add_tensor_methods, _add_colle
 from torch_npu.npu._stream_check import apply_sanitizer_patch
 import torch_npu.utils.custom_ops
 import torch_npu.distributed.rpc
+import torch_npu.op_plugin
 from torch_npu.distributed.rpc.backend_registry import _rpc_backend_registry
 from torch_npu.utils import _cann_package_check, _add_intercept_methods
 from torch_npu.utils import _register_ops_under_dtensor_rules
-- 
Gitee


From 8ed2723aa3117cfd1e1336a5070906c1840d1bcf Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Mon, 17 Feb 2025 04:49:58 +0000
Subject: [PATCH 010/358] !18031 Update op_plugin commit id Merge pull request
 !18031 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index e8fe30bd83..4cee96e32a 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit e8fe30bd83ae021b90a1ecd7d520f87d424481ea
+Subproject commit 4cee96e32a77913e086fbf2140dec76a583f63de
-- 
Gitee


From 3bed597e4b103c8af00261d8a0aed60a547a4db8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= <wangchao430@huawei.com>
Date: Mon, 17 Feb 2025 06:33:04 +0000
Subject: [PATCH 011/358] =?UTF-8?q?!17999=20Update=20TORCH=5FNPU=5FLOG=20p?=
 =?UTF-8?q?atch=20method=20Merge=20pull=20request=20!17999=20from=20?=
 =?UTF-8?q?=E7=8E=8B=E8=B6=85/v2.6.0=5Flogging?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/_logging/__init__.py        |  2 +-
 torch_npu/_logging/_internal.py       | 11 +++++++-
 torch_npu/csrc/logging/LogContext.cpp | 38 ++++++++++++++-------------
 torch_npu/csrc/logging/LogContext.h   |  6 ++---
 torch_npu/csrc/logging/Logger.cpp     |  8 +++---
 torch_npu/csrc/logging/Logger.h       |  6 ++---
 6 files changed, 41 insertions(+), 30 deletions(-)

diff --git a/torch_npu/_logging/__init__.py b/torch_npu/_logging/__init__.py
index 121c7fbfa3..e669356924 100644
--- a/torch_npu/_logging/__init__.py
+++ b/torch_npu/_logging/__init__.py
@@ -16,7 +16,7 @@ _add_logging_module()
 def _update_log_state_from_env():
     log_setting = os.environ.get("TORCH_NPU_LOGS", None)
     if log_setting is not None:
-        os.environ["TORCH_LOGS"] = log_setting
+        torch._logging._internal.LOG_ENV_VAR = "TORCH_NPU_LOGS"
         torch._logging._internal._init_logs()
         _C._logging._LogContext.GetInstance().setLogs(torch._logging._internal.log_state.log_qname_to_level)
     elif os.environ.get("TORCH_LOGS", None) is not None:
diff --git a/torch_npu/_logging/_internal.py b/torch_npu/_logging/_internal.py
index 5b6edd048a..951b1854d4 100644
--- a/torch_npu/_logging/_internal.py
+++ b/torch_npu/_logging/_internal.py
@@ -5,6 +5,15 @@ from torch_npu import _C
 
 
 def _set_logs():
+    """
+    Propagate the results torch._logging.set_logs to the C++ layer.
+
+    .. note:: The ``TORCH_LOGS`` or ``TORCH_NPU_LOGS`` environment variable has complete precedence
+        over this function, so if it was set, this function does nothing.
+
+    """
+
+    # ignore if env var is set
     if os.environ.get('TORCH_LOGS', None) is not None or os.environ.get('TORCH_NPU_LOGS', None) is not None:
         return
 
@@ -25,4 +34,4 @@ def _logging_patch():
 
 def _add_logging_module():
     torch._logging._internal.register_log("memory", "torch_npu.memory")
-    torch._logging._internal.register_log("delivery", "torch_npu.delivery")
+    torch._logging._internal.register_log("dispatch", "torch_npu.dispatch")
diff --git a/torch_npu/csrc/logging/LogContext.cpp b/torch_npu/csrc/logging/LogContext.cpp
index 457d295b98..a05a059421 100644
--- a/torch_npu/csrc/logging/LogContext.cpp
+++ b/torch_npu/csrc/logging/LogContext.cpp
@@ -9,16 +9,16 @@ LogContext &LogContext::GetInstance()
 }
 
 // Locked from the Outside
-void LogContext::GetAliasAndLevelByName(const std::string& name, std::string& alias, LoggingLevel& level)
+void LogContext::GetQNameAndLevelByName(const std::string& name, std::string& qname, LoggingLevel& level)
 {
     std::string nameKey = name;
     level = allLevel_;
-    alias = "";
+    qname = "";
     do {
-        auto iterLevel = aliasLevels_.find(nameKey);
-        if (iterLevel != aliasLevels_.end()) {
+        auto iterLevel = qnameLevels_.find(nameKey);
+        if (iterLevel != qnameLevels_.end()) {
             level = static_cast<LoggingLevel>(iterLevel->second);
-            alias = iterLevel->first;
+            qname = iterLevel->first;
             break;
         }
         auto pos = nameKey.rfind('.');
@@ -29,23 +29,25 @@ void LogContext::GetAliasAndLevelByName(const std::string& name, std::string& al
     } while (true);
 }
 
-void LogContext::setLogs(const std::unordered_map<std::string, int>& aliasLevels)
+void LogContext::setLogs(const std::unordered_map<std::string, int>& qnameLevels)
 {
     std::lock_guard<std::mutex> lock(mutex_);
-    aliasLevels_ = aliasLevels;
-    auto iter = aliasLevels_.find("torch");
-    if (iter != aliasLevels_.end()) {
+    qnameLevels_ = qnameLevels;
+    auto iter = qnameLevels_.find("torch");
+    if (iter != qnameLevels_.end()) {
         allLevel_ = static_cast<LoggingLevel>(iter->second);
     }
+    // Global or static logger variables are initialized prior to the invocation of set_logs,
+    // the logging levels associated with these loggers should be updated to reflect the new settings.
     for (auto iter = loggers_.begin(); iter != loggers_.end(); iter++) {
         LoggingLevel level = allLevel_;
-        std::string alias = iter->second->getModuleAlias();
-        if (alias.empty()) {
-            GetAliasAndLevelByName(iter->first, alias, level);
-            iter->second->setModuleAlias(alias);
+        std::string qname = iter->second->getQName();
+        if (qname.empty()) {
+            GetQNameAndLevelByName(iter->first, qname, level);
+            iter->second->setQName(qname);
         }
-        auto iterLevel = aliasLevels_.find(alias);
-        if (iterLevel != aliasLevels_.end()) {
+        auto iterLevel = qnameLevels_.find(qname);
+        if (iterLevel != qnameLevels_.end()) {
             level = static_cast<LoggingLevel>(iterLevel->second);
         }
         iter->second->setAllowLevel(level);
@@ -59,12 +61,12 @@ std::shared_ptr<Logger> LogContext::getLogger(const std::string& name)
     if (iter != loggers_.end()) {
         return iter->second;
     }
-    std::string alias;
+    std::string qname;
     LoggingLevel level = allLevel_;
-    GetAliasAndLevelByName(name, alias, level);
+    GetQNameAndLevelByName(name, qname, level);
     std::shared_ptr<Logger> logger = std::make_shared<Logger>(name);
     logger->setAllowLevel(level);
-    logger->setModuleAlias(alias);
+    logger->setQName(qname);
     loggers_[name] = logger;
     return logger;
 }
diff --git a/torch_npu/csrc/logging/LogContext.h b/torch_npu/csrc/logging/LogContext.h
index ec619b7e45..f0bdd6be57 100644
--- a/torch_npu/csrc/logging/LogContext.h
+++ b/torch_npu/csrc/logging/LogContext.h
@@ -18,13 +18,13 @@ public:
 
     std::shared_ptr<Logger> getLogger(const std::string& name = "");
     static LogContext& GetInstance();
-    void setLogs(const std::unordered_map<std::string, int>& aliasLevels);
+    void setLogs(const std::unordered_map<std::string, int>& qnameLevels);
 
 private:
-    void GetAliasAndLevelByName(const std::string& name, std::string& alias, LoggingLevel& level);
+    void GetQNameAndLevelByName(const std::string& name, std::string& qname, LoggingLevel& level);
 
     std::mutex mutex_;
-    std::unordered_map<std::string, int> aliasLevels_;
+    std::unordered_map<std::string, int> qnameLevels_;
     LoggingLevel allLevel_ = LoggingLevel::WARNING;
     std::unordered_map<std::string, std::shared_ptr<Logger>> loggers_;
 };
diff --git a/torch_npu/csrc/logging/Logger.cpp b/torch_npu/csrc/logging/Logger.cpp
index e25257c3ca..385d11f6af 100644
--- a/torch_npu/csrc/logging/Logger.cpp
+++ b/torch_npu/csrc/logging/Logger.cpp
@@ -22,14 +22,14 @@ void Logger::setAllowLevel(LoggingLevel level)
     allow_level_ = level;
 }
 
-void Logger::setModuleAlias(const std::string& alias)
+void Logger::setQName(const std::string& qname)
 {
-    alias_ = alias;
+    qname_ = qname;
 }
 
-std::string Logger::getModuleAlias()
+std::string Logger::getQName()
 {
-    return alias_;
+    return qname_;
 }
 
 void Logger::log(LoggingLevel level, const char* format, va_list args)
diff --git a/torch_npu/csrc/logging/Logger.h b/torch_npu/csrc/logging/Logger.h
index 8c918498f7..6c08c6c379 100644
--- a/torch_npu/csrc/logging/Logger.h
+++ b/torch_npu/csrc/logging/Logger.h
@@ -21,8 +21,8 @@ public:
     ~Logger() = default;
 
     void setAllowLevel(LoggingLevel level);
-    void setModuleAlias(const std::string& alias);
-    std::string getModuleAlias();
+    void setQName(const std::string& qname);
+    std::string getQName();
     void debug(const char* format, ...);
     void info(const char* format, ...);
     void warn(const char* format, ...);
@@ -34,7 +34,7 @@ private:
 
     LoggingLevel allow_level_ = LoggingLevel::WARNING;
     std::string name_;
-    std::string alias_;
+    std::string qname_;
 };
 
 }
-- 
Gitee


From 3532c534ab6e2791c014c47ef667015cb8185e1e Mon Sep 17 00:00:00 2001
From: yuhaiyan <yuhaiyan8@huawei.com>
Date: Mon, 17 Feb 2025 08:07:20 +0000
Subject: [PATCH 012/358] !17986 Delete -fopenmp Merge pull request !17986 from
 yuhaiyan/v2.6.0-dev1

---
 CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2e118364db..5045ab0a21 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -122,7 +122,6 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=old-style-cast")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__FILENAME__='\"$(notdir $(abspath $<))\"'")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-builtin-macro-redefined -D__FILE__='\"$(subst $(realpath ${CMAKE_SOURCE_DIR})/,,$(abspath $<))\"'")
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
 # These flags are not available in GCC-4.8.5. Set only when using clang.
 if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-invalid-partial-specialization")
-- 
Gitee


From ba493ea1cb27a54d290150319e1617cb4f71a286 Mon Sep 17 00:00:00 2001
From: yuhaiyan <yuhaiyan8@huawei.com>
Date: Mon, 17 Feb 2025 08:14:53 +0000
Subject: [PATCH 013/358] !17981 Fixed the failed tests. Merge pull request
 !17981 from yuhaiyan/v2.6.0-dev2

---
 test/distributed/test_allgather_base.py        | 3 ++-
 test/distributed/test_allgather_into_tensor.py | 6 ++++--
 test/distributed/test_reduce_scatter_tensor.py | 3 ++-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/test/distributed/test_allgather_base.py b/test/distributed/test_allgather_base.py
index a5b9184abb..957ce3487d 100644
--- a/test/distributed/test_allgather_base.py
+++ b/test/distributed/test_allgather_base.py
@@ -18,7 +18,7 @@ from test_allgather import HcclAllGatherTestBase
 class HcclAllGatherBaseTest(HcclAllGatherTestBase):
 
     @classmethod
-    def _test_all_gather_base(cls, rank, input1, world_size, init_pg, c2p):
+    def _test_all_gather_base(cls, rank, input1, world_size, init_pg, c2p, p2c):
         pg = init_pg(rank, world_size)
         input1 = input1.npu()
         shape = list(input1.size())
@@ -27,6 +27,7 @@ class HcclAllGatherBaseTest(HcclAllGatherTestBase):
         pg._all_gather_base(gather_tensor, input1)
         c2p.put((rank, gather_tensor.cpu()))
         pg.barrier()
+        p2c.get()
 
     @skipIfUnsupportMultiNPU(2)
     def test_all_gather_base_dist(self):
diff --git a/test/distributed/test_allgather_into_tensor.py b/test/distributed/test_allgather_into_tensor.py
index 5c1d4b33c3..9ef4269923 100644
--- a/test/distributed/test_allgather_into_tensor.py
+++ b/test/distributed/test_allgather_into_tensor.py
@@ -18,7 +18,7 @@ from test_allgather import HcclAllGatherTestBase
 class HcclAllGatherIntoTensorTest(HcclAllGatherTestBase):
 
     @classmethod
-    def _test_all_gather_into_tensor(cls, rank, input1, world_size, init_pg, c2p):
+    def _test_all_gather_into_tensor(cls, rank, input1, world_size, init_pg, c2p, p2c):
         pg = init_pg(rank, world_size)
         input1 = input1.npu()
         shape = list(input1.size())
@@ -27,6 +27,7 @@ class HcclAllGatherIntoTensorTest(HcclAllGatherTestBase):
         pg.all_gather_into_tensor(gather_tensor, input1)
         c2p.put((rank, gather_tensor.cpu()))
         pg.barrier()
+        p2c.get()
 
     @skipIfUnsupportMultiNPU(2)
     def test_all_gather_into_tensor_dist(self):
@@ -46,7 +47,7 @@ class HcclAllGatherIntoTensorTest(HcclAllGatherTestBase):
                                         HcclAllGatherIntoTensorTest._init_dist_hccl, expected, input1, world_size)
 
     @classmethod
-    def _test_all_gather_into_tensor_uneven(cls, rank, input1, world_size, init_pg, c2p):
+    def _test_all_gather_into_tensor_uneven(cls, rank, input1, world_size, init_pg, c2p, p2c):
         init_pg(rank, world_size)
         input1 = input1.npu()
         shape = list(input1.size())
@@ -55,6 +56,7 @@ class HcclAllGatherIntoTensorTest(HcclAllGatherTestBase):
         torch_npu.distributed.all_gather_into_tensor_uneven(gather_tensor, input1)
         c2p.put((rank, gather_tensor.cpu()))
         dist.barrier()
+        p2c.get()
 
     @skipIfUnsupportMultiNPU(2)
     def test_all_gather_into_tensor_uneven_dist(self):
diff --git a/test/distributed/test_reduce_scatter_tensor.py b/test/distributed/test_reduce_scatter_tensor.py
index a064daf27a..adfafe786a 100644
--- a/test/distributed/test_reduce_scatter_tensor.py
+++ b/test/distributed/test_reduce_scatter_tensor.py
@@ -50,7 +50,7 @@ class HcclReduceScatterTensorTest(HcclReduceScatterTestBase):
 
     @classmethod
     # pylint:disable=huawei-too-many-arguments
-    def _test_reduce_scatter_tensor_uneven(cls, rank, input_list, world_size, init_pg, c2p, reduce_op=dist.ReduceOp.SUM):
+    def _test_reduce_scatter_tensor_uneven(cls, rank, input_list, world_size, init_pg, c2p, p2c, reduce_op=dist.ReduceOp.SUM):
         init_pg(rank, world_size)
         input_list_npu = [input.npu() for input in input_list]
         input_tensor = torch.cat(input_list_npu)
@@ -58,6 +58,7 @@ class HcclReduceScatterTensorTest(HcclReduceScatterTestBase):
         torch_npu.distributed.reduce_scatter_tensor_uneven(output, input_tensor, reduce_op)
         c2p.put((rank, output.cpu()))
         dist.barrier()
+        p2c.get()
 
     @skipIfUnsupportMultiNPU(2)
     def test_reduce_scatter_tensor_uneven(self):
-- 
Gitee


From 1b4ac3ff2f0cceba510cef7217e188825b564ce8 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 18 Feb 2025 03:50:02 +0000
Subject: [PATCH 014/358] !18045 Update op_plugin commit id Merge pull request
 !18045 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 4cee96e32a..5078c25131 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 4cee96e32a77913e086fbf2140dec76a583f63de
+Subproject commit 5078c25131c46fd1e903d595260d8104a2a7061c
-- 
Gitee


From e0a7a1c740fb25f5371534bfd9512ca126beeaaf Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 18 Feb 2025 04:50:01 +0000
Subject: [PATCH 015/358] !18055 Update op_plugin commit id Merge pull request
 !18055 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 5078c25131..4ef888c8ee 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 5078c25131c46fd1e903d595260d8104a2a7061c
+Subproject commit 4ef888c8ee30bd22da2c6ac2f282812fed2918ba
-- 
Gitee


From bf30ddfbd69e33b15b7e2e37084d12728bcfe1af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= <yanpengquan@huawei.com>
Date: Tue, 18 Feb 2025 11:15:17 +0000
Subject: [PATCH 016/358] =?UTF-8?q?!18063=20Capture=20HBM=20error=20inform?=
 =?UTF-8?q?ation=20and=20return=20it=20when=20querying=20UCE=20errors.=20M?=
 =?UTF-8?q?erge=20pull=20request=20!18063=20from=20=E9=97=AB=E9=B9=8F?=
 =?UTF-8?q?=E5=85=A8/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/core/npu/NPUException.cpp |  8 ++++++++
 torch_npu/csrc/core/npu/NPUException.h   | 14 ++++++++++----
 torch_npu/csrc/core/npu/NPUQueue.cpp     |  4 +++-
 torch_npu/csrc/npu/Module.cpp            |  6 +++++-
 4 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUException.cpp b/torch_npu/csrc/core/npu/NPUException.cpp
index 3f6e95bc42..0b31d34906 100644
--- a/torch_npu/csrc/core/npu/NPUException.cpp
+++ b/torch_npu/csrc/core/npu/NPUException.cpp
@@ -103,6 +103,14 @@ const char *c10_npu_get_error_message()
     return c10_npu::acl::AclGetErrMsg();
 }
 
+void record_mem_hbm_ecc_error()
+{
+    MemUceInfo memUceInfo_;
+    memUceInfo_.is_hbm_ecc_error = true;
+    ASCEND_LOGE("Log HBM MULTI BIT ECC ERROR, set is_hbm_ecc_error param is true");
+    set_mem_uce_info(memUceInfo_);
+}
+
 bool checkUceErrAndRepair(bool check_error, std::string& err_msg)
 {
     int device = 0;
diff --git a/torch_npu/csrc/core/npu/NPUException.h b/torch_npu/csrc/core/npu/NPUException.h
index 4db5591d67..0005b4046d 100644
--- a/torch_npu/csrc/core/npu/NPUException.h
+++ b/torch_npu/csrc/core/npu/NPUException.h
@@ -120,7 +120,7 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args)
                 ", error code is ", error_code, PTA_ERROR(ErrCode::ACL));    \
             break;                                                           \
         }                                                                    \
-        case ACL_ERROR_RT_HBM_MULTI_BIT_ECC_ERROR: {                            \
+        case ACL_ERROR_RT_HBM_MULTI_BIT_ECC_ERROR: {                         \
             ASCEND_LOGE("getRepoStopFlag in Run, throw ECC ERROR.");         \
             std::string error_msg(c10_npu::c10_npu_get_error_message());     \
             std::regex pattern(R"(time us= (\d+)\.)");                       \
@@ -131,9 +131,11 @@ inline const char* getErrorFunction(const char* /* msg */, const char* args)
                     time_msg = match[1].str();                               \
                 }                                                            \
             }                                                                \
+            c10_npu::record_mem_hbm_ecc_error();                             \
             TORCH_CHECK(false, __func__, ":", __FILE__, ":", __LINE__,       \
-                " NPU function error: HBM MULTI BIT ECC ERROR.", time_msg,   \
-                ", error code is ", error_code, PTA_ERROR(ErrCode::ACL));    \
+                " NPU function error: HBM MULTI BIT ECC ERROR.", error_msg,  \
+                "time is ", time_msg, ", error code is ", error_code,        \
+                PTA_ERROR(ErrCode::ACL));                                    \
             break;                                                           \
         }                                                                    \
         case ACL_ERROR_RT_DEVICE_MEM_ERROR: {                                \
@@ -241,8 +243,9 @@ struct MemUceInfo {
     aclrtMemUceInfo info[MAX_MEM_UCE_INFO_ARRAY_SIZE];
     size_t retSize;
     int mem_type;
+    bool is_hbm_ecc_error;
 
-    MemUceInfo() : device(-1), retSize(0), mem_type(0)
+    MemUceInfo() : device(-1), retSize(0), mem_type(0), is_hbm_ecc_error(false)
     {
         std::memset(info, 0, sizeof(info));
     }
@@ -253,6 +256,7 @@ struct MemUceInfo {
         std::memset(info, 0, sizeof(info));
         retSize = 0;
         mem_type = 0;
+        is_hbm_ecc_error = false;
     }
 };
 
@@ -260,6 +264,8 @@ C10_NPU_API const char *c10_npu_get_error_message();
 
 bool checkUceErrAndRepair(bool check_error, std::string& err_msg);
 
+void record_mem_hbm_ecc_error();
+
 void set_mem_uce_info(MemUceInfo info);
 
 MemUceInfo get_mem_uce_info();
diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp
index 1cb1d284aa..3b39831372 100644
--- a/torch_npu/csrc/core/npu/NPUQueue.cpp
+++ b/torch_npu/csrc/core/npu/NPUQueue.cpp
@@ -378,7 +378,9 @@ bool Repository::ReadQueue()
         std::string err_msg;
         if (ret == ACL_ERROR_RT_DEVICE_MEM_ERROR && checkUceErrAndRepair(false, err_msg)) {
             SetStatus(UCE_EXIT);
-        } else if (ret == ACL_ERROR_RT_HBM_MULTI_BIT_ECC_ERROR) {
+        } else if (ret == ACL_ERROR_RT_HBM_MULTI_BIT_ECC_ERROR ||
+            acl_error.find(DEVICE_HBM_ECC_ERROR) != std::string::npos) {
+            record_mem_hbm_ecc_error();
             SetStatus(HBM_ECC_EXIT);
         } else if (GetStatus() != STOP_EXIT) {
             SetStatus(ERROR_EXIT);
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index 6fb980b2f9..df23a997f1 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -323,6 +323,10 @@ PyObject* THNPModule_check_uce_in_memory_wrap(PyObject* self, PyObject* arg)
     HANDLE_TH_ERRORS
     int device = THPUtils_unpackLong(arg);
     auto memUceInfo_ = c10_npu::get_mem_uce_info();
+    if (memUceInfo_.is_hbm_ecc_error) {
+        // HBM ECC error always return 3.
+        return PyLong_FromLong(3);
+    }
     if (memUceInfo_.retSize == 0) {
         // UCE error size is 0, return 0.
         memUceInfo_.mem_type = 0;
@@ -356,7 +360,7 @@ PyObject* THNPModule_restart_device_wrap(PyObject* self, PyObject* arg)
     if (memUceInfo_.retSize > 0) {
         NPU_CHECK_ERROR_WITHOUT_UCE(c10_npu::acl::AclrtMemUceRepair(memUceInfo_.device, memUceInfo_.info, memUceInfo_.retSize));
     }
-    
+
     c10_npu::clear_mem_uce_info();
     setDefaultStreamsStatus(device, c10_npu::RepoStatus::INIT);
     c10_npu::NPUCachingAllocator::cleanEvent();
-- 
Gitee


From a0fe75dbb8f22e856960edede6c1d233cf212729 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 18 Feb 2025 11:50:07 +0000
Subject: [PATCH 017/358] !18071 Update op_plugin commit id Merge pull request
 !18071 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 4ef888c8ee..2c96962d74 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 4ef888c8ee30bd22da2c6ac2f282812fed2918ba
+Subproject commit 2c96962d74b411d94f44372d286638158e0f1763
-- 
Gitee


From 87c1b3853ef7cbfa765c4ec2ff4c07ee15d02ff6 Mon Sep 17 00:00:00 2001
From: huangyunlong <huangyunlong4@huawei.com>
Date: Tue, 18 Feb 2025 12:30:34 +0000
Subject: [PATCH 018/358] !18012 reset to default value when
 TORCH_HCCL_STATUS_SAVE_INTERVAL &lt;= 0 Merge pull request !18012 from
 huangyunlong/2.6env

---
 torch_npu/csrc/core/npu/register/OptionsManager.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
index dbf399df5d..55dc7488e1 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
@@ -263,7 +263,14 @@ uint32_t OptionsManager::GetStatusSaveInterval()
 {
     const static uint32_t status_save_interval = []() -> uint32_t {
         char* env_val = std::getenv("TORCH_HCCL_STATUS_SAVE_INTERVAL");
-        int64_t envFlag = (env_val != nullptr) ? strtol(env_val, nullptr, 10) : 30;
+        int64_t envFlag = 30;
+        if (env_val != nullptr) {
+            envFlag = strtol(env_val, nullptr, 10);
+            if (envFlag <= 0) {
+                envFlag = 30;
+                TORCH_NPU_WARN_ONCE("Get env TORCH_HCCL_STATUS_SAVE_INTERVAL less than or equal to 0, so reset it to the default value.");
+            }
+        }
         return static_cast<uint32_t>(envFlag);
     }();
     return status_save_interval;
-- 
Gitee


From bfa82a36216f485c36c44fb53c8eb743d2df9620 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 18 Feb 2025 14:20:02 +0000
Subject: [PATCH 019/358] !18103 Update op_plugin commit id Merge pull request
 !18103 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 2c96962d74..7d30b87b35 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 2c96962d74b411d94f44372d286638158e0f1763
+Subproject commit 7d30b87b35973b7f376c5db54ddfa7b6b15d1aff
-- 
Gitee


From 811fe1bb345824f60bc4927b719c0f2d4e583204 Mon Sep 17 00:00:00 2001
From: shaoyf <shaoyifan1@huawei.com>
Date: Tue, 18 Feb 2025 14:40:53 +0000
Subject: [PATCH 020/358] !18038 Register
 npuBasicAutogradNotImplementedFallbackImpl for AutogradPrivateUse1. Merge
 pull request !18038 from shaoyf/v260_warn

---
 test/autograd/test_autograd_fallback.py       |  30 +++
 .../csrc/aten/VariableFallbackKernel.cpp      | 199 +++++++++++++++++-
 2 files changed, 228 insertions(+), 1 deletion(-)
 create mode 100644 test/autograd/test_autograd_fallback.py

diff --git a/test/autograd/test_autograd_fallback.py b/test/autograd/test_autograd_fallback.py
new file mode 100644
index 0000000000..6c940ab857
--- /dev/null
+++ b/test/autograd/test_autograd_fallback.py
@@ -0,0 +1,30 @@
+import torch
+from torch.testing._internal.common_utils import (
+    run_tests,
+    TestCase,
+)
+import torch_npu
+
+class TestAutogradFallback(TestCase):
+
+    def test_pad_backward_warn(self):
+
+        def _exec_npu_pad():
+            npu_input = torch.randn(2, 3).npu()
+            npu_input.requires_grad = True
+            pads = (1, 1, 1, 1)
+            output = torch_npu.npu_pad(npu_input, pads)
+            output.backward(torch.ones_like(output))
+
+        # When set to "nothing," calling the reverse function directly causes an error.
+        torch._C._set_autograd_fallback_mode("nothing")
+        with self.assertRaisesRegex(RuntimeError, "does not require grad"):
+            _exec_npu_pad()
+
+        # When set to "warn," calling the print function emits a warning.
+        torch._C._set_autograd_fallback_mode("warn")
+        _exec_npu_pad()
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch_npu/csrc/aten/VariableFallbackKernel.cpp b/torch_npu/csrc/aten/VariableFallbackKernel.cpp
index f655345296..25d96107e4 100644
--- a/torch_npu/csrc/aten/VariableFallbackKernel.cpp
+++ b/torch_npu/csrc/aten/VariableFallbackKernel.cpp
@@ -2,6 +2,11 @@
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/native/CPUFallback.h>
 #include <torch/library.h>
+#include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
+#include <torch/csrc/autograd/functions/basic_ops.h>
+#include <torch/csrc/autograd/functions/utils.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/ivalue.h>
 
 #include "torch_npu/csrc/core/npu/NPUException.h"
 
@@ -25,16 +30,208 @@ using c10::DispatchKey;
 using c10::DispatchKeySet;
 using c10::Dispatcher;
 using c10::KernelFunction;
+using torch::autograd::edge_list;
+using torch::autograd::Node;
+using torch::autograd::variable_list;
 
 namespace {
 
+template <typename F>
+void _foreach_tensor(
+    F fn,
+    torch::jit::Stack* stack,
+    size_t stack_start,
+    size_t size)
+{
+    // Enumerate over tensors in a stack, including ones in TensorLists
+    int idx_tensor = 0;
+    for (const auto idx_arg : c10::irange(size)) {
+        auto& ivalue = (*stack)[stack_start + idx_arg];
+        if (ivalue.isTensor()) { // true for optional tensor that has value
+            const auto& tensor = ivalue.toTensor();
+            fn(idx_tensor, idx_arg, tensor);
+            idx_tensor++;
+        } else if (ivalue.isTensorList()) {
+            for (const auto& iv : ivalue.toListRef()) {
+                const auto& tensor = iv.toTensor();
+                fn(idx_tensor, idx_arg, tensor);
+                idx_tensor++;
+            }
+        }
+    }
+}
+
+
+static void warnAutogradNotImplemented(const std::string& op_name)
+{
+    TORCH_NPU_WARN_ONCE(
+        op_name,
+        ": an autograd kernel was not registered to the Autograd key(s) ",
+        "but we are trying to backprop through it. This may lead to silently incorrect behavior. ",
+        "This behavior is deprecated and will be removed in a future version of PyTorch. ",
+        "If your operator is differentiable, please ensure you have registered an "
+        "autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, "
+        "DispatchKey::CompositeImplicitAutograd). If your operator is not "
+        "differentiable, or to squash this warning and use the previous behavior, "
+        "please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd.");
+}
+
+
+struct WarnNotImplemented : public Node {
+    WarnNotImplemented(
+        std::string op_name,
+        int64_t num_outputs,
+        edge_list&& next_edges)
+        : Node(std::move(next_edges)), op_name(std::move(op_name)), num_outputs(num_outputs) {}
+
+    WarnNotImplemented(std::string op_name, int64_t num_outputs)
+        : op_name(std::move(op_name)), num_outputs(num_outputs) {}
+
+    variable_list apply(variable_list&& inputs) override;
+
+    std::string op_name;
+    int64_t num_outputs;
+};
+
+auto WarnNotImplemented::apply(variable_list&& inputs) -> variable_list
+{
+    warnAutogradNotImplemented(op_name);
+    std::vector<at::Tensor> output(num_outputs);
+    return output;
+}
+
+static void npuBasicAutogradNotImplementedFallbackImpl(
+    const c10::OperatorHandle& op,
+    c10::DispatchKeySet dispatch_keys,
+    torch::jit::Stack* stack)
+{
+    const auto& schema = op.schema();
+    const auto& op_name = schema.operator_name().name;
+    const auto num_arguments = schema.arguments().size();
+    const auto num_returns = schema.returns().size();
+    const auto stack_start = stack->size() - num_arguments;
+
+    if (torch::autograd::getAutogradFallbackMode() == torch::autograd::AutogradFallbackMode::Nothing) {
+        op.redispatchBoxed(dispatch_keys & c10::after_autograd_keyset, stack);
+        return;
+    }
+    TORCH_INTERNAL_ASSERT(
+        torch::autograd::getAutogradFallbackMode() == torch::autograd::AutogradFallbackMode::Warn);
+
+    bool any_input_requires_grad = false;
+    _foreach_tensor(
+        [&](size_t _, size_t idx_arg, const at::Tensor& t) {
+            if (t.requires_grad()) {
+            any_input_requires_grad = true;
+            }
+        },
+        stack,
+        stack_start,
+        num_arguments);
+    // Optimization: TLS access can be slow. So we only check if it necessary
+    // by putting it after the requires_grad checks.
+    any_input_requires_grad = any_input_requires_grad && at::GradMode::is_enabled();
+
+    std::shared_ptr<WarnNotImplemented> grad_fn;
+    if (any_input_requires_grad) {
+        // NB: It is standard to collect edges from all tensors
+        // (see generated/VariableTypeEverything.cpp for examples)
+        std::vector<const at::Tensor*> all_tensors_on_stack;
+        _foreach_tensor(
+            [&](size_t _, size_t idx_arg, const at::Tensor& t) {
+            all_tensors_on_stack.push_back(&t);
+            },
+            stack,
+            stack_start,
+            num_arguments);
+        grad_fn = std::shared_ptr<WarnNotImplemented>(
+            new WarnNotImplemented(op_name, all_tensors_on_stack.size()),
+            torch::autograd::deleteNode);
+        grad_fn->set_next_edges(torch::autograd::collect_next_edges(all_tensors_on_stack));
+    }
+
+    op.redispatchBoxed(dispatch_keys & c10::after_autograd_keyset, stack);
+
+    if (any_input_requires_grad) {
+        // NB: if the operator mutates any inputs in-place and does not return them
+        // as outputs, we are unable to lazily raise a warning. This is OK because
+        // we don't expect many existing operators to do this because of the amount
+        // of technical expertise necessary (you would need to manually register an
+        // autograd kernel without using autograd.Function)
+        _foreach_tensor(
+            [&](size_t _, size_t idx_ret, const at::Tensor& t) {
+            if (!torch::autograd::isDifferentiableType(t.scalar_type())) {
+                return;
+            }
+            const bool is_mutable_output =
+                schema.is_aliasing({c10::SchemaArgType::output, idx_ret}) &&
+                schema.is_mutable({c10::SchemaArgType::output, idx_ret});
+
+            // If the post-autograd implementation returns Tensors that require
+            // grad, then we install a hook that will warn during the backwards.
+            //
+            // NB: If the operation is inplace and the inputs were views,
+            // it is possible that the history was rebased and the hook will
+            // not warn in all places where it should. That is, the following
+            // won't warn:
+            // >>> x = torch.randn(3, 3, requires_grad=True)
+            // >>> z = x.clone()
+            // >>> w = z[0]
+            // >>> k = w[0]
+            // >>> y = op(k)
+            // >>> torch.autograd.grad(z.sum(), w)
+            if (t.requires_grad()) {
+                t.register_hook([op_name](const at::Tensor& grad) {
+                warnAutogradNotImplemented(op_name);
+                });
+                // If history is rebased, then we will attempt to warn
+                // on the view's base. This will catch most cases (because
+                // users typically call .backward() and backprop through
+                // the entire program).
+                if (t.is_view() && is_mutable_output) {
+                // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+                auto& base = const_cast<at::TensorBase&>(t._base());
+                if (base.requires_grad()) {
+                    // Can only register_hook on tensors that require grad.
+                    base.register_hook([op_name](const at::TensorBase& grad) {
+                    warnAutogradNotImplemented(op_name);
+                    });
+                }
+                }
+                return;
+            }
+
+            // If the post-autograd implementation returns any Tensors that
+            // don't require grad, then we install the WarnNotImplemented grad_fn.
+            // This grad_fn warns in backward and returns undefined tensor
+            // gradients.
+            //
+            // NOTE [autograd fallback and in-place operations]
+            // If the schema says the output is mutable, and the output
+            // is an input, and the input is a view Tensor, then...
+            // we're not sure if set_history is OK to do, so we just skip
+            // adding the grad_fn. Builtin operators do rebase_history here,
+            // but custom operators may have multiple Tensor(a!) returns,
+            // rebase_history assumes single Tensor(a!) return, and in general
+            // custom ops don't have a good in-place story.
+            if (!is_mutable_output) {
+                // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+                torch::autograd::set_history(const_cast<at::Tensor&>(t), grad_fn);
+            }
+            },
+            stack,
+            stack->size() - num_returns,
+            num_returns);
+    }
+}
+
 // Register fallthrough for Autograd backends dispatch keys
 // NB: But not the private use ones; maybe the extension wants
 // to override it themselves!
 
 // (Ascend) TORCH_LIBRARY_IMPL
 TORCH_LIBRARY_IMPL(_, AutogradPrivateUse1, m) {
-    m.fallback(torch::CppFunction::makeFallthrough());
+    m.fallback(torch::CppFunction::makeFromBoxedFunction<&npuBasicAutogradNotImplementedFallbackImpl>());
 }
 
 bool has_op_name_warned(const std::string& op_name)
-- 
Gitee


From 7784cbe2754042068eb4a42d0dca685d1dfb2177 Mon Sep 17 00:00:00 2001
From: yuhaiyan <yuhaiyan8@huawei.com>
Date: Wed, 19 Feb 2025 03:15:55 +0000
Subject: [PATCH 021/358] !18115 Modify the input parameters of functions.
 Merge pull request !18115 from yuhaiyan/v2.6.0-dev1

---
 test/torch_npu_schema.json | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json
index 1a0f4cc5e2..e6320eb7cc 100644
--- a/test/torch_npu_schema.json
+++ b/test/torch_npu_schema.json
@@ -828,22 +828,22 @@
     "signature": "(self)"
   },
   "torch_npu.dynamo.torchair.ge.Tensor.dtype": {
-    "signature": "()"
+    "signature": "(self)"
   },
   "torch_npu.dynamo.torchair.ge.Tensor.rank": {
-    "signature": "()"
+    "signature": "(self)"
   },
   "torch_npu.dynamo.torchair.ge.TensorSpec": {
     "signature": "()"
   },
   "torch_npu.dynamo.torchair.ge.TensorSpec.dtype": {
-    "signature": "()"
+    "signature": "(self)"
   },
   "torch_npu.dynamo.torchair.ge.TensorSpec.rank": {
-    "signature": "()"
+    "signature": "(self)"
   },
   "torch_npu.dynamo.torchair.ge.TensorSpec.size": {
-    "signature": "()"
+    "signature": "(self)"
   },
   "torch_npu.dynamo.torchair.ge.custom_op": {
     "signature": "(op_type: str, *, inputs: Optional[Dict[str, Union[ForwardRef('Tensor'), List[ForwardRef('Tensor')], NoneType]]], outputs: Optional[List[Union[str, Tuple[str, int]]]], attrs: Optional[Dict[str, ForwardRef('_Attr')]] = None, node_name: Optional[str] = None)"
-- 
Gitee


From e9b772973b80d82a59608fd798f8265916468c15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E5=A4=8F=E5=A4=8F?= <wangxiaxia3@huawei.com>
Date: Wed, 19 Feb 2025 03:44:49 +0000
Subject: [PATCH 022/358] =?UTF-8?q?!18123=20Skip=20the=20verification=20of?=
 =?UTF-8?q?=20torch=5Fnpu.op=5Fplugin.meta.=5Fmeta=5Fregistrations.=20Merg?=
 =?UTF-8?q?e=20pull=20request=20!18123=20from=20=E7=8E=8B=E5=A4=8F?=
 =?UTF-8?q?=E5=A4=8F/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/npu/test_public_bindings.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/npu/test_public_bindings.py b/test/npu/test_public_bindings.py
index bfc8ff7889..a3802878bd 100644
--- a/test/npu/test_public_bindings.py
+++ b/test/npu/test_public_bindings.py
@@ -547,6 +547,7 @@ class TestPublicBindings(TestCase):
             "torch_npu.dynamo.torchair._ge_concrete_graph.ge_converter.custom.npu_selu_backward",
             "torch_npu.dynamo.torchair._ge_concrete_graph.ge_ir_by_protoc_3_13_pb2",
             "torch_npu.utils.collect_hccl_info",
+            "torch_npu.op_plugin.meta._meta_registrations",
 
         }
 
-- 
Gitee


From 86ac6c16489ba4e67dbbfc91e916bd09ccb4a408 Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Wed, 19 Feb 2025 04:22:59 +0000
Subject: [PATCH 023/358] !18110 Update torchair commit id Merge pull request
 !18110 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 4ed6f8606a..e5cc5f4981 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 4ed6f8606ae15e4136af04356de9f4fdfea6a7a5
+Subproject commit e5cc5f4981971c0f649f11f999f61ade9de9aa67
-- 
Gitee


From 7b9c600435eb70dfe1197b99acaa7afc9eae2c6a Mon Sep 17 00:00:00 2001
From: dilililiwhy <why.wuhuanyu@huawei.com>
Date: Wed, 19 Feb 2025 06:11:59 +0000
Subject: [PATCH 024/358] !18127 Release 2.6.0rc1 Merge pull request !18127
 from dilililiwhy/260_release_issue

---
 README.md                | 64 +++++++++++++++++++++-------------------
 README.zh.md             | 62 ++++++++++++++++++++------------------
 SECURITYNOTE.md          | 48 +++++++++++++++---------------
 ci/docker/ARM/Dockerfile | 16 +++++-----
 ci/docker/X86/Dockerfile |  6 ++--
 requirements.txt         |  2 +-
 setup.py                 |  4 +++
 test/requirements.txt    |  2 +-
 8 files changed, 108 insertions(+), 96 deletions(-)

diff --git a/README.md b/README.md
index 59d79729c9..bdf44f7562 100644
--- a/README.md
+++ b/README.md
@@ -19,13 +19,13 @@ Install **PyTorch** through pip.
 **For Aarch64:**
 
 ```Python
-pip3 install torch==2.1.0
+pip3 install torch==2.6.0
 ```
 
 **For x86:**
 
 ```Python
-pip3 install torch==2.1.0+cpu  --index-url https://download.pytorch.org/whl/cpu
+pip3 install torch==2.6.0+cpu  --index-url https://download.pytorch.org/whl/cpu
 ```
 
 2. **Install torch-npu dependencies**
@@ -39,21 +39,19 @@ pip3 install setuptools
 
 If the installation fails, use the download link or visit the [PyTorch official website](https://pytorch.org/) to download the installation package of the corresponding version.
 
-| OS arch | Python version | link                                                         |
-| ------- | -------------- | ------------------------------------------------------------ |
-| x86     | Python3.8      | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp38-cp38-linux_x86_64.whl#sha256=9e5cfd931a65b38d222755a45dabb53b836be31bc620532bc66fee77e3ff67dc) |
-| x86     | Python3.9      | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp39-cp39-linux_x86_64.whl#sha256=86cc28df491fa84738affe752f9870791026565342f69e4ab63e5b935f00a495) |
-| x86     | Python3.10     | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp310-cp310-linux_x86_64.whl#sha256=5077921fc2b54e69a534f3a9c0b98493c79a5547c49d46f5e77e42da3610e011) |
-| x86     | Python3.11     | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp311-cp311-linux_x86_64.whl#sha256=5954924ce74bc7e6a6c811e3fa4bdda9936d9889f6369fd068420c444bfd1cae) |
-| aarch64 | Python3.8      | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=761822761fffaa1c18a62c5deb13abaa780862577d3eadc428f1daa632536905) |
-| aarch64 | Python3.9      | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=de7d63c6ecece118684415a3dbd4805af4a4c1ee1490cccf7405d8c240a481b4) |
-| aarch64 | Python3.10     | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=a04a0296d47f28960f51c18c5489a8c3472f624ec3b5bcc8e2096314df8c3342) |
-| aarch64 | Python3.11     | [link](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=8132efb782cd181cc2dcca5e58effbe4217cdb2581206ac71466d535bf778867) |
+| OS arch | Python version | link                                                  |
+| ------- | -------------- | ----------------------------------------------------- |
+| x86     | Python3.9      | [link](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp39-cp39-linux_x86_64.whl#sha256=b68274aeb4047ba8c73e903f0621e2a4adb54ad5282b0845689c3e1dcd2e2546) |
+| x86     | Python3.10     | [link](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp310-cp310-linux_x86_64.whl#sha256=35a9e78b7e4096968b54c1a198687b981569c50ae93e661aa430f9fd208da102) |
+| x86     | Python3.11     | [link](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl#sha256=5b6ae523bfb67088a17ca7734d131548a2e60346c622621e4248ed09dd0790cc) |
+| aarch64 | Python3.9      | [link](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp39-cp39-manylinux_2_28_aarch64.whl#sha256=2ab9c6b3d6eea506bda9b82a0155e974d8ef8e38b417589d144568b4fa59afe1) |
+| aarch64 | Python3.10     | [link](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp310-cp310-manylinux_2_28_aarch64.whl#sha256=90832f4d118c566b8652a2196ac695fc1f14cf420db27b5a1b41c7eaaf2141e9) |
+| aarch64 | Python3.11     | [link](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp311-cp311-manylinux_2_28_aarch64.whl#sha256=d3dab9fb0294f268aec28e8aaba834e9d006b90a50db5bc2fe2191a9d48c6084) |
 
 3. **Install torch-npu**
 
 ```
-pip3 install torch-npu==2.1.0.post10
+pip3 install torch-npu==2.6.0rc1
 ```
 
 ### From Source
@@ -63,7 +61,7 @@ In some special scenarios, users may need to compile **torch-npu** by themselves
 1. **Clone torch-npu**
 
    ```
-   git clone https://github.com/ascend/pytorch.git -b 2.1.0-6.0.0 --depth 1
+   git clone https://github.com/ascend/pytorch.git -b v2.6.0 --depth 1
    ```
 
 2. **Build Docker Image**
@@ -82,11 +80,11 @@ In some special scenarios, users may need to compile **torch-npu** by themselves
 
 4. **Compile torch-npu**
 
-   Take **Python 3.8** as an example.
+   Take **Python 3.9** as an example.
 
    ```
    cd /home/pytorch
-   bash ci/build.sh --python=3.8
+   bash ci/build.sh --python=3.9
    ```
 
 **Tips**
@@ -138,10 +136,12 @@ Refer to [API of Ascend Extension for PyTorch](docs/api/torch_npu_apis.md) for m
 | PyTorch Version | Python Version                                            |
 |-----------------|:----------------------------------------------------------|
 | PyTorch1.11.0   | Python3.7.x(>=3.7.5),Python3.8.x,Python3.9.x,Python3.10.x |
-| PyTorch2.1.0    | Python3.8.x,Python3.9.x,Python3.10.x,Python3.11.x                      |
-| PyTorch2.2.0    | Python3.8.x,Python3.9.x,Python3.10.x                       |
-| PyTorch2.3.1    | Python3.8.x,Python3.9.x,Python3.10.x,Python3.11.x                       |
-| PyTorch2.4.0    | Python3.8.x,Python3.9.x,Python3.10.x,Python3.11.x                       |
+| PyTorch2.1.0    | Python3.8.x,Python3.9.x,Python3.10.x,Python3.11.x         |
+| PyTorch2.2.0    | Python3.8.x,Python3.9.x,Python3.10.x                      |
+| PyTorch2.3.1    | Python3.8.x,Python3.9.x,Python3.10.x,Python3.11.x         |
+| PyTorch2.4.0    | Python3.8.x,Python3.9.x,Python3.10.x,Python3.11.x         |
+| PyTorch2.5.1    | Python3.9.x,Python3.10.x,Python3.11.x                     |
+| PyTorch2.6.0    | Python3.9.x,Python3.10.x,Python3.11.x                     |
 
 ## Ascend Auxiliary Software
 
@@ -149,6 +149,8 @@ Refer to [API of Ascend Extension for PyTorch](docs/api/torch_npu_apis.md) for m
 
 | CANN Version          | Supported PyTorch Version | Supported Extension Version | Github Branch     |
 |-----------------------|---------------------------|-----------------------------|-------------------|
+| CANN 8.0.0.beta1      | 2.6.0                     | 2.6.0rc1                    | v2.6.0            |
+| CANN 8.0.0.alpha001   | 2.5.1                     | 2.5.1rc1                    | v2.5.1            |
 | CANN 8.0.0            | 2.4.0                     | 2.4.0.post2                 | v2.4.0-6.0.0      | 
 |                       | 2.3.1                     | 2.3.1.post4                 | v2.3.1-6.0.0      |
 |                       | 2.1.0                     | 2.1.0.post10                | v2.1.0-6.0.0      |
@@ -235,16 +237,18 @@ The version branches of AscendPyTorch have the following maintenance phases:
 
 ##  PyTorch Maintenance Policies
 
-| **PyTorch** |  **Maintenance Policies** | **Status** | **Launch Date**       | **Subsequent Status**            | **EOL Date**     |
-|-----------|--------------------|--------------|------------|-----------------|-----------|
-| 2.4.0     |  Regular Release  | Development  | 2024/10/15 |Expected to enter maintenance status from  March 15, 2025 |           |
-| 2.3.1     |  Regular Release  | Development   | 2024/06/06 | Expected to enter maintenance status from  December 6, 2024 |           |
-| 2.2.0     |  Regular Release  | Maintained   | 2024/04/01 | Expected to enter maintenance free status from September 10th, 2025|           |
-| 2.1.0     | Long Term Support  | Development   | 2023/10/15 | Expected to enter maintenance status from March 30, 2025 |           |
-| 2.0.1     | Regular Release    | EOL   | 2023/7/19  |   | 2024/3/14          |
-| 1.11.0    | Long Term Support  | Maintained   | 2023/4/19  | Expected to enter maintenance free status from September 10th, 2025  |           |
-| 1.8.1     | Long Term Support  | EOL          | 2022/4/10  |                 | 2023/4/10 |
-| 1.5.0     | Long Term Support  | EOL          | 2021/7/29  |                 | 2022/7/29 |
+| **PyTorch** | **Maintenance Policies** | **Status**  | **Launch Date** | **Subsequent Status**                                               | **EOL Date** |
+|-------------|--------------------------|-------------|-----------------|---------------------------------------------------------------------|--------------|
+| 2.6.0       | Regular Release          | Development | 2025/02/20      | Expected to enter maintenance status from  July 20, 2025            |              |
+| 2.5.1       | Regular Release          | Development | 2024/11/08      | Expected to enter maintenance status from  April 8, 2025            |              |
+| 2.4.0       | Regular Release          | Development | 2024/10/15      | Expected to enter maintenance status from  March 15, 2025           |              |
+| 2.3.1       | Regular Release          | Development | 2024/06/06      | Expected to enter maintenance status from  December 6, 2024         |              |
+| 2.2.0       | Regular Release          | Maintained  | 2024/04/01      | Expected to enter maintenance free status from September 10th, 2025 |              |
+| 2.1.0       | Long Term Support        | Development | 2023/10/15      | Expected to enter maintenance status from March 30, 2025            |              |
+| 2.0.1       | Regular Release          | EOL         | 2023/7/19       |                                                                     | 2024/3/14    |
+| 1.11.0      | Long Term Support        | Maintained  | 2023/4/19       | Expected to enter maintenance free status from September 10th, 2025 |              |
+| 1.8.1       | Long Term Support        | EOL         | 2022/4/10       |                                                                     | 2023/4/10    |
+| 1.5.0       | Long Term Support        | EOL         | 2021/7/29       |                                                                     | 2022/7/29    |
 
 ## Reference Documents
 
diff --git a/README.zh.md b/README.zh.md
index 400174917a..d47c0500bd 100644
--- a/README.zh.md
+++ b/README.zh.md
@@ -19,27 +19,25 @@
 **aarch64:**
 
 ```Python
-pip3 install torch==2.1.0
+pip3 install torch==2.6.0
 ```
 
 **x86:**
 
 ```Python
-pip3 install torch==2.1.0+cpu  --index-url https://download.pytorch.org/whl/cpu
+pip3 install torch==2.6.0+cpu  --index-url https://download.pytorch.org/whl/cpu
 ```
 
 若使用pip命令安装失败，请使用下载链接或进入[PyTorch官方网站](https://pytorch.org/)进行查询下载对应版本。
 
-| 架构    | Python版本 | 下载链接                                                     |
-| ------- | ---------- | ------------------------------------------------------------ |
-| x86     | Python3.8  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp38-cp38-linux_x86_64.whl#sha256=9e5cfd931a65b38d222755a45dabb53b836be31bc620532bc66fee77e3ff67dc) |
-| x86     | Python3.9  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp39-cp39-linux_x86_64.whl#sha256=86cc28df491fa84738affe752f9870791026565342f69e4ab63e5b935f00a495) |
-| x86     | Python3.10 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp310-cp310-linux_x86_64.whl#sha256=5077921fc2b54e69a534f3a9c0b98493c79a5547c49d46f5e77e42da3610e011) |
-| x86     | Python3.11 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp311-cp311-linux_x86_64.whl#sha256=5954924ce74bc7e6a6c811e3fa4bdda9936d9889f6369fd068420c444bfd1cae) |
-| aarch64 | Python3.8  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=761822761fffaa1c18a62c5deb13abaa780862577d3eadc428f1daa632536905) |
-| aarch64 | Python3.9  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=de7d63c6ecece118684415a3dbd4805af4a4c1ee1490cccf7405d8c240a481b4) |
-| aarch64 | Python3.10 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=a04a0296d47f28960f51c18c5489a8c3472f624ec3b5bcc8e2096314df8c3342) |
-| aarch64 | Python3.11 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl#sha256=8132efb782cd181cc2dcca5e58effbe4217cdb2581206ac71466d535bf778867) |
+| 架构    | Python版本 | 下载链接                                              |
+| ------- | ---------- | ----------------------------------------------------- |
+| x86     | Python3.9  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp39-cp39-linux_x86_64.whl#sha256=b68274aeb4047ba8c73e903f0621e2a4adb54ad5282b0845689c3e1dcd2e2546) |
+| x86     | Python3.10 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp310-cp310-linux_x86_64.whl#sha256=35a9e78b7e4096968b54c1a198687b981569c50ae93e661aa430f9fd208da102) |
+| x86     | Python3.11 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl#sha256=5b6ae523bfb67088a17ca7734d131548a2e60346c622621e4248ed09dd0790cc) |
+| aarch64 | Python3.9  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp39-cp39-manylinux_2_28_aarch64.whl#sha256=2ab9c6b3d6eea506bda9b82a0155e974d8ef8e38b417589d144568b4fa59afe1) |
+| aarch64 | Python3.10 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp310-cp310-manylinux_2_28_aarch64.whl#sha256=90832f4d118c566b8652a2196ac695fc1f14cf420db27b5a1b41c7eaaf2141e9) |
+| aarch64 | Python3.11 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp311-cp311-manylinux_2_28_aarch64.whl#sha256=d3dab9fb0294f268aec28e8aaba834e9d006b90a50db5bc2fe2191a9d48c6084) |
 
 2. **安装torch_npu依赖**
 
@@ -53,7 +51,7 @@ pip3 install setuptools
 3. **安装torch_npu**
 
 ```
-pip3 install torch-npu==2.1.0.post10
+pip3 install torch-npu==2.6.0rc1
 ```
 如需要保存安装日志，可在pip3 install命令后面加上参数 `--log <PATH>`，并对您指定的目录`<PATH>`做好权限管控。
 
@@ -64,7 +62,7 @@ pip3 install torch-npu==2.1.0.post10
 1. **克隆torch_npu代码仓**
 
    ```
-   git clone https://gitee.com/ascend/pytorch.git -b v2.1.0-6.0.0 --depth 1
+   git clone https://gitee.com/ascend/pytorch.git -b v2.6.0 --depth 1
    ```
 
 2. **构建镜像**
@@ -83,11 +81,11 @@ pip3 install torch-npu==2.1.0.post10
 
 4. **编译torch_npu**
 
-   以**Python 3.8** 为例。
+   以**Python 3.9** 为例。
 
    ```
    cd /home/pytorch
-   bash ci/build.sh --python=3.8
+   bash ci/build.sh --python=3.9
    ```
 
  **提示**
@@ -147,10 +145,12 @@ print(z)
 | PyTorch版本     | Python版本                                                     |
 |---------------|:-------------------------------------------------------------|
 | PyTorch1.11.0 | Python3.7.x(>=3.7.5), Python3.8.x, Python3.9.x, Python3.10.x |
-| PyTorch2.1.0  | Python3.8.x, Python3.9.x, Python3.10.x, Python 3.11.x                       |
+| PyTorch2.1.0  | Python3.8.x, Python3.9.x, Python3.10.x, Python 3.11.x        |
 | PyTorch2.2.0  | Python3.8.x, Python3.9.x, Python3.10.x                       |
-| PyTorch2.3.1  | Python3.8.x, Python3.9.x, Python3.10.x, Python 3.11.x                       |
-| PyTorch2.4.0  | Python3.8.x, Python3.9.x, Python3.10.x, Python 3.11.x                       |
+| PyTorch2.3.1  | Python3.8.x, Python3.9.x, Python3.10.x, Python 3.11.x        |
+| PyTorch2.4.0  | Python3.8.x, Python3.9.x, Python3.10.x, Python 3.11.x        |
+| PyTorch2.5.1  | Python3.9.x, Python3.10.x, Python 3.11.x                     |
+| PyTorch2.6.0  | Python3.9.x, Python3.10.x, Python 3.11.x                     |
 
 ## 昇腾辅助软件
 
@@ -158,6 +158,8 @@ print(z)
 
 | CANN版本                | 支持的PyTorch版本 | 支持的Extension版本   | Gitee分支           | 
 |-----------------------|--------------|------------------|-------------------|
+| CANN 8.0.0.beta1      | 2.6.0        | 2.6.0rc1         | v2.6.0            |
+| CANN 8.0.0.alpha001   | 2.5.1        | 2.5.1rc1         | v2.5.1            |
 | CANN 8.0.0            | 2.4.0        | 2.4.0.post2      | v2.4.0-6.0.0      |
 |                       | 2.3.1        | 2.3.1.post4      | v2.3.1-6.0.0      |
 |                       | 2.1.0        | 2.1.0.post10     | v2.1.0-6.0.0      |
@@ -237,16 +239,18 @@ AscendPyTorch版本分支的维护阶段如下：
 
 ## PyTorch版本维护策略
 
-| **PyTorch版本** | **维护策略** | **当前状态** | **发布时间** | **后续状态** | **EOL日期** |
-|-----------|-----------|--------|------------|-----------------------|-----------|
-| 2.4.0     |  常规分支  | 开发    | 2024/10/15 | 预计2025/03/15起进入维护状态 |         -  | 
-| 2.3.1     |  常规分支  | 开发   | 2024/06/06 | 预计2024/12/06起进入维护状态 |           |
-| 2.2.0     |  常规分支   | 维护   | 2024/04/01 | 预计2025/9/10起进入无维护状态 |           |
-| 2.1.0     |  长期支持  | 开发   | 2023/10/15 | 预计2025/03/30起进入维护状态 |           |
-| 2.0.1     |  常规分支   | EOL   | 2023/7/19  |   |  2024/3/14          |
-| 1.11.0    |  长期支持  | 维护   | 2023/4/19  | 预计2025/9/10起进入无维护状态  |           |
-| 1.8.1     |  长期支持  | EOL    | 2022/4/10  |                       | 2023/4/10 |
-| 1.5.0     |  长期支持  | EOL    | 2021/7/29  |                       | 2022/7/29 |
+| **PyTorch版本** | **维护策略** | **当前状态** | **发布时间**   | **后续状态**            | **EOL日期** |
+|---------------|----------|----------|------------|---------------------|-----------|
+| 2.6.0         | 常规分支     | 开发       | 2025/02/20 | 预计2025/07/20起进入维护状态 | -         | 
+| 2.5.1         | 常规分支     | 开发       | 2024/11/08 | 预计2025/04/08起进入维护状态 | -         | 
+| 2.4.0         | 常规分支     | 开发       | 2024/10/15 | 预计2025/03/15起进入维护状态 | -         | 
+| 2.3.1         | 常规分支     | 开发       | 2024/06/06 | 预计2024/12/06起进入维护状态 |           |
+| 2.2.0         | 常规分支     | 维护       | 2024/04/01 | 预计2025/9/10起进入无维护状态 |           |
+| 2.1.0         | 长期支持     | 开发       | 2023/10/15 | 预计2025/03/30起进入维护状态 |           |
+| 2.0.1         | 常规分支     | EOL      | 2023/7/19  |                     | 2024/3/14 |
+| 1.11.0        | 长期支持     | 维护       | 2023/4/19  | 预计2025/9/10起进入无维护状态 |           |
+| 1.8.1         | 长期支持     | EOL      | 2022/4/10  |                     | 2023/4/10 |
+| 1.5.0         | 长期支持     | EOL      | 2021/7/29  |                     | 2022/7/29 |
 
 ## 安全声明
 
diff --git a/SECURITYNOTE.md b/SECURITYNOTE.md
index 0daedfc6f2..d7cf1140cf 100644
--- a/SECURITYNOTE.md
+++ b/SECURITYNOTE.md
@@ -69,30 +69,30 @@ torch_npu支持源码编译安装，在编译时会下载依赖第三方库并
 
 ##### 公网地址
 
-| 类型   | 开源代码地址                                                                                                                                                                                                                   | 文件名                                    | 公网IP地址/公网URL地址/域名/邮箱地址                                                         | 用途说明                           |
-|------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------|--------------------------------------------------------------------------------|--------------------------------|
-| 自研   | 不涉及                                                                                                                                                                                                                            | .gitmodules                   | https://gitee.com/ascend/op-plugin.git                         | 依赖的开源代码仓            |
-| 自研   | 不涉及                                                                                                                                                                                                                            | .gitmodules                   | https://gitee.com/mirrors/googletest.git                         | 依赖的开源代码仓            |
-| 自研   | 不涉及                                                                                                                                                                                                                            | .gitmodules                   | https://gitee.com/ascend/torchair.git                         | 依赖的开源代码仓            |
-| 自研   | 不涉及                                                                                                                                                                                                                            | .gitmodules                   | https://gitee.com/ascend/Tensorpipe.git                        | 依赖的开源代码仓            |
-| 自研   | 不涉及                                                                                                                                                                                                                      | ci\docker\X86\Dockerfile               | https://mirrors.huaweicloud.com/repository/pypi/simple                         | docker配置文件，用于配置pip源            |
-| 自研   | 不涉及                                                                                                                                                                                                                      | ci\docker\X86\Dockerfile               | https://download.pytorch.org/whl/cpu                                           | docker配置源，用于配置torch下载连接        |
-| 自研   | 不涉及                                                                                                                                                                                                                      | ci\docker\ARM\Dockerfile               | https://mirrors.huaweicloud.com/repository/pypi/simple                         | docker配置文件，用于配置pip源            |
-| 自研   | 不涉及                                                                                                                                                                                                                      | .github\workflows\\_build-and-test.yml | https://mirrors.huaweicloud.com/repository/pypi/simple                         | workflow配置文件，用于配置pip源          |
-| 自研   | 不涉及                                                                                                                                                                                                                      | setup.cfg                              | https://gitee.com/ascend/pytorch                                               | 用于打包whl的url入参                  |
-| 自研   | 不涉及                                                                                                                                                                                                                      | setup.cfg                              | https://gitee.com/ascend/pytorch/tags                                          | 用于打包whl的download_url入参         |
-| 自研   | 不涉及                                                                                                                                                                                                                      | third_party\op-plugin\ci\build.sh      | https://gitee.com/ascend/pytorch.git                                           | 编译脚本根据torch_npu仓库地址拉取代码进行编译    |
-| 自研   | 不涉及                                                                                                                                                                                                                      | third_party\op-plugin\ci\exec_ut.sh    | https://gitee.com/ascend/pytorch.git                                           | UT脚本根据torch_npu仓库地址下拉取代码进行UT测试 |
-| 开源引入 | https://github.com/pytorch/pytorch/blob/main/test/nn/test_convolution.py <br> https://github.com/pytorch/pytorch/blob/main/test/test_mps.py <br> https://github.com/pytorch/pytorch/blob/main/test/test_serialization.py | test\url.ini                           | https://download.pytorch.org/test_data/legacy_conv2d.pt                        | 用于test脚本下载相关pt文件               |
-| 开源引入 | https://github.com/pytorch/pytorch/blob/main/test/test_serialization.py                                                                                                                                                  | test\url.ini                           | https://download.pytorch.org/test_data/legacy_serialized.pt                    | 用于test脚本下载相关pt文件               |
-| 开源引入 | https://github.com/pytorch/pytorch/blob/main/test/test_serialization.py                                                                                                                                                  | test\url.ini                           | https://download.pytorch.org/test_data/gpu_tensors.pt                          | 用于test脚本下载相关pt文件               |
-| 开源引入 | https://github.com/pytorch/pytorch/blob/main/test/onnx/test_utility_funs.py                                                                                                                                              | test\url.ini                           | https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml          | issue的链接                       |
-| 开源引入 | https://github.com/pytorch/pytorch/blob/main/test/test_nn.py <br> https://github.com/pytorch/pytorch/blob/main/test/test_serialization.py                                                                                | test\url.ini                           | https://download.pytorch.org/test_data/linear.pt                               | 用于test脚本下载相关pt文件               |
-| 自研   | 不涉及                                                                                                                                                                                                                      | torch_npu\npu\config.yaml              | https://raw.githubusercontent.com/brendangregg/FlameGraph/master/flamegraph.pl | 火焰图脚本下载路径                      |
-| 自研   | 不涉及                                                                                                                                                                                                                      | test\requirements.txt                  | https://download.pytorch.org/whl/nightly/cpu                                   | 下载链接，用于下载torch-cpu版本           |
-| 自研   | 不涉及                                                                                                                                                                                                                      | test\requirements.txt                  | https://data.pyg.org/whl/torch-2.4.0+cpu.html                                  | 下载链接，用于下载torch-scatter的cpu版本   |
-| 自研   | 不涉及                                                                                                                                                                                                                      | requirements.txt                       | https://download.pytorch.org/whl/nightly/cpu                                   | 下载链接，用于下载torch-cpu版本           |
-| 自研   | 不涉及                                                                                                                                                                                                                      | test\get_synchronized_files.sh         | https://github.com/pytorch/pytorch.git                                         | 下载链接，用于下载pytorch的测试用例        |
+| 类型   | 开源代码地址                                                                                                                                                                                                                         | 文件名                                    | 公网IP地址/公网URL地址/域名/邮箱地址                                                       | 用途说明                           |
+|------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------|------------------------------------------------------------------------------|--------------------------------|
+| 自研   | 不涉及                                                                                                                                                                                                                            | .gitmodules                   | https://gitee.com/ascend/op-plugin.git                                       | 依赖的开源代码仓            |
+| 自研   | 不涉及                                                                                                                                                                                                                            | .gitmodules                   | https://gitee.com/mirrors/googletest.git                                     | 依赖的开源代码仓            |
+| 自研   | 不涉及                                                                                                                                                                                                                            | .gitmodules                   | https://gitee.com/ascend/torchair.git                                        | 依赖的开源代码仓            |
+| 自研   | 不涉及                                                                                                                                                                                                                            | .gitmodules                   | https://gitee.com/ascend/Tensorpipe.git                                      | 依赖的开源代码仓            |
+| 自研   | 不涉及                                                                                                                                                                                                                            | ci\docker\X86\Dockerfile               | https://mirrors.huaweicloud.com/repository/pypi/simple                       | docker配置文件，用于配置pip源            |
+| 自研   | 不涉及                                                                                                                                                                                                                            | ci\docker\X86\Dockerfile               | https://download.pytorch.org/whl/cpu                                         | docker配置源，用于配置torch下载连接        |
+| 自研   | 不涉及                                                                                                                                                                                                                            | ci\docker\ARM\Dockerfile               | https://mirrors.huaweicloud.com/repository/pypi/simple                       | docker配置文件，用于配置pip源            |
+| 自研   | 不涉及                                                                                                                                                                                                                            | .github\workflows\\_build-and-test.yml | https://mirrors.huaweicloud.com/repository/pypi/simple                       | workflow配置文件，用于配置pip源          |
+| 自研   | 不涉及                                                                                                                                                                                                                            | setup.cfg                              | https://gitee.com/ascend/pytorch                                             | 用于打包whl的url入参                  |
+| 自研   | 不涉及                                                                                                                                                                                                                            | setup.cfg                              | https://gitee.com/ascend/pytorch/tags                                        | 用于打包whl的download_url入参         |
+| 自研   | 不涉及                                                                                                                                                                                                                            | third_party\op-plugin\ci\build.sh      | https://gitee.com/ascend/pytorch.git                                         | 编译脚本根据torch_npu仓库地址拉取代码进行编译    |
+| 自研   | 不涉及                                                                                                                                                                                                                            | third_party\op-plugin\ci\exec_ut.sh    | https://gitee.com/ascend/pytorch.git                                         | UT脚本根据torch_npu仓库地址下拉取代码进行UT测试 |
+| 开源引入 | https://github.com/pytorch/pytorch/blob/v2.6.0/test/nn/test_convolution.py <br> https://github.com/pytorch/pytorch/blob/v2.6.0/test/test_mps.py <br> https://github.com/pytorch/pytorch/blob/v2.6.0/test/test_serialization.py | test\url.ini                           | https://download.pytorch.org/test_data/legacy_conv2d.pt                      | 用于test脚本下载相关pt文件               |
+| 开源引入 | https://github.com/pytorch/pytorch/blob/v2.6.0/test/test_serialization.py                                                                                                                                                      | test\url.ini                           | https://download.pytorch.org/test_data/legacy_serialized.pt                  | 用于test脚本下载相关pt文件               |
+| 开源引入 | https://github.com/pytorch/pytorch/blob/v2.6.0/test/test_serialization.py                                                                                                                                                      | test\url.ini                           | https://download.pytorch.org/test_data/gpu_tensors.pt                        | 用于test脚本下载相关pt文件               |
+| 开源引入 | https://github.com/pytorch/pytorch/blob/v2.6.0/test/onnx/test_utility_funs.py                                                                                                                                                  | test\url.ini                           | https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml        | issue的链接                       |
+| 开源引入 | https://github.com/pytorch/pytorch/blob/v2.6.0/test/test_nn.py <br> https://github.com/pytorch/pytorch/blob/v2.6.0/test/test_serialization.py                                                                                  | test\url.ini                           | https://download.pytorch.org/test_data/linear.pt                             | 用于test脚本下载相关pt文件               |
+| 自研   | 不涉及                                                                                                                                                                                                                            | torch_npu\npu\config.yaml              | https://raw.githubusercontent.com/brendangregg/FlameGraph/master/flamegraph.pl | 火焰图脚本下载路径                      |
+| 自研   | 不涉及                                                                                                                                                                                                                            | test\requirements.txt                  | https://download.pytorch.org/whl/cpu                                  | 下载链接，用于下载torch-cpu版本           |
+| 自研   | 不涉及                                                                                                                                                                                                                            | test\requirements.txt                  | https://data.pyg.org/whl/torch-2.5.0+cpu.html                                | 下载链接，用于下载torch-scatter的cpu版本   |
+| 自研   | 不涉及                                                                                                                                                                                                                            | requirements.txt                       | https://download.pytorch.org/whl/cpu                                  | 下载链接，用于下载torch-cpu版本           |
+| 自研   | 不涉及                                                                                                                                                                                                                            | test\get_synchronized_files.sh         | https://github.com/pytorch/pytorch.git                                       | 下载链接，用于下载pytorch的测试用例        |
 
 ## 公开接口声明
 
diff --git a/ci/docker/ARM/Dockerfile b/ci/docker/ARM/Dockerfile
index 7457919d3d..d8fd1faba2 100644
--- a/ci/docker/ARM/Dockerfile
+++ b/ci/docker/ARM/Dockerfile
@@ -1,11 +1,11 @@
-FROM quay.io/pypa/manylinux2014_aarch64:2023-10-07-c1e05d1
+FROM pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.6
 
 # Set pip
 RUN cd /usr/local/bin \
-    && ln -s /opt/_internal/cpython-3.9.18/bin/pip3.9 pip3.9 \
-    && ln -s /opt/_internal/cpython-3.10.13/bin/pip3.10 pip3.10 \
-    && ln -s /opt/_internal/cpython-3.11.6/bin/pip3.11 pip3.11 \
-    && ln -s python3.9 python3
+    && ln -s /opt/_internal/cpython-3.9.21/bin/pip3.9 pip3.9 \
+    && ln -s /opt/_internal/cpython-3.10.16/bin/pip3.10 pip3.10 \
+    && ln -s /opt/_internal/cpython-3.11.11/bin/pip3.11 pip3.11 \
+    && cd /usr/bin/ && ln -sf /opt/_internal/cpython-3.9.21/bin/python3.9 python3
 
 # Set pip source
 RUN mkdir /root/.pip \
@@ -33,15 +33,15 @@ RUN if [ "$CONFIG_FOR_LCOV" = "1" ]; then \
 
 # Install pip package(build)
 RUN pip3.9 install pyyaml  \
-    && pip3.9 install torch==2.1.0 \
+    && pip3.9 install torch==2.6.0 \
     && pip3.9 install numpy==1.21.3
 
 RUN pip3.10 install pyyaml  \
-    && pip3.10 install torch==2.1.0 \
+    && pip3.10 install torch==2.6.0 \
     && pip3.10 install numpy==1.21.3
 
 RUN pip3.11 install pyyaml  \
-    && pip3.11 install torch==2.1.0 \
+    && pip3.11 install torch==2.6.0 \
     && pip3.11 install numpy==1.23.2
 
 WORKDIR /home
diff --git a/ci/docker/X86/Dockerfile b/ci/docker/X86/Dockerfile
index 0c234633ca..917b7bc26d 100644
--- a/ci/docker/X86/Dockerfile
+++ b/ci/docker/X86/Dockerfile
@@ -36,15 +36,15 @@ RUN if [ "$CONFIG_FOR_LCOV" = "1" ]; then \
 
 # Install pip package(build)
 RUN pip3.9 install pyyaml  \
-    && pip3.9 install torch==2.1.0+cpu --index-url https://download.pytorch.org/whl/cpu \
+    && pip3.9 install torch==2.6.0+cpu --index-url https://download.pytorch.org/whl/cpu \
     && pip3.9 install numpy==1.21.3
 
 RUN pip3.10 install pyyaml  \
-    && pip3.10 install torch==2.1.0+cpu --index-url https://download.pytorch.org/whl/cpu \
+    && pip3.10 install torch==2.6.0+cpu --index-url https://download.pytorch.org/whl/cpu \
     && pip3.10 install numpy==1.21.3
 
 RUN pip3.11 install pyyaml  \
-    && pip3.11 install torch==2.1.0+cpu --index-url https://download.pytorch.org/whl/cpu \
+    && pip3.11 install torch==2.6.0+cpu --index-url https://download.pytorch.org/whl/cpu \
     && pip3.11 install numpy==1.23.2
 
 WORKDIR /home
diff --git a/requirements.txt b/requirements.txt
index b45208bbfa..a81ac384cb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
---extra-index-url https://download.pytorch.org/whl/test/cpu
+--extra-index-url https://download.pytorch.org/whl/cpu
 
 pyyaml
 setuptools
diff --git a/setup.py b/setup.py
index 1ca33aab47..3c9afe54c6 100644
--- a/setup.py
+++ b/setup.py
@@ -629,6 +629,9 @@ classifiers = [
     "Programming Language :: Python :: 3.11",
 ]
 
+requirements = ['torch==2.6.0+cpu' if platform.machine() == 'x86_64' else 'torch==2.6.0']
+if USE_CXX11_ABI:
+    requirements = ['torch==2.6.0+cpu.cxx11.abi'] if platform.machine() == 'x86_64' else ['torch==2.6.0']
 
 setup(
     name=os.environ.get('TORCH_NPU_PACKAGE_NAME', 'torch_npu'),
@@ -653,6 +656,7 @@ setup(
                 define_macros=[('_GLIBCXX_USE_CXX11_ABI', '1' if USE_CXX11_ABI else '0'), ('GLIBCXX_USE_CXX11_ABI', '1' if USE_CXX11_ABI else '0')]
             ),
     ],
+    install_requires=requirements,
     extras_require={
     },
     package_data={
diff --git a/test/requirements.txt b/test/requirements.txt
index 7574cf1f85..01f180d64a 100644
--- a/test/requirements.txt
+++ b/test/requirements.txt
@@ -1,4 +1,4 @@
---extra-index-url https://download.pytorch.org/whl/test/cpu
+--extra-index-url https://download.pytorch.org/whl/cpu
 -f https://data.pyg.org/whl/torch-2.5.0+cpu.html
 
 coverage
-- 
Gitee


From a9ec81f04d81771d306f9a18a6fa0bd8578ac67d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= <jiangyiwen5@huawei.com>
Date: Wed, 19 Feb 2025 07:12:06 +0000
Subject: [PATCH 025/358] =?UTF-8?q?!18095=20Add=20ACL=5FOP=5FINIT=5FMODE?=
 =?UTF-8?q?=20Merge=20pull=20request=20!18095=20from=20=E5=A7=9C=E6=80=A1?=
 =?UTF-8?q?=E6=96=87/v2.6.0=5Flazy?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../csrc/core/npu/register/OptionsManager.cpp |  11 +
 .../csrc/core/npu/register/OptionsManager.h   |   7 +
 .../csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp   | 108 +--------
 torch_npu/csrc/framework/LazyInitAclops.cpp   | 208 ++++++++++++++++++
 torch_npu/csrc/framework/LazyInitAclops.h     |  14 ++
 torch_npu/csrc/framework/OpCommand.cpp        |   2 +
 torch_npu/csrc/framework/OpParamMaker.cpp     |  25 +--
 torch_npu/csrc/framework/OpParamMaker.h       |   3 +-
 .../csrc/framework/interface/EnvVariables.cpp |   6 +-
 9 files changed, 264 insertions(+), 120 deletions(-)
 create mode 100644 torch_npu/csrc/framework/LazyInitAclops.cpp
 create mode 100644 torch_npu/csrc/framework/LazyInitAclops.h

diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
index 55dc7488e1..7ec743d178 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
@@ -473,6 +473,17 @@ uint32_t OptionsManager::GetP2PBufferSize()
     return buf_size;
 }
 
+uint32_t OptionsManager::GetAclOpInitMode()
+{
+    const static uint32_t acl_op_init_mode = []() -> uint32_t {
+        char* buf_val = std::getenv("ACL_OP_INIT_MODE");
+        // Default 0
+        int64_t acl_op_init_mode = (buf_val != nullptr) ? strtol(buf_val, nullptr, 10) : 0;
+        return static_cast<uint32_t>(acl_op_init_mode);
+    }();
+    return acl_op_init_mode;
+}
+
 char* OptionsManager::GetCpuAffinityConf()
 {
     return std::getenv("CPU_AFFINITY_CONF");
diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h
index c7fc7208d9..be7445b626 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.h
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.h
@@ -77,6 +77,12 @@ static std::unordered_map<int32_t, std::string> getTaskQueueEnableMode()
     return taskQueueEnableMode;
 }
 
+static std::unordered_map<int32_t, std::string> getAclOpInitMode()
+{
+    std::unordered_map<int32_t, std::string> aclOpInitMode = {{0, "aclops init"}, {1, "aclops lazy init"}, {2, "aclops disabled"}};
+    return aclOpInitMode;
+}
+
 class OptionsManager {
 public:
     static bool IsHcclZeroCopyEnable();
@@ -113,6 +119,7 @@ public:
     static uint32_t GetHcclBufferSize();
     static uint32_t GetP2PBufferSize();
     static uint32_t GetTaskQueueEnable();
+    static uint32_t GetAclOpInitMode();
     static char* GetCpuAffinityConf();
     static bool CheckForceUncached();
     static std::string GetOomSnapshotDumpPath();
diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
index e53c9ba8bf..6dae2d670f 100644
--- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
+++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
@@ -21,6 +21,7 @@
 #include "third_party/acl/inc/acl/acl_op_compiler.h"
 #include "third_party/acl/inc/acl/acl_rt.h"
 #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h"
+#include "torch_npu/csrc/framework/LazyInitAclops.h"
 #include "torch_npu/csrc/core/npu/NPUFunctions.h"
 #include  "torch_npu/csrc/toolkit/profiler/common/utils.h"
 #ifdef SUCCESS
@@ -30,78 +31,11 @@
 #undef FAILED
 #endif
 
-#if defined(_MSC_VER)
-#include <direct.h>
-#define GetCurrentDirPath _getcwd
-#define Mkdir(path, mode) _mkdir(path)
-#elif defined(__unix__)
-#include <unistd.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#define GetCurrentDirPath getcwd
-#define Mkdir(path, mode) mkdir(path, mode)
-#else
-#endif
 
 namespace {
 const uint32_t kMaxOpExecuteTimeOut = 547U;
 const size_t kMaxPathLen = 4096U;
 
-void MakeCompileCacheDirAndSetOption()
-{
-    char *compile_cache_mode_val = std::getenv("ACL_OP_COMPILER_CACHE_MODE");
-    std::string compile_cache_mode =
-        (compile_cache_mode_val == nullptr) ? std::string("enable") : std::string(compile_cache_mode_val);
-    if (compile_cache_mode != "enable" && compile_cache_mode != "disable" && compile_cache_mode != "force") {
-        compile_cache_mode = std::string("enable");
-    }
-    auto compile_mode = c10_npu::option::GetOption("ACL_OP_COMPILER_CACHE_MODE");
-    if (!compile_mode.has_value() || compile_mode.value() == "") {
-        c10_npu::option::register_options::OptionRegister::GetInstance()->Set("ACL_OP_COMPILER_CACHE_MODE",
-                                                                              compile_cache_mode);
-    }
-
-    char *compile_cache_dir_val = std::getenv("ACL_OP_COMPILER_CACHE_DIR");
-    if (compile_cache_dir_val != nullptr) {
-        std::string compile_cache_dir = std::string(compile_cache_dir_val);
-        // mode : 750
-        auto ret = Mkdir(compile_cache_dir.c_str(), S_IRWXU | S_IRGRP | S_IXGRP);
-        if (ret == -1) {
-            if (errno != EEXIST) {
-                TORCH_NPU_WARN("make compile cache directory error: ", strerror(errno));
-                return;
-            }
-        }
-        auto compile_dir = c10_npu::option::GetOption("ACL_OP_COMPILER_CACHE_DIR");
-        if (!compile_dir.has_value() || compile_dir.value() == "") {
-            c10_npu::option::register_options::OptionRegister::GetInstance()->Set("ACL_OP_COMPILER_CACHE_DIR",
-                                                                                  compile_cache_dir);
-        }
-    }
-}
-
-void GetAndSetDefaultJitCompileByAcl()
-{
-    auto jit_compile = c10_npu::option::GetOption("jitCompile");
-    if (jit_compile.has_value() && jit_compile.value() != "") {
-        return;
-    }
-
-    auto opt_size = at_npu::native::AclGetCompileoptSize(ACL_OP_JIT_COMPILE);
-    if (!opt_size.has_value()) {
-        ASCEND_LOGW("Get ACL JitCompile default value size failed, use PTA default value: True");
-        return;
-    }
-    TORCH_CHECK(opt_size.value() != 0, "AclGetCompileoptSize opt_size.value() = 0 !", PTA_ERROR(ErrCode::ACL));
-    char value_name[opt_size.value()];
-    auto ret = at_npu::native::AclGetCompileopt(ACL_OP_JIT_COMPILE, value_name, opt_size.value());
-    // Get func success but get value failed, throw error
-    TORCH_CHECK(ret == ACL_SUCCESS, "Get ACL JitCompile default value failed.", PTA_ERROR(ErrCode::ACL));
-    std::string value_str(value_name);
-    c10_npu::option::SetOption("jitCompile", value_str);
-    ASCEND_LOGI("Get ACL JitCompile default value %s and set", value_str.c_str());
-}
-
 void SetDefaultAllowInternalFromatDisable()
 {
     auto allow_internal_format = c10_npu::option::GetOption("ALLOW_INTERNAL_FORMAT");
@@ -113,28 +47,6 @@ void SetDefaultAllowInternalFromatDisable()
     ASCEND_LOGI("Set ALLOW_INTERNAL_FORMAT default value disable.");
 }
 
-void SetHF32DefaultValue()
-{
-    // The default value of the flag used to control whether HF32 is allowed on conv is True.
-    // The default value of the flag used to control whether HF32 is allowed on matmul is True,
-    // but this flag defaults to False in PyTorch 1.12 and later.
-
-    // When the flag of matmul is False, and the flag of conv is True,
-    // the value of option "ACL_ALLOW_HF32" should be set to "10";
-    std::string allow_hf32 = "10";
-    auto ret = at_npu::native::AclSetCompileopt(aclCompileOpt::ACL_ALLOW_HF32, allow_hf32.c_str());
-    if (ret == ACL_SUCCESS) {
-        ASCEND_LOGI("Set ACL option ACL_ALLOW_HF32 default value to %s.", allow_hf32.c_str());
-    } else if (ret == ACL_ERROR_INTERNAL_ERROR) {
-        // Used to solve version compatibility issues, when ASCEND have not been updated.
-        ASCEND_LOGW(
-            "Failed to set default value of ACL option ACL_ALLOW_HF32, which is unsupported by current version.");
-    } else {
-        TORCH_CHECK(0, "Failed to set compile option ACL_ALLOW_HF32, result = ", ret, ", set value ", allow_hf32,
-                    PTA_ERROR(ErrCode::ACL));
-    }
-}
-
 #ifndef BUILD_LIBTORCH
 std::string GetTorchNpuFile()
 {
@@ -236,7 +148,6 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id)
         ASCEND_LOGW("Npu device %d has been set before global init.", device_id_);
     }
 
-
     if (c10_npu::option::OptionsManager::CheckAclDumpDateEnable()) {
         const char *aclConfigPath = "acl.json";
         NPU_CHECK_ERROR(aclmdlSetDump(aclConfigPath));
@@ -253,23 +164,18 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id)
         c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_SATURATION);
     }
 
-    // set ACL_PRECISION_MODE by SocVersion("allow_fp32_to_fp16" or "must_keep_origin_dtype").
-    auto precision_mode =
-        c10_npu::GetSocVersion() >= c10_npu::SocVersion::Ascend910B1 ? "must_keep_origin_dtype" : "allow_fp32_to_fp16";
-    NPU_CHECK_ERROR(at_npu::native::AclSetCompileopt(aclCompileOpt::ACL_PRECISION_MODE, precision_mode));
+    int acl_op_init_mode = c10_npu::option::OptionsManager::GetAclOpInitMode();
+    if (acl_op_init_mode == 0) {
+        at_npu::aclops::InitAclops();
+    } else {
+        at_npu::aclops::InitializeJitCompilationMode();
+    }
 
-    // set default compile cache mode and dir for users to improve op compile time
-    MakeCompileCacheDirAndSetOption();
-    // set default jit_Compile value from Get acl defalut value
-    GetAndSetDefaultJitCompileByAcl();
     // set default allow_internal_format value
     if (c10_npu::GetSocVersion() >= c10_npu::SocVersion::Ascend910_9391) {
         SetDefaultAllowInternalFromatDisable();
     }
 
-
-    SetHF32DefaultValue();
-
     NPU_CHECK_ERROR(at_npu::native::AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, 0));
     NPU_CHECK_SUPPORTED_OR_ERROR(c10_npu::acl::AclrtSetOpExecuteTimeOut(kMaxOpExecuteTimeOut));
 
diff --git a/torch_npu/csrc/framework/LazyInitAclops.cpp b/torch_npu/csrc/framework/LazyInitAclops.cpp
new file mode 100644
index 0000000000..2e0b30c740
--- /dev/null
+++ b/torch_npu/csrc/framework/LazyInitAclops.cpp
@@ -0,0 +1,208 @@
+#include "torch_npu/csrc/framework/LazyInitAclops.h"
+
+#include <ATen/record_function.h>
+
+#include "torch_npu/csrc/core/npu/NPUException.h"
+#include "torch_npu/csrc/core/npu/NpuVariables.h"
+#include "torch_npu/csrc/core/npu/register/OptionRegister.h"
+#include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h"
+#include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h"
+
+#ifndef BUILD_LIBTORCH
+#include <Python.h>
+#endif
+#include <atomic>
+#if defined(_MSC_VER)
+#include <direct.h>
+#define GetCurrentDirPath _getcwd
+#define Mkdir(path, mode) _mkdir(path)
+#elif defined(__unix__)
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#define GetCurrentDirPath getcwd
+#define Mkdir(path, mode) mkdir(path, mode)
+#else
+#endif
+
+namespace at_npu {
+namespace aclops {
+
+std::atomic<bool> encounteredAclops(false);
+
+void SetHF32DefaultValue()
+{
+    // The default value of the flag used to control whether HF32 is allowed on
+    // conv is True. The default value of the flag used to control whether HF32
+    // is allowed on matmul is True, but this flag defaults to False in
+    // PyTorch 1.12 and later.
+
+    // When the flag of matmul is False, and the flag of conv is True,
+    // the value of option "ACL_ALLOW_HF32" should be set to "10";
+    std::string allow_hf32 = "10";
+    auto ret = at_npu::native::AclSetCompileopt(aclCompileOpt::ACL_ALLOW_HF32, allow_hf32.c_str());
+    if (ret == ACL_SUCCESS) {
+        ASCEND_LOGI("Set ACL option ACL_ALLOW_HF32 default value to %s.", allow_hf32.c_str());
+    } else if (ret == ACL_ERROR_INTERNAL_ERROR) {
+        // Used to solve version compatibility issues, when ASCEND have not been
+        // updated.
+        ASCEND_LOGW(
+            "Failed to set default value of ACL option ACL_ALLOW_HF32, which is "
+            "unsupported by current version.");
+    } else {
+        TORCH_CHECK(0, "Failed to set compile option ACL_ALLOW_HF32, result = ", ret, ", set value ", allow_hf32,
+                    PTA_ERROR(ErrCode::ACL));
+    }
+}
+
+// set default compile cache mode and dir to improve op compile time
+void MakeCompileCacheDirAndSetOption()
+{
+    char *compile_cache_mode_val = std::getenv("ACL_OP_COMPILER_CACHE_MODE");
+    std::string compile_cache_mode =
+        (compile_cache_mode_val == nullptr) ? std::string("enable") : std::string(compile_cache_mode_val);
+    if (compile_cache_mode != "enable" && compile_cache_mode != "disable" && compile_cache_mode != "force") {
+        compile_cache_mode = std::string("enable");
+    }
+    auto compile_mode = c10_npu::option::GetOption("ACL_OP_COMPILER_CACHE_MODE");
+    if (!compile_mode.has_value() || compile_mode.value() == "") {
+        c10_npu::option::register_options::OptionRegister::GetInstance()->Set(
+            "ACL_OP_COMPILER_CACHE_MODE", compile_cache_mode);
+    }
+
+    char *compile_cache_dir_val = std::getenv("ACL_OP_COMPILER_CACHE_DIR");
+    if (compile_cache_dir_val != nullptr) {
+        std::string compile_cache_dir = std::string(compile_cache_dir_val);
+        // mode : 750
+        auto ret = Mkdir(compile_cache_dir.c_str(), S_IRWXU | S_IRGRP | S_IXGRP);
+        if (ret == -1) {
+            if (errno != EEXIST) {
+                TORCH_NPU_WARN("make compile cache directory error: ", strerror(errno));
+                return;
+            }
+        }
+        auto compile_dir = c10_npu::option::GetOption("ACL_OP_COMPILER_CACHE_DIR");
+        if (!compile_dir.has_value() || compile_dir.value() == "") {
+            c10_npu::option::register_options::OptionRegister::GetInstance()->Set(
+                "ACL_OP_COMPILER_CACHE_DIR", compile_cache_dir);
+        }
+    }
+}
+
+bool IsJitCompileModeSetted()
+{
+    auto jit_compile = c10_npu::option::GetOption("jitCompile");
+    if (jit_compile.has_value() && jit_compile.value() != "") {
+        return true;
+    }
+    return false;
+}
+
+std::string GetJitCompileMode()
+{
+    auto opt_size = at_npu::native::AclGetCompileoptSize(ACL_OP_JIT_COMPILE);
+    if (!opt_size.has_value()) {
+        ASCEND_LOGW(
+            "Get ACL JitCompile default value size failed, use PTA "
+            "default value: "
+            "True");
+        return "";
+    }
+    TORCH_CHECK(opt_size.value() != 0, "AclGetCompileoptSize opt_size.value() = 0 !", PTA_ERROR(ErrCode::PARAM));
+
+    char value_name[opt_size.value()];
+    auto ret = at_npu::native::AclGetCompileopt(ACL_OP_JIT_COMPILE, value_name, opt_size.value());
+    // Get func success but get value failed, throw error
+    TORCH_CHECK(ret == ACL_SUCCESS, "Get ACL JitCompile default value failed.", PTA_ERROR(ErrCode::ACL));
+
+    return std::string(value_name);
+}
+
+void InitializeJitCompilationMode()
+{
+    if (IsJitCompileModeSetted()) {
+        return;
+    }
+    std::string value_str = GetJitCompileMode();
+    if (value_str != "") {
+        c10_npu::option::SetOption("jitCompileInit", value_str);
+        ASCEND_LOGI("Set jitCompileInit option to %s", value_str);
+    } else {
+        c10_npu::option::SetOption("jitCompileInit", "disable");
+        ASCEND_LOGI("Set jitCompileInit option to default value: disable");
+    }
+}
+
+// set default jit_Compile value from Get acl defalut value
+void GetAndSetDefaultJitCompileByAcl()
+{
+    if (IsJitCompileModeSetted()) {
+        return;
+    }
+
+    std::string value_str = GetJitCompileMode();
+    if (value_str != "") {
+        c10_npu::option::SetOption("jitCompile", value_str);
+    }
+}
+
+void SetPrecisionMode()
+{
+    // set ACL_PRECISION_MODE by SocVersion("allow_fp32_to_fp16" or
+    // "must_keep_origin_dtype").
+    auto precision_mode = c10_npu::GetSocVersion() >= c10_npu::SocVersion::Ascend910B1 ? "must_keep_origin_dtype"
+                                                                                        : "allow_fp32_to_fp16";
+    NPU_CHECK_ERROR(at_npu::native::AclSetCompileopt(aclCompileOpt::ACL_PRECISION_MODE, precision_mode));
+}
+
+void LazyInitAclopsCore()
+{
+#ifndef BUILD_LIBTORCH
+    PyThreadState *gilState = nullptr;
+    if (PyGILState_Check()) {
+        gilState = PyEval_SaveThread();
+    }
+#endif
+    SetPrecisionMode();
+    SetHF32DefaultValue();
+    MakeCompileCacheDirAndSetOption();
+    GetAndSetDefaultJitCompileByAcl();
+#ifndef BUILD_LIBTORCH
+    if (gilState) {
+        PyEval_RestoreThread(gilState);
+    }
+#endif
+}
+
+void LazyInitAclops()
+{
+    static int acl_op_init_mode = c10_npu::option::OptionsManager::GetAclOpInitMode();
+    if (acl_op_init_mode == 0) {
+        return;
+    }
+    NPU_CHECK_ERROR(acl_op_init_mode != 2, "Acl op is disabled! Please check the environment variable ACL_OP_INIT_MODE.");
+
+    if (!encounteredAclops.exchange(true) && c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) {
+        RECORD_FUNCTION("LazyInitAclops", std::vector<c10::IValue>({}));
+        LazyInitAclopsCore();
+        ASCEND_LOGI("Lazy init for aclops finished.")
+    }
+}
+
+void InitAclopsCore()
+{
+    SetPrecisionMode();
+    MakeCompileCacheDirAndSetOption();
+    GetAndSetDefaultJitCompileByAcl();
+    SetHF32DefaultValue();
+}
+
+void InitAclops()
+{
+    RECORD_FUNCTION("InitAclops", std::vector<c10::IValue>({}));
+    InitAclopsCore();
+    ASCEND_LOGI("Init for aclops finished.")
+}
+
+}  // namespace aclops
+}  // namespace at_npu
diff --git a/torch_npu/csrc/framework/LazyInitAclops.h b/torch_npu/csrc/framework/LazyInitAclops.h
new file mode 100644
index 0000000000..b842b78652
--- /dev/null
+++ b/torch_npu/csrc/framework/LazyInitAclops.h
@@ -0,0 +1,14 @@
+#ifndef AT_NPU_ACOPLS_LAZYINITACLOPS_H_
+#define AT_NPU_ACOPLS_LAZYINITACLOPS_H_
+
+namespace at_npu {
+namespace aclops {
+
+void InitAclops();
+void LazyInitAclops();
+void InitializeJitCompilationMode();
+
+}  // namespace aclops
+}  // namespace at_npu
+
+#endif  // AT_NPU_ACOPLS_LAZYINITACLOPS_H_
\ No newline at end of file
diff --git a/torch_npu/csrc/framework/OpCommand.cpp b/torch_npu/csrc/framework/OpCommand.cpp
index 4464ccdc0e..54f0b8b863 100644
--- a/torch_npu/csrc/framework/OpCommand.cpp
+++ b/torch_npu/csrc/framework/OpCommand.cpp
@@ -9,6 +9,7 @@
 #include "torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h"
 #include "torch_npu/csrc/framework/utils/NpuUtils.h"
 #include "torch_npu/csrc/framework/utils/NpuStorageOffsetGuard.h"
+#include "torch_npu/csrc/framework/LazyInitAclops.h"
 #include "torch_npu/csrc/aten/CustomFunctions.h"
 #include "torch_npu/csrc/core/npu/NPUFunctions.h"
 #ifndef BUILD_LIBTORCH
@@ -132,6 +133,7 @@ OpCommand& OpCommand::Output(
 void OpCommand::Run() {
     aclCmd->SetEnginePriority();
     const string &op_name = aclCmd->GetName();
+    at_npu::aclops::LazyInitAclops();
 #ifndef BUILD_LIBTORCH
     const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
 #endif
diff --git a/torch_npu/csrc/framework/OpParamMaker.cpp b/torch_npu/csrc/framework/OpParamMaker.cpp
index 7e93f453f6..fab27ad75a 100644
--- a/torch_npu/csrc/framework/OpParamMaker.cpp
+++ b/torch_npu/csrc/framework/OpParamMaker.cpp
@@ -96,11 +96,12 @@ void OpCommandImpl::SetEnginePriority()
     }
 }
 
-inline void SetDeterministicOption(bool deterministicAlgorithmsStatus)
+inline void SetDeterministicOption(bool deterministicAlgorithmsStatus, bool isOpapi)
 {
     if (deterministicaclnn_oldstatus != deterministicAlgorithmsStatus) {
-        NPU_CHECK_ERROR(
-            AclSetCompileopt(aclCompileOpt::ACL_OP_DETERMINISTIC, deterministicAlgorithmsStatus ? "1" : "0"));
+        if (!isOpapi) {
+            NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_DETERMINISTIC, deterministicAlgorithmsStatus ? "1" : "0"));
+        }
         NPU_CHECK_ERROR(
             AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, deterministicAlgorithmsStatus ? 1 : 0));
         NPU_CHECK_ERROR(
@@ -111,15 +112,15 @@ inline void SetDeterministicOption(bool deterministicAlgorithmsStatus)
     }
 }
 
-void SetDeterministic()
+void SetDeterministic(bool isOpapi)
 {
     auto deterministicAlgorithmsStatus = at::globalContext().deterministicAlgorithms();
-    SetDeterministicOption(deterministicAlgorithmsStatus);
+    SetDeterministicOption(deterministicAlgorithmsStatus, isOpapi);
 }
 
 void SetDeterministicOps(bool deterministicAlgorithmsStatus)
 {
-    SetDeterministicOption(deterministicAlgorithmsStatus);
+    SetDeterministicOption(deterministicAlgorithmsStatus, true);
 }
 
 void OpCommandImpl::Run(
@@ -177,7 +178,7 @@ aclError OpCommandImpl::InnerRun(
     auto inputSize = params.inBuffer.size();
     auto outputSize = params.outBuffer.size();
     // open the deterministicAlgorithms config
-    SetDeterministic();
+    SetDeterministic(false);
     bool reset_flag = false;
     if (ForceJitCompileList::GetInstance().Inlist(name) && env::CheckJitDisable()) {
         NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, "enable"));
@@ -270,20 +271,12 @@ aclError OpCommandImpl::InnerRunOpApi(const string &op_name, PROC_FUNC func)
     }
     // open the deterministicAlgorithms config
     SetDeterministic();
-    bool reset_flag = false;
-    if (ForceJitCompileList::GetInstance().Inlist(op_name) && env::CheckJitDisable()) {
-        NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, "enable"));
-        reset_flag = true;
-    }
     int index = 0;
     do {
         ret = func();
         OPS_CHECK_ERROR(ret, op_name.c_str());
         index++;
     } while (NpuUtils::IsOomError(ret, index) && (index < NPU_MAX_OP_EXEC_TRY_NUM));
-    if (reset_flag) {
-        NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, "disable"));
-    }
     return ret;
 }
 
@@ -322,7 +315,7 @@ int ExecFunc(c10_npu::queue::QueueParas *in, aclrtStream stream)
     ASCEND_LOGD("Op %s Run.", cur_paras->opType);
     aclError ret;
     // open the deterministicAlgorithms config
-    SetDeterministic();
+    SetDeterministic(false);
     if (cur_paras->customHandler) {
         ASCEND_LOGD("Exec Op %s with custom handle", cur_paras->opType);
         try {
diff --git a/torch_npu/csrc/framework/OpParamMaker.h b/torch_npu/csrc/framework/OpParamMaker.h
index 66c91b2ea6..a586c3f0ab 100644
--- a/torch_npu/csrc/framework/OpParamMaker.h
+++ b/torch_npu/csrc/framework/OpParamMaker.h
@@ -394,8 +394,7 @@ private:
     c10::SmallVector<OpCommandImpl, N> objs;
 }; // class OpCommandImpls
 
-void SetDeterministic();
-
+void SetDeterministic(bool isOpapi = true);
 void SetDeterministicOps(bool deterministicAlgorithmsStatus);
 
 static bool deterministicaclnn_oldstatus = false;
diff --git a/torch_npu/csrc/framework/interface/EnvVariables.cpp b/torch_npu/csrc/framework/interface/EnvVariables.cpp
index 5fff216758..9e0499f4c5 100644
--- a/torch_npu/csrc/framework/interface/EnvVariables.cpp
+++ b/torch_npu/csrc/framework/interface/EnvVariables.cpp
@@ -48,7 +48,11 @@ REGISTER_OPTION_HOOK(mdldumpconfigpath, [](const std::string &val) {
 REGISTER_OPTION_BOOL_FUNCTION(CheckJitDisableInner, jitCompile, "enable", "disable")
 REGISTER_OPTION_CACHE(bool, isJitDisable, CheckJitDisableInner)
 REGISTER_OPTION_HOOK(jitCompile, [](const std::string &val) {
-    NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, val.c_str()));
+    NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, "enable"));
+    SET_OPTION_WITH_CACHE(isJitDisable, ("disable" == val) ? true : false);
+})
+
+REGISTER_OPTION_HOOK(jitCompileInit, [](const std::string &val) {
     SET_OPTION_WITH_CACHE(isJitDisable, ("disable" == val) ? true : false);
 })
 
-- 
Gitee


From b91f85e962d74bcd1af562a31e505179158eed21 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Wed, 19 Feb 2025 08:50:03 +0000
Subject: [PATCH 026/358] !18134 Update op_plugin commit id Merge pull request
 !18134 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 7d30b87b35..6c9130662a 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 7d30b87b35973b7f376c5db54ddfa7b6b15d1aff
+Subproject commit 6c9130662a2eab05371e37425d239b0c0cadf66c
-- 
Gitee


From 2c5074d1695e96c5842949ca4f0fcc0772368baa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=A2=85=E9=A3=9E=E8=A6=81?= <1332490378@qq.com>
Date: Wed, 19 Feb 2025 10:06:15 +0000
Subject: [PATCH 027/358] =?UTF-8?q?!18090=20[PROFILING]pta=E9=80=82?=
 =?UTF-8?q?=E9=85=8Ddomain=E6=8E=A5=E5=8F=A3=20Merge=20pull=20request=20!1?=
 =?UTF-8?q?8090=20from=20=E6=A2=85=E9=A3=9E=E8=A6=81/2.6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 third_party/mstx/ms_tools_ext.h               |  15 ++
 .../csrc/distributed/ProcessGroupHCCL.cpp     |  64 +++++---
 .../framework/interface/MstxInterface.cpp     | 153 ++++++++++++++++--
 .../csrc/framework/interface/MstxInterface.h  |  12 ++
 torch_npu/csrc/profiler/mstx_mgr.cpp          |  74 +++++++++
 torch_npu/csrc/profiler/mstx_mgr.h            |   9 ++
 torch_npu/csrc/profiler/npu_profiler.h        |  20 ++-
 7 files changed, 302 insertions(+), 45 deletions(-)

diff --git a/third_party/mstx/ms_tools_ext.h b/third_party/mstx/ms_tools_ext.h
index 6c156adbc8..aac8239127 100644
--- a/third_party/mstx/ms_tools_ext.h
+++ b/third_party/mstx/ms_tools_ext.h
@@ -11,12 +11,27 @@ extern "C" {
 
 typedef uint64_t  mstxRangeId;
 
+struct mstxDomainRegistration_st;
+typedef struct mstxDomainRegistration_st mstxDomainRegistration_t;
+typedef mstxDomainRegistration_t* mstxDomainhandle_t;
+
 ACL_FUNC_VISIBILITY void mstxMarkA(const char* message, aclrtStream stream);
 
 ACL_FUNC_VISIBILITY mstxRangeId mstxRangeStartA(const char* message, aclrtStream stream);
 
 ACL_FUNC_VISIBILITY void mstxRangeEnd(mstxRangeId id);
 
+ACL_FUNC_VISIBILITY mstxDomainhandle_t mstxDomainCreateA(const char* name);
+
+ACL_FUNC_VISIBILITY void mstxDomainDestroy(mstxDomainhandle_t handle);
+
+ACL_FUNC_VISIBILITY void mstxDomainMarkA(mstxDomainhandle_t handle, const char* message, aclrtStream stream);
+
+ACL_FUNC_VISIBILITY mstxRangeId mstxDomainRangeStartA(mstxDomainhandle_t handle, const char* message,
+                                                      aclrtStream stream);
+
+ACL_FUNC_VISIBILITY void mstxDomainRangeEnd(mstxDomainhandle_t handle, mstxRangeId id);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 9843cb1020..b86993fcaf 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -2789,7 +2789,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allreduce(
             auto hcclReduceOp = getHcclReduceOp(opts.reduceOp, input);
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclAllreduce", numel, hcclType, comm), stream.stream(false));
+                    getMstxHcclMsg("HcclAllreduce", numel, hcclType, comm), stream.stream(false),
+                    torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclAllReduce(
                     inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -2865,7 +2866,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::batch_isend_irecv(
 			    }
                 torch_npu::profiler::MstxRange range(
                     getMstxHcclMsg("HcclBatchSendRecv", sendRecvInfo[0].count, sendRecvInfo[0].dataType, comm),
-                    stream.stream(false));
+                    stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
 			    auto hccl_result = hcclBatchIsendIrecv(sendRecvInfo, itemNum, comm, stream.stream(false));
                 *is_dispatched = true;
                 return hccl_result;
@@ -2912,7 +2913,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::broadcast(
             auto hcclType = getHcclDataType(input.scalar_type());
             auto hccl_call = [inputDataPtr, numel, hcclType, root, comm, stream, is_dispatched]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclBroadcast", numel, hcclType, comm), stream.stream(false));
+                    getMstxHcclMsg("HcclBroadcast", numel, hcclType, comm), stream.stream(false),
+                    torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclBroadcast(inputDataPtr, numel, hcclType, root, comm, stream.stream(false));
                 *is_dispatched = true;
                 return hccl_result;
@@ -2959,7 +2961,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allreduce_coalesced(
             auto hcclReduceOp = getHcclReduceOp(opts.reduceOp, input);
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclAllreduce", numel, hcclType, comm), stream.stream(false));
+                    getMstxHcclMsg("HcclAllreduce", numel, hcclType, comm), stream.stream(false),
+                    torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclAllReduce(
                     inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -3025,7 +3028,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce(
             auto reduceOp = getHcclReduceOp(opts.reduceOp, input);
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, reduceOp, rank, comm, stream, is_dispatched]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclReduce", numel, hcclType, comm), stream.stream(false));
+                    getMstxHcclMsg("HcclReduce", numel, hcclType, comm), stream.stream(false),
+                    torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = hcclReduce(
                     inputDataPtr, outputDataPtr, numel, hcclType, reduceOp, rank, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -3083,7 +3087,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_oop(
             auto reduceOp = getHcclReduceOp(opts.reduceOp, input);
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, reduceOp, rank, comm, stream, is_dispatched]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclReduce", numel, hcclType, comm), stream.stream(false));
+                    getMstxHcclMsg("HcclReduce", numel, hcclType, comm), stream.stream(false),
+                    torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = hcclReduce(
                     inputDataPtr, outputDataPtr, numel, hcclType, reduceOp, rank, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -3205,7 +3210,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_scatter_base_uneven(
                 is_dispatched]() -> int {
                     torch_npu::profiler::MstxRange range(
                         getMstxHcclMsg("HcclReduceScatterV", numel, hcclType, comm),
-                        stream.stream(false));
+                        stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
                     auto hccl_result = hcclReduceScatterV(
                         inputDataPtr,
                         inputCounts.data(),
@@ -3290,7 +3295,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_allgather_base_uneven(
                 is_dispatched]() -> int {
                     torch_npu::profiler::MstxRange range(
                         getMstxHcclMsg("HcclAllGatherV", numel, hcclType, comm),
-                        stream.stream(false));
+                        stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
                     auto hccl_result = hcclAllGatherV(
                         inputDataPtr,
                         inputCount,
@@ -3360,7 +3365,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather(
                 auto hcclType = getHcclDataType(input.scalar_type());
                 auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, comm, stream, is_dispatched]() -> int {
                     torch_npu::profiler::MstxRange range(
-                        getMstxHcclMsg("HcclAllGather", numel, hcclType, comm), stream.stream(false));
+                        getMstxHcclMsg("HcclAllGather", numel, hcclType, comm), stream.stream(false),
+                        torch_npu::profiler::DOMAIN_COMMUNICATION);
                     auto hccl_result = HcclAllGather(inputDataPtr, outputDataPtr, numel, hcclType, comm, stream.stream(false));
                     *is_dispatched = true;
                     return hccl_result;
@@ -3437,7 +3443,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather(
                     is_dispatched]() -> int {
                         torch_npu::profiler::MstxRange range(
                             getMstxHcclMsg("HcclAllGatherV", numel, hcclType, comm),
-                            stream.stream(false));
+                            stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
                         auto hccl_result = hcclAllGatherV(
                             inputDataPtr,
                             inputCount,
@@ -3513,7 +3519,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather(
                 auto numel = getNumelForHCCL(input);
                 auto hcclType = getHcclDataType(input.scalar_type());
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclBroadcast", numel, hcclType, comm), stream.stream(false));
+                    getMstxHcclMsg("HcclBroadcast", numel, hcclType, comm), stream.stream(false),
+                    torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclBroadcast(inputDataPtr, numel, hcclType, root, comm, stream.stream());
                 *is_dispatched = true;
                 return hccl_result;
@@ -3551,7 +3558,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather_into_tensor_coalesced
             auto hcclType = getHcclDataType(input.scalar_type());
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, comm, stream, is_dispatched]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclAllGather", numel, hcclType, comm), stream.stream(false));
+                    getMstxHcclMsg("HcclAllGather", numel, hcclType, comm), stream.stream(false),
+                    torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclAllGather(inputDataPtr, outputDataPtr, numel, hcclType, comm, stream.stream(false));
                 *is_dispatched = true;
                 return hccl_result;
@@ -3596,7 +3604,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather_togather(
             auto hcclType = getHcclDataType(input.scalar_type());
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, comm, stream, is_dispatched]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclAllGather", numel, hcclType, comm), stream.stream(false));
+                    getMstxHcclMsg("HcclAllGather", numel, hcclType, comm), stream.stream(false),
+                    torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclAllGather(inputDataPtr, outputDataPtr, numel, hcclType, comm, stream.stream(false));
                 *is_dispatched = true;
                 return hccl_result;
@@ -3646,7 +3655,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_allgather_base(
             auto hcclType = getHcclDataType(input.scalar_type());
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, comm, stream, is_dispatched]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclAllGather", numel, hcclType, comm), stream.stream(false));
+                    getMstxHcclMsg("HcclAllGather", numel, hcclType, comm), stream.stream(false),
+                    torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclAllGather(inputDataPtr, outputDataPtr, numel, hcclType, comm, stream.stream(false));
                 *is_dispatched = true;
                 return hccl_result;
@@ -3692,7 +3702,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce_scatter(
             auto hcclReduceOp = getHcclReduceOp(opts.reduceOp, input);
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclReduceScatter", numel, hcclType, comm), stream.stream(false));
+                    getMstxHcclMsg("HcclReduceScatter", numel, hcclType, comm), stream.stream(false),
+                    torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclReduceScatter(
                     inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -3774,7 +3785,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce_scatter(
                     is_dispatched]() -> int {
                         torch_npu::profiler::MstxRange range(
                             getMstxHcclMsg("HcclReduceScatterV", numel, hcclType, comm),
-                            stream.stream(false));
+                            stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
                         auto hccl_result = hcclReduceScatterV(
                             inputDataPtr,
                             inputCounts.data(),
@@ -3880,7 +3891,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_scatter_base(
             auto hcclReduceOp = getHcclReduceOp(opts.reduceOp, input);
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclReduceScatter", numel, hcclType, comm), stream.stream(false));
+                    getMstxHcclMsg("HcclReduceScatter", numel, hcclType, comm), stream.stream(false),
+                    torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclReduceScatter(
                     inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -3924,7 +3936,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce_scatter_tensor_coalesced
             auto hcclReduceOp = getHcclReduceOp(opts.reduceOp, input);
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclReduceScatter", numel, hcclType, comm), stream.stream(false));
+                    getMstxHcclMsg("HcclReduceScatter", numel, hcclType, comm), stream.stream(false),
+                    torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclReduceScatter(
                     inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -4059,7 +4072,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::scatter(
             auto hcclType = getHcclDataType(input.scalar_type());
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, root, comm, stream, is_dispatched]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclScatter", numel, hcclType, comm), stream.stream(false));
+                    getMstxHcclMsg("HcclScatter", numel, hcclType, comm), stream.stream(false),
+                    torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = hcclScatter(inputDataPtr, outputDataPtr, numel, hcclType, root, comm, stream.stream(false));
                 *is_dispatched = true;
                 return hccl_result;
@@ -4110,7 +4124,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::send(std::vector<at::Tensor>& t
             auto hcclType = getHcclDataType(input.scalar_type());
             auto hccl_call = [inputDataPtr, numel, hcclType, dst_rank, comm, stream, is_dispatched]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclSend", numel, hcclType, comm), stream.stream(false));
+                    getMstxHcclMsg("HcclSend", numel, hcclType, comm), stream.stream(false),
+                    torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclSend(inputDataPtr, numel, hcclType, dst_rank, comm, stream.stream(false));
                 *is_dispatched = true;
                 return hccl_result;
@@ -4144,7 +4159,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::recv(std::vector<at::Tensor>& t
             auto hcclType = getHcclDataType(output.scalar_type());
             auto hccl_call = [outputDataPtr, numel, hcclType, src_rank, comm, stream, is_dispatched]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclRecv", numel, hcclType, comm), stream.stream(false));
+                    getMstxHcclMsg("HcclRecv", numel, hcclType, comm), stream.stream(false),
+                    torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclRecv(outputDataPtr, numel, hcclType, src_rank, comm, stream.stream(false));
                 *is_dispatched = true;
                 return hccl_result;
@@ -4241,7 +4257,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::alltoall_base(
                                   is_dispatched]() -> int {
                         torch_npu::profiler::MstxRange range(
                             getMstxHcclMsg("HcclAlltoAll", input_counts, inputhcclDataType, comm),
-                            stream.stream(false));
+                            stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
                         auto hccl_result = hcclAlltoAll(
                             inputDataPtr,
                             input_counts,
@@ -4340,7 +4356,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::alltoall_base(
                     torch_npu::profiler::MstxRange range(
                         getMstxHcclMsg("HcclAlltoAllV", static_cast<uint64_t>(inputCounts.size()),
                                        inputhcclDataType, comm),
-                        stream.stream(false));
+                        stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
                     auto hccl_result = hcclAlltoAllV(
                         inputDataPtr,
                         inputCounts.data(),
@@ -4466,7 +4482,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::alltoall(
                 torch_npu::profiler::MstxRange range(
                     getMstxHcclMsg("HcclAlltoAllV", static_cast<uint64_t>(input_counts.size()),
                                    inputhcclDataType, comm),
-                    stream.stream(false));
+                    stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = hcclAlltoAllV(
                     inputDataPtr,
                     input_counts.data(),
diff --git a/torch_npu/csrc/framework/interface/MstxInterface.cpp b/torch_npu/csrc/framework/interface/MstxInterface.cpp
index 4c2c876283..40ef6dcced 100644
--- a/torch_npu/csrc/framework/interface/MstxInterface.cpp
+++ b/torch_npu/csrc/framework/interface/MstxInterface.cpp
@@ -19,6 +19,11 @@ REGISTER_LIBRARY(libms_tools_ext)
 LOAD_FUNCTION(mstxMarkA)
 LOAD_FUNCTION(mstxRangeStartA)
 LOAD_FUNCTION(mstxRangeEnd)
+LOAD_FUNCTION(mstxDomainCreateA)
+LOAD_FUNCTION(mstxDomainDestroy)
+LOAD_FUNCTION(mstxDomainMarkA)
+LOAD_FUNCTION(mstxDomainRangeStartA)
+LOAD_FUNCTION(mstxDomainRangeEnd)
 
 // save python range id with cann mstx range id.
 // when mstx.range_end(id) is called, we can check if this id is invalid
@@ -26,22 +31,33 @@ static std::unordered_map<int, mstxRangeId> g_rangeIdMap;
 
 static std::mutex g_mutex;
 
-static std::mutex g_supportMstx;
+static bool IsSupportMstxFuncImpl()
+{
+    bool isSupport = false;
+    char* path = std::getenv("ASCEND_HOME_PATH");
+    if (path != nullptr) {
+        std::string soPath = std::string(path) + "/lib64/libms_tools_ext.so";
+        soPath = torch_npu::toolkit::profiler::Utils::RealPath(soPath);
+        isSupport = !soPath.empty();
+    }
+    return isSupport;
+}
+
+static bool IsSupportMstxDomainFuncImpl()
+{
+    bool isSupport = (MstxDomainCreateA("test") == nullptr) ? false : true;
+    return isSupport;
+}
 
 bool IsSupportMstxFunc()
 {
-    static bool isSupport = false;
-    static bool isChecked = false;
-    std::lock_guard<std::mutex> lock(g_supportMstx);
-    if (!isChecked) {
-        char* path = std::getenv("ASCEND_HOME_PATH");
-        if (path != nullptr) {
-            std::string soPath = std::string(path) + "/lib64/libms_tools_ext.so";
-            soPath = torch_npu::toolkit::profiler::Utils::RealPath(soPath);
-            isSupport = !soPath.empty();
-            isChecked = true;
-        }
-    }
+    static bool isSupport = IsSupportMstxFuncImpl();
+    return isSupport;
+}
+
+bool IsSupportMstxDomainFunc()
+{
+    static bool isSupport = IsSupportMstxDomainFuncImpl();
     return isSupport;
 }
 
@@ -86,7 +102,7 @@ int MstxRangeStartA(const char* message, aclrtStream stream, int ptRangeId)
     return 0;
 }
 
-void MstxRangeEnd(int ptRangdId)
+void MstxRangeEnd(int ptRangeId)
 {
     using MstxRangeEndFunc = void (*)(mstxRangeId);
     static MstxRangeEndFunc func = nullptr;
@@ -103,14 +119,119 @@ void MstxRangeEnd(int ptRangdId)
         }
     }
     std::lock_guard<std::mutex> lock(g_mutex);
-    auto iter = g_rangeIdMap.find(ptRangdId);
+    auto iter = g_rangeIdMap.find(ptRangeId);
     if (iter == g_rangeIdMap.end()) {
-        ASCEND_LOGW("Failed to find mstx range id for python input range id %d", ptRangdId);
+        ASCEND_LOGW("Failed to find mstx range id for python input range id %d", ptRangeId);
         return;
     }
     func(iter->second);
     g_rangeIdMap.erase(iter);
 }
 
+mstxDomainhandle_t MstxDomainCreateA(const char* name)
+{
+    using MstxDomainCreateAFunc = mstxDomainhandle_t (*)(const char*);
+    static MstxDomainCreateAFunc func = nullptr;
+    static bool noFuncFlag = false;
+    if (noFuncFlag) {
+        return nullptr;
+    }
+    if (func == nullptr) {
+        func = (MstxDomainCreateAFunc)GET_FUNC(mstxDomainCreateA);
+        if (func == nullptr) {
+            ASCEND_LOGW("Failed to get func mstxDomainCreateA");
+            noFuncFlag = true;
+            return nullptr;
+        }
+    }
+    return func(name);
+}
+
+void MstxDomainDestroy(mstxDomainhandle_t handle)
+{
+    using MstxDomainDestroyFunc = void (*)(mstxDomainhandle_t);
+    static MstxDomainDestroyFunc func = nullptr;
+    static bool noFuncFlag = false;
+    if (noFuncFlag) {
+        return;
+    }
+    if (func == nullptr) {
+        func = (MstxDomainDestroyFunc)GET_FUNC(mstxDomainDestroy);
+        if (func == nullptr) {
+            ASCEND_LOGW("Failed to get func mstxDomainDestroy");
+            noFuncFlag = true;
+            return;
+        }
+    }
+    func(handle);
+}
+
+void MstxDomainMarkA(mstxDomainhandle_t handle, const char* message, aclrtStream stream)
+{
+    using MstxDomainMarkAFunc = void (*)(mstxDomainhandle_t, const char*, aclrtStream);
+    static MstxDomainMarkAFunc func = nullptr;
+    static bool noFuncFlag = false;
+    if (noFuncFlag) {
+        return;
+    }
+    if (func == nullptr) {
+        func = (MstxDomainMarkAFunc)GET_FUNC(mstxDomainMarkA);
+        if (func == nullptr) {
+            ASCEND_LOGW("Failed to get func mstxDomainMarkA");
+            noFuncFlag = true;
+            return;
+        }
+    }
+    func(handle, message, stream);
+}
+
+int MstxDomainRangeStartA(mstxDomainhandle_t handle, const char* message, aclrtStream stream, int ptRangeId)
+{
+    using MstxDomainRangeStartAFunc = mstxRangeId (*)(mstxDomainhandle_t, const char*, aclrtStream);
+    static MstxDomainRangeStartAFunc func = nullptr;
+    static bool noFuncFlag = false;
+    if (noFuncFlag) {
+        return 0;
+    }
+    if (func == nullptr) {
+        func = (MstxDomainRangeStartAFunc)GET_FUNC(mstxDomainRangeStartA);
+        if (func == nullptr) {
+            ASCEND_LOGW("Failed to get func mstxDomainRangeStartA");
+            noFuncFlag = true;
+            return 0;
+        }
+    }
+    mstxRangeId taskId = func(handle, message, stream);
+    std::lock_guard<std::mutex> lock(g_mutex);
+    g_rangeIdMap.insert({ptRangeId, taskId});
+    return 0;
+}
+
+void MstxDomainRangeEnd(mstxDomainhandle_t handle, int ptRangeId)
+{
+    using MstxDomainRangeEndFunc = void (*)(mstxDomainhandle_t, mstxRangeId);
+    static MstxDomainRangeEndFunc func = nullptr;
+    static bool noFuncFlag = false;
+    if (noFuncFlag) {
+        return;
+    }
+    if (func == nullptr) {
+        func = (MstxDomainRangeEndFunc)GET_FUNC(mstxDomainRangeEnd);
+        if (func == nullptr) {
+            ASCEND_LOGW("Failed to get func mstxDomainRangeEnd");
+            noFuncFlag = true;
+            return;
+        }
+    }
+    std::lock_guard<std::mutex> lock(g_mutex);
+    auto iter = g_rangeIdMap.find(ptRangeId);
+    if (iter == g_rangeIdMap.end()) {
+        ASCEND_LOGW("Failed to find mstx range id for python input range id %d", ptRangeId);
+        return;
+    }
+    func(handle, iter->second);
+    g_rangeIdMap.erase(iter);
+}
+
 }
 }
\ No newline at end of file
diff --git a/torch_npu/csrc/framework/interface/MstxInterface.h b/torch_npu/csrc/framework/interface/MstxInterface.h
index accd38bef2..806e8e749b 100644
--- a/torch_npu/csrc/framework/interface/MstxInterface.h
+++ b/torch_npu/csrc/framework/interface/MstxInterface.h
@@ -8,11 +8,23 @@ namespace native {
 
 bool IsSupportMstxFunc();
 
+bool IsSupportMstxDomainFunc();
+
 void MstxMarkA(const char* message, aclrtStream stream);
 
 int MstxRangeStartA(const char* message, aclrtStream stream, int ptRangeId);
 
 void MstxRangeEnd(int ptRangeId);
+
+mstxDomainhandle_t MstxDomainCreateA(const char* name);
+
+void MstxDomainDestroy(mstxDomainhandle_t handle);
+
+void MstxDomainMarkA(mstxDomainhandle_t handle, const char* message, aclrtStream stream);
+
+int MstxDomainRangeStartA(mstxDomainhandle_t handle, const char* message, aclrtStream stream, int ptRangeId);
+
+void MstxDomainRangeEnd(mstxDomainhandle_t handle, int ptRangeId);
 }
 }
 
diff --git a/torch_npu/csrc/profiler/mstx_mgr.cpp b/torch_npu/csrc/profiler/mstx_mgr.cpp
index 1690cbf146..7ee7793e8c 100644
--- a/torch_npu/csrc/profiler/mstx_mgr.cpp
+++ b/torch_npu/csrc/profiler/mstx_mgr.cpp
@@ -84,6 +84,80 @@ int MstxMgr::getRangeId()
     return ptRangeId_++;
 }
 
+mstxDomainhandle_t MstxMgr::createDomain(const char* name)
+{
+    return at_npu::native::MstxDomainCreateA(name);
+}
+
+void MstxMgr::destroyDomain(mstxDomainhandle_t domain)
+{
+    at_npu::native::MstxDomainDestroy(domain);
+}
+
+void MstxMgr::domainMark(mstxDomainhandle_t domain, const char* message, const aclrtStream stream)
+{
+    if (!isMstxEnable()) {
+        return;
+    }
+    int id = ptRangeId_++;
+    if (stream == nullptr) {
+        (void)at_npu::native::MstxDomainMarkA(domain, message, nullptr);
+        return;
+    }
+    auto mark_call = [domain, msg_ptr = std::make_shared<std::string>(message), stream]() -> int {
+        (void)at_npu::native::MstxDomainMarkA(domain, msg_ptr->c_str(), stream);
+        return 0;
+    };
+    at_npu::native::OpCommand::RunOpApi("mstx_domain_mark_op", mark_call);
+}
+
+int MstxMgr::domainRangeStart(mstxDomainhandle_t domain, const char* message, const aclrtStream stream)
+{
+    if (!isMstxEnable()) {
+        return 0;
+    }
+    int id = ptRangeId_++;
+    if (stream == nullptr) {
+        int res = at_npu::native::MstxDomainRangeStartA(domain, message, nullptr, id);
+        return id;
+    }
+    {
+        std::lock_guard<std::mutex> lock(mtx_);
+        ptRangeIdsWithStream_.insert(id);
+    }
+    auto range_start_call = [domain, msg_ptr = std::make_shared<std::string>(message), stream, id]() -> int {
+        int taskId = at_npu::native::MstxDomainRangeStartA(domain, msg_ptr->c_str(), stream, id);
+        return 0;
+    };
+    at_npu::native::OpCommand::RunOpApi("mstx_domain_range_start_op", range_start_call);
+    return id;
+}
+
+void MstxMgr::domainRangeEnd(mstxDomainhandle_t domain, int ptRangeId)
+{
+    if (!isMstxEnable() || ptRangeId == 0) {
+        return;
+    }
+    bool rangeIdWithStream = false;
+    {
+        std::lock_guard<std::mutex> lock(mtx_);
+        auto iter = ptRangeIdsWithStream_.find(ptRangeId);
+        if (iter != ptRangeIdsWithStream_.end()) {
+            rangeIdWithStream = true;
+            ptRangeIdsWithStream_.erase(iter);
+        }
+    }
+    if (!rangeIdWithStream) {
+        at_npu::native::MstxDomainRangeEnd(domain, ptRangeId);
+        return;
+    }
+    auto range_end_call = [domain, ptRangeId]() -> int {
+        at_npu::native::MstxDomainRangeEnd(domain, ptRangeId);
+        return 0;
+    };
+    at_npu::native::OpCommand::RunOpApi("mstx_domain_range_end_op", range_end_call);
+}
+
 bool MstxMgr::isProfTxEnable()
 {
     return ProfilerMgr::GetInstance()->GetNpuTrace().load() && ProfilerMgr::GetInstance()->GetMsprofTx().load();
diff --git a/torch_npu/csrc/profiler/mstx_mgr.h b/torch_npu/csrc/profiler/mstx_mgr.h
index 883662cb4b..c6fc6a5fe1 100644
--- a/torch_npu/csrc/profiler/mstx_mgr.h
+++ b/torch_npu/csrc/profiler/mstx_mgr.h
@@ -10,6 +10,9 @@
 
 namespace torch_npu {
 namespace profiler {
+
+const std::string DOMAIN_COMMUNICATION = "communication";
+
 class MstxMgr : public torch_npu::toolkit::profiler::Singleton<MstxMgr> {
 friend class torch_npu::toolkit::profiler::Singleton<MstxMgr>;
 public:
@@ -19,6 +22,12 @@ public:
     bool isMstxEnable();
     int getRangeId();
 
+    mstxDomainhandle_t createDomain(const char* name);
+    void destroyDomain(mstxDomainhandle_t domain);
+    void domainMark(mstxDomainhandle_t domain, const char* message, const aclrtStream stream);
+    int domainRangeStart(mstxDomainhandle_t domain, const char* message, const aclrtStream stream);
+    void domainRangeEnd(mstxDomainhandle_t domain, int ptRangeId);
+
 private:
     MstxMgr();
     explicit MstxMgr(const MstxMgr &obj) = delete;
diff --git a/torch_npu/csrc/profiler/npu_profiler.h b/torch_npu/csrc/profiler/npu_profiler.h
index 0727d02236..b58fa182a7 100644
--- a/torch_npu/csrc/profiler/npu_profiler.h
+++ b/torch_npu/csrc/profiler/npu_profiler.h
@@ -129,21 +129,31 @@ inline bool mstxEnable()
 
 struct MstxRange {
     int rangeId{0};
-    MstxRange(const std::string &message, aclrtStream stream)
+    mstxDomainhandle_t domainHandle{nullptr};
+    MstxRange(const std::string &message, aclrtStream stream, const std::string &domainName = "default")
     {
-        if (message.empty()) {
+        if (!mstxEnable()) {
             return;
         }
         rangeId = MstxMgr::GetInstance()->getRangeId();
-        at_npu::native::MstxRangeStartA(message.c_str(), stream, rangeId);
+        if (at_npu::native::IsSupportMstxDomainFunc()) {
+            domainHandle = MstxMgr::GetInstance()->createDomain(domainName.c_str());
+            at_npu::native::MstxDomainRangeStartA(domainHandle, message.c_str(), stream, rangeId);
+        } else {
+            at_npu::native::MstxRangeStartA(message.c_str(), stream, rangeId);
+        }
     }
 
     ~MstxRange()
     {
-        if (rangeId == 0) {
+        if (rangeId == 0 || !mstxEnable()) {
             return;
         }
-        at_npu::native::MstxRangeEnd(rangeId);
+        if (at_npu::native::IsSupportMstxDomainFunc()) {
+            at_npu::native::MstxDomainRangeEnd(domainHandle, rangeId);
+        } else {
+            at_npu::native::MstxRangeEnd(rangeId);
+        }
     }
 };
 } // profiler
-- 
Gitee


From c82927d64d5c03bcd02606faa40f341b63c52721 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= <jiangyiwen5@huawei.com>
Date: Wed, 19 Feb 2025 11:02:19 +0000
Subject: [PATCH 028/358] =?UTF-8?q?!18150=20Change=20LazyInitAclops=20chec?=
 =?UTF-8?q?k=20Merge=20pull=20request=20!18150=20from=20=E5=A7=9C=E6=80=A1?=
 =?UTF-8?q?=E6=96=87/v2.6.0=5Flazy?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/framework/LazyInitAclops.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torch_npu/csrc/framework/LazyInitAclops.cpp b/torch_npu/csrc/framework/LazyInitAclops.cpp
index 2e0b30c740..25e58d2e78 100644
--- a/torch_npu/csrc/framework/LazyInitAclops.cpp
+++ b/torch_npu/csrc/framework/LazyInitAclops.cpp
@@ -180,7 +180,9 @@ void LazyInitAclops()
     if (acl_op_init_mode == 0) {
         return;
     }
-    NPU_CHECK_ERROR(acl_op_init_mode != 2, "Acl op is disabled! Please check the environment variable ACL_OP_INIT_MODE.");
+    TORCH_CHECK(acl_op_init_mode != 2,
+                "Acl op is disabled! Please check the environment variable ACL_OP_INIT_MODE.",
+                PTA_ERROR(ErrCode::NOT_SUPPORT));
 
     if (!encounteredAclops.exchange(true) && c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) {
         RECORD_FUNCTION("LazyInitAclops", std::vector<c10::IValue>({}));
-- 
Gitee


From 9254db379edf412a402327443b7717c61730287b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= <yanpengquan@huawei.com>
Date: Wed, 19 Feb 2025 11:19:27 +0000
Subject: [PATCH 029/358] =?UTF-8?q?!18141=20Replace=20setCurrentNPUStream?=
 =?UTF-8?q?=20with=20NPUStreamGuard=20add=20logging=20Merge=20pull=20reque?=
 =?UTF-8?q?st=20!18141=20from=20=E9=97=AB=E9=B9=8F=E5=85=A8/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/core/npu/NPUStream.cpp         |  2 ++
 .../npu/interface/AsyncTaskQueueInterface.cpp | 33 +++++++++++--------
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp
index 75a4001c17..a5ed2971a7 100644
--- a/torch_npu/csrc/core/npu/NPUStream.cpp
+++ b/torch_npu/csrc/core/npu/NPUStream.cpp
@@ -510,6 +510,8 @@ void setCurrentNPUStream(NPUStream stream)
     initNPUStreamsOnce();
     auto ptr = NPUStream_internals(stream);
     AT_ASSERT(ptr, PTA_ERROR(ErrCode::PTR));
+    ASCEND_LOGI("Exchange NPU current stream from stream = %p to stream = %p",
+        current_streams[ptr->device_index]->stream, ptr->stream);
     current_streams[ptr->device_index] = ptr;
 }
 
diff --git a/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.cpp b/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.cpp
index d6b201a07c..7199eb895d 100644
--- a/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.cpp
@@ -1,5 +1,6 @@
 #include "AsyncTaskQueueInterface.h"
 #include "torch_npu/csrc/core/npu/NPUEventManager.h"
+#include "torch_npu/csrc/core/npu/NPUGuard.h"
 #include "torch_npu/csrc/core/npu/register/OptionsManager.h"
 #include <ATen/record_function.h>
 #include "torch_npu/csrc/framework/utils/NpuUtils.h"
@@ -120,15 +121,17 @@ void EventTask::LaunchRecordTask(c10_npu::NPUStream npuStream) {
 #ifndef BUILD_LIBTORCH
     at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(0, EventParas::EVENT_PARAS_MAP[RECORD_EVENT]);
 #endif
-    c10_npu::NPUStream currentStream = c10_npu::getCurrentNPUStream();
-    c10_npu::setCurrentNPUStream(npuStream);
-    QueueParas params(RECORD_EVENT, sizeof(EventParas), &eventParam_);
-    c10_npu::NPUEventManager::GetInstance().IncreaseUnrecordedCount(eventParam_.event);
-    c10_npu::enCurrentNPUStream(&params);
-    c10_npu::setCurrentNPUStream(currentStream);
+    uint64_t prof_correlation_id = 0;
+    {
+        c10_npu::NPUStreamGuard guard(npuStream);
+        QueueParas params(RECORD_EVENT, sizeof(EventParas), &eventParam_);
+        c10_npu::NPUEventManager::GetInstance().IncreaseUnrecordedCount(eventParam_.event);
+        c10_npu::enCurrentNPUStream(&params);
+        prof_correlation_id = params.correlation_id;
+    }
     ASCEND_LOGI("Event: LaunchRecordTask is successfully executed, event=%p", eventParam_.event);
 #ifndef BUILD_LIBTORCH
-    at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[RECORD_EVENT], params.correlation_id);
+    at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[RECORD_EVENT], prof_correlation_id);
 #endif
   } else {
     NPU_CHECK_ERROR(aclrtRecordEvent(eventParam_.event, npuStream));
@@ -157,14 +160,16 @@ void EventTask::LaunchWaitTask(c10_npu::NPUStream npuStream) {
 #ifndef BUILD_LIBTORCH
     at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(0, EventParas::EVENT_PARAS_MAP[WAIT_EVENT]);
 #endif
-    c10_npu::NPUStream currentStream = c10_npu::getCurrentNPUStream();
-    c10_npu::setCurrentNPUStream(npuStream);
-    QueueParas params(WAIT_EVENT, sizeof(EventParas), &eventParam_);
-    c10_npu::enCurrentNPUStream(&params);
-    c10_npu::setCurrentNPUStream(currentStream);
+    uint64_t prof_correlation_id = 0;
+    {
+        c10_npu::NPUStreamGuard guard(npuStream);
+        QueueParas params(WAIT_EVENT, sizeof(EventParas), &eventParam_);
+        c10_npu::enCurrentNPUStream(&params);
+        prof_correlation_id = params.correlation_id;
+    }
     ASCEND_LOGI("Event: LaunchWaitTask is successfully executed, event=%p", eventParam_.event);
 #ifndef BUILD_LIBTORCH
-    at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[WAIT_EVENT], params.correlation_id);
+    at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[WAIT_EVENT], prof_correlation_id);
 #endif
   } else {
     NPU_CHECK_ERROR(aclrtStreamWaitEvent(npuStream, eventParam_.event));
@@ -217,4 +222,4 @@ aclError LaunchLazyDestroyEventTask(aclrtEvent event, c10::DeviceIndex device_in
   return ACL_ERROR_NONE;
 }
 } // namespace queue
-} // namespace c10
\ No newline at end of file
+} // namespace c10
-- 
Gitee


From 560714adab14faadcffed1e37f8939de68a3f9e3 Mon Sep 17 00:00:00 2001
From: fanglanyue <lanyuefang0916@163.com>
Date: Wed, 19 Feb 2025 12:00:48 +0000
Subject: [PATCH 030/358] =?UTF-8?q?!18121=20=E3=80=90Profiling=E3=80=91?=
 =?UTF-8?q?=E4=BF=AE=E6=94=B9dyno=5Fipc=E5=BC=95=E5=85=A5=E6=A8=A1?=
 =?UTF-8?q?=E5=9D=97=20Merge=20pull=20request=20!18121=20from=20fanglanyue?=
 =?UTF-8?q?/ctrl=5Fipc=5Fv2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/profiler/CMakeLists.txt        |   2 +-
 .../csrc/profiler/dyno/DynoLogNpuMonitor.cpp  |  33 ---
 .../csrc/profiler/dyno/DynoLogNpuMonitor.h    |  28 ---
 torch_npu/csrc/profiler/dyno/MonitorBase.h    |  15 --
 torch_npu/csrc/profiler/dyno/NpuIpcClient.cpp | 134 ------------
 torch_npu/csrc/profiler/dyno/NpuIpcClient.h   | 100 ---------
 torch_npu/csrc/profiler/dyno/NpuIpcEndPoint.h | 201 ------------------
 .../profiler/dyno/PyDynamicMonitorProxy.h     |  34 ---
 torch_npu/csrc/profiler/dyno/utils.cpp        |  88 --------
 torch_npu/csrc/profiler/dyno/utils.h          |  24 ---
 torch_npu/csrc/profiler/init.cpp              |   5 -
 .../_dynamic_profiler_monitor.py              |   8 +-
 12 files changed, 8 insertions(+), 664 deletions(-)
 delete mode 100644 torch_npu/csrc/profiler/dyno/DynoLogNpuMonitor.cpp
 delete mode 100644 torch_npu/csrc/profiler/dyno/DynoLogNpuMonitor.h
 delete mode 100644 torch_npu/csrc/profiler/dyno/MonitorBase.h
 delete mode 100644 torch_npu/csrc/profiler/dyno/NpuIpcClient.cpp
 delete mode 100644 torch_npu/csrc/profiler/dyno/NpuIpcClient.h
 delete mode 100644 torch_npu/csrc/profiler/dyno/NpuIpcEndPoint.h
 delete mode 100644 torch_npu/csrc/profiler/dyno/PyDynamicMonitorProxy.h
 delete mode 100644 torch_npu/csrc/profiler/dyno/utils.cpp
 delete mode 100644 torch_npu/csrc/profiler/dyno/utils.h

diff --git a/torch_npu/csrc/profiler/CMakeLists.txt b/torch_npu/csrc/profiler/CMakeLists.txt
index 52d68a84c6..9a39c2d4d8 100644
--- a/torch_npu/csrc/profiler/CMakeLists.txt
+++ b/torch_npu/csrc/profiler/CMakeLists.txt
@@ -1,4 +1,4 @@
-FILE(GLOB _PROF_SRCS *.cpp dyno/*.h dyno/*.cpp)
+FILE(GLOB _PROF_SRCS *.cpp)
 
 LIST(APPEND PROF_SRCS ${_PROF_SRCS})
 
diff --git a/torch_npu/csrc/profiler/dyno/DynoLogNpuMonitor.cpp b/torch_npu/csrc/profiler/dyno/DynoLogNpuMonitor.cpp
deleted file mode 100644
index acd11ac202..0000000000
--- a/torch_npu/csrc/profiler/dyno/DynoLogNpuMonitor.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-#include "DynoLogNpuMonitor.h"
-#include "utils.h"
-
-namespace torch_npu {
-namespace profiler {
-
-bool DynoLogNpuMonitor::Init()
-{
-    if (isInitialized_) {
-        ASCEND_LOGW("DynoLog npu monitor is initialized !");
-        return true;
-    }
-    bool res = ipcClient_.RegisterInstance(npuId_);
-    if (res) {
-        isInitialized_ = true;
-        ASCEND_LOGI("DynoLog npu monitor initialized success !");
-    }
-    return res;
-}
-
-std::string DynoLogNpuMonitor::Poll()
-{
-    std::string res = ipcClient_.IpcClientNpuConfig();
-    if (res.empty()) {
-        ASCEND_LOGI("Request for dynolog server is empty !");
-        return "";
-    }
-    ASCEND_LOGI("Received NPU configuration successfully");
-    return res;
-}
-
-} // namespace profiler
-} // namespace torch_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/profiler/dyno/DynoLogNpuMonitor.h b/torch_npu/csrc/profiler/dyno/DynoLogNpuMonitor.h
deleted file mode 100644
index 63840255f9..0000000000
--- a/torch_npu/csrc/profiler/dyno/DynoLogNpuMonitor.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#pragma once
-#include <torch_npu/csrc/toolkit/profiler/common/singleton.h>
-#include "MonitorBase.h"
-#include "NpuIpcClient.h"
-
-namespace torch_npu {
-namespace profiler {
-
-class DynoLogNpuMonitor : public MonitorBase, public torch_npu::toolkit::profiler::Singleton<DynoLogNpuMonitor> {
-    friend class torch_npu::toolkit::profiler::Singleton<DynoLogNpuMonitor>;
-
-public:
-    DynoLogNpuMonitor() = default;
-    bool Init() override;
-    std::string Poll() override;
-    void SetNpuId(int id) override
-    {
-        npuId_ = id;
-    }
-
-private:
-    bool isInitialized_ = false;
-    int32_t npuId_ = 0;
-    IpcClient ipcClient_;
-};
-
-} // namespace profiler
-} // namespace torch_npu
diff --git a/torch_npu/csrc/profiler/dyno/MonitorBase.h b/torch_npu/csrc/profiler/dyno/MonitorBase.h
deleted file mode 100644
index 1c7885b3e5..0000000000
--- a/torch_npu/csrc/profiler/dyno/MonitorBase.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#pragma once
-#include <string>
-
-namespace torch_npu {
-namespace profiler {
-
-class MonitorBase {
-public:
-    virtual bool Init() = 0;
-    virtual std::string Poll() = 0;
-    virtual void SetNpuId(int id) = 0;
-};
-
-} // namespace profiler
-} // namespace torch_npu
diff --git a/torch_npu/csrc/profiler/dyno/NpuIpcClient.cpp b/torch_npu/csrc/profiler/dyno/NpuIpcClient.cpp
deleted file mode 100644
index e5a4211aa8..0000000000
--- a/torch_npu/csrc/profiler/dyno/NpuIpcClient.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-#include "NpuIpcClient.h"
-namespace torch_npu {
-namespace profiler {
-
-bool torch_npu::profiler::IpcClient::RegisterInstance(int32_t id)
-{
-    NpuContext context{
-        .npu = id,
-        .pid = getpid(),
-        .jobId = JOB_ID,
-    };
-    std::unique_ptr<Message> message = Message::ConstructMessage<decltype(context)>(context, "ctxt");
-    try {
-        if (!SyncSendMessage(*message, std::string(DYNO_IPC_NAME))) {
-            ASCEND_LOGW("Failed to send register ctxt for pid %d with dyno", context.pid);
-            return false;
-        }
-    } catch (const std::exception &e) {
-        ASCEND_LOGW("Error when SyncSendMessage %s !", e.what());
-        return false;
-    }
-    ASCEND_LOGI("Resigter pid %d for dynolog success !", context.pid);
-    return true;
-}
-std::string IpcClient::IpcClientNpuConfig()
-{
-    auto size = pids_.size();
-    auto *req = (NpuRequest *)malloc(sizeof(NpuRequest) + sizeof(int32_t) * size);
-    req->type = DYNO_IPC_TYPE;
-    req->pidSize = size;
-    req->jobId = JOB_ID;
-    for (int i = 0; i < size; i++) {
-        req->pids[i] = pids_[i];
-    }
-    std::unique_ptr<Message> message = Message::ConstructMessage<NpuRequest, int32_t>(*req, "req", size);
-    if (!SyncSendMessage(*message, std::string(DYNO_IPC_NAME))) {
-        ASCEND_LOGW("Failed to send config  to dyno server fail !");
-        free(req);
-        req = nullptr;
-        return "";
-    }
-    free(req);
-    message = PollRecvMessage(MAX_IPC_RETRIES, MAX_SLEEP_US);
-    if (!message) {
-        ASCEND_LOGW("Failed to receive on-demand config !");
-        return "";
-    }
-    std::string res = std::string((char *)message->buf.get(), message->metadata.size);
-    return res;
-}
-std::unique_ptr<Message> IpcClient::ReceiveMessage()
-{
-    std::lock_guard<std::mutex> wguard(dequeLock_);
-    if (msgDynoDeque_.empty()) {
-        return nullptr;
-    }
-    std::unique_ptr<Message> message = std::move(msgDynoDeque_.front());
-    msgDynoDeque_.pop_front();
-    return message;
-}
-bool IpcClient::SyncSendMessage(const Message &message, const std::string &destName, int numRetry, int seepTimeUs)
-{
-    if (destName.empty()) {
-        ASCEND_LOGW("Can not send to empty socket name !");
-        return false;
-    }
-    int i = 0;
-    std::vector<NpuPayLoad> npuPayLoad{ NpuPayLoad(sizeof(struct Metadata), (void *)&message.metadata),
-        NpuPayLoad(message.metadata.size, message.buf.get()) };
-    try {
-        auto ctxt = ep_.BuildSendNpuCtxt(destName, npuPayLoad, std::vector<int>());
-        while (!ep_.TrySendMessage(*ctxt) && i < numRetry) {
-            i++;
-            usleep(seepTimeUs);
-            seepTimeUs *= 2;
-        }
-    } catch (const std::exception &e) {
-        ASCEND_LOGW("Error when SyncSendMessage %s !", e.what());
-        return false;
-    }
-    return i < numRetry;
-}
-bool IpcClient::Recv()
-{
-    try {
-        Metadata recvMetadata;
-        std::vector<NpuPayLoad> PeekNpuPayLoad{ NpuPayLoad(sizeof(struct Metadata), &recvMetadata) };
-        auto peekCtxt = ep_.BuildNpuRcvCtxt(PeekNpuPayLoad);
-        bool successFlag = false;
-        try {
-            successFlag = ep_.TryPeekMessage(*peekCtxt);
-        } catch (std::exception &e) {
-            ASCEND_LOGW("ERROR when TryPeekMessage: %s !", e.what());
-            return false;
-        }
-        if (successFlag) {
-            std::unique_ptr<Message> npuMessage = std::make_unique<Message>(Message());
-            npuMessage->metadata = recvMetadata;
-            npuMessage->buf = std::unique_ptr<unsigned char[]>(new unsigned char[recvMetadata.size]);
-            npuMessage->src = std::string(ep_.GetName(*peekCtxt));
-            std::vector<NpuPayLoad> npuPayLoad{ NpuPayLoad(sizeof(struct Metadata), (void *)&npuMessage->metadata),
-                NpuPayLoad(recvMetadata.size, npuMessage->buf.get()) };
-            auto recvCtxt = ep_.BuildNpuRcvCtxt(npuPayLoad);
-            try {
-                successFlag = ep_.TryRcvMessage(*recvCtxt);
-            } catch (std::exception &e) {
-                ASCEND_LOGW("Error when TryRecvMsg: %s !", e.what());
-                return false;
-            }
-            if (successFlag) {
-                std::lock_guard<std::mutex> wguard(dequeLock_);
-                msgDynoDeque_.push_back(std::move(npuMessage));
-                return true;
-            }
-        }
-    } catch (std::exception &e) {
-        ASCEND_LOGW("Error in Recv(): %s !", e.what());
-        return false;
-    }
-    return false;
-}
-std::unique_ptr<Message> IpcClient::PollRecvMessage(int maxRetry, int sleeTimeUs)
-{
-    for (int i = 0; i < maxRetry; i++) {
-        if (Recv()) {
-            return ReceiveMessage();
-        }
-        usleep(sleeTimeUs);
-    }
-    return nullptr;
-}
-
-} // namespace profiler
-} // namespace torch_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/profiler/dyno/NpuIpcClient.h b/torch_npu/csrc/profiler/dyno/NpuIpcClient.h
deleted file mode 100644
index b152517b6d..0000000000
--- a/torch_npu/csrc/profiler/dyno/NpuIpcClient.h
+++ /dev/null
@@ -1,100 +0,0 @@
-#pragma once
-#include <vector>
-#include <string>
-#include <memory>
-#include <mutex>
-#include <cstring>
-#include <deque>
-#include <random>
-#include <sstream>
-#include "NpuIpcEndPoint.h"
-#include "utils.h"
-
-namespace torch_npu {
-namespace profiler {
-
-constexpr int TYPE_SIZE = 32;
-constexpr int JOB_ID = 0;
-constexpr const char *DYNO_IPC_NAME = "dynolog";
-constexpr const int DYNO_IPC_TYPE = 3;
-constexpr const int MAX_IPC_RETRIES = 5;
-constexpr const int MAX_SLEEP_US = 10000;
-struct NpuRequest {
-    int type;
-    int pidSize;
-    int64_t jobId;
-    int32_t pids[0];
-};
-struct NpuContext {
-    int32_t npu;
-    pid_t pid;
-    int64_t jobId;
-};
-struct Metadata {
-    size_t size = 0;
-    char type[TYPE_SIZE] = "";
-};
-struct Message {
-    Metadata metadata;
-    std::unique_ptr<unsigned char[]> buf;
-    std::string src;
-    template <class T> static std::unique_ptr<Message> ConstructMessage(const T &data, const std::string &type)
-    {
-        std::unique_ptr<Message> ipcNpuMessage = std::make_unique<Message>(Message());
-        if (type.size() + 1 > sizeof(ipcNpuMessage->metadata.type)) {
-            throw std::runtime_error("Type string is too long to fit in metadata.type" + PROF_ERROR(ErrCode::PARAM));
-        }
-        memcpy(ipcNpuMessage->metadata.type, type.c_str(), type.size() + 1);
-#if __cplusplus >= 201703L
-        if constexpr (std::is_same<std::string, T>::value == true) {
-            ipcNpuMessage->metadata.size = data.size();
-            ipcNpuMessage->buf = std::make_unique<unsigned char[]>(ipcNpuMessage->metadata.size);
-            memcpy(ipcNpuMessage->buf.get(), data.c_str(), sizeof(data));
-            return ipcNpuMessage;
-        }
-#endif
-        static_assert(std::is_trivially_copyable<T>::value);
-        ipcNpuMessage->metadata.size = sizeof(data);
-        ipcNpuMessage->buf = std::make_unique<unsigned char[]>(ipcNpuMessage->metadata.size);
-        memcpy(ipcNpuMessage->buf.get(), &data, sizeof(data));
-        return ipcNpuMessage;
-    }
-
-    template <class T, class U>
-    static std::unique_ptr<Message> ConstructMessage(const T &data, const std::string &type, int n)
-    {
-        std::unique_ptr<Message> ipcNpuMessage = std::make_unique<Message>(Message());
-        if (type.size() + 1 > sizeof(ipcNpuMessage->metadata.type)) {
-            throw std::runtime_error("Type string is too long to fit in metadata.type" + PROF_ERROR(ErrCode::PARAM));
-        }
-        memcpy(ipcNpuMessage->metadata.type, type.c_str(), type.size() + 1);
-        static_assert(std::is_trivially_copyable<T>::value);
-        static_assert(std::is_trivially_copyable<U>::value);
-        ipcNpuMessage->metadata.size = sizeof(data) + sizeof(U) * n;
-        ipcNpuMessage->buf = std::make_unique<unsigned char[]>(ipcNpuMessage->metadata.size);
-        memcpy(ipcNpuMessage->buf.get(), &data, ipcNpuMessage->metadata.size);
-        return ipcNpuMessage;
-    }
-};
-class IpcClient {
-public:
-    IpcClient(const IpcClient &) = delete;
-    IpcClient &operator = (const IpcClient &) = delete;
-    IpcClient() = default;
-    bool RegisterInstance(int32_t npu);
-    std::string IpcClientNpuConfig();
-
-private:
-    std::vector<int32_t> pids_ = GetPids();
-    NpuIpcEndPoint<0> ep_{ "dynoconfigclient" + GenerateUuidV4() };
-    std::mutex dequeLock_;
-    std::deque<std::unique_ptr<Message>> msgDynoDeque_;
-    std::unique_ptr<Message> ReceiveMessage();
-    bool SyncSendMessage(const Message &message, const std::string &destName, int numRetry = 10,
-        int seepTimeUs = 10000);
-    bool Recv();
-    std::unique_ptr<Message> PollRecvMessage(int maxRetry, int sleeTimeUs);
-};
-
-} // namespace profiler
-} // namespace torch_npu
diff --git a/torch_npu/csrc/profiler/dyno/NpuIpcEndPoint.h b/torch_npu/csrc/profiler/dyno/NpuIpcEndPoint.h
deleted file mode 100644
index 4f3b35ffd1..0000000000
--- a/torch_npu/csrc/profiler/dyno/NpuIpcEndPoint.h
+++ /dev/null
@@ -1,201 +0,0 @@
-#pragma once
-#include <cstdlib>
-#include <sys/stat.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <unistd.h>
-#include <cstring>
-#include <stdexcept>
-#include <string>
-#include <memory>
-#include <vector>
-#include "utils.h"
-
-namespace torch_npu {
-namespace profiler {
-
-using fileDesT = int;
-constexpr const char STR_END_CHAR = '\0';
-constexpr int SOCKET_FD_CHMOD = 0666;
-
-struct NpuPayLoad {
-    size_t size;
-    void *data;
-    NpuPayLoad(size_t size, void *data) : size(size), data(data) {}
-};
-
-template <size_t MaxNumFileDes = 0> struct NpuIpcEndPointCtxt {
-    struct sockaddr_un messageName;
-    size_t messageLen;
-    fileDesT *fileDesPtr;
-    struct msghdr msghdr;
-    std::vector<struct iovec> iov;
-    char ancillaryBuf[CMSG_SPACE(MaxNumFileDes * sizeof(fileDesT))];
-    explicit NpuIpcEndPointCtxt(size_t num) : iov(std::vector<struct iovec>(num)){};
-};
-
-template <size_t MaxNumFileDes = 0> class NpuIpcEndPoint final {
-    using Ctxt = NpuIpcEndPointCtxt<MaxNumFileDes>;
-
-public:
-    constexpr static size_t addressMaxLen = 108 - 2; // Max unix socket path length
-    explicit NpuIpcEndPoint(const std::string &addressName)
-    {
-        socketFd = socket(AF_UNIX, SOCK_DGRAM, 0);
-        if (socketFd == -1) {
-            throw std::runtime_error(std::strerror(errno) + PROF_ERROR(ErrCode::PARAM));
-        }
-        struct sockaddr_un address;
-        size_t addressLen = SetSocketAdress(addressName, address);
-        if (address.sun_path[0] != STR_END_CHAR) {
-            unlink(address.sun_path);
-        }
-        int res = bind(socketFd, (const struct sockaddr *)&address, addressLen);
-        if (res == -1) {
-            throw std::runtime_error("Bind socket failed." + PROF_ERROR(ErrCode::PARAM));
-        }
-        if (address.sun_path[0] != STR_END_CHAR) {
-            chmod(address.sun_path, SOCKET_FD_CHMOD);
-        }
-    }
-    ~NpuIpcEndPoint()
-    {
-        close(socketFd);
-    }
-    [[nodiscard]] auto BuildSendNpuCtxt(const std::string &desAddrName, const std::vector<NpuPayLoad> &npuPayLoad,
-        const std::vector<fileDesT> &fileDes)
-    {
-        if (fileDes.size() > MaxNumFileDes) {
-            throw std::runtime_error("Request to fill more than max connections " + PROF_ERROR(ErrCode::PARAM));
-        }
-        if (desAddrName.empty()) {
-            throw std::runtime_error("Can not send to dest point, because dest socket name is empty " +
-                PROF_ERROR(ErrCode::PARAM));
-        }
-        auto ctxt = BuildNpuCtxt_(npuPayLoad, fileDes.size());
-        ctxt->msghdr.msg_namelen = SetSocketAdress(desAddrName, ctxt->messageName);
-        if (!fileDes.empty()) {
-            if (sizeof(ctxt->fileDesPtr) < fileDes.size() * sizeof(fileDesT)) {
-                throw std::runtime_error("Memcpy failed when fileDes size large than ctxt fileDesPtr " +
-                    PROF_ERROR(ErrCode::PARAM));
-            }
-            memcpy(ctxt->fileDesPtr, fileDes.data(), fileDes.size() * sizeof(fileDesT));
-        }
-        return ctxt;
-    }
-
-    [[nodiscard]] bool TrySendMessage(Ctxt const & ctxt, bool retryOnConnRefused = true)
-    {
-        ssize_t retCode = sendmsg(socketFd, &ctxt.msghdr, MSG_DONTWAIT);
-        if (retCode > 0) {
-            return true;
-        }
-        if ((errno == EAGAIN || errno == EWOULDBLOCK) && retCode == -1) {
-            return false;
-        }
-        if (retryOnConnRefused && errno == ECONNREFUSED && retCode == -1) {
-            return false;
-        }
-        throw std::runtime_error("TrySendMessage occur " + std::string(std::strerror(errno)) + " " +
-            PROF_ERROR(ErrCode::PARAM));
-    }
-
-    [[nodiscard]] auto BuildNpuRcvCtxt(const std::vector<NpuPayLoad> &npuPayLoad)
-    {
-        return BuildNpuCtxt_(npuPayLoad, MaxNumFileDes);
-    }
-
-    [[nodiscard]] bool TryRcvMessage(Ctxt &ctxt) noexcept
-    {
-        auto retCode = recvmsg(socketFd, &ctxt.msghdr, MSG_DONTWAIT);
-        if (retCode > 0) {
-            return true;
-        }
-        if (retCode == 0) {
-            return false;
-        }
-        if (errno == EWOULDBLOCK || errno == EAGAIN) {
-            return false;
-        }
-        throw std::runtime_error("TryRcvMessage occur " + std::string(std::strerror(errno)) + " " +
-            PROF_ERROR(ErrCode::PARAM));
-    }
-
-    [[nodiscard]] bool TryPeekMessage(Ctxt &ctxt)
-    {
-        ssize_t ret = recvmsg(socketFd, &ctxt.msghdr, MSG_DONTWAIT | MSG_PEEK);
-        if (ret > 0) {
-            return true;
-        }
-        if (ret == 0) {
-            return false;
-        }
-        if (errno == EAGAIN || errno == EWOULDBLOCK) {
-            return false;
-        }
-        throw std::runtime_error("TryPeekMessage occur " + std::string(std::strerror(errno)));
-    }
-
-    const char *GetName(Ctxt const & ctxt) const noexcept
-    {
-        if (ctxt.messageName.sun_path[0] != STR_END_CHAR) {
-            throw std::runtime_error("GetName() want to got abstract socket, but got " +
-                std::string(ctxt.messageName.sun_path));
-        }
-        return ctxt.messageName.sun_path + 1;
-    }
-
-    std::vector<fileDesT> GetFileDes(const Ctxt &ctxt) const
-    {
-        struct cmsghdr *cmg = CMSG_FIRSTHDR(&ctxt.msghdl);
-        unsigned numFileDes = (cmg->cmsg_len - sizeof(struct cmsghdr)) / sizeof(fileDesT);
-        return { ctxt.fileDesPtr, ctxt.fileDesPtr + numFileDes };
-    }
-
-protected:
-    fileDesT socketFd;
-    size_t SetSocketAdress(const std::string &srcSocket, struct sockaddr_un &destSocket)
-    {
-        if (srcSocket.size() > addressMaxLen) {
-            throw std::runtime_error("Abstract UNIX Socket path cannot be larger than addressMaxLen");
-        }
-        destSocket.sun_family = AF_UNIX;
-        destSocket.sun_path[0] = STR_END_CHAR;
-        if (srcSocket.empty()) {
-            return sizeof(sa_family_t);
-        }
-        srcSocket.copy(destSocket.sun_path + 1, srcSocket.size());
-        destSocket.sun_path[srcSocket.size() + 1] = STR_END_CHAR;
-        return sizeof(sa_family_t) + srcSocket.size() + 2;
-    }
-
-    auto BuildNpuCtxt_(const std::vector<NpuPayLoad> &npuPayLoad, unsigned numFileDes)
-    {
-        auto ctxt = std::make_unique<Ctxt>(npuPayLoad.size());
-        std::memset(&ctxt->msghdr, 0, sizeof(ctxt->msghdr));
-        for (auto i = 0; i < npuPayLoad.size(); i++) {
-            ctxt->iov[i] = {npuPayLoad[i].data, npuPayLoad[i].size};
-        }
-        ctxt->msghdr.msg_name = &ctxt->messageName;
-        ctxt->msghdr.msg_namelen = sizeof(decltype(ctxt->messageName));
-        ctxt->msghdr.msg_iov = ctxt->iov.data();
-        ctxt->msghdr.msg_iovlen = npuPayLoad.size();
-        ctxt->fileDesPtr = nullptr;
-        if (numFileDes == 0) {
-            return ctxt;
-        }
-        const size_t fileDesSize = sizeof(fileDesT) * numFileDes;
-        ctxt->msghdr.msg_control = ctxt->ancillaryBuf;
-        ctxt->msghdr.msg_controllen = CMSG_SPACE(fileDesSize);
-
-        struct cmsghdr *cmsg = CMSG_FIRSTHDR(&ctxt->msghdr);
-        cmsg->cmsg_level = SOL_SOCKET;
-        cmsg->cmsg_type = SCM_RIGHTS;
-        cmsg->cmsg_len = CMSG_LEN(fileDesSize);
-        ctxt->fileDesPtr = (fileDesT *)CMSG_DATA(cmsg);
-        return ctxt;
-    }
-};
-
-} // namespace profiler
-} // namespace torch_npu
diff --git a/torch_npu/csrc/profiler/dyno/PyDynamicMonitorProxy.h b/torch_npu/csrc/profiler/dyno/PyDynamicMonitorProxy.h
deleted file mode 100644
index 0eb5754539..0000000000
--- a/torch_npu/csrc/profiler/dyno/PyDynamicMonitorProxy.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#pragma once
-#include "MonitorBase.h"
-#include "DynoLogNpuMonitor.h"
-
-namespace torch_npu {
-namespace profiler {
-
-class PyDynamicMonitorProxy {
-public:
-    PyDynamicMonitorProxy() = default;
-    bool InitDyno(int npuId)
-    {
-        try {
-            monitor_ = DynoLogNpuMonitor::GetInstance();
-            monitor_->SetNpuId(npuId);
-            bool res = monitor_->Init();
-            return res;
-        } catch (const std::exception &e) {
-            ASCEND_LOGE("Error when init dyno %s !", e.what());
-            return false;
-        }
-    }
-
-    std::string PollDyno()
-    {
-        return monitor_->Poll();
-    };
-
-private:
-    MonitorBase *monitor_ = nullptr;
-};
-
-} // namespace profiler
-} // namespace torch_npu
diff --git a/torch_npu/csrc/profiler/dyno/utils.cpp b/torch_npu/csrc/profiler/dyno/utils.cpp
deleted file mode 100644
index f4f2078825..0000000000
--- a/torch_npu/csrc/profiler/dyno/utils.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-#include "utils.h"
-
-namespace torch_npu {
-namespace profiler {
-
-int32_t GetProcessId()
-{
-    return static_cast<int32_t>(getpid());
-}
-
-std::pair<int32_t, std::string> GetParentPidAndCommand(int32_t pid)
-{
-    std::string fileName = "/proc/" + std::to_string(pid) + "/stat";
-    std::ifstream statFile(fileName);
-    if (!statFile) {
-        return std::make_pair(0, "");
-    }
-    int32_t parentPid = 0;
-    std::string command;
-    std::string line;
-    if (std::getline(statFile, line)) {
-        int ret = sscanf(line.c_str(), "%*d (%[^)]) %*c %d", command.data(), &parentPid);
-        if (ret == 2) {
-            ASCEND_LOGI("Success to get parent pid %d", parentPid);
-            return std::make_pair(parentPid, command);
-        }
-    }
-    ASCEND_LOGW("Failed to parse /proc/%d/stat", pid);
-    return std::make_pair(0, "");
-}
-
-std::vector<std::pair<int32_t, std::string>> GetPidCommandPairsofAncestors()
-{
-    std::vector<std::pair<int32_t, std::string>> process_pids_and_cmds;
-    process_pids_and_cmds.reserve(MaxParentPids + 1);
-    int32_t current_pid = GetProcessId();
-    for (int i = 0; i <= MaxParentPids && (i == 0 || current_pid > 1); i++) {
-        std::pair<int32_t, std::string> parent_pid_and_cmd = GetParentPidAndCommand(current_pid);
-        process_pids_and_cmds.push_back(std::make_pair(current_pid, parent_pid_and_cmd.second));
-        current_pid = parent_pid_and_cmd.first;
-    }
-    return process_pids_and_cmds;
-}
-
-std::vector<int32_t> GetPids()
-{
-    const auto &pids = GetPidCommandPairsofAncestors();
-    std::vector<int32_t> res;
-    res.reserve(pids.size());
-    for (const auto &pidPair : pids) {
-        res.push_back(pidPair.first);
-    }
-    return res;
-}
-std::string GenerateUuidV4()
-{
-    static std::random_device randomDevice;
-    static std::mt19937 gen(randomDevice());
-    static std::uniform_int_distribution<> dis(0, 15);
-    static std::uniform_int_distribution<> dis2(8, 11);
-
-    std::stringstream stringStream;
-    stringStream << std::hex;
-    for (int i = 0; i < 8; i++) {
-        stringStream << dis(gen);
-    }
-    stringStream << "-";
-    for (int j = 0; j < 4; j++) {
-        stringStream << dis(gen);
-    }
-    stringStream << "-4";
-    for (int k = 0; k < 3; k++) {
-        stringStream << dis(gen);
-    }
-    stringStream << "-";
-    stringStream << dis2(gen);
-    for (int m = 0; m < 3; m++) {
-        stringStream << dis(gen);
-    }
-    stringStream << "-";
-    for (int n = 0; n < 12; n++) {
-        stringStream << dis(gen);
-    }
-    return stringStream.str();
-}
-
-} // namespace profiler
-} // namespace torch_npu
diff --git a/torch_npu/csrc/profiler/dyno/utils.h b/torch_npu/csrc/profiler/dyno/utils.h
deleted file mode 100644
index 22669a63ee..0000000000
--- a/torch_npu/csrc/profiler/dyno/utils.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#pragma once
-#include <sys/types.h>
-#include <unistd.h>
-#include <cstdint>
-#include <vector>
-#include <string>
-#include <random>
-#include <fstream>
-#include <sstream>
-#include "torch_npu/csrc/core/npu/npu_log.h"
-#include "torch_npu/csrc/core/npu/NPUException.h"
-
-namespace torch_npu {
-namespace profiler {
-
-constexpr int MaxParentPids = 5;
-int32_t GetProcessId();
-std::string GenerateUuidV4();
-std::vector<int32_t> GetPids();
-std::pair<int32_t, std::string> GetParentPidAndCommand(int32_t pid);
-std::vector<std::pair<int32_t, std::string>> GetPidCommandPairsofAncestors();
-
-} // namespace profiler
-} // namespace torch_npu
diff --git a/torch_npu/csrc/profiler/init.cpp b/torch_npu/csrc/profiler/init.cpp
index 4ae750c6b2..07e2964405 100644
--- a/torch_npu/csrc/profiler/init.cpp
+++ b/torch_npu/csrc/profiler/init.cpp
@@ -18,7 +18,6 @@
 #include "torch_npu/csrc/toolkit/profiler/common/utils.h"
 #include "torch_npu/csrc/framework/interface/LibAscendHal.h"
 #include "torch_npu/csrc/core/npu/NPUException.h"
-#include "torch_npu/csrc/profiler/dyno/PyDynamicMonitorProxy.h"
 
 namespace torch_npu {
 namespace profiler {
@@ -65,10 +64,6 @@ PyObject* profiler_initExtension(PyObject* _unused, PyObject *unused) {
 
     py::class_<NpuProfilerConfig>(m, "NpuProfilerConfig")
         .def(py::init<std::string, bool, bool, bool, bool, bool, ExperimentalConfig>());
-    py::class_<PyDynamicMonitorProxy>(m, "PyDynamicMonitorProxy")
-        .def(py::init<>())
-        .def("init_dyno", &PyDynamicMonitorProxy::InitDyno, py::arg("npuId"))
-        .def("poll_dyno", &PyDynamicMonitorProxy::PollDyno);
     m.def("_supported_npu_activities", []() {
         std::set<NpuActivityType> activities {
             NpuActivityType::CPU,
diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py
index 1cc58c1a23..0a27e3cb70 100644
--- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py
@@ -5,7 +5,6 @@ import time
 import json
 import struct
 import multiprocessing
-from torch_npu._C._profiler import PyDynamicMonitorProxy
 from ._dynamic_profiler_config_context import ConfigContext
 from ._dynamic_profiler_utils import DynamicProfilerUtils
 from ._dynamic_profiler_monitor_shm import DynamicProfilerShareMemory
@@ -176,6 +175,13 @@ def worker_dyno_func(params_dict):
     max_size = params_dict.get("max_size")
     dynamic_profiler_utils = params_dict.get("dynamic_profiler_utils")
 
+    try:
+        from IPCMonitor import PyDynamicMonitorProxy
+    except Exception as e:
+        dynamic_profiler_utils.stdout_log(f"Import IPCMonitor module failed: {e}!",
+                                          dynamic_profiler_utils.LoggerLevelEnum.WARNING)
+        return
+
     py_dyno_monitor = PyDynamicMonitorProxy()
     ret = py_dyno_monitor.init_dyno(rank_id)
     if not ret:
-- 
Gitee


From 0bd1c60896e76a526a3fc5690cde0b07626c1e77 Mon Sep 17 00:00:00 2001
From: cheng <3218750885@qq.com>
Date: Wed, 19 Feb 2025 12:56:55 +0000
Subject: [PATCH 031/358] !18081 Adjusting Communication Operator Levels Merge
 pull request !18081 from cheng/v2.6.0

---
 torch_npu/profiler/analysis/_profiler_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_npu/profiler/analysis/_profiler_config.py b/torch_npu/profiler/analysis/_profiler_config.py
index dea0fbe7d1..355d4ab95c 100644
--- a/torch_npu/profiler/analysis/_profiler_config.py
+++ b/torch_npu/profiler/analysis/_profiler_config.py
@@ -33,7 +33,7 @@ class ProfilerConfig:
     }
     LEVEL_TRACE_PRUNE_CONFIG = {
         Constant.LEVEL_NONE: [],
-        Constant.LEVEL0: ['CANN', 'AscendCL', 'Runtime', 'GE', 'Node', 'Model', 'Hccl', 'acl_to_npu'],
+        Constant.LEVEL0: ['CANN', 'AscendCL', 'Runtime', 'GE', 'Node', 'Model', 'Hccl', 'acl_to_npu', 'Communication@'],
         Constant.LEVEL1: [],
         Constant.LEVEL2: []
     }
-- 
Gitee


From 1503d139df09f37cf5d60b6b0399f603e74b3064 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=A2=85=E9=A3=9E=E8=A6=81?= <1332490378@qq.com>
Date: Wed, 19 Feb 2025 15:14:26 +0000
Subject: [PATCH 032/358] =?UTF-8?q?!18169=20[PROFILING]pta=20mstx=20suppor?=
 =?UTF-8?q?t=20dataloader=20and=20save=20ckpt=20duration=20Merge=20pull=20?=
 =?UTF-8?q?request=20!18169=20from=20=E6=A2=85=E9=A3=9E=E8=A6=81/2.6=5Ftx?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../.pytorch-disabled-tests.json              | 10 +++
 torch_npu/__init__.py                         |  2 +
 torch_npu/profiler/_add_mstx_patch.py         | 61 +++++++++++++++++++
 3 files changed, 73 insertions(+)
 create mode 100644 torch_npu/profiler/_add_mstx_patch.py

diff --git a/test/unsupported_test_cases/.pytorch-disabled-tests.json b/test/unsupported_test_cases/.pytorch-disabled-tests.json
index 28efab1990..8e513fc559 100644
--- a/test/unsupported_test_cases/.pytorch-disabled-tests.json
+++ b/test/unsupported_test_cases/.pytorch-disabled-tests.json
@@ -30745,6 +30745,16 @@
   "test_fuzz_symbolize (__main__.TestExperimentalUtils)": ["", [""]],
   "test_profiler_strides (__main__.TestProfiler)": ["", [""]],
   "test_schedule_function_count (__main__.TestProfiler)": ["", [""]],
+  "test_source_multithreaded_basic_work_in_main_thread_False (__main__.TestProfiler)": ["", [""]],
+  "test_source_multithreaded_basic_work_in_main_thread_True (__main__.TestProfiler)": ["", [""]],
+  "test_source_multithreaded_close_in_scope_work_in_main_thread_False (__main__.TestProfiler)": ["", [""]],
+  "test_source_multithreaded_close_in_scope_work_in_main_thread_True (__main__.TestProfiler)": ["", [""]],
+  "test_source_multithreaded_complex_work_in_main_thread_False (__main__.TestProfiler)": ["", [""]],
+  "test_source_multithreaded_complex_work_in_main_thread_True (__main__.TestProfiler)": ["", [""]],
+  "test_source_multithreaded_multiple_preexisting_work_in_main_thread_False (__main__.TestProfiler)": ["", [""]],
+  "test_source_multithreaded_multiple_preexisting_work_in_main_thread_True (__main__.TestProfiler)": ["", [""]],
+  "test_source_multithreaded_open_in_scope_work_in_main_thread_False (__main__.TestProfiler)": ["", [""]],
+  "test_source_multithreaded_open_in_scope_work_in_main_thread_True (__main__.TestProfiler)": ["", [""]],
   "test_profiler_experimental_tree_with_memory_and_stack (__main__.TestProfilerTree)": ["", [""]],
   "test_profiler_experimental_tree_with_stack_and_modules (__main__.TestProfilerTree)": ["", [""]],
   "test_profiler_experimental_tree_with_stack_and_torch_dispatch (__main__.TestProfilerTree)": ["", [""]],
diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index d46af43d83..665895c4b3 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -75,6 +75,7 @@ from torch_npu.npu._stream_check import apply_sanitizer_patch
 import torch_npu.utils.custom_ops
 import torch_npu.distributed.rpc
 import torch_npu.op_plugin
+from torch_npu.profiler._add_mstx_patch import _apply_mstx_patch
 from torch_npu.distributed.rpc.backend_registry import _rpc_backend_registry
 from torch_npu.utils import _cann_package_check, _add_intercept_methods
 from torch_npu.utils import _register_ops_under_dtensor_rules
@@ -166,6 +167,7 @@ def _apply_class_patches():
     _apply_sharded_grad_scaler_patch()
     add_perf_dump_patch()
     _apply_distributed_methods_patch()
+    _apply_mstx_patch()
 
 
 def _apply_distributed_methods_patch():
diff --git a/torch_npu/profiler/_add_mstx_patch.py b/torch_npu/profiler/_add_mstx_patch.py
new file mode 100644
index 0000000000..d8c257427b
--- /dev/null
+++ b/torch_npu/profiler/_add_mstx_patch.py
@@ -0,0 +1,61 @@
+import functools
+import torch
+from torch.utils.data import DataLoader
+import torch_npu
+
+original_save = torch.serialization.save
+original_iter = DataLoader.__iter__
+original_singlenext = torch.utils.data.dataloader._SingleProcessDataLoaderIter.__next__
+original_multinext = torch.utils.data.dataloader._MultiProcessingDataLoaderIter.__next__
+
+
+class _MstxState:
+    def __init__(self):
+        self.dataloader_range_id = None
+        self.save_range_id = None
+
+mstx_state = _MstxState()
+
+
+def _custom_dataloader_iter(self):
+    global mstx_state
+
+    out_iter = original_iter(self)
+
+    def dataloader_wrapper(func):
+        def wrapper(*args, **kwargs):
+            mstx_state.dataloader_range_id = torch_npu.npu.mstx.range_start("dataloader")
+            out = func(*args, **kwargs)
+            if mstx_state.dataloader_range_id is not None:
+                torch_npu.npu.mstx.range_end(mstx_state.dataloader_range_id)
+                mstx_state.dataloader_range_id = None
+            return out
+
+        return wrapper
+
+    if self.num_workers == 0:
+        torch.utils.data.dataloader._SingleProcessDataLoaderIter.__next__ = dataloader_wrapper(original_singlenext)
+    else:
+        torch.utils.data.dataloader._MultiProcessingDataLoaderIter.__next__ = dataloader_wrapper(original_multinext)
+
+    return out_iter
+
+
+def _custom_save(func):
+    global mstx_state
+
+    @functools.wraps(func)
+    def save_wrapper(*args, **kwargs):
+        mstx_state.save_range_id = torch_npu.npu.mstx.range_start("save_checkpoint")
+        out = func(*args, **kwargs)
+        if mstx_state.save_range_id is not None:
+            torch_npu.npu.mstx.range_end(mstx_state.save_range_id)
+            mstx_state.save_range_id = None
+        return out
+
+    return save_wrapper
+
+
+def _apply_mstx_patch():
+    DataLoader.__iter__ = _custom_dataloader_iter
+    torch.serialization.save = _custom_save(original_save)
\ No newline at end of file
-- 
Gitee


From 366b1e4b60dd9f6771be187063c60fd3575b5aa6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E5=A4=8F=E5=A4=8F?= <wangxiaxia3@huawei.com>
Date: Thu, 20 Feb 2025 01:09:03 +0000
Subject: [PATCH 033/358] =?UTF-8?q?!18132=20Decoupling=20meta.=20Merge=20p?=
 =?UTF-8?q?ull=20request=20!18132=20from=20=E7=8E=8B=E5=A4=8F=E5=A4=8F/v2.?=
 =?UTF-8?q?6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/__init__.py                 |    2 +-
 torch_npu/meta/__init__.py            |    1 -
 torch_npu/meta/_meta_registrations.py | 1057 -------------------------
 3 files changed, 1 insertion(+), 1059 deletions(-)
 delete mode 100644 torch_npu/meta/__init__.py
 delete mode 100644 torch_npu/meta/_meta_registrations.py

diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index 665895c4b3..867aa53b6d 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -83,8 +83,8 @@ from torch_npu.utils.exposed_api import public_npu_functions
 from torch_npu.utils._error_code import ErrCode, pta_error, _except_handler
 from torch_npu.asd.asd import _asd_patch
 from torch_npu._C._distributed_c10d import ParallelStore
+from torch_npu.op_plugin.meta import _meta_registrations
 from .version import __version__ as __version__
-from .meta import _meta_registrations
 from . import _op_plugin_docs
 del _op_plugin_docs
 
diff --git a/torch_npu/meta/__init__.py b/torch_npu/meta/__init__.py
deleted file mode 100644
index a9a2c5b3bb..0000000000
--- a/torch_npu/meta/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-__all__ = []
diff --git a/torch_npu/meta/_meta_registrations.py b/torch_npu/meta/_meta_registrations.py
deleted file mode 100644
index 019ce61117..0000000000
--- a/torch_npu/meta/_meta_registrations.py
+++ /dev/null
@@ -1,1057 +0,0 @@
-import math
-import torch
-from torch.library import Library, impl
-from torch.fx.node import has_side_effect
-from torch_npu.utils._error_code import ErrCode, ops_error
-
-'''
-Registering Meta implementations for custom ops
-'''
-BIT_NUMBER = 128
-UINT8_BIT_NUMBER = 8
-INPUTS_DIM_LIMIT_QUANTCONV2D = 4
-ATTR_DIM_LIMIT_QUANTCONV2D = 2
-#meta register implementation
-m = Library("npu", "IMPL", "Meta")
-m_aten = Library("aten", "IMPL", "Meta")
-
-
-@impl(m_aten, "matmul_backward")
-def matmul_backward_meta(grad, self, other, mask):
-    return (torch.empty_like(self), torch.empty_like(other))
-
-
-@impl(m, "npu_incre_flash_attention")
-def npu_incre_flash_attention_forward(query, key, value, *, padding_mask=None, atten_mask=None, pse_shift=None, actual_seq_lengths=None,
-                                      antiquant_scale=None, antiquant_offset=None, block_table=None,
-                                      dequant_scale1=None, quant_scale1=None, dequant_scale2=None, quant_scale2=None,
-                                      quant_offset2=None, kv_padding_size=None, num_heads=1, scale_value=1.0, input_layout="BSH",
-                                      num_key_value_heads=0, block_size=0, inner_precise=1):
-    if quant_scale2 is not None:
-        return torch.empty_like(query, dtype=torch.int8)
-    elif query.dtype == torch.int8:
-        return torch.empty_like(query, dtype=torch.half)
-    else:
-        return torch.empty_like(query)
-
-
-@impl(m, "npu_prompt_flash_attention")
-def npu_prompt_flash_attention_forward(query, key, value, *, padding_mask=None, atten_mask=None, pse_shift=None, actual_seq_lengths=None, deq_scale1=None, quant_scale1=None, deq_scale2=None, quant_scale2=None, quant_offset2=None, num_heads=1, scale_value=1.0, pre_tokens=2147473647, next_tokens=0, input_layout="BSH", num_key_value_heads=0, actual_seq_lengths_kv=None, sparse_mode=0):
-    if quant_scale2 is not None:
-        return torch.empty_like(query, dtype=torch.int8)
-    elif query.dtype == torch.int8:
-        return torch.empty_like(query, dtype=torch.half)
-    else:
-        return torch.empty_like(query, dtype=query.dtype)
-
-
-@impl(m, "npu_fusion_attention")
-def npu_fusion_attention_forward(query, key, value, head_num, input_layout, pse=None, padding_mask=None,
-                                atten_mask=None, scale=1.0, keep_prob=1.0, pre_tockens=2147483647, next_tockens=2147483647,
-                                inner_precise=0, prefix=None, actual_seq_qlen=None, actual_seq_kvlen=None, sparse_mode=0, gen_mask_parallel=True, sync=False):
-    B = query.size(0)
-    N = head_num
-    S1 = query.size(2)
-    S2 = key.size(2)
-
-    if input_layout == "BSH":
-        B = query.size(0)
-        S1 = query.size(1)
-        S2 = key.size(1)
-
-    if input_layout == "SBH":
-        B = query.size(1)
-        S1 = query.size(0)
-        S2 = key.size(0)
-
-    seed = 0
-    offset = 0
-    numels = 0
-    attention_score = torch.empty_like(query, dtype=query.dtype, device='meta')
-    softmax_max = torch.empty([B, head_num, S1, 8], dtype=torch.float32, device='meta')
-    softmax_sum = torch.empty([B, head_num, S1, 8], dtype=torch.float32, device='meta')
-    softmax_out = torch.empty([0], dtype=query.dtype, device='meta')
-    return (torch.empty_like(attention_score),
-            torch.empty_like(softmax_max),
-            torch.empty_like(softmax_sum),
-            torch.empty_like(softmax_out),
-            seed,
-            offset,
-            numels)
-
-
-@impl(m, "npu_fusion_attention_grad")
-def npu_fusion_attention_backward(query, key, value, dy, head_num, input_layout, *, pse=None, padding_mask=None, atten_mask=None,
-                                  softmax_max=None, softmax_sum=None, softmax_in=None, attention_in=None, scale_value=1.0,
-                                  keep_prob=1.0, pre_tockens=2147483647, next_tockens=2147483647, inner_precise=0, seed=0, offset=0,
-                                  numels=0, prefix=None, actual_seq_qlen=None, actual_seq_kvlen=None, sparse_mode=0, gen_mask_parallel=True, sync=False):
-    dq = torch.empty_like(query, dtype=query.dtype, device='meta')
-    dk = torch.empty_like(key, dtype=query.dtype, device='meta')
-    dv = torch.empty_like(value, dtype=query.dtype, device='meta')
-    dpse = torch.empty([0], dtype=query.dtype, device='meta')
-    return (torch.empty_like(dq), torch.empty_like(dk), torch.empty_like(dv), torch.empty_like(dpse))
-
-
-@impl(m, "npu_rotary_mul")
-def npu_rotary_mul_meta(embedding, cosine, sine):
-    return torch.empty_like(embedding)
-
-
-@impl(m, "npu_rotary_mul_backward")
-def npu_rotary_mul_backward(grad, embedding, cosine, sine):
-    dx = torch.empty_like(embedding, dtype=embedding.dtype, device='meta')
-    dr1 = torch.empty_like(cosine, dtype=embedding.dtype, device='meta')
-    dr2 = torch.empty_like(sine, dtype=embedding.dtype, device='meta')
-    return (dx, dr1, dr2)
-
-
-@impl(m, "fast_gelu")
-def fast_gelu_meta(self):
-    return torch.empty_like(self)
-
-
-@impl(m, "npu_fast_gelu_backward")
-def npu_fast_gelu_backward_meta(grad, self):
-    return torch.empty_like(self)
-
-
-@impl(m, "npu_fast_gelu")
-def npu_fast_gelu_meta(self):
-    return torch.empty_like(self)
-
-
-@impl(m, "npu_gelu")
-def npu_gelu_meta(self, *, approximate="none"):
-    return torch.empty_like(self)
-
-
-@impl(m, "npu_gelu_backward")
-def npu_gelu_backward_meta(grad, self, *, approximate="none"):
-    return torch.empty_like(self)
-
-
-@impl(m, "npu_dtype_cast")
-def npu_dtype_cast_meta(self, dtype):
-    return torch.empty_like(self, dtype=dtype)
-
-
-@impl(m, "npu_dtype_cast_backward")
-def npu_dtype_cast_backward_meta(self, dtype):
-    return torch.empty_like(self, dtype=dtype)
-
-
-@impl(m, "npu_bmmV2")
-def npu_bmmV2_meta(self, mat2, output_sizes):
-    dim1 = self.size(0)
-    dim2 = self.size(1)
-    dim3 = mat2.size(2)
-    return self.new_empty((dim1, dim2, dim3))
-
-
-@impl(m, "npu_transpose")
-def npu_transpose_meta(self, perm, require_contiguous=True):
-    output = self.permute(perm)
-    return torch.empty_like(output, dtype=self.dtype)
-
-
-@impl(m, "npu_deep_norm")
-def npu_deep_norm_meta(self, gx, beta, gamma, alpha=0.3, epsilon=1e-6):
-    rstd_dim = self.dim() - gamma.dim()
-    ret = []
-    for i in range(self.dim()):
-        if i < rstd_dim:
-            ret.append(self.size(i))
-        else:
-            ret.append(1)
-    rstd = torch.empty(ret, dtype=torch.float32, device='meta')
-    return (torch.empty_like(rstd), torch.empty_like(rstd), torch.empty_like(self, dtype=self.dtype))
-
-
-@impl(m, "npu_rms_norm")
-def npu_rms_norm_meta(self, gamma, epsilon=1e-6):
-    rstd_dim = self.dim() - gamma.dim()
-    ret = []
-    for i in range(self.dim()):
-        if i < rstd_dim:
-            ret.append(self.size(i))
-        else:
-            ret.append(1)
-    rstd = torch.empty(ret, dtype=torch.float32, device='meta')
-    return (torch.empty_like(self, dtype=self.dtype), torch.empty_like(rstd))
-
-
-@impl(m, "npu_add_rms_norm")
-def npu_add_rms_norm_meta(x1, x2, gamma, epsilon=1e-6):
-    rstd_dim = x1.dim() - gamma.dim()
-    ret = []
-    for i in range(x1.dim()):
-        if i < rstd_dim:
-            ret.append(x1.size(i))
-        else:
-            ret.append(1)
-    rstd = torch.empty(ret, dtype=torch.float32, device='meta')
-    return (torch.empty_like(x1, dtype=x1.dtype), torch.empty_like(rstd), torch.empty_like(x1, dtype=x1.dtype))
-
-
-@impl(m, "npu_rms_norm_backward")
-def npu_rms_norm_backward_meta(dy, self, gamma, rstd):
-    return (torch.empty_like(self, dtype=self.dtype), torch.empty_like(gamma, dtype=gamma.dtype))
-
-
-@impl(m, "scatter_update")
-def scatter_update_meta(self, indices, updates, axis):
-    return torch.empty_like(self)
-
-
-@impl(m, "scatter_update_")
-def scatter_update__meta(self, indices, updates, axis):
-    return self
-
-
-@impl(m, "_npu_dropout")
-def _npu_dropout_meta(self, p):
-    mask = math.floor(math.floor((self.numel() + BIT_NUMBER - 1) / BIT_NUMBER) * BIT_NUMBER / UINT8_BIT_NUMBER)
-    return (torch.empty_like(self, dtype=self.dtype), torch.empty(mask, dtype=torch.uint8, device='meta'))
-
-
-@impl(m, "npu_quant_scatter")
-def npu_quant_scatter_meta(self, indices, updates, quant_scales, quant_zero_points=None, axis=0, quant_axis=1,
-                           reduce='update'):
-    return torch.empty_like(self)
-
-
-@impl(m, "npu_quant_scatter_")
-def npu_quant_scatter__meta(self, indices, updates, quant_scales, quant_zero_points=None, axis=0, quant_axis=1,
-                            reduce='update'):
-    return self
-
-
-@impl(m, "npu_scatter_list_")
-def scatter_list__meta(self, indices, updates, mask, reduce='update', axis=-2):
-    return self
-
-
-@impl(m, "npu_scatter_list")
-def scatter_list_meta(self, indices, updates, mask, reduce='update', axis=-2):
-    var_list = []
-    for item in self:
-        var_list.append(torch.empty_like(item))
-    return var_list
-
-
-@impl(m, "npu_scatter_nd_update")
-def scatter_nd_update_meta(self, indices, updates):
-    return torch.empty_like(self, dtype=self.dtype)
-
-
-@impl(m, "npu_scatter_nd_update_")
-def scatter_nd_update__meta(self, indices, updates):
-    return self
-
-
-@impl(m, "npu_geglu")
-def npu_geglu_meta(self, dim, approximate, activate_left=False):
-    return (torch.empty_like(self, dtype=self.dtype), torch.empty_like(self, dtype=self.dtype))
-
-
-@impl(m, "npu_geglu_grad")
-def npu_geglu_backward_meta(grad_output, self, gelu, dim, approximate, activate_left=False):
-    return (torch.empty_like(self, dtype=self.dtype), torch.empty_like(self, dtype=self.dtype))
-
-
-@impl(m, "npu_dropout_backward")
-def npu_dropout_backward_meta(grad_output, mask, p):
-    return torch.empty_like(grad_output, dtype=grad_output.dtype)
-
-
-@impl(m, "npu_masked_softmax_with_rel_pos_bias")
-def npu_masked_softmax_with_rel_pos_bias_meta(x, atten_mask, relative_pos_bias, scale_value=1.0, inner_precision_mode=0):
-    return torch.empty_like(x, dtype=x.dtype)
-
-
-@impl(m, "npu_moe_distribute_dispatch")
-def npu_moe_distribute_dispatch_meta(x, expert_ids, group_ep, ep_world_size, ep_rank_id, moe_expert_num, scales=None, group_tp="", tp_world_size=0,
-                                     tp_rank_id=0, expert_shard_type=0, shared_expert_rank_num=0, quant_mode=0, global_bs=0):
-    n = x.size(0)
-    h = x.size(1)
-    k = expert_ids.size(1)
-
-    shared_front = 0
-    outDtype = x.dtype
-    if expert_shard_type == 0:
-        shared_front = 1
-
-    local_moe_expert_num = 0
-    global_bs_real = 0
-    if global_bs == 0:
-        global_bs_real = n * ep_world_size
-    else:
-        global_bs_real = global_bs
-    a = 0
-    if shared_front == 1:
-        if ep_rank_id < shared_expert_rank_num:
-            local_moe_expert_num = 1
-            a = global_bs_real // shared_expert_rank_num
-        else:
-            local_moe_expert_num = moe_expert_num // (ep_world_size - shared_expert_rank_num)
-            a = global_bs_real * local_moe_expert_num
-    else:
-        if ep_rank_id >= ep_world_size - shared_expert_rank_num:
-            local_moe_expert_num = 1
-            a = global_bs_real // shared_expert_rank_num
-        else:
-            local_moe_expert_num = moe_expert_num // (ep_world_size - shared_expert_rank_num)
-            a = global_bs_real * local_moe_expert_num
-
-    if scales is not None or quant_mode != 0:
-        outDtype = torch.int8
-    local_moe_expert_num = int(local_moe_expert_num)
-    expand_x = x.new_empty(tuple([a * tp_world_size, h]), dtype=outDtype)
-    dynamic_scales = x.new_empty(tuple([a * tp_world_size]), dtype=torch.float32)
-    expand_idx = x.new_empty(tuple([n * k]), dtype=torch.int32)
-    expert_token_nums = x.new_empty(tuple([local_moe_expert_num]), dtype=torch.int64)
-    ep_recv_counts = x.new_empty(tuple([moe_expert_num + shared_expert_rank_num]), dtype=torch.int32)
-    tp_recv_counts = x.new_empty(tuple([tp_world_size]), dtype=torch.int32)
-    return (expand_x, dynamic_scales, expand_idx, expert_token_nums, ep_recv_counts, tp_recv_counts)
-
-
-@impl(m, "npu_moe_distribute_combine")
-def npu_moe_distribute_combine_meta(expand_x, expert_ids, expand_idx, ep_send_counts, expert_scales, group_ep, ep_world_size, ep_rank_id, moe_expert_num,
-                                    tp_send_counts=None, group_tp="", tp_world_size=0, tp_rank_id=0, expert_shard_type=0, shared_expert_rank_num=0, global_bs=0):
-    dim_list = []
-    dim_list.append(expert_ids.size(0))
-    dim_list.append(expand_x.size(1))
-
-    return expand_x.new_empty(tuple(dim_list), dtype=expand_x.dtype)
-
-
-@impl(m, "npu_ffn")
-def npu_ffn_meta(x, weight1, weight2, activation, *, expert_tokens=None, expert_tokens_index=None, bias1=None,
-                 bias2=None, scale=None, offset=None, deq_scale1=None, deq_scale2=None, antiquant_scale1=None,
-                 antiquant_scale2=None, antiquant_offset1=None, antiquant_offset2=None, inner_precise=0,
-                 output_dtype=None):
-    dim_list = []
-    for i in range(0, x.dim() - 1):
-        dim_list.append(x.size(i))
-    dim_list.append(weight2.size(weight2.dim() - 1))
-    if x.dtype == torch.int8:
-        if output_dtype is not None and output_dtype == torch.bfloat16:
-            return x.new_empty(tuple(dim_list), dtype=torch.bfloat16)
-        else:
-            return x.new_empty(tuple(dim_list), dtype=torch.float16)
-    else:
-        return x.new_empty(tuple(dim_list))
-
-
-@impl(m, "npu_grouped_matmul")
-@impl(m, "npu_grouped_matmul.List")
-def npu_grouped_matmul_meta(x, weight, *, bias=None, scale=None, offset=None, antiquant_scale=None,
-                            antiquant_offset=None, per_token_scale=None, group_list=None, 
-                            activation_input=None, activation_quant_scale=None, activation_quant_offset=None,
-                            split_item=0, group_type=-1, group_list_type=0, act_type=0, output_dtype=None):
-    y = []
-    num_x = len(x)
-    singleWeight = len(weight) == 1 and len(weight[0].shape) == 3
-    n = weight[0].shape[2] if singleWeight else weight[0].shape[1]
-    if num_x > 0 and output_dtype is None:
-        output_dtype = x[0].dtype
-    if split_item == 0:
-        for i in range(num_x):
-            ni = n if singleWeight else weight[i].shape[1]
-            y.append(x[i].new_empty((*x[i].shape[:-1], ni), dtype=output_dtype))
-    elif split_item == 1:
-        num_group_list = group_list.shape[0] if isinstance(group_list, torch.Tensor) else len(group_list)
-        pre_offset = group_list[0]
-        y.append(x[0].new_empty((pre_offset, n), dtype=output_dtype))
-        for i in range(1, num_group_list):
-            ni = n if singleWeight else weight[i].shape[1]
-            cur_offset = group_list[i]
-            y.append(x[0].new_empty((cur_offset - pre_offset, ni), dtype=output_dtype))
-            pre_offset = cur_offset
-    elif split_item == 2:
-        dim_m = 0
-        for i in range(num_x):
-            dim_m += x[i].shape[0]
-        y.append(x[0].new_empty((dim_m, n), dtype=output_dtype))
-    elif split_item == 3:
-        y.append(x[0].new_empty((x[0].shape[0], n), dtype=output_dtype))
-
-    return y
-
-
-@impl(m, "npu_group_norm_silu")
-def group_norm_silu_meta(self, gemma, beta, group, eps=0.00001):
-    N = self.size(1)
-    if gemma is None or beta is None:
-        return (torch.empty_like(self, dtype=self.dtype), self.new_empty((N, group), dtype=self.dtype), self.new_empty((N, group), dtype=self.dtype))
-    else:
-        return (torch.empty_like(self, dtype=self.dtype), gemma.new_empty((N, group), dtype=gemma.dtype), beta.new_empty((N, group), dtype=beta.dtype))
-
-
-@impl(m, "npu_mm_all_reduce_base")
-def npu_mm_all_reduce_base_forward(x1, x2, hcom, reduce_op='sum', bias=None, antiquant_scale=None,
-                                   antiquant_offset=None, x3=None, dequant_scale=None, pertoken_scale=None,
-                                   comm_quant_scale_1=None, comm_quant_scale_2=None, antiquant_group_size=0, comm_turn=0):
-    dim_list = []
-    for i in range(x1.dim()):
-        dim_list.append(x1.size(i))
-    dim_list[-1] = x2.size(1)
-    if dequant_scale is not None:
-        if dequant_scale.dtype == torch.bfloat16:
-            return x1.new_empty(tuple(dim_list), dtype=torch.bfloat16)
-        else:
-            return x1.new_empty(tuple(dim_list), dtype=torch.float16)
-    else:
-        return x1.new_empty(tuple(dim_list))
-
-
-
-@impl(m, "npu_weight_quant_batchmatmul")
-def npu_weight_quant_batchmatmul_meta(x, weight, antiquant_scale, antiquant_offset=None, quant_scale=None, quant_offset=None, bias=None, antiquant_group_size=0, inner_precise=0):
-    dim_m = x.size(0)
-    if weight.dtype == torch.int32 and weight.is_contiguous():
-        dim_n = weight.size(1) * 8
-    else:
-        dim_n = weight.size(1)
-    if quant_scale is not None:
-        return x.new_empty((dim_m, dim_n), dtype=torch.int8)
-    return x.new_empty((dim_m, dim_n), dtype=x.dtype)
-
-
-def bias_shape_check(x2, bias, batch_val, is_a4w4, transpose_x2):
-    bias_dim_num = bias.dim()
-    if is_a4w4:
-        torch._check(
-            bias_dim_num == 1,
-            lambda: "bias_dim_num should be 1 when x1's dtype is int32, please check bias dim num " + ops_error(ErrCode.VALUE),
-        )
-    else:
-        torch._check(
-            bias_dim_num == 1 or bias_dim_num == 3,
-            lambda: "bias_dim_num should be 1 or 3 when x1's dtype is int8, please check bias dim num " + ops_error(ErrCode.VALUE),
-        )
-    x2_dim_num = x2.dim()
-    x2_n_dim = x2.size(x2_dim_num - 1) * 8 if (is_a4w4 and not transpose_x2) else x2.size(x2_dim_num - 1)
-    bias_first_dim = bias.size(0)
-    if bias_dim_num == 1:
-        torch._check(
-            bias_first_dim == x2_n_dim,
-            lambda: "bias_first_dim should be equal to x2 n dim, please check bias 1st dim value " + ops_error(ErrCode.VALUE),
-        )
-        return
-    bias_second_dim = bias.size(1)
-    bias_third_dim = bias.size(2)
-    torch._check(
-        bias_first_dim == batch_val,
-        lambda: "infered batch value should be equal to bias batch dim value, please check bias batch dim value" + ops_error(ErrCode.VALUE),
-    )
-    torch._check(
-        bias_second_dim == 1,
-        lambda: "bias_second_dim should be 1, please check bias second dim value " + ops_error(ErrCode.VALUE),
-    )
-    torch._check(
-        bias_third_dim == x2_n_dim,
-        lambda: "bias_third_dim should be equal to x2_n_dim, please check bias third dim value " + ops_error(ErrCode.VALUE),
-    )
-
-
-def quant_matmul_shape_check(*args):
-    x1, x2, scale, offset, pertoken_scale, is_a4w4, transpose_x2 = args
-    X_MAX_DIM = 6
-    X_MIN_DIM = 2
-    INT4_IN_INT32 = 8
-    x1_dim_num = x1.dim()
-    x2_dim_num = x2.dim()
-    x1_m_dim = x1.size(x1_dim_num - 2)
-    x1_k_dim = x1.size(x1_dim_num - 1)
-    x2_k_dim = x2.size(x2_dim_num - 2)
-    x2_n_dim = x2.size(x2_dim_num - 1) * INT4_IN_INT32 if (is_a4w4 and not transpose_x2) else x2.size(x2_dim_num - 1)
-    torch._check(
-        x1_dim_num >= X_MIN_DIM and x1_dim_num <= X_MAX_DIM,
-        lambda: "x1 dim num should be 2 ~ 6, please check x1 dim num" + ops_error(ErrCode.VALUE),
-    )
-    if is_a4w4 and not transpose_x2:
-        torch._check(
-            x1_k_dim * INT4_IN_INT32 == x2_k_dim,
-            lambda: "k dim of x2 should be 8 multiple of k dim of x1, please check k dim of x1 and x2" + ops_error(ErrCode.VALUE),
-        )
-    else:
-        torch._check(
-            x1_k_dim == x2_k_dim,
-            lambda: "k dim of x1 and x2 need be same, please check k dim of x1 and x2" + ops_error(ErrCode.VALUE),
-        )
-
-    if is_a4w4:
-        torch._check(
-            x2_dim_num == X_MIN_DIM,
-            lambda: "x2 dim num should be 2 when x1's dtype is int32, please check x2 dim num" + ops_error(ErrCode.VALUE),
-        )
-    else:
-        torch._check(
-            x2_dim_num >= X_MIN_DIM and x2_dim_num <= X_MAX_DIM,
-            lambda: "x2 dim num should be 2 ~ 6 when x1's dtype is int8, please check x2 dim num" + ops_error(ErrCode.VALUE),
-        )
-
-    if offset is not None:
-        offset_dim_num = offset.dim()
-        torch._check(
-            offset_dim_num == 1,
-            lambda: "the offset dim num must be 1, please check offset dim num " + ops_error(ErrCode.VALUE),
-        )
-        offset_first_dim = offset.size(0)
-        torch._check(
-            offset_first_dim == 1 or offset_first_dim == x2_n_dim,
-            lambda: "the offset 1st dim value must be 1 or x2 n dim value, please check offset 1st dim value" + ops_error(ErrCode.VALUE),
-        )
-    if pertoken_scale is not None:
-        pertoken_scale_dim_num = pertoken_scale.dim()
-        torch._check(
-            pertoken_scale_dim_num == 1,
-            lambda: "the pertoken_scale dim num must be 1, please check scale dim num" + ops_error(ErrCode.VALUE),
-        )
-        pertoken_scale_first_dim = pertoken_scale.size(0)
-        torch._check(
-            pertoken_scale_first_dim == x1_m_dim,
-            lambda: "the pertoken_scale 1st dim value must be x1 m dim value, please check scale 1st dim value " + ops_error(ErrCode.VALUE),
-        )
-
-    scale_dim_num = scale.dim()
-    torch._check(
-        scale_dim_num == 1,
-        lambda: "the scale dim num must be 1, please check scale dim num" + ops_error(ErrCode.VALUE),
-    )
-    scale_first_dim = scale.size(0)
-    torch._check(
-        scale_first_dim == 1 or scale_first_dim == x2_n_dim,
-        lambda: "the scale 1st dim value must be 1 or x2 n dim value, please check scale 1st dim value " + ops_error(ErrCode.VALUE),
-    )
-
-
-def quant_matmul_bias_dtype_check(bias, pertoken_scale, output_dtype):
-    bias_dtype_supported_list = [torch.int32, torch.bfloat16, torch.float32, torch.float16]
-    torch._check(
-        bias.dtype in bias_dtype_supported_list,
-        lambda: "bias's type supported for int32, bfloat16, float16 and float32, but bias.dtype is " + str(bias.dtype) + ops_error(ErrCode.TYPE),
-    )
-    if bias.dtype == torch.bfloat16:
-        torch._check(
-            output_dtype == torch.bfloat16,
-            lambda: "When bias dtype is bfloat16, output_dtype must be bfloat16, but it is " +
-                    str(output_dtype) + ops_error(ErrCode.TYPE),
-        )
-    if output_dtype == torch.int32:
-        torch._check(
-            bias.dtype == torch.int32,
-            lambda: "When output_dtype dtype is int32, bias_dtype must be int32, but it is " +
-                    str(bias.dtype) + ops_error(ErrCode.TYPE),
-        )
-    if pertoken_scale is not None:
-        if bias.dtype == torch.float16:
-            torch._check(
-                output_dtype == torch.float16,
-                lambda: "When bias dtype is float16 and pertoken is given, output_dtype must be float16, but it is " +
-                        str(output_dtype) + ops_error(ErrCode.TYPE),
-            )
-    else:
-        torch._check(
-            bias.dtype != torch.float16,
-            lambda: "Bias dtype cannot be float16 when pertoken not given." + ops_error(ErrCode.TYPE),
-        )
-        if bias.dtype == torch.float32:
-            torch._check(
-                output_dtype == torch.bfloat16,
-                lambda: "When bias dtype is float32 and pertoken not given, output_dtype must be bfloat16, but it is " +
-                        str(output_dtype) + ops_error(ErrCode.TYPE),
-            )   
-
-
-def quant_matmul_dtype_check(*args):
-    x1, x2, scale, offset, pertoken_scale, bias, output_dtype, is_a4w4 = args
-    torch._check(
-        x1.dtype == x2.dtype,
-        lambda: "x1's type and x2's type should be same, but x1.dtype is " + str(x1.dtype) + " and x2.dtype is " +
-                str(x2.dtype) + ops_error(ErrCode.TYPE),
-    )
-    input_dtype_supported_list = [torch.int8, torch.int32]
-    torch._check(
-        x1.dtype in input_dtype_supported_list,
-        lambda: "input's type supported for int8 and int32, but now is " + str(x1.dtype) + ops_error(ErrCode.TYPE),
-    )
-    scale_dtype_supported_list = [torch.float32, torch.int64, torch.bfloat16]
-    torch._check(
-        scale.dtype in scale_dtype_supported_list,
-        lambda: "scale's type supported for float32, int64 and bfloat16, but scale.dtype is " + str(scale.dtype) + ops_error(ErrCode.TYPE),
-    )
-    if offset is not None:
-        torch._check(
-            offset.dtype == torch.float32,
-            lambda: "offset's type supported for float32, but offset.dtype is " + str(offset.dtype) + ops_error(ErrCode.TYPE),
-        )
-    if pertoken_scale is not None:
-        torch._check(
-            pertoken_scale.dtype == torch.float32,
-            lambda: "pertoken_scale's type supported for float32, but pertoken_scale.dtype is " +
-                    str(offset.dtype) + ops_error(ErrCode.TYPE),
-        )
-    if bias is not None:
-        quant_matmul_bias_dtype_check(bias, pertoken_scale, output_dtype)
-
-
-def quant_matmul_scale_offset_out_check(scale, offset, pertoken_scale, output_dtype, is_a4w4):
-    if scale.dtype == torch.bfloat16:
-        torch._check(
-            output_dtype in [torch.bfloat16, torch.int32],
-            lambda: "When scale's dtype is bfloat16, output_dtype must be bfloat16 or int32, but output_dtype is " +
-                    str(output_dtype) + ops_error(ErrCode.TYPE),
-        )
-    if output_dtype == torch.bfloat16:
-        torch._check(
-            scale.dtype == torch.bfloat16 or scale.dtype == torch.float32,
-            lambda: "When output_dtype is bfloat16, scale's dtype must be bfloat16 or float32, but scale's dtype is " +
-                    str(scale.dtype) + ops_error(ErrCode.TYPE),
-        )
-    if output_dtype == torch.int32:
-        torch._check(
-            scale.dtype in [torch.bfloat16, torch.float32],
-            lambda: "When output_dtype is int32, scale's dtype must be bfloat16 or float32, but scale's dtype is " +
-                    str(scale.dtype) + ops_error(ErrCode.TYPE),
-        )
-    if offset is not None:
-        torch._check(
-            output_dtype is None or output_dtype == torch.int8,
-            lambda: "offset only exists when output_dtype is int8, but output_dtype is " + str(output_dtype) + ops_error(ErrCode.TYPE),
-        )
-    if pertoken_scale is not None:
-        if output_dtype == torch.float16:
-            torch._check(
-                scale.dtype == torch.float32,
-                lambda: "When output_dtype is float16 and pertoken_scale is not none, scale's dtype must be float32, but scale's dtype is " +
-                        str(scale.dtype) + ops_error(ErrCode.TYPE),
-            )
-        torch._check(
-            output_dtype == torch.float16 or output_dtype == torch.bfloat16,
-            lambda: "When pertoken_scale is not none, output_dtype must be float16 or bfloat16, but output_dtype is " +
-                    str(output_dtype) + ops_error(ErrCode.TYPE),
-        )
-    if is_a4w4:
-        torch._check(
-            output_dtype == torch.float16,
-            lambda: "When input's dtype is int32, output_dtype must be float16, but output_dtype is " +
-                    str(output_dtype) + ops_error(ErrCode.TYPE),
-        )
-
-
-@impl(m, "npu_quant_matmul")
-def npu_quant_matmul_meta(x1, x2, scale, *, offset=None, pertoken_scale=None, bias=None, output_dtype=None):
-    INT4_IN_INT32 = 8
-    batch_val = 1
-    x1_dim_num = x1.dim()
-    x2_dim_num = x2.dim()
-    out_dim_num = max(x1_dim_num, x2_dim_num)
-    shape_long = x1 if x1_dim_num > x2_dim_num else x2
-    shape_short = x2 if x1_dim_num > x2_dim_num else x1
-    vaild_offset = out_dim_num - min(x1_dim_num, x2_dim_num)
-    is_a4w4 = x1.dtype == torch.int32 and x2.dtype == torch.int32
-    dim_list = []
-    for i in range(0, out_dim_num - 2):
-        short_dim = 1 if i < vaild_offset else shape_short.size(i - vaild_offset)
-        long_dim = shape_long.size(i)
-        torch._check(
-            not (short_dim > 1 and long_dim > 1 and short_dim != long_dim),
-            lambda: "the batch shape cannot be broadcast" + ops_error(ErrCode.VALUE),
-        )
-        cur_batch_val = max(short_dim, long_dim)
-        batch_val = batch_val * cur_batch_val
-        dim_list.append(cur_batch_val)
-    dimm = x1.size(x1.dim() - 2)
-    transpose_x2 = x1.size(x1.dim() - 1) == x2.size(x2.dim() - 2)
-
-    dimn = x2.size(x2.dim() - 1) * INT4_IN_INT32 if (is_a4w4 and not transpose_x2) else x2.size(x2.dim() - 1)
-    dim_list.append(dimm)
-    dim_list.append(dimn)
-    quant_matmul_shape_check(x1, x2, scale, offset, pertoken_scale, is_a4w4, transpose_x2)
-    if bias is not None:
-        if bias.dim() == 3:
-            torch._check(
-                len(dim_list) == 3,
-                lambda:"when bias dim is 3, out dim need to be 3" + ops_error(ErrCode.TYPE),
-            )
-        bias_shape_check(x2, bias, batch_val, is_a4w4, transpose_x2)
-    quant_matmul_dtype_check(x1, x2, scale, offset, pertoken_scale, bias, output_dtype, is_a4w4)
-    quant_matmul_scale_offset_out_check(scale, offset, pertoken_scale, output_dtype, is_a4w4)
-    if output_dtype == torch.float16:
-        return shape_long.new_empty(tuple(dim_list), dtype=torch.float16)
-    elif output_dtype == torch.bfloat16:
-        return shape_long.new_empty(tuple(dim_list), dtype=torch.bfloat16)
-    elif output_dtype == torch.int32:
-        return shape_long.new_empty(tuple(dim_list), dtype=torch.int32)
-    elif output_dtype is None or output_dtype == torch.int8:
-        return shape_long.new_empty(tuple(dim_list), dtype=torch.int8)
-    else:
-        raise RuntimeError("Not supportted output dtype is " + str(output_dtype))
-
-
-@impl(m, "npu_trans_quant_param")
-def npu_trans_quant_param_meta(scale, offset=None):
-    scale_dim_num = scale.dim()
-    torch._check(
-        scale_dim_num == 1 or (scale_dim_num == 2 and scale.size(0) == 1),
-        lambda: "the scale shape support only (1, ) and (1, n)" + ops_error(ErrCode.VALUE),
-    )
-    output_shape = scale.size()
-    if scale_dim_num == 1:
-        scale_first_dim = scale.size(0)
-        dim_max = scale_first_dim
-        if offset is not None:
-            offset_first_dim = offset.size(0)
-            dim_max = max(dim_max, offset_first_dim)
-            if offset_first_dim != 1 and scale_first_dim != 1:
-                torch._check(
-                    offset_first_dim == scale_first_dim,
-                    lambda: "offset first dim should be equal to scale first dim if none of them are equal to one" + ops_error(ErrCode.VALUE),
-                )
-        output_shape = (dim_max)
-    else:
-        if offset is not None:
-            torch._check(
-                scale.size() == offset.size(),
-                lambda: "when the input shape of scale is (1, n), shape of scale and offset should be equal" + ops_error(ErrCode.VALUE),
-            )
-    return scale.new_empty(output_shape, dtype=torch.int64)
-
-
-@impl(m, "npu_quantize")
-def npu_quantize_meta(self, scales, zero_points, dtype, axis=1, div_mode=True):
-    if dtype == torch.quint8:
-        return torch.empty_like(self, dtype=torch.uint8)
-    elif dtype == torch.qint8:
-        return torch.empty_like(self, dtype=torch.int8)
-    elif dtype == torch.qint32:
-        return torch.empty_like(self, dtype=torch.int32)
-    elif dtype == torch.quint4x2:
-        dim_num = self.dim()
-        if self.size(dim_num - 1) % 8:
-            raise RuntimeError("If dtype is quint4x2, last dim must be divided by 8" +
-                               ops_error(ErrCode.NOT_SUPPORT))
-        output_shape = []
-        for dim in range(dim_num - 1):
-            output_shape.append(self.size(dim))
-        output_shape.append(self.size(dim_num - 1) // 8)
-        return self.new_empty(output_shape, dtype=torch.int32)
-    return torch.empty_like(self, dtype=torch.int8)
-
-
-@impl(m, "npu_group_quant")
-def npu_group_quant_meta(x, scale, group_index, *, offset=None, dst_dtype=None):
-    if dst_dtype == torch.quint8:
-        return torch.empty_like(x, dtype=torch.uint8)
-    elif dst_dtype == torch.qint8:
-        return torch.empty_like(x, dtype=torch.int8)
-    elif dst_dtype == torch.quint4x2:
-        dim_num = x.dim()
-        if x.size(dim_num - 1) % 8:
-            raise RuntimeError("If dst_dtype is quint4x2, last dim must be divisible by 8" +
-                               ops_error(ErrCode.NOT_SUPPORT))
-        output_shape = []
-        for dim in range(dim_num - 1):
-            output_shape.append(x.size(dim))
-        output_shape.append(x.size(dim_num - 1) // 8)
-        return x.new_empty(output_shape, dtype=torch.int32)
-    return torch.empty_like(x, dtype=torch.int8)
-
-
-@impl(m, "npu_dynamic_quant")
-def npu_dynamic_quant(input_dummy, *, smooth_scales=None, group_index=None, dst_type=torch.int8):
-    dim_num = input_dummy.dim()
-    scale_shape = []
-    for dim in range(dim_num - 1):
-        scale_shape.append(input_dummy.size(dim))
-    scale = input_dummy.new_empty(scale_shape, dtype=torch.float32)
-    if dst_type == torch.quint4x2:
-        if input_dummy.size(dim_num - 1) % 8:
-            raise RuntimeError("If dst_dtype is quint4x2, last dim must be divisible by 8" +
-                               ops_error(ErrCode.PARAM))
-        scale_shape.append(input_dummy.size(dim_num - 1) // 8)
-        output = input_dummy.new_empty(scale_shape, dtype=torch.int32)
-    else:
-        output = torch.empty_like(input_dummy, dtype=torch.int8)
-    return (output, scale)
-
-
-@impl(m, "npu_dynamic_quant_asymmetric")
-def npu_dynamic_quant_asymmetric(input_dummy, *, smooth_scales=None, group_index=None, dst_type=torch.int8):
-    dim_num = input_dummy.dim()
-    scale_offset_shape = []
-    for dim in range(dim_num - 1):
-        scale_offset_shape.append(input_dummy.size(dim))
-    scale = input_dummy.new_empty(scale_offset_shape, dtype=torch.float32)
-    offset = input_dummy.new_empty(scale_offset_shape, dtype=torch.float32)
-    if dst_type == torch.quint4x2:
-        if input_dummy.size(dim_num - 1) % 8:
-            raise RuntimeError("If dst_dtype is quint4x2, last dim must be divisible by 8" +
-                               ops_error(ErrCode.PARAM))
-        scale_offset_shape.append(input_dummy.size(dim_num - 1) // 8)
-        output = input_dummy.new_empty(scale_offset_shape, dtype=torch.int32)
-    else:
-        output = torch.empty_like(input_dummy, dtype=torch.int8)
-    return (output, scale, offset)
-
-
-@impl(m, "npu_moe_compute_expert_tokens")
-def npu_moe_compute_expert_tokens_meta(sorted_experts, num_experts=1):
-    out = torch.zeros(num_experts, dtype=torch.int32, device='meta')
-    return torch.empty_like(out)
-
-
-@impl(m, "npu_anti_quant")
-def npu_anti_quant_meta(x, scale, *, offset=None, dst_dtype=None, src_dtype=None):
-    if dst_dtype is None:
-        dst_dtype = torch.float16
-
-    if x.dtype == torch.int32:
-        x_shape = x.size()
-        if len(x_shape) == 0:
-            raise RuntimeError("Not supported for x is scalar when x dtype is int32" + ops_error(ErrCode.NOT_SUPPORT))
-        y_shape = (*(x_shape[:-1]), x_shape[-1] * 8)
-        y = x.new_empty(y_shape, dtype=dst_dtype)
-        return torch.empty_like(y)
-    else:
-        return torch.empty_like(x, dtype=dst_dtype)
-
-
-@impl(m, "npu_apply_rotary_pos_emb")
-def npu_apply_rotary_pos_emb_meta(query, key, cos, sin, layout=1):
-    return (torch.empty_like(query, dtype=query.dtype), torch.empty_like(key, dtype=key.dtype))
-
-
-@impl(m, "npu_quant_conv2d")
-def npu_quant_conv2d(input_, weight, scale, strides, pads, dilations,
-                     groups=1, offset_x=0, round_mode='rint', output_dtype=None, bias=None, offset=None):
-
-    input_shape = input_.size()
-    weight_shape = weight.size()
-    scale_shape = scale.size()
-
-    input_dim = input_.dim()
-    weight_dim = weight.dim()
-    scale_dim = scale.dim()
-
-    def check_basic_inputs_dim_shape():
-
-        torch._check(
-            input_dim == weight_dim and weight_dim == INPUTS_DIM_LIMIT_QUANTCONV2D,
-            lambda: "input dim or weight dim is not equal to 4, but now input dim is " + str(input_dim) + ", and weight dim is "
-                     + str(weight_dim) + ops_error(ErrCode.VALUE),
-        )
-
-        torch._check(
-            scale_dim == 1,
-            lambda: "scale dim is not equal to 1, but now scale dim is " + str(scale_dim) + ops_error(ErrCode.VALUE),
-        )
-
-        torch._check(
-            input_shape[1] == weight_shape[1],
-            lambda: "input cin should equal to weight cin, but now input cin is " + str(input_shape[1]) + ", and weight cin is "
-                    + str(weight_shape[1]) + ops_error(ErrCode.VALUE),
-        )
-
-        torch._check(
-            scale_shape[0] == weight_shape[0],
-            lambda: "scale shape should equal to cout, but now scale shape is " + str(scale_shape[0]) + ", and cout is " +
-                    str(weight_shape[0]) + ops_error(ErrCode.VALUE),
-        )
-
-    def check_basic_inputs_dtype():
-        torch._check(
-            input_.dtype == torch.int8 and weight.dtype == torch.int8,
-            lambda: "input's dtype and weight's dtype should be int8, but input.dtype is " + str(input_.dtype) + ", and weight.dtype is " +
-                    str(weight.dtype) + ops_error(ErrCode.TYPE),
-        )
-
-        torch._check(
-            scale.dtype == torch.int64,
-            lambda: "scale's dtype should be int64, but scale.dtype is " + str(scale.dtype) + ops_error(ErrCode.TYPE),
-        )
-
-        torch._check(
-            output_dtype == torch.float16,
-            lambda: "output dtype should be float16, but now dtype is " + str(output_dtype) + ops_error(ErrCode.TYPE),
-        )
-
-    def check_bias_dim_shape_dtype():
-        bias_dim = bias.dim()
-        bias_shape = bias.size()
-        torch._check(
-            bias_dim == 1,
-            lambda: "bias dim is not equal to 1, but now bias dim is " + str(bias_dim) + ops_error(ErrCode.VALUE),
-        )
-
-        torch._check(
-            bias.dtype == torch.int32,
-            lambda: "bias' dtype should be int32, but bias.dtype is " + str(input_.dtype) + ops_error(ErrCode.VALUE),
-        )
-
-        torch._check(
-            bias_shape[0] == weight_shape[0],
-            lambda: "bias shape should equal to cout, but now bias shape is " + str(bias_shape[0]) + ", and cout is " +
-                    str(weight_shape[0]) + ops_error(ErrCode.VALUE),
-        )
-
-    def check_attrs():
-        pads_dim = len(pads)
-        strides_dim = len(strides)
-        dilations_dim = len(dilations)
-        torch._check(
-            pads_dim == ATTR_DIM_LIMIT_QUANTCONV2D and strides_dim == ATTR_DIM_LIMIT_QUANTCONV2D and
-            dilations_dim == ATTR_DIM_LIMIT_QUANTCONV2D,
-            lambda: "attrs's dim should be 2, but pads dim is " + str(pads_dim) + ", strides dim is "
-                    + str(strides_dim) + ", dilations dim is " + str(dilations_dim) + ops_error(ErrCode.VALUE),
-        )
-        torch._check(
-            pads[0] >= 0 and pads[1] >= 0,
-            lambda: "pads's value should large or equal to 0, but pads is " + str(pads[0]) + ", "
-                    + str(pads[1]) + ops_error(ErrCode.VALUE),
-        )
-        torch._check(
-            strides[0] > 0 and strides[1] > 0,
-            lambda: "strides's value should large than 0, but strides is " + str(strides[0]) + ", "
-                    + str(strides[1]) + ops_error(ErrCode.VALUE),
-        )
-        torch._check(
-            dilations[0] > 0 and dilations[1] > 0,
-            lambda: "dilations's value should large than 0, but dilations is " + str(dilations[0]) + ", "
-                    + str(dilations[1]) + ops_error(ErrCode.VALUE),
-        )
-        torch._check(
-            groups == 1,
-            lambda: "groups should be 1, but now " + str(groups) + ops_error(ErrCode.VALUE),
-        )
-        torch._check(
-            offset_x <= 127 and offset_x >= -128,
-            lambda: "offset_x should be [-128,127], but offset_x is " + str(offset_x) + ops_error(ErrCode.VALUE),
-        )
-        torch._check(
-            round_mode == 'rint',
-            lambda: "round_mode should be rint, but round_mode is " + str(round_mode) + ops_error(ErrCode.VALUE),
-        )
-
-    check_basic_inputs_dim_shape()
-    check_basic_inputs_dtype()
-    if bias is not None:
-        check_bias_dim_shape_dtype()
-    check_attrs()
-
-    nout = input_shape[0]
-    cout = weight_shape[0]
-    hout = (input_shape[2] + pads[0] * 2 - dilations[0] * (weight_shape[2] - 1) - 1) // strides[0] + 1
-    wout = (input_shape[3] + pads[1] * 2 - dilations[1] * (weight_shape[3] - 1) - 1) // strides[1] + 1
-
-    torch._check(
-        hout > 0 and wout > 0,
-        lambda: "ho, wo should larger than 0, but now ho is " + str(hout) + ", and wo is " + str(wout) + ops_error(ErrCode.VALUE),
-    )
-
-    output_dim_list = [nout, cout, hout, wout]
-
-    return scale.new_empty(tuple(output_dim_list), dtype=output_dtype)
-
-
-@impl(m, "npu_linear")
-def npu_linear_meta(input_, weight, bias=None):
-    dimm = input_.size(0)
-    dimn = weight.size(0)
-    return input_.new_empty((dimm, dimn))
-
-
-@impl(m, "npu_moe_finalize_routing")
-def npu_moe_finalize_routing_meta(expanded_permuted_rows, skip1, skip2_optional, bias, scales, expanded_src_to_dst_row,
-                                  expert_for_source_row):
-    if scales is None:
-        return torch.empty_like(expanded_permuted_rows, dtype=expanded_permuted_rows.dtype)
-    dimm = scales.size(0)
-    dimn = expanded_permuted_rows.size(1)
-    return expanded_permuted_rows.new_empty((dimm, dimn))
-
-
-has_side_effect(torch.ops.npu.npu_prefetch.default)
-
-
-@impl(m, "npu_prefetch")
-def npu_prefetch_meta(self, dependency, max_size, offset=0):
-    torch._check(
-        max_size > 0,
-        lambda: f"The max_size should be greater than zero, but got {max_size}.",
-    )
-    torch._check(
-        offset >= 0,
-        lambda: f"The offset should be nonnegative, but got {offset}.",
-    )
-
-
-@impl(m, "npu_swiglu")
-def npu_swiglu_meta(x, dim=-1):
-    output_size = []
-    for i in range(x.dim()):
-        output_size.append(x.size(i))
-    output_size[dim] = math.floor(output_size[dim] / 2)
-    return torch.empty(output_size, dtype=x.dtype, device=x.device)
-
-
-@impl(m, "npu_swiglu_backward")
-def npu_swiglugrad_meta(y, x, dim=-1):
-    return torch.empty_like(x)
-
-
-def rope_quant_kvcache(x, cos, k_cache, v_cache, size_splits, kv_output=False):
-    torch._check(
-        x.dim() == 3 or x.dim() == 2,
-        lambda: f"The x's dim should be 2 or 3, but got {x.dim()}.",
-        )
-    torch._check(
-        k_cache.dim() == 4,
-        lambda: f"The k_cache's dim should be 4, but got {k_cache.dim()}.",
-        )
-    num_size_splits = len(size_splits)
-    torch._check(
-        num_size_splits == 3,
-        lambda: f"The size_splits should be 3, but got {num_size_splits}.",
-        )
-    torch._check(
-        size_splits[0] >= 0,
-        lambda: f"size_splits[0] should not less than 0, but got {size_splits[0]}.",
-        )
-    batch = x.size(0)
-    seqlen = x.size(1)
-    k_headdim = k_cache.size(2)
-    hidden_size = k_cache.size(3)
-    q_headdim = 0
-    if hidden_size != 0:
-        q_headdim = size_splits[0] // hidden_size
-    out_q_size = [batch, seqlen, q_headdim, hidden_size] if x.dim() == 3 else [batch, q_headdim, hidden_size]
-    out_k_size = [0]
-    out_v_size = [0]
-    if kv_output:
-        out_k_size = [batch, seqlen, k_headdim, hidden_size] if x.dim() == 3 else [batch, k_headdim, hidden_size]
-        out_v_size = [batch, seqlen, k_headdim, hidden_size] if x.dim() == 3 else [batch, k_headdim, hidden_size]
-    return (torch.empty(out_q_size, dtype=cos.dtype, device=x.device),
-            torch.empty(out_k_size, dtype=cos.dtype, device=x.device),
-            torch.empty(out_v_size, dtype=cos.dtype, device=x.device),
-            k_cache, v_cache)
-
-
-@impl(m, "npu_dequant_rope_quant_kvcache")
-def npu_dequant_rope_quant_kvcache_meta(x, cos, sin, k_cache, v_cache, indices, scale_k, scale_v, size_splits, *,
-                                        offset_k=None, offset_v=None, weight_scale=None, activation_scale=None, 
-                                        bias=None, quant_mode=0, input_layout="BSND", kv_output=False, 
-                                        cache_mode="contiguous"):
-    torch._check(
-        x.dtype == torch.int32,
-        lambda: f"The x's dtype should be Int32, but got {x.dtype}.",
-        )
-    return rope_quant_kvcache(x, cos, k_cache, v_cache, size_splits, kv_output=kv_output)
-
-
-@impl(m, "npu_rope_quant_kvcache")
-def npu_rope_quant_kvcache_meta(x, cos, sin, k_cache, v_cache, indices, scale_k, scale_v, size_splits, *, offset_k=None,
-                                offset_v=None, quant_mode=0, input_layout="BSND", kv_output=False, cache_mode="contiguous"):
-    return rope_quant_kvcache(x, cos, k_cache, v_cache, size_splits, kv_output=kv_output)
-- 
Gitee


From a760a5cbd4964d9047fb2e0ec1e32295ccf0dd00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=8F=B6=E5=AD=90=E5=87=A1?= <vscience@qq.com>
Date: Thu, 20 Feb 2025 08:27:45 +0000
Subject: [PATCH 034/358] =?UTF-8?q?!18157=20Check=20if=20the=20output=20of?=
 =?UTF-8?q?=20allgather=20has=20empty=20tensor=20Merge=20pull=20request=20?=
 =?UTF-8?q?!18157=20from=20=E5=8F=B6=E5=AD=90=E5=87=A1/v2.6.0=5Frsv=5Fagv?=
 =?UTF-8?q?=5Fupdate?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index b86993fcaf..dd0352d05e 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -1829,6 +1829,16 @@ bool check_same_size(const std::vector<at::Tensor>& input_tensors)
     return true;
 }
 
+bool has_empty_tensor(const std::vector<at::Tensor>& tensors)
+{
+    for (const auto& tensor : tensors) {
+        if (tensor.data_ptr() == nullptr) {
+            return true;
+        }
+    }
+    return false;
+}
+
 std::vector<at::Tensor> cast_to_origin_format(const std::vector<at::Tensor>& inputTensors)
 {
     std::vector<at::Tensor> inputTensors_;
@@ -3402,7 +3412,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather(
                 }
             },
             c10d::OpType::ALLGATHER);
-    } else if (hcclAllGatherVExist()) {
+    } else if (hcclAllGatherVExist() && !has_empty_tensor(outputTensors.back())) {
         std::vector<at::Tensor> lastOutputTensors = outputTensors.back();
         std::vector<uint64_t> outputCounts;
         std::vector<uint64_t> outputSpl;
-- 
Gitee


From 0a5d52a727ed2b812c245b7995be8bfe6a5df808 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= <jiangyiwen5@huawei.com>
Date: Thu, 20 Feb 2025 09:47:31 +0000
Subject: [PATCH 035/358] =?UTF-8?q?!18183=20Fix=20jitCompile=20value=20Mer?=
 =?UTF-8?q?ge=20pull=20request=20!18183=20from=20=E5=A7=9C=E6=80=A1?=
 =?UTF-8?q?=E6=96=87/v2.6.0=5Ffix?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/framework/interface/EnvVariables.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_npu/csrc/framework/interface/EnvVariables.cpp b/torch_npu/csrc/framework/interface/EnvVariables.cpp
index 9e0499f4c5..bcf973d26d 100644
--- a/torch_npu/csrc/framework/interface/EnvVariables.cpp
+++ b/torch_npu/csrc/framework/interface/EnvVariables.cpp
@@ -48,7 +48,7 @@ REGISTER_OPTION_HOOK(mdldumpconfigpath, [](const std::string &val) {
 REGISTER_OPTION_BOOL_FUNCTION(CheckJitDisableInner, jitCompile, "enable", "disable")
 REGISTER_OPTION_CACHE(bool, isJitDisable, CheckJitDisableInner)
 REGISTER_OPTION_HOOK(jitCompile, [](const std::string &val) {
-    NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, "enable"));
+    NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, val.c_str()));
     SET_OPTION_WITH_CACHE(isJitDisable, ("disable" == val) ? true : false);
 })
 
-- 
Gitee


From aaa08ddc72a65c2928af201a9254073ebbd6bcef Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 20 Feb 2025 11:20:06 +0000
Subject: [PATCH 036/358] !18187 Update op_plugin commit id Merge pull request
 !18187 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 6c9130662a..c7857a1db3 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 6c9130662a2eab05371e37425d239b0c0cadf66c
+Subproject commit c7857a1db3fcc482d89b12254802c39400c0d976
-- 
Gitee


From 899eda6179356e35962a0234ff103d5f5727fdf6 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 20 Feb 2025 15:50:11 +0000
Subject: [PATCH 037/358] !18191 Update op_plugin commit id Merge pull request
 !18191 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index c7857a1db3..debe9126f7 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit c7857a1db3fcc482d89b12254802c39400c0d976
+Subproject commit debe9126f75b3d0a2a3bb393eb71cdd102086fb5
-- 
Gitee


From b727bb852ec5607116113bdbddc0d4303971dbe1 Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Thu, 20 Feb 2025 23:38:14 +0000
Subject: [PATCH 038/358] !18202 Update torchair commit id Merge pull request
 !18202 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index e5cc5f4981..40fbfc8ad6 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit e5cc5f4981971c0f649f11f999f61ade9de9aa67
+Subproject commit 40fbfc8ad6cb47716c4566f2f8cfc4abd4c56e5a
-- 
Gitee


From f2566ea0245005248ed987c448ee71b59cc4288e Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 21 Feb 2025 07:31:26 +0000
Subject: [PATCH 039/358] !18207 Update op_plugin commit id Merge pull request
 !18207 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index debe9126f7..1c84d0c134 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit debe9126f75b3d0a2a3bb393eb71cdd102086fb5
+Subproject commit 1c84d0c134f34e23e3ff2e27f89c97e4c8360913
-- 
Gitee


From 3958e1f00bc93afc3a2ca5413508a74195fff4cd Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 21 Feb 2025 08:45:08 +0000
Subject: [PATCH 040/358] !18210 Update op_plugin commit id Merge pull request
 !18210 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 1c84d0c134..5c2673aa4d 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 1c84d0c134f34e23e3ff2e27f89c97e4c8360913
+Subproject commit 5c2673aa4dbf101ba72d435fbbd28e1a907d2474
-- 
Gitee


From 11c5119db92fa9743cd33042e8dc3acbf91aa1f7 Mon Sep 17 00:00:00 2001
From: Gallium <hujia23@huawei.com>
Date: Sat, 22 Feb 2025 01:42:23 +0000
Subject: [PATCH 041/358] !18163 dynamic_profiler adapt dynolog params Merge
 pull request !18163 from Gallium/dynolog_v2.6.0

---
 .../_dynamic_profiler_config_context.py       | 124 +++++++++++++-----
 1 file changed, 89 insertions(+), 35 deletions(-)

diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
index 0cb049d5db..8570db61dc 100644
--- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
@@ -32,16 +32,11 @@ class ConfigContext:
         self.parse(json_data)
 
     def parse(self, json_data: dict):
-        activities = json_data.get('activities')
         self.is_valid = json_data.get("is_valid", False)
-        if activities and isinstance(activities, list):
-            for entry in activities:
-                activity = getattr(ProfilerActivity, entry.upper(), None)
-                if activity:
-                    self.activity_set.add(activity)
+        self._parse_activity(json_data)
         self._parse_prof_dir(json_data)
         self._meta_data = json_data.get('metadata', {})
-        self._analyse = json_data.get('analyse', False)
+        self._parse_analysis(json_data)
         self._async_mode = json_data.get('async_mode', False)
         self._parse_report_shape(json_data)
         self._parse_profiler_memory(json_data)
@@ -50,39 +45,14 @@ class ConfigContext:
         self._parse_with_modules(json_data)
         self._parse_active(json_data)
         self._parse_start_step(json_data)
-        exp_config = json_data.get('experimental_config')
-        if not exp_config:
-            self.experimental_config = None
-        else:
-            profiler_level = exp_config.get('profiler_level', 'Level0')
-            profiler_level = getattr(ProfilerLevel, profiler_level, profiler_level)
-            aic_metrics = exp_config.get('aic_metrics', 'AiCoreNone')
-            aic_metrics = getattr(AiCMetrics, aic_metrics, aic_metrics)
-            l2_cache = exp_config.get('l2_cache', False)
-            op_attr = exp_config.get('op_attr', False)
-            gc_detect_threshold = exp_config.get('gc_detect_threshold', None)
-            data_simplification = exp_config.get('data_simplification', True)
-            record_op_args = exp_config.get('record_op_args', False)
-            export_type = exp_config.get('export_type', 'text')
-            msprof_tx = exp_config.get('msprof_tx', False)
-            self.experimental_config = _ExperimentalConfig(
-                profiler_level=profiler_level,
-                aic_metrics=aic_metrics,
-                l2_cache=l2_cache,
-                op_attr=op_attr,
-                gc_detect_threshold=gc_detect_threshold,
-                data_simplification=data_simplification,
-                record_op_args=record_op_args,
-                export_type=export_type,
-                msprof_tx=msprof_tx
-            )
-            self._parse_ranks(json_data)
+        self._parse_exp_cfg(json_data)
+        self._parse_ranks(json_data)
 
     def _parse_start_step(self, json_data: dict):
         if not self._is_dyno:
             self._start_step = json_data.get("start_step", self.DEFAULT_START_STEP)
         else:
-            start_step = json_data.get("PROFILE_START_ITERATION_ROUNDUP", self.DEFAULT_START_STEP)
+            start_step = json_data.get("PROFILE_START_STEP", self.DEFAULT_START_STEP)
             try:
                 self._start_step = int(start_step)
             except ValueError:
@@ -180,6 +150,90 @@ class ConfigContext:
             if isinstance(rank, int) and rank >= 0:
                 self._rank_set.add(rank)
 
+    def _parse_activity(self, json_data: dict):
+        if not self._is_dyno:
+            activities = json_data.get('activities')
+        else:
+            activities = json_data.get('PROFILE_ACTIVITIES').split(",")
+        if activities and isinstance(activities, list):
+            for entry in activities:
+                activity = getattr(ProfilerActivity, entry.upper(), None)
+                if activity:
+                    self.activity_set.add(activity)
+                else:
+                    DynamicProfilerUtils.out_log("Set activity failed, activity must be CPU OR NPU!",
+                                         DynamicProfilerUtils.LoggerLevelEnum.WARNING)
+    
+    def _parse_analysis(self, json_data: dict):
+        if not self._is_dyno:
+            self._analyse = json_data.get("analyse", False)
+        else:
+            self._analyse = json_data.get("PROFILE_ANALYSE", False)
+
+    def _parse_dyno_exp_cfg(self, json_data: dict): 
+        profiler_level = json_data.get('PROFILE_PROFILER_LEVEL', 'Level0')
+        profiler_level = getattr(ProfilerLevel, profiler_level, profiler_level)
+        aic_metrics = json_data.get('PROFILE_AIC_METRICS', 'AiCoreNone')
+        aic_metrics = getattr(AiCMetrics, aic_metrics, aic_metrics)
+        l2_cache = json_data.get('PROFILE_L2_CACHE', 'false')
+        l2_cache = self.BOOL_MAP.get(l2_cache.lower(), False)
+        op_attr = json_data.get('PROFILE_OP_ATTR', 'false')
+        op_attr = self.BOOL_MAP.get(op_attr.lower(), False)
+        gc_detect_threshold = json_data.get('PROFILE_GC_DETECT_THRESHOLD', None)
+        data_simplification = json_data.get('PROFILE_DATA_SIMPLIFICATION', 'true')
+        data_simplification = self.BOOL_MAP.get(data_simplification.lower(), True)
+        record_op_args = json_data.get('PROFILE_RECORD_SHAPES', 'false')
+        record_op_args = self.BOOL_MAP.get(record_op_args.lower(), False)
+        export_type = json_data.get('PROFILE_EXPORT_TYPE', 'text').lower()
+        msprof_tx = False
+        
+        self.experimental_config = _ExperimentalConfig(
+            profiler_level=profiler_level,
+            aic_metrics=aic_metrics,
+            l2_cache=l2_cache,
+            op_attr=op_attr,
+            gc_detect_threshold=gc_detect_threshold,
+            data_simplification=data_simplification,
+            record_op_args=record_op_args,
+            export_type=export_type,
+            msprof_tx=msprof_tx
+        )
+    
+    def _parse_cfg_json_exp_cfg(self, json_data: dict):
+        exp_config = json_data.get('experimental_config')
+        if not exp_config:
+            self.experimental_config = None
+            return
+        profiler_level = exp_config.get('profiler_level', 'Level0')
+        profiler_level = getattr(ProfilerLevel, profiler_level, profiler_level)
+        aic_metrics = exp_config.get('aic_metrics', 'AiCoreNone')
+        aic_metrics = getattr(AiCMetrics, aic_metrics, aic_metrics)
+        l2_cache = exp_config.get('l2_cache', False)
+        op_attr = exp_config.get('op_attr', False)
+        gc_detect_threshold = exp_config.get('gc_detect_threshold', None)
+        data_simplification = exp_config.get('data_simplification', True)
+        record_op_args = exp_config.get('record_op_args', False)
+        export_type = exp_config.get('export_type', 'text')
+        msprof_tx = exp_config.get('msprof_tx', False)
+
+        self.experimental_config = _ExperimentalConfig(
+            profiler_level=profiler_level,
+            aic_metrics=aic_metrics,
+            l2_cache=l2_cache,
+            op_attr=op_attr,
+            gc_detect_threshold=gc_detect_threshold,
+            data_simplification=data_simplification,
+            record_op_args=record_op_args,
+            export_type=export_type,
+            msprof_tx=msprof_tx
+        )
+
+    def _parse_exp_cfg(self, json_data: dict):
+        if not self._is_dyno:
+            self._parse_cfg_json_exp_cfg(json_data)
+        else:
+            self._parse_dyno_exp_cfg(json_data)
+
     def valid(self) -> bool:
         if not self.is_valid:
             return False
-- 
Gitee


From c6eec1b1b0b72eaf7dd2d165cdd5b1df020c095c Mon Sep 17 00:00:00 2001
From: louyujing <7927276+louyujing@user.noreply.gitee.com>
Date: Sat, 22 Feb 2025 01:44:55 +0000
Subject: [PATCH 042/358] =?UTF-8?q?!18020=20=E3=80=90transfer=5Fto=5Fnpu?=
 =?UTF-8?q?=E3=80=91Adapt=20cuda=20default=5Fstream=20Merge=20pull=20reque?=
 =?UTF-8?q?st=20!18020=20from=20louyujing/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/contrib/transfer_to_npu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_npu/contrib/transfer_to_npu.py b/torch_npu/contrib/transfer_to_npu.py
index 30c31ec9c2..1c9d0f5a03 100644
--- a/torch_npu/contrib/transfer_to_npu.py
+++ b/torch_npu/contrib/transfer_to_npu.py
@@ -36,7 +36,7 @@ torch_cuda_fn_white_list = [
     'get_device_properties', 'get_device_name', 'get_device_capability', 'list_gpu_processes', 'set_device',
     'synchronize', 'mem_get_info', 'memory_stats', 'memory_summary', 'memory_allocated', 'max_memory_allocated',
     'reset_max_memory_allocated', 'memory_reserved', 'max_memory_reserved', 'reset_max_memory_cached',
-    'reset_peak_memory_stats'
+    'reset_peak_memory_stats', 'default_stream'
 ]
 torch_distributed_fn_white_list = ['__init__']
 device_kwargs_list = ['device', 'device_type', 'map_location', 'device_id']
-- 
Gitee


From cdb308cd202a6c59734d45fc7ad1b27c64f1dfa2 Mon Sep 17 00:00:00 2001
From: Mrtutu <zhangwei983@huawei.com>
Date: Sat, 22 Feb 2025 02:09:57 +0000
Subject: [PATCH 043/358] !18016 Add Profile log on branch v2.6.0 Merge pull
 request !18016 from Mrtutu/add_log_v2.6.0

---
 test/profiler/test_npu_profiler.py            |   2 +-
 .../profiler/analysis/_profiling_parser.py    |   4 +
 .../analysis/prof_common_func/_log.py         | 114 ++++++++++++++++++
 .../prof_common_func/_task_manager.py         |   2 +-
 .../analysis/prof_parse/_cann_file_parser.py  |  25 +++-
 .../prof_parse/_fwk_cann_relation_parser.py   |  18 ++-
 .../analysis/prof_parse/_fwk_file_parser.py   |  21 +++-
 .../analysis/prof_view/_base_parser.py        |   2 +-
 .../prof_view/_communication_parser.py        |  16 ++-
 .../analysis/prof_view/_integrate_parser.py   |   9 +-
 .../analysis/prof_view/_kernel_view_parser.py |  18 ++-
 .../prof_view/_memory_prepare_parser.py       |  10 +-
 .../prof_view/_memory_timeline_parser.py      |   7 +-
 .../analysis/prof_view/_memory_view_parser.py |   9 +-
 .../prof_view/_operator_view_parser.py        |   9 +-
 .../analysis/prof_view/_stack_view_parser.py  |   9 +-
 .../prof_view/_trace_step_time_parser.py      |   9 +-
 .../analysis/prof_view/_trace_view_parser.py  |   9 +-
 .../prof_view/cann_parse/_cann_analyze.py     |   6 +-
 .../prof_view/cann_parse/_cann_export.py      |   5 +
 .../prepare_parse/_fwk_pre_parser.py          |  14 ++-
 .../prepare_parse/_relation_parser.py         |  10 +-
 .../prof_db_parse/_communication_db_parser.py |  13 +-
 .../prof_view/prof_db_parse/_db_parser.py     |  12 +-
 .../prof_db_parse/_fwk_api_db_parser.py       |   8 +-
 .../prof_db_parse/_gc_record_db_parser.py     |  10 +-
 .../prof_db_parse/_memory_db_parser.py        |   9 +-
 .../prof_db_parse/_step_info_db_parser.py     |   5 +-
 .../_trace_step_time_db_parser.py             |   7 +-
 torch_npu/profiler/profiler_interface.py      |   4 +-
 30 files changed, 308 insertions(+), 88 deletions(-)
 create mode 100644 torch_npu/profiler/analysis/prof_common_func/_log.py

diff --git a/test/profiler/test_npu_profiler.py b/test/profiler/test_npu_profiler.py
index 03a1a82848..54f3526477 100644
--- a/test/profiler/test_npu_profiler.py
+++ b/test/profiler/test_npu_profiler.py
@@ -295,7 +295,7 @@ class TestNpuProfiler(TestCase):
         prof.stop()
         result_dir = os.path.join(self.results_work_path, "profiling_data")
         torch_npu.profiler.profiler.analyse(result_dir)
-        work_names = os.listdir(result_dir)
+        work_names = [p for p in os.listdir(result_dir) if p.endswith("ascend_pt")]
         os.environ["ASCEND_WORK_PATH"] = ""
         # only one device
         valid_work_name = len(work_names) == 1 and work_names[0].endswith("ascend_pt")
diff --git a/torch_npu/profiler/analysis/_profiling_parser.py b/torch_npu/profiler/analysis/_profiling_parser.py
index 1bfcac8fbe..db63d97fcd 100644
--- a/torch_npu/profiler/analysis/_profiling_parser.py
+++ b/torch_npu/profiler/analysis/_profiling_parser.py
@@ -6,6 +6,7 @@ from .prof_common_func._constant import Constant, print_info_msg, print_error_ms
 from .prof_common_func._cann_package_manager import CannPackageManager
 from .prof_common_func._path_manager import ProfilerPathManager
 from .prof_common_func._task_manager import ConcurrentTasksManager
+from .prof_common_func._log import ProfilerLogger
 from .prof_config._parser_config import ParserConfig
 from .prof_parse._cann_file_parser import CANNFileParser
 from ._profiler_config import ProfilerConfig
@@ -25,6 +26,8 @@ class ProfilingParser:
             self._output_path = os.path.join(profiler_path, Constant.OUTPUT_DIR)
             PathManager.remove_path_safety(self._output_path)
             PathManager.make_dir_safety(self._output_path)
+        ProfilerLogger.init(self._profiler_path, "ProfilingParser")
+        self.logger = ProfilerLogger.get_instance()
 
     @staticmethod
     def simplify_data(profiler_path: str, simplify_flag: bool):
@@ -91,6 +94,7 @@ class ProfilingParser:
             self.run_parser()
         except Exception as err:
             print_error_msg(f"Failed to parsing profiling data. {err}")
+            self.logger.error("Failed to parsing profiling data, error: %s", str(err), exc_info=True)
         if self._analysis_type == Constant.TENSORBOARD_TRACE_HANDLER:
             self.simplify_data(self._profiler_path, ProfilerConfig().data_simplification)
         end_time = datetime.utcnow()
diff --git a/torch_npu/profiler/analysis/prof_common_func/_log.py b/torch_npu/profiler/analysis/prof_common_func/_log.py
new file mode 100644
index 0000000000..15ba7a80f9
--- /dev/null
+++ b/torch_npu/profiler/analysis/prof_common_func/_log.py
@@ -0,0 +1,114 @@
+import os
+import logging
+from logging.handlers import RotatingFileHandler
+from datetime import datetime, timezone
+from typing import Optional
+
+from torch_npu.utils._path_manager import PathManager
+
+
+class ProfilerLogger:
+    """
+    Profiler Logger class for managing log operations.
+
+    This class provides a centralized logging mechanism for the profiler,
+    writing logs to file with rotation support.
+
+    Attributes:
+        LOG_FORMAT: The format string for log messages
+        DATE_FORMAT: The format string for timestamps in log messages
+        DEFAULT_LOGGER_NAME: Default name for the logger instance
+        DEFAULT_LOG_DIR: Default directory name for log files
+        MAX_BYTES: Maximum size of each log file
+        BACKUP_COUNT: Number of backup files to keep
+    """
+
+    LOG_FORMAT = "[%(asctime)s] [%(levelname)s] [%(name)s:%(lineno)d] %(message)s"
+    DATE_FORMAT = "%Y-%m-%d-%H:%M:%S"
+    DEFAULT_LOGGER_NAME = "AscendProfiler"
+    DEFAULT_LOG_LEVEL = logging.INFO
+    DEFAULT_LOG_DIR = "logs"
+    # 10MB per file
+    MAX_BYTES = 10 * 1024 * 1024
+    # Keep 3 backup files
+    BACKUP_COUNT = 3
+    # logger instance
+    _instance = None
+
+    @classmethod
+    def get_instance(cls) -> logging.Logger:
+        """Get the singleton logger instance."""
+        if cls._instance is None:
+            raise RuntimeError("Logger not initialized. Call init first.")
+        return cls._instance
+
+    @classmethod
+    def init(cls, output_dir: str, custom_name: Optional[str] = None) -> None:
+        """
+        Initialize the logger with rotating file handler.
+
+        Args:
+            output_dir (str): Directory where log files will be stored
+
+        Raises:
+            RuntimeError: If logger initialization fails
+        """
+        if cls._instance is not None:
+            return
+
+        # Create logs directory
+        log_dir = os.path.join(output_dir, cls.DEFAULT_LOG_DIR)
+        PathManager.make_dir_safety(log_dir)
+
+        # Create logger
+        logger = logging.getLogger(cls.DEFAULT_LOGGER_NAME)
+        logger.setLevel(cls.DEFAULT_LOG_LEVEL)
+        logger.propagate = False
+
+        # Create formatters
+        formatter = logging.Formatter(fmt=cls.LOG_FORMAT, datefmt=cls.DATE_FORMAT)
+
+        # Add rotating file handler
+        timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d%H%M%S")
+        log_file = os.path.join(
+            log_dir,
+            (
+                f"profiler_{timestamp}_{os.getpid()}_{custom_name}.log"
+                if custom_name
+                else f"profiler_{timestamp}_{os.getpid()}.log"
+            ),
+        )
+        file_handler = RotatingFileHandler(
+            filename=log_file,
+            maxBytes=cls.MAX_BYTES,
+            backupCount=cls.BACKUP_COUNT,
+            encoding="utf-8",
+        )
+        file_handler.setFormatter(formatter)
+        file_handler.setLevel(cls.DEFAULT_LOG_LEVEL)
+        logger.addHandler(file_handler)
+
+        cls._instance = logger
+        logger.info("Profiler logger initialized at: %s", log_file)
+
+    @classmethod
+    def set_level(cls, level: int) -> None:
+        """
+        Set the logging level for both file and console handlers.
+
+        Args:
+            level (int): Logging level (e.g., logging.DEBUG, logging.INFO)
+        """
+        logger = cls.get_instance()
+        logger.setLevel(level)
+        for handler in logger.handlers:
+            handler.setLevel(level)
+
+    @classmethod
+    def destroy(cls) -> None:
+        """Close and cleanup the logger."""
+        if cls._instance:
+            for handler in cls._instance.handlers[:]:
+                handler.close()
+                cls._instance.removeHandler(handler)
+            cls._instance = None
diff --git a/torch_npu/profiler/analysis/prof_common_func/_task_manager.py b/torch_npu/profiler/analysis/prof_common_func/_task_manager.py
index 11577e39af..7b884d6a15 100644
--- a/torch_npu/profiler/analysis/prof_common_func/_task_manager.py
+++ b/torch_npu/profiler/analysis/prof_common_func/_task_manager.py
@@ -157,7 +157,7 @@ class ConcurrentTasksManager:
     def finalize(self):
         for task_info in self.task_infos.values():
             if task_info.status != TaskStatus.Succeed:
-                print_error_msg("Task %s has not run successfully." % task_info.task.name)
+                print_error_msg(f"Task [{task_info.task.__class__.__name__}] run failed.")
                 self.__stop_task(task_info)
 
         if self.progress_bar:
diff --git a/torch_npu/profiler/analysis/prof_parse/_cann_file_parser.py b/torch_npu/profiler/analysis/prof_parse/_cann_file_parser.py
index 21a9831a60..b70b3e049d 100644
--- a/torch_npu/profiler/analysis/prof_parse/_cann_file_parser.py
+++ b/torch_npu/profiler/analysis/prof_parse/_cann_file_parser.py
@@ -12,6 +12,7 @@ from ..prof_common_func._constant import Constant, print_warn_msg
 from ..prof_common_func._constant import convert_us2ns
 from ..prof_common_func._path_manager import ProfilerPathManager
 from ..prof_common_func._file_manager import FileManager
+from ..prof_common_func._log import ProfilerLogger
 
 __all__ = []
 
@@ -66,9 +67,12 @@ class CANNFileParser:
     }
 
     def __init__(self, profiler_path: str):
+        self._profiler_path = profiler_path
         self._cann_path = ProfilerPathManager.get_cann_path(profiler_path)
         self._file_dict = {}
         self._file_dispatch()
+        ProfilerLogger.init(profiler_path, "CANNFileParser")
+        self.logger = ProfilerLogger.get_instance()
 
     @classmethod
     def _json_load(cls, data: str) -> list:
@@ -98,6 +102,7 @@ class CANNFileParser:
 
     @classmethod
     def combine_acl_to_npu(cls, timeline_data: list) -> dict:
+        logger = ProfilerLogger.get_instance()
         flow_dict, event_dict = {}, {}
         for data in timeline_data:
             if data.get("cat") == cls.HOST_TO_DEVICE and data.get("ph") == cls.START_FLOW:
@@ -112,6 +117,13 @@ class CANNFileParser:
                 ts = data.get("ts")
                 unique_id = f"{pid}-{tid}-{ts}"
                 event_dict[unique_id] = data
+
+        if not flow_dict:
+            logger.error("There is no HostToDevice flow events in msprof timeline.")
+
+        if not event_dict:
+            logger.error("There is no kernel events in msprof timeline.")
+
         acl_to_npu_dict = {}
         for flow in flow_dict.values():
             start_event = flow.get("start")
@@ -123,8 +135,11 @@ class CANNFileParser:
                 unique_id = f"{pid}-{tid}-{ts}"
                 kernel_event = event_dict.get(unique_id)
                 if not kernel_event:
+                    logger.warning("The kernel event of unique_id(pid: %d, tid: %d, ts: %d) is not exist in msprof timeline.", 
+                                    pid, tid, ts)
                     continue
                 acl_to_npu_dict.setdefault(convert_us2ns(start_event.get("ts", 0)), []).append(EventBean(kernel_event))
+                
         return acl_to_npu_dict
 
     def get_timeline_all_data(self) -> list:
@@ -133,6 +148,9 @@ class CANNFileParser:
         for msprof_file in msprof_file_list:
             data = self._json_load(FileManager.file_read_all(msprof_file, "rt"))
             timeline_data.extend(data)
+
+        if not timeline_data:
+            self.logger.error("Get timeline all data failed, the timeline data is empty.")
         return timeline_data
 
     def get_analyze_communication_data(self, file_type: Enum) -> dict:
@@ -153,21 +171,24 @@ class CANNFileParser:
     def get_localtime_diff(self) -> float:
         localtime_diff = 0
         if not self._cann_path:
+            self.logger.error("Get localtime diff failed, the CANN path is not exist.")
             return localtime_diff
         start_info_path = ProfilerPathManager.get_start_info_path(self._cann_path)
         if not start_info_path:
+            self.logger.error("Get localtime diff failed, the start info path is not exist.")
             return localtime_diff
         try:
             info_json = ast.literal_eval(FileManager.file_read_all(start_info_path, "rt"))
             localtime_diff = convert_us2ns(info_json.get(Constant.CANN_BEGIN_TIME, 0)) - int(
                 info_json.get(Constant.CANN_BEGIN_MONOTONIC, 0))
-        except Exception:
-            print_warn_msg("Failed to get CANN localtime diff.")
+        except Exception as e:
+            self.logger.error("Failed to get CANN localtime diff, error: %s", str(e), exc_info=True)
         return localtime_diff
 
     def del_summary_and_timeline_data(self):
         device_path = ProfilerPathManager.get_device_path(self._cann_path)
         if not device_path:
+            self.logger.error("Delete summary and timeline data failed, the device path is not exist.")
             return
         summary_path = os.path.join(device_path, "summary")
         timeline_path = os.path.join(device_path, "timeline")
diff --git a/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py b/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py
index 21df8a5a69..de8b466067 100644
--- a/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py
+++ b/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py
@@ -1,6 +1,7 @@
 from ._fwk_file_parser import FwkFileParser
 from ..prof_bean._torch_op_node import TorchOpNode
 from ..prof_common_func._constant import Constant, print_error_msg
+from ..prof_common_func._log import ProfilerLogger
 from ..prof_parse._cann_file_parser import CANNFileParser
 
 __all__ = []
@@ -9,6 +10,8 @@ __all__ = []
 class FwkCANNRelationParser:
     def __init__(self, profiler_path: str):
         self._profiler_path = profiler_path
+        ProfilerLogger.init(self._profiler_path, "FwkCANNRelationParser")
+        self.logger = ProfilerLogger.get_instance()
 
     @classmethod
     def combine_kernel_dict(cls, acl_to_npu_dict: dict, dequeue_data_list: list):
@@ -45,28 +48,31 @@ class FwkCANNRelationParser:
     def get_kernel_dict(self) -> dict:
         acl_to_npu_dict = CANNFileParser(self._profiler_path).get_acl_to_npu_data()
         if not acl_to_npu_dict:
+            print_error_msg("Failed to get acl to npu flow events.")
             return acl_to_npu_dict
         dequeue_data_list = FwkFileParser(self._profiler_path).get_dequeue_data()
         return self.combine_kernel_dict(acl_to_npu_dict, dequeue_data_list)
 
     def get_step_range(self, root_node: TorchOpNode, kernel_dict: dict):
         if not kernel_dict:
-            print_error_msg("Failed to get acl to npu flow events.")
+            self.logger.error("Get step range failed, the kernel dict is empty.")
             return []
-        step_node_list = []
-        for level1_node in root_node.child_node_list:
-            if level1_node.is_profiler_step():
-                step_node_list.append(level1_node)
+        # Get ProfilerStep#x node
+        step_node_list = [node for node in root_node.child_node_list if node.is_profiler_step()]
         if not step_node_list:
+            self.logger.error("Get step range failed, the step node list is empty.")
             return []
+        
+        # Gather flow events start time in each step node
         if not FwkFileParser(self._profiler_path).has_task_queue_data():
             acl_start_time_list = sorted(list(kernel_dict.keys()))
             self._update_step_node_info(step_node_list, acl_start_time_list)
+        # Get step range on device by flow events
         step_range = []
         for step_node in step_node_list:
             step_id = step_node.event.name.split("#")[-1]
             if not step_node.corr_id_total:
-                print_error_msg("Some step lost the correlation id information.")
+                self.logger.error("There is no flow events in %s range.", step_node.event.name)
                 return []
             corr_id_list = sorted(step_node.corr_id_total)
             min_index, max_index = 0, len(corr_id_list) - 1
diff --git a/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py b/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py
index dbc6c63208..f8010197a8 100644
--- a/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py
+++ b/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py
@@ -4,13 +4,14 @@ from collections import defaultdict
 
 from ..prof_bean._torch_op_bean import TorchOpBean
 from ..prof_common_func._binary_decoder import BinaryDecoder
-from ..prof_common_func._constant import Constant, DbConstant, contact_2num, print_warn_msg
+from ..prof_common_func._constant import Constant, contact_2num
 from ..prof_common_func._file_manager import FileManager
 from ..prof_common_func._file_tag import FileTag
 from ..prof_common_func._path_manager import ProfilerPathManager
 from ..prof_common_func._tlv_decoder import TLVDecoder
 from ..prof_common_func._trace_event_manager import TraceEventManager
 from ..prof_common_func._tree_builder import TreeBuilder
+from ..prof_common_func._log import ProfilerLogger
 from ..prof_config._fwk_file_parser_config import FwkFileParserConfig
 from ._python_trace_parser import PythonTraceParser
 
@@ -23,6 +24,8 @@ class FwkFileParser:
         self._profiler_path = profiler_path
         self._file_list = {}
         self._file_dispatch()
+        ProfilerLogger.init(self._profiler_path, "FwkFileParser")
+        self.logger = ProfilerLogger.get_instance()
 
     def get_file_data_by_tag(self, file_tag: int) -> list:
         file_path = self._file_list.get(file_tag)
@@ -41,6 +44,7 @@ class FwkFileParser:
         enqueue_data_list = []
         op_mark_data = self.get_file_data_by_tag(FileTag.OP_MARK)
         if not op_mark_data:
+            self.logger.error("Get enqueue data failed, the op mark data is empty.")
             return enqueue_data_list
         op_mark_data.sort(key=lambda x: x.time_ns)
         tid_op_dict = defaultdict(lambda: defaultdict(list))
@@ -52,7 +56,8 @@ class FwkFileParser:
                 continue
             start_op_list = tid_op_dict.get(op_mark.tid, {}).get(op_mark.origin_name, [])
             if not start_op_list:
-                print_warn_msg("Enquque data match failed")
+                self.logger.warning("Enquque data match failed, the tid: %d, origin_name: %s is not exist.", 
+                                op_mark.tid, op_mark.origin_name)
                 continue
             start_op = start_op_list.pop()
             op_mark.ts = start_op.time_ns
@@ -65,6 +70,7 @@ class FwkFileParser:
         dequeue_data_list = []
         op_mark_data = self.get_file_data_by_tag(FileTag.OP_MARK)
         if not op_mark_data:
+            self.logger.error("Get dequeue data failed, the op mark data is empty.")
             return dequeue_data_list
         op_mark_data.sort(key=lambda x: x.time_ns)
         tid_op_dict = defaultdict(lambda: defaultdict(list))
@@ -76,7 +82,8 @@ class FwkFileParser:
                 continue
             start_op_list = tid_op_dict.get(op_mark.tid, {}).get(op_mark.origin_name, [])
             if not start_op_list:
-                print_warn_msg("Dequque data match failed")
+                self.logger.warning("Dequque data match failed, the tid: %d, origin_name: %s is not exist.", 
+                                op_mark.tid, op_mark.origin_name)
                 continue
             start_op = start_op_list.pop()
             op_mark.ts = start_op.time_ns
@@ -103,7 +110,8 @@ class FwkFileParser:
             if op_mark.is_enqueue_end:
                 start_op_list = enqueue_tid_op_dict.get(op_mark.tid, {}).get(op_mark.origin_name, [])
                 if not start_op_list:
-                    print_warn_msg("Enquque data match failed")
+                    self.logger.warning("Enquque data match failed, the tid: %d, origin_name: %s is not exist.", 
+                                    op_mark.tid, op_mark.origin_name)
                     continue
                 start_op = start_op_list.pop()
                 op_mark.ts = start_op.time_ns
@@ -114,7 +122,8 @@ class FwkFileParser:
             if op_mark.is_dequeue_end:
                 start_op_list = dequeue_tid_op_dict.get(op_mark.tid, {}).get(op_mark.origin_name, [])
                 if not start_op_list:
-                    print_warn_msg("Dequque data match failed")
+                    self.logger.warning("Dequque data match failed, the tid: %d, origin_name: %s is not exist.", 
+                                    op_mark.tid, op_mark.origin_name)
                     continue
                 start_op = start_op_list.pop()
                 op_mark.ts = start_op.time_ns
@@ -126,6 +135,7 @@ class FwkFileParser:
     def get_torch_op_tree_node(self, only_fwk: bool = False) -> list:
         torch_op_list = self.get_file_data_by_tag(FileTag.TORCH_OP)
         if not torch_op_list:
+            self.logger.error("Get torch op tree node failed, the torch op data is empty.")
             return []
         enqueue_data_list = []
         if not only_fwk:
@@ -136,6 +146,7 @@ class FwkFileParser:
     def get_fwk_trace_data(self):
         torch_op_data = self.get_file_data_by_tag(FileTag.TORCH_OP)
         if not torch_op_data:
+            self.logger.error("Get fwk trace data failed, the torch op data is empty.")
             return []
         enqueue_data_list, dequeue_data_list = self.get_task_queue_data()
         pid = torch_op_data[0].pid
diff --git a/torch_npu/profiler/analysis/prof_view/_base_parser.py b/torch_npu/profiler/analysis/prof_view/_base_parser.py
index cb270a54f1..26dc595cd7 100644
--- a/torch_npu/profiler/analysis/prof_view/_base_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_base_parser.py
@@ -30,7 +30,7 @@ class BaseParser(ConcurrentTask, ABC):
         self._output_path = None
         deps, mode = self._init_param(name)
         super(BaseParser, self).__init__(name, deps, mode)
-
+        
     def _init_param(self, name: str) -> any:
         self._profiler_path = self._param_dict.get("profiler_path")
         self._output_path = self._param_dict.get("output_path")
diff --git a/torch_npu/profiler/analysis/prof_view/_communication_parser.py b/torch_npu/profiler/analysis/prof_view/_communication_parser.py
index e1751abfab..fff6d265d6 100644
--- a/torch_npu/profiler/analysis/prof_view/_communication_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_communication_parser.py
@@ -3,11 +3,12 @@ from collections import defaultdict
 
 from ._base_parser import BaseParser
 from ..prof_bean._torch_op_node import TorchOpNode
-from ..prof_common_func._constant import Constant, print_error_msg, print_warn_msg
+from ..prof_common_func._constant import Constant, print_warn_msg
 from ..prof_common_func._file_manager import FileManager
 from ..prof_parse._cann_file_parser import CANNFileParser
 from ..prof_parse._cann_file_parser import CANNDataEnum
 from ..prof_common_func._constant import convert_us2ns
+from ..prof_common_func._log import ProfilerLogger
 from ..prof_parse._fwk_cann_relation_parser import FwkCANNRelationParser
 
 __all__ = []
@@ -45,6 +46,8 @@ class CommunicationParser(BaseParser):
         self._root_node = TorchOpNode()
         self._kernel_dict = {}
         self.step_list = []
+        ProfilerLogger.init(self._profiler_path, "CommunicationParser")
+        self.logger = ProfilerLogger.get_instance()
 
     @staticmethod
     def combine_size_distribution(op_dict: dict, total_dict: dict):
@@ -63,8 +66,8 @@ class CommunicationParser(BaseParser):
         try:
             self._init_step_list(deps_data)
             self.generate_view()
-        except Exception:
-            print_error_msg("Failed to generate communication.json or communication_matrix.json.")
+        except Exception as e:
+            self.logger.error("Failed to generate communication.json or communication_matrix.json, error: %s", str(e), exc_info=True)
             return Constant.FAIL, None
         return Constant.SUCCESS, None
 
@@ -272,7 +275,10 @@ class CommunicationParser(BaseParser):
     def _init_step_list(self, deps_data: dict):
         torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, [])
         if torch_op_node:
-            self.step_list = FwkCANNRelationParser(self._profiler_path).get_step_range(torch_op_node[0], deps_data.get(
-                Constant.RELATION_PARSER, {}))
+            kernels_dict = deps_data.get(Constant.RELATION_PARSER, {})
+            if not kernels_dict:
+                self.logger.error("Init step list failed, the kernel dict is empty.")
+            self.step_list = FwkCANNRelationParser(self._profiler_path).get_step_range(torch_op_node[0], kernels_dict)
+
         if not self.step_list:
             self.step_list = [{"step_id": None, "start_ts": 0, "end_ts": float('inf'), "comm_ops": {}}]
diff --git a/torch_npu/profiler/analysis/prof_view/_integrate_parser.py b/torch_npu/profiler/analysis/prof_view/_integrate_parser.py
index 01c6a00538..b1344bc0a2 100644
--- a/torch_npu/profiler/analysis/prof_view/_integrate_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_integrate_parser.py
@@ -1,6 +1,7 @@
 from ._base_parser import BaseParser
-from ..prof_common_func._constant import Constant, print_error_msg
+from ..prof_common_func._constant import Constant
 from ..prof_common_func._file_manager import FileManager
+from ..prof_common_func._log import ProfilerLogger
 from ..prof_parse._cann_file_parser import CANNFileParser, CANNDataEnum
 from .._profiler_config import ProfilerConfig
 
@@ -21,13 +22,15 @@ class IntegrateParser(BaseParser):
 
     def __init__(self, name: str, param_dict: dict):
         super().__init__(name, param_dict)
+        ProfilerLogger.init(self._profiler_path, "IntegrateParser")
+        self.logger = ProfilerLogger.get_instance()
 
     def run(self, deps_data: dict):
         try:
             ProfilerConfig().load_info(self._profiler_path)
             self.generate_view()
-        except Exception:
-            print_error_msg("Failed to generate data_preprocess.csv or l2_cache.csv.")
+        except Exception as e:
+            self.logger.error("Failed to generate data_preprocess.csv or l2_cache.csv, error: %s", str(e), exc_info=True)
             return Constant.FAIL, None
         return Constant.SUCCESS, None
 
diff --git a/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py b/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py
index f35f7c6e6d..b06d7d3d72 100644
--- a/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_kernel_view_parser.py
@@ -1,7 +1,8 @@
 from ._base_parser import BaseParser
-from ..prof_common_func._constant import Constant, print_error_msg, convert_ns2us_str
+from ..prof_common_func._constant import Constant, convert_ns2us_str
 from ..prof_common_func._csv_headers import CsvHeaders
 from ..prof_common_func._file_manager import FileManager
+from ..prof_common_func._log import ProfilerLogger
 from ..prof_bean._op_summary_bean import OpSummaryBean
 from ..prof_parse._cann_file_parser import CANNFileParser, CANNDataEnum
 from ..prof_parse._fwk_cann_relation_parser import FwkCANNRelationParser
@@ -16,6 +17,8 @@ class KernelViewParser(BaseParser):
     def __init__(self, name: str, param_dict: dict):
         super().__init__(name, param_dict)
         self.step_range = []
+        ProfilerLogger.init(self._profiler_path, "KernelViewParser")
+        self.logger = ProfilerLogger.get_instance()
 
     @classmethod
     def _project_map_for_headers(cls, input_headers: list):
@@ -36,8 +39,8 @@ class KernelViewParser(BaseParser):
             ProfilerConfig().load_info(self._profiler_path)
             self._init_step_range(deps_data)
             self.generate_view()
-        except Exception:
-            print_error_msg("Failed to generate kernel_details.csv.")
+        except Exception as e:
+            self.logger.error("Failed to generate kernel_details.csv, error: %s", str(e), exc_info=True)
             return Constant.FAIL, None
         return Constant.SUCCESS, None
 
@@ -68,8 +71,13 @@ class KernelViewParser(BaseParser):
     def _init_step_range(self, deps_data: dict):
         torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, [])
         if torch_op_node:
-            step_range = FwkCANNRelationParser(self._profiler_path).get_step_range(torch_op_node[0], deps_data.get(
-                Constant.RELATION_PARSER, {}))
+            kernel_dict = deps_data.get(Constant.RELATION_PARSER, {})
+            if not kernel_dict:
+                self.logger.error("Kernel view get step range failed, the kernel dict is empty.")
+                return
+            step_range = FwkCANNRelationParser(self._profiler_path).get_step_range(torch_op_node[0], kernel_dict)
+            if not step_range:
+                self.logger.error("Kernel view get step range failed, the step range is empty.")
             for step_data in step_range:
                 step_id = step_data.get(Constant.STEP_ID)
                 step_start = convert_ns2us_str(step_data.get(Constant.START_TS, 0))
diff --git a/torch_npu/profiler/analysis/prof_view/_memory_prepare_parser.py b/torch_npu/profiler/analysis/prof_view/_memory_prepare_parser.py
index ad7d96e002..b5914dd50e 100644
--- a/torch_npu/profiler/analysis/prof_view/_memory_prepare_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_memory_prepare_parser.py
@@ -16,7 +16,6 @@
 from collections import defaultdict
 from warnings import warn
 from math import ceil
-import os
 
 from ._base_parser import BaseParser
 from ..prof_common_func._file_tag import FileTag
@@ -24,8 +23,9 @@ from ..prof_common_func._path_manager import ProfilerPathManager
 from ..prof_parse._fwk_file_parser import FwkFileParser
 from ..prof_bean._memory_use_bean import MemoryUseBean
 from ..prof_bean._op_mark_bean import OpMarkBean
-from ..prof_common_func._constant import Constant, print_error_msg, print_warn_msg
+from ..prof_common_func._constant import Constant, print_warn_msg
 from ..prof_common_func._constant import convert_ns2us_float, convert_ns2us_str
+from ..prof_common_func._log import ProfilerLogger
 from .._profiler_config import ProfilerConfig
 
 __all__ = []
@@ -46,6 +46,8 @@ class MemoryPrepareParser(BaseParser):
         self._enqueue_record_dict = {}  # {corrid: enqueue}
         self._dequeue_pids = set()
         self._dequeue_tids = set()
+        ProfilerLogger.init(self._profiler_path, "MemoryPrepareParser")
+        self.logger = ProfilerLogger.get_instance()
 
     @staticmethod
     def _find_torch_ops_by_binary_search(ts: int, torch_ops: list):
@@ -63,8 +65,8 @@ class MemoryPrepareParser(BaseParser):
         try:
             self._torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, [])
             self.generate_view()
-        except Exception:
-            print_error_msg("Failed to generate pytorch memory data.")
+        except Exception as e:
+            self.logger.error("Failed to generate pytorch memory data, error: %s", str(e), exc_info=True)
             return Constant.FAIL, {}
         if self._incomplete_num > 0:
             print_warn_msg(f"{self._incomplete_num} memory record(s) are incomplete.")
diff --git a/torch_npu/profiler/analysis/prof_view/_memory_timeline_parser.py b/torch_npu/profiler/analysis/prof_view/_memory_timeline_parser.py
index 79ba8768ed..986ec5935a 100644
--- a/torch_npu/profiler/analysis/prof_view/_memory_timeline_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_memory_timeline_parser.py
@@ -15,6 +15,7 @@ from torch.profiler._utils import traverse_dfs
 from ._base_parser import BaseParser
 from ..prof_common_func._path_manager import ProfilerPathManager
 from ..prof_common_func._file_manager import FileManager
+from ..prof_common_func._log import ProfilerLogger
 from ..prof_common_func._constant import Constant, print_warn_msg, print_error_msg
 from ..prof_parse._event_tree_parser import (
     EventTree,
@@ -1074,6 +1075,8 @@ class MemoryTimelineParser(BaseParser):
     def __init__(self, name: str, param_dict: dict):
         super().__init__(name, param_dict)
         self._device = self._param_dict.get("device")
+        ProfilerLogger.init(self._profiler_path, "MemoryTimelineParser")
+        self.logger = ProfilerLogger.get_instance()
 
     def run(self, deps_data: dict):
         try:
@@ -1087,7 +1090,7 @@ class MemoryTimelineParser(BaseParser):
                 mem_timeline.export_memory_timeline_json_raw(self._output_path, self._device)
             else:
                 mem_timeline.export_memory_timeline_json(self._output_path, self._device)
-        except Exception:
-            print_error_msg(f"Failed to generate {self._output_path}.")
+        except Exception as e:
+            self.logger.error("Failed to generate %s, error: %s", self._output_path, str(e), exc_info=True)
             return Constant.FAIL, None
         return Constant.SUCCESS, None
\ No newline at end of file
diff --git a/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py b/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py
index ca70813380..fa834e543b 100644
--- a/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_memory_view_parser.py
@@ -7,11 +7,12 @@ from ..prof_common_func._path_manager import ProfilerPathManager
 from ..prof_parse._fwk_file_parser import FwkFileParser
 from ..prof_common_func._file_manager import FileManager
 from ..prof_common_func._constant import convert_ns2us_str
-from ..prof_common_func._constant import Constant, print_error_msg
+from ..prof_common_func._constant import Constant
 from ..prof_bean._npu_mem_bean import NpuMemoryBean
 from ..prof_bean._ge_op_memory_bean import GeOpMemoryBean
 from ..prof_bean._ge_memory_record_bean import GeMemoryRecordBean
 from ..prof_parse._cann_file_parser import CANNFileParser, CANNDataEnum
+from ..prof_common_func._log import ProfilerLogger
 
 __all__ = []
 
@@ -33,6 +34,8 @@ class MemoryViewParser(BaseParser):
         self.ge_record_list = []
         self.memory_data = []
         self.component_list = []
+        ProfilerLogger.init(self._profiler_path, "MemoryViewParser")
+        self.logger = ProfilerLogger.get_instance()
 
     @staticmethod
     def _get_data_from_file(file_set: set, file_type_bean: any, bean_list: bool = False) -> list:
@@ -69,8 +72,8 @@ class MemoryViewParser(BaseParser):
             self.memory_data = deps_data.get(Constant.MEMORY_PREPARE, {}).get("memory_data", {}).get(Constant.Text, [])
             self.pta_record_list = deps_data.get(Constant.MEMORY_PREPARE, {}).get("pta_record_list", [])
             self.generate_view()
-        except Exception:
-            print_error_msg("Failed to generate operator_memory.csv or memory_record.csv.")
+        except Exception as e:
+            self.logger.error("Failed to generate operator_memory.csv or memory_record.csv, error: %s", str(e), exc_info=True)
             return Constant.FAIL, None
         return Constant.SUCCESS, None
 
diff --git a/torch_npu/profiler/analysis/prof_view/_operator_view_parser.py b/torch_npu/profiler/analysis/prof_view/_operator_view_parser.py
index 35d3e3edc2..f87e8dc8b8 100644
--- a/torch_npu/profiler/analysis/prof_view/_operator_view_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_operator_view_parser.py
@@ -1,10 +1,11 @@
 from ._base_parser import BaseParser
-from ..prof_common_func._constant import Constant, print_error_msg
+from ..prof_common_func._constant import Constant
 from ..prof_common_func._file_manager import FileManager
 
 from ..prof_common_func._constant import convert_ns2us_float
 from ..prof_common_func._path_manager import ProfilerPathManager
 from ..prof_common_func._tree_builder import TreeBuilder
+from ..prof_common_func._log import ProfilerLogger
 from ..prof_parse._fwk_file_parser import FwkFileParser
 
 __all__ = []
@@ -21,14 +22,16 @@ class OperatorViewParser(BaseParser):
         self._torch_op_node = []
         self._root_node = None
         self._kernel_dict = {}
+        ProfilerLogger.init(self._profiler_path, "OperatorViewParser")
+        self.logger = ProfilerLogger.get_instance()
 
     def run(self, deps_data: dict):
         try:
             self._torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, [])
             self._kernel_dict = deps_data.get(Constant.RELATION_PARSER, {})
             self.generate_view()
-        except Exception:
-            print_error_msg("Failed to generate operator_details.csv.")
+        except Exception as e:
+            self.logger.error("Failed to generate operator_details.csv, error: %s", str(e), exc_info=True)
             return Constant.FAIL, None
         return Constant.SUCCESS, None
 
diff --git a/torch_npu/profiler/analysis/prof_view/_stack_view_parser.py b/torch_npu/profiler/analysis/prof_view/_stack_view_parser.py
index 6601f56b0a..2f793a8af8 100644
--- a/torch_npu/profiler/analysis/prof_view/_stack_view_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_stack_view_parser.py
@@ -3,11 +3,12 @@ import os
 from ..prof_common_func._constant import convert_ns2us_float
 from ._base_parser import BaseParser
 from ..prof_bean._torch_op_node import TorchOpNode
-from ..prof_common_func._constant import Constant, print_error_msg
+from ..prof_common_func._constant import Constant
 from ..prof_common_func._constant import print_warn_msg
 from ..prof_common_func._path_manager import ProfilerPathManager
 from ..prof_common_func._tree_builder import TreeBuilder
 from ..prof_common_func._file_manager import FileManager
+from ..prof_common_func._log import ProfilerLogger
 from ..prof_parse._fwk_cann_relation_parser import FwkCANNRelationParser
 from ..prof_parse._fwk_file_parser import FwkFileParser
 from ....utils._path_manager import PathManager
@@ -22,13 +23,15 @@ class StackViewParser(BaseParser):
         self._root_node = None
         self._kernel_dict = {}
         self._metric = param_dict.get("metric")
+        ProfilerLogger.init(self._profiler_path, "StackViewParser")
+        self.logger = ProfilerLogger.get_instance()
 
     def run(self, deps_data: dict):
         try:
             self._torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, [])
             self.generate_view()
-        except Exception:
-            print_error_msg("Failed to export stack.")
+        except Exception as e:
+            self.logger.error("Failed to export stack, error: %s", str(e), exc_info=True)
             return Constant.FAIL, None
         return Constant.SUCCESS, None
 
diff --git a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py
index a3fc0f8187..4eb7a1488f 100644
--- a/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_trace_step_time_parser.py
@@ -1,8 +1,9 @@
 from enum import Enum
 from ._base_parser import BaseParser
-from ..prof_common_func._constant import Constant, print_error_msg
+from ..prof_common_func._constant import Constant
 from ..prof_common_func._file_manager import FileManager
 from ..prof_common_func._constant import convert_ns2us_float
+from ..prof_common_func._log import ProfilerLogger
 from ..prof_parse._cann_file_parser import CANNFileParser
 from ..prof_parse._fwk_cann_relation_parser import FwkCANNRelationParser
 from ..prof_parse._fwk_file_parser import FwkFileParser
@@ -30,6 +31,8 @@ class TraceStepTimeParser(BaseParser):
     def __init__(self, name: str, param_dict: dict):
         super().__init__(name, param_dict)
         self.step_range = []
+        ProfilerLogger.init(self._profiler_path, "TraceStepTimeParser")
+        self.logger = ProfilerLogger.get_instance()
 
     @classmethod
     def is_float_num(cls, num):
@@ -131,8 +134,8 @@ class TraceStepTimeParser(BaseParser):
         try:
             self._init_step_range(deps_data)
             self.generate_view()
-        except Exception:
-            print_error_msg("Failed to generate step_trace_time.csv.")
+        except Exception as e:
+            self.logger.error("Failed to generate step_trace_time.csv, error: %s", str(e), exc_info=True)
             return Constant.FAIL, None
         return Constant.SUCCESS, None
 
diff --git a/torch_npu/profiler/analysis/prof_view/_trace_view_parser.py b/torch_npu/profiler/analysis/prof_view/_trace_view_parser.py
index 1c4d252fb8..f90100e869 100644
--- a/torch_npu/profiler/analysis/prof_view/_trace_view_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/_trace_view_parser.py
@@ -1,11 +1,12 @@
 import os
 
 from ._base_parser import BaseParser
-from ..prof_common_func._constant import Constant, print_error_msg
+from ..prof_common_func._constant import Constant
 from ..prof_common_func._file_manager import FileManager
 from ..prof_common_func._path_manager import ProfilerPathManager
 from ..prof_common_func._trace_event_manager import TraceEventManager
 from ..prof_common_func._tree_builder import TreeBuilder
+from ..prof_common_func._log import ProfilerLogger
 from ..prof_parse._fwk_cann_relation_parser import FwkCANNRelationParser
 from .._profiler_config import ProfilerConfig
 from ..prof_parse._cann_file_parser import CANNFileParser
@@ -26,6 +27,8 @@ class TraceViewParser(BaseParser):
         self._trace_data = []
         self._torch_op_node = []
         self._root_node = None
+        ProfilerLogger.init(self._profiler_path, "TraceViewParser")
+        self.logger = ProfilerLogger.get_instance()
 
     @staticmethod
     def _prune_trace_by_level(json_data: list) -> list:
@@ -51,8 +54,8 @@ class TraceViewParser(BaseParser):
                 self._root_node = torch_op_node[0]
                 self._torch_op_node = torch_op_node[1:]
             self.generate_view()
-        except Exception:
-            print_error_msg("Failed to generate trace_view.json.")
+        except Exception as e:
+            self.logger.error("Failed to generate trace_view.json, error: %s", str(e), exc_info=True)
             return Constant.FAIL, None
         return Constant.SUCCESS, None
 
diff --git a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py
index cf8f05bfca..bb8e917e3c 100644
--- a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py
+++ b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_analyze.py
@@ -21,6 +21,7 @@ from torch_npu.utils._error_code import ErrCode, prof_error
 from ...prof_common_func._constant import print_warn_msg, Constant, print_error_msg
 from ...prof_common_func._path_manager import ProfilerPathManager
 from .._base_parser import BaseParser
+from ...prof_common_func._log import ProfilerLogger
 from ..._profiler_config import ProfilerConfig
 
 __all__ = []
@@ -33,6 +34,8 @@ class CANNAnalyzeParser(BaseParser):
         super().__init__(name, param_dict)
         self._cann_path = ProfilerPathManager.get_cann_path(self._profiler_path)
         self.msprof_path = shutil.which("msprof")
+        ProfilerLogger.init(self._profiler_path, "CANNAnalyzeParser")
+        self.logger = ProfilerLogger.get_instance()
 
     def run(self, deps_data: dict):
         try:
@@ -56,7 +59,8 @@ class CANNAnalyzeParser(BaseParser):
                 if completed_analysis.returncode != self.COMMAND_SUCCESS:
                     print_warn_msg("Failed to analyze CANN TEXT Profiling data.")
 
-        except Exception:
+        except Exception as e:
             print_error_msg("Failed to analyze CANN Profiling data.")
+            self.logger.error("Failed to analyze CANN Profiling data, error: %s", str(e), exc_info=True)
             return Constant.FAIL, None
         return Constant.SUCCESS, None
diff --git a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py
index db6fb176a9..cff2628575 100644
--- a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py
+++ b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py
@@ -25,6 +25,8 @@ from ...prof_common_func._constant import Constant, print_warn_msg, print_error_
 from ...prof_common_func._path_manager import ProfilerPathManager
 from .._base_parser import BaseParser
 from ..._profiler_config import ProfilerConfig
+from ...prof_common_func._log import ProfilerLogger
+
 
 __all__ = []
 
@@ -39,6 +41,8 @@ class CANNExportParser(BaseParser):
         super().__init__(name, param_dict)
         self._cann_path = ProfilerPathManager.get_cann_path(self._profiler_path)
         self.msprof_path = shutil.which("msprof")
+        ProfilerLogger.init(self._profiler_path, "CANNExportParser")
+        self.logger = ProfilerLogger.get_instance()
 
     def run(self, deps_data: dict):
         try:
@@ -67,6 +71,7 @@ class CANNExportParser(BaseParser):
 
         except Exception as err:
             print_error_msg(f"Failed to export CANN Profiling data. Error msg: {err}")
+            self.logger.error("Failed to export CANN Profiling data, error: %s", str(err), exc_info=True)
             return Constant.FAIL, None
         end_time = datetime.utcnow()
         print_info_msg(f"CANN profiling data parsed in a total time of {end_time - start_time}")
diff --git a/torch_npu/profiler/analysis/prof_view/prepare_parse/_fwk_pre_parser.py b/torch_npu/profiler/analysis/prof_view/prepare_parse/_fwk_pre_parser.py
index 2d3e3bbd16..6cc6f23516 100644
--- a/torch_npu/profiler/analysis/prof_view/prepare_parse/_fwk_pre_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prepare_parse/_fwk_pre_parser.py
@@ -15,8 +15,9 @@
 
 import os
 
-from ...prof_common_func._constant import print_error_msg, Constant
+from ...prof_common_func._constant import Constant
 from ...prof_common_func._file_manager import FileManager
+from ...prof_common_func._log import ProfilerLogger
 from ...prof_parse._fwk_file_parser import FwkFileParser
 from .._base_parser import BaseParser
 
@@ -27,6 +28,8 @@ class TracePreParser(BaseParser):
 
     def __init__(self, name: str, param_dict: dict):
         super().__init__(name, param_dict)
+        ProfilerLogger.init(self._profiler_path, "TracePreParser")
+        self.logger = ProfilerLogger.get_instance()
 
     def run(self, deps_data: dict):
         try:
@@ -34,7 +37,8 @@ class TracePreParser(BaseParser):
             trace_file_path = os.path.join(self._output_path, Constant.TRACE_VIEW_TEMP) if os.path.isdir(
                 self._output_path) else self._output_path
             FileManager.create_prepare_trace_json_by_path(trace_file_path, fwk_trace_data)
-        except Exception:
+        except Exception as e:
+            self.logger.error("Failed to create prepare trace json, error: %s", str(e), exc_info=True)
             return Constant.FAIL, None
         return Constant.SUCCESS, None
 
@@ -43,11 +47,13 @@ class TreeBuildParser(BaseParser):
 
     def __init__(self, name: str, param_dict: dict):
         super().__init__(name, param_dict)
+        ProfilerLogger.init(self._profiler_path, "TracePreParser")
+        self.logger = ProfilerLogger.get_instance()
 
     def run(self, deps_data: dict):
         try:
             torch_op_node = FwkFileParser(self._profiler_path).get_torch_op_tree_node()
-        except Exception:
-            print_error_msg("Failed to build torch op tree.")
+        except Exception as e:
+            self.logger.error("Failed to build torch op tree, error: %s", str(e), exc_info=True)
             return Constant.FAIL, []
         return Constant.SUCCESS, torch_op_node
diff --git a/torch_npu/profiler/analysis/prof_view/prepare_parse/_relation_parser.py b/torch_npu/profiler/analysis/prof_view/prepare_parse/_relation_parser.py
index 8932df64fa..e6eb02ddb8 100644
--- a/torch_npu/profiler/analysis/prof_view/prepare_parse/_relation_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prepare_parse/_relation_parser.py
@@ -12,8 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from ...prof_common_func._constant import Constant, print_error_msg
+from ...prof_common_func._constant import Constant
+from ...prof_common_func._log import ProfilerLogger
 from ...prof_parse._fwk_cann_relation_parser import FwkCANNRelationParser
 from .._base_parser import BaseParser
 
@@ -23,11 +23,13 @@ __all__ = []
 class RelationParser(BaseParser):
     def __init__(self, name: str, param_dict: dict):
         super().__init__(name, param_dict)
+        ProfilerLogger.init(self._profiler_path, "RelationParser")
+        self.logger = ProfilerLogger.get_instance()
 
     def run(self, deps_data: dict):
         try:
             kernel_dict = FwkCANNRelationParser(self._profiler_path).get_kernel_dict()
-        except Exception:
-            print_error_msg("Failed to get acl to npu flow dict.")
+        except Exception as e:
+            self.logger.error("Failed to get acl to npu flow dict, error: %s", str(e), exc_info=True)
             return Constant.FAIL, {}
         return Constant.SUCCESS, kernel_dict
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_communication_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_communication_db_parser.py
index a77d1defbc..29bf8259a3 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_communication_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_communication_db_parser.py
@@ -18,9 +18,10 @@ from enum import Enum
 
 from ...prof_parse._cann_file_parser import CANNDataEnum, CANNFileParser
 from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager
-from ...prof_common_func._constant import convert_us2ns, print_error_msg, print_warn_msg
+from ...prof_common_func._constant import convert_us2ns
 from ...prof_common_func._db_manager import DbManager
 from .._communication_parser import CommunicationParser
+from ...prof_common_func._log import ProfilerLogger
 
 __all__ = []
 
@@ -74,13 +75,15 @@ class CommunicationDbParser(CommunicationParser):
         self.cann_comm_db_curs = None
         self.analysis_db_conn = None
         self.analysis_db_curs = None
+        ProfilerLogger.init(self._profiler_path, "CommunicationDbParser")
+        self.logger = ProfilerLogger.get_instance()
 
     def run(self, deps_data: dict):
         try:
             self._init_step_list(deps_data)
             self.generate_view()
-        except Exception:
-            print_error_msg("Failed to generate communication table.")
+        except Exception as e:
+            self.logger.error("Failed to generate communication table, error: %s", str(e), exc_info=True)
             DbManager.destroy_db_connect(self.cann_comm_db_conn, self.cann_comm_db_curs)
             DbManager.destroy_db_connect(self.analysis_db_conn, self.analysis_db_curs)
             return Constant.FAIL, None
@@ -110,7 +113,7 @@ class CommunicationDbParser(CommunicationParser):
         band_width_data, matrix_data, time_data = [], [], []
         conn, curs = DbManager.create_connect_db(db_path)
         if not (conn and curs):
-            print_warn_msg(f"Failed to connect to db file: {db_path}")
+            self.logger.warning("Failed to connect to db file: %s", db_path)
             return band_width_data, matrix_data, time_data
         self.cann_comm_db_conn = conn
         self.cann_comm_db_curs = curs
@@ -219,7 +222,7 @@ class CommunicationDbParser(CommunicationParser):
         db_path = os.path.join(output_path, DbConstant.DB_ANALYSIS)
         conn, curs = DbManager.create_connect_db(db_path)
         if not (conn and curs):
-            print_warn_msg(f"Failed to connect to db file: {db_path}")
+            self.logger.warning("Failed to connect to db file: %s", db_path)
             return
         self.analysis_db_conn = conn
         self.analysis_db_curs = curs
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py
index ad05147ab6..d4e63b210b 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py
@@ -2,13 +2,13 @@ import os
 import re
 import shutil
 import json
-
 from ...prof_common_func._utils import collect_env_vars
 from ...prof_common_func._path_manager import ProfilerPathManager
 from ...prof_common_func._file_manager import FileManager
-from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager, print_error_msg, print_warn_msg
+from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager, print_warn_msg
 from ...prof_common_func._db_manager import DbManager
 from ...prof_common_func._host_info import get_host_info
+from ...prof_common_func._log import ProfilerLogger
 from .._base_parser import BaseParser
 from ..._profiler_config import ProfilerConfig
 
@@ -22,6 +22,8 @@ class DbParser(BaseParser):
         self._ascend_db_path = os.path.join(self._output_path, DbConstant.DB_ASCEND_PYTORCH_PROFILER)
         self._conn = None
         self._cur = None
+        ProfilerLogger.init(self._profiler_path, "DbParser")
+        self.logger = ProfilerLogger.get_instance()
 
     def run(self, depth_data: dict):
         try:
@@ -37,8 +39,8 @@ class DbParser(BaseParser):
             self.save_env_vars_info_to_db()
             self.save_profiler_metadata_to_db()
             DbManager.destroy_db_connect(self._conn, self._cur)
-        except RuntimeError:
-            print_error_msg("Failed to generate ascend_pytorch_profiler db file.")
+        except RuntimeError as e:
+            self.logger.error("Failed to generate ascend_pytorch_profiler db file, error: %s", str(e), exc_info=True)
             DbManager.destroy_db_connect(self._conn, self._cur)
             return Constant.FAIL, ""
         return Constant.SUCCESS, self._ascend_db_path
@@ -90,7 +92,7 @@ class DbParser(BaseParser):
         try:
             profiler_metadata = json.loads(profiler_metadata)
         except json.JSONDecodeError as e:
-            print_warn_msg(f"profiler_metadata.json parse failed. {e}")
+            self.logger.warning("profiler_metadata.json parse failed. %s", str(e))
             return
         data = [
             [str(key), json.dumps(value)] for key, value in profiler_metadata.items()
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_fwk_api_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_fwk_api_db_parser.py
index eede0fa6a4..6572f4a472 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_fwk_api_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_fwk_api_db_parser.py
@@ -1,9 +1,7 @@
-import os
-
 from enum import Enum
 from ...prof_common_func._db_manager import DbManager
 from ...prof_common_func._id_manager import Str2IdManager, ConnectionIdManager, CallChainIdManager
-from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager, print_error_msg
+from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager
 from .._base_parser import BaseParser
 from ...prof_parse._fwk_file_parser import FwkFileParser
 
@@ -71,8 +69,8 @@ class FwkApiDbParser(BaseParser):
             fwk_api_data = FwkFileParser(self._profiler_path).get_fwk_api()
             self.get_api_data_for_db(fwk_api_data)
             self.save_api_data_to_db()
-        except Exception:
-            print_error_msg("Failed to generate framework api table.")
+        except Exception as e:
+            logging.error("Failed to generate framework api table, error: %s", str(e), exc_info=True)
             DbManager.destroy_db_connect(self._conn, self._cur)
             return Constant.FAIL, None
         return Constant.SUCCESS, None
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_gc_record_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_gc_record_db_parser.py
index f264f39baa..76a67b41bf 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_gc_record_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_gc_record_db_parser.py
@@ -12,9 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from ...prof_common_func._db_manager import DbManager
-from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager, print_error_msg
+from ...prof_common_func._log import ProfilerLogger
+from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager
 from ...prof_parse._fwk_file_parser import FwkFileParser
 from .._base_parser import BaseParser
 
@@ -29,6 +29,8 @@ class GCRecordDbParser(BaseParser):
         self._cur = None
         self._db_path = ""
         self._gc_record_data = []
+        ProfilerLogger.init(self._profiler_path, "GCRecordDbParser")
+        self.logger = ProfilerLogger.get_instance()
 
     def run(self, deps_data: dict):
         try:
@@ -36,8 +38,8 @@ class GCRecordDbParser(BaseParser):
             self.init_db_connect()
             self._gc_record_data = FwkFileParser(self._profiler_path).get_gc_record_db_data()
             self.save_gc_record_data_to_db()
-        except Exception:
-            print_error_msg("Failed to generate gc record table.")
+        except Exception as e:
+            self.logger.error("Failed to generate gc record table, error: %s", str(e), exc_info=True)
             DbManager.destroy_db_connect(self._conn, self._cur)
             return Constant.FAIL, None
         return Constant.SUCCESS, None
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py
index 30e9d93c0a..09ca81a73d 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py
@@ -8,7 +8,8 @@ from ...prof_common_func._db_manager import DbManager
 from ...prof_common_func._id_manager import Str2IdManager
 from ...prof_common_func._path_manager import ProfilerPathManager
 from ...prof_parse._cann_file_parser import CANNFileParser, CANNDataEnum
-from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager, print_error_msg
+from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager
+from ...prof_common_func._log import ProfilerLogger
 from .._base_parser import BaseParser
 
 __all__ = []
@@ -65,6 +66,8 @@ class MemoryDbParser(BaseParser):
         self._pta_record_list = []
         self._ge_record_list = []
         self._record_list = []
+        ProfilerLogger.init(self._profiler_path, "MemoryDbParser")
+        self.logger = ProfilerLogger.get_instance()
 
     @staticmethod
     def _combine_record(last_record, cur_record):
@@ -86,8 +89,8 @@ class MemoryDbParser(BaseParser):
             self._pta_memory_bean_list = deps_data.get(Constant.MEMORY_PREPARE, {}).get("pta_record_list", [])
             self.init_pta_memory_data()
             self.save_memory_data_to_db()
-        except Exception:
-            print_error_msg("Failed to generate memory_record table or op_memory table.")
+        except Exception as e:
+            self.logger.error("Failed to generate memory_record table or op_memory table, error: %s", str(e), exc_info=True)
             DbManager.destroy_db_connect(self._conn, self._cur)
             return Constant.FAIL, None
         return Constant.SUCCESS, None
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_step_info_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_step_info_db_parser.py
index fb8d6c980c..fc871036a3 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_step_info_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_step_info_db_parser.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import logging
 
 from .._base_parser import BaseParser
 from ...prof_bean._torch_op_node import TorchOpNode
@@ -36,8 +37,8 @@ class StepInfoDbParser(BaseParser):
             self._db_path = deps_data.get(Constant.DB_PARSER, "")
             torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, [])
             step_range = self.get_step_range(torch_op_node[0] if torch_op_node else None)
-        except Exception:
-            print_error_msg("Failed to get step info from db.")
+        except Exception as e:
+            logging.error("Failed to get step info from db, error: %s", str(e), exc_info=True)
             DbManager.destroy_db_connect(self.db_conn, self.db_curs)
             return Constant.FAIL, []
         return Constant.SUCCESS, step_range
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py
index 96eb06f802..6b7e1bd37e 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py
@@ -14,9 +14,10 @@
 # limitations under the License.
 
 import os
+import logging
 from enum import Enum
 from .._base_parser import BaseParser
-from ...prof_common_func._constant import Constant, print_error_msg, print_warn_msg
+from ...prof_common_func._constant import Constant, print_warn_msg
 from ...prof_common_func._constant import DbConstant, TableColumnsManager
 from ...prof_common_func._db_manager import DbManager
 from ...prof_common_func._constant import convert_ns2us_float
@@ -84,8 +85,8 @@ class TraceStepTimeDbParser(BaseParser):
             self._init_step_range(deps_data)
             self._init_task_info_from_db()
             self.generate_view()
-        except Exception:
-            print_error_msg("Failed to generate step_trace_time table.")
+        except Exception as e:
+            logging.error("Failed to generate step_trace_time table, error: %s", str(e), exc_info=True)
             DbManager.destroy_db_connect(self.task_db_con, self.task_db_curs)
             DbManager.destroy_db_connect(self.analysis_db_con, self.analysis_db_curs)
             return Constant.FAIL, None
diff --git a/torch_npu/profiler/profiler_interface.py b/torch_npu/profiler/profiler_interface.py
index 3b1127ed0d..56107b57ac 100644
--- a/torch_npu/profiler/profiler_interface.py
+++ b/torch_npu/profiler/profiler_interface.py
@@ -134,8 +134,8 @@ class _ProfInterface:
     def analyse(self, analysis_type: str = Constant.TENSORBOARD_TRACE_HANDLER, output_path: str = None, **kwargs):
         try:
             NpuProfiler.analyse(self.prof_path, analysis_type, output_path, **kwargs)
-        except Exception:
-            print_warn_msg("Profiling data parsing failed.")
+        except Exception as e:
+            print_warn_msg(f"Profiling data parsing failed, error: {e}")
 
     def check_gc_detect_enable(self):
         return ProfilerActivity.CPU in self.activities and self.experimental_config.with_gc
-- 
Gitee


From bfd4f363df23fb31c43bc4020cb2c57e7a706519 Mon Sep 17 00:00:00 2001
From: Gallium <hujia23@huawei.com>
Date: Sat, 22 Feb 2025 02:51:03 +0000
Subject: [PATCH 044/358] !18058 add streamId to mstx msg Merge pull request
 !18058 from Gallium/v2.6.0

---
 .../csrc/distributed/ProcessGroupHCCL.cpp     | 174 +++++++++++-------
 .../csrc/distributed/ProcessGroupHCCL.hpp     |   3 +-
 2 files changed, 106 insertions(+), 71 deletions(-)

diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index dd0352d05e..769fcb1516 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -2118,8 +2118,24 @@ std::string ProcessGroupHCCL::getHcclCommNameWithoutInit(int rankid, std::vector
     return name_str;
 }
 
+std::string mapToJson(const std::unordered_map<std::string, std::string>& map)
+{
+    std::stringstream ss;
+    ss << "{";
+    bool first = true;
+    for (const auto& pair : map) {
+        if (!first) {
+            ss << ",";
+        }
+        ss << pair.first << ": " << pair.second;
+        first = false;
+    }
+    ss << "}";
+    return ss.str();
+}
+
 std::string ProcessGroupHCCL::getMstxHcclMsg(
-    const std::string &opName, uint64_t dataCnt, HcclDataType dataType, HcclComm comm)
+    const std::string &opName, uint64_t dataCnt, HcclDataType dataType, HcclComm comm, int64_t streamId)
 {
     const static std::map<HcclDataType, std::string> dataTypes = {
         {HCCL_DATA_TYPE_INT8, "int8"},
@@ -2139,25 +2155,29 @@ std::string ProcessGroupHCCL::getMstxHcclMsg(
     if (!torch_npu::profiler::mstxEnable()) {
         return "";
     }
-    std::string hccl_message_str = "comm:" + opName + ",";
+    std::unordered_map<std::string, std::string> msgDict;
+    msgDict["opName"] = opName;
+    std::string hccl_message_str = "comm:" + opName + "-";
     auto nameIter = commNames.find(comm);
     if (nameIter == commNames.end()) {
         char commName[MAX_GROUP_NAME_LEN];
         HCCL_CHECK_ERROR(at_npu::hccl::HcclGetCommNameFace(comm, commName));
         std::string name(commName);
         commNames.insert({comm, name});
-        hccl_message_str += name;
+        msgDict["commName"] = name;
     } else {
-        hccl_message_str += nameIter->second;
+        msgDict["commName"] = nameIter->second;
     }
-    hccl_message_str += ",";
+    hccl_message_str += "-";
     std::string data_type_str = "na";
     auto iter = dataTypes.find(dataType);
     if (iter != dataTypes.end()) {
         data_type_str = iter->second;
     }
-    hccl_message_str = hccl_message_str + data_type_str + "," + std::to_string(dataCnt);
-    return hccl_message_str;
+    msgDict["dataType"] = data_type_str;
+    msgDict["dataCnt"] = std::to_string(dataCnt);
+    msgDict["streamId"] = std::to_string(streamId);
+    return mapToJson(msgDict);
 }
 
 void ProcessGroupHCCL::silenceCheck(at::Tensor &input, c10d::OpType opType)
@@ -2785,6 +2805,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allreduce(
 
     std::vector<at::Tensor> tensors_cp = {tensors[0]};
     std::string functionName = __FUNCTION__;
+    auto streamId = getStreamId(false, -1);
     return collective(
         tensors_cp,
         tensors_cp,
@@ -2797,9 +2818,9 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allreduce(
             auto outputDataPtr = output.data_ptr();
             auto numel = getNumelForHCCL(input);
             auto hcclReduceOp = getHcclReduceOp(opts.reduceOp, input);
-            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched]() -> int {
+            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclAllreduce", numel, hcclType, comm), stream.stream(false),
+                    getMstxHcclMsg("HcclAllreduce", numel, hcclType, comm, streamId), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclAllReduce(
                     inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream.stream(false));
@@ -2842,6 +2863,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::batch_isend_irecv(
     }
 
     std::vector<at::Tensor> tensors_tmp = {tensors[0]};
+    auto streamId = getStreamId(false, -1);
     return collective(
         tensors_tmp,
         tensors_tmp,
@@ -2856,7 +2878,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::batch_isend_irecv(
 			    numel_list.push_back(getNumelForHCCL(tensors[i]));
 			    type_list.push_back(getHcclDataType(tensors[i].scalar_type()));
 			}
-			auto hccl_call = [tensor_ptr_list, numel_list, type_list, remote_rank_list, op_type, itemNum, comm, stream, is_dispatched]() -> int {
+			auto hccl_call = [tensor_ptr_list, numel_list, type_list, remote_rank_list, op_type, itemNum, comm, stream, is_dispatched, streamId]() -> int {
 			    HcclSendRecvItem sendRecvInfo[itemNum];
 			    HcclSendRecvType currType;
 			    for (size_t i = 0; i < op_type.size(); ++i) {
@@ -2875,7 +2897,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::batch_isend_irecv(
 			                                           };
 			    }
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclBatchSendRecv", sendRecvInfo[0].count, sendRecvInfo[0].dataType, comm),
+                    getMstxHcclMsg("HcclBatchSendRecv", sendRecvInfo[0].count, sendRecvInfo[0].dataType, comm, streamId),
                     stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
 			    auto hccl_result = hcclBatchIsendIrecv(sendRecvInfo, itemNum, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -2910,7 +2932,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::broadcast(
     if (C10_UNLIKELY(at_npu::native::env::CheckOpHookEnable())) {
         at_npu::native::OpHook::GetInstance().PreHook("broadcast", tensors);
     }
-
+    auto streamId = getStreamId(false, -1);
     return collective(
         tensors,
         tensors,
@@ -2921,9 +2943,9 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::broadcast(
             auto inputDataPtr = input.data_ptr();
             auto numel = getNumelForHCCL(input);
             auto hcclType = getHcclDataType(input.scalar_type());
-            auto hccl_call = [inputDataPtr, numel, hcclType, root, comm, stream, is_dispatched]() -> int {
+            auto hccl_call = [inputDataPtr, numel, hcclType, root, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclBroadcast", numel, hcclType, comm), stream.stream(false),
+                    getMstxHcclMsg("HcclBroadcast", numel, hcclType, comm, streamId), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclBroadcast(inputDataPtr, numel, hcclType, root, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -2957,6 +2979,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allreduce_coalesced(
     check_npu_tensors_same_device(tensors);
     std::vector<at::Tensor> tensors_cp = tensors;
     std::string functionName = __FUNCTION__;
+    auto streamId = getStreamId(false, -1);
     return collectiveCoalesced(
         tensors_cp,
         tensors_cp,
@@ -2969,9 +2992,9 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allreduce_coalesced(
             auto outputDataPtr = output.data_ptr();
             auto numel = getNumelForHCCL(input);
             auto hcclReduceOp = getHcclReduceOp(opts.reduceOp, input);
-            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched]() -> int {
+            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclAllreduce", numel, hcclType, comm), stream.stream(false),
+                    getMstxHcclMsg("HcclAllreduce", numel, hcclType, comm, streamId), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclAllReduce(
                     inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream.stream(false));
@@ -3024,6 +3047,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce(
     std::string functionName = __FUNCTION__;
     uint64_t rank = opts.rootRank;
     std::vector<at::Tensor> tensors_cp = {tensors[0]};
+    auto streamId = getStreamId(false, -1);
     return collective(
         tensors_cp,
         tensors_cp,
@@ -3036,9 +3060,9 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce(
             auto outputDataPtr = output.data_ptr();
             auto numel = getNumelForHCCL(input);
             auto reduceOp = getHcclReduceOp(opts.reduceOp, input);
-            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, reduceOp, rank, comm, stream, is_dispatched]() -> int {
+            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, reduceOp, rank, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclReduce", numel, hcclType, comm), stream.stream(false),
+                    getMstxHcclMsg("HcclReduce", numel, hcclType, comm, streamId), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = hcclReduce(
                     inputDataPtr, outputDataPtr, numel, hcclType, reduceOp, rank, comm, stream.stream(false));
@@ -3083,6 +3107,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_oop(
     std::vector<at::Tensor> inputTensors = {inputTensor};
     std::vector<at::Tensor> outputTensors = {outputTensor};
     std::string functionName = __FUNCTION__;
+    auto streamId = getStreamId(false, -1);
     return collective(
         inputTensors,
         outputTensors,
@@ -3095,9 +3120,9 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_oop(
             auto outputDataPtr = output.data_ptr();
             auto numel = getNumelForHCCL(input);
             auto reduceOp = getHcclReduceOp(opts.reduceOp, input);
-            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, reduceOp, rank, comm, stream, is_dispatched]() -> int {
+            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, reduceOp, rank, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclReduce", numel, hcclType, comm), stream.stream(false),
+                    getMstxHcclMsg("HcclReduce", numel, hcclType, comm, streamId), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = hcclReduce(
                     inputDataPtr, outputDataPtr, numel, hcclType, reduceOp, rank, comm, stream.stream(false));
@@ -3194,7 +3219,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_scatter_base_uneven(
 
     auto inputTensors_ = cast_to_origin_format(inputTensors);
     auto outputTensors_ = cast_to_origin_format(outputTensors);
-
+    auto streamId = getStreamId(false, -1);
     return collective(
         inputTensors_,
         outputTensors_,
@@ -3217,9 +3242,10 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_scatter_base_uneven(
                 numel,
                 comm,
                 stream,
-                is_dispatched]() -> int {
+                is_dispatched,
+                streamId]() -> int {
                     torch_npu::profiler::MstxRange range(
-                        getMstxHcclMsg("HcclReduceScatterV", numel, hcclType, comm),
+                        getMstxHcclMsg("HcclReduceScatterV", numel, hcclType, comm, streamId),
                         stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
                     auto hccl_result = hcclReduceScatterV(
                         inputDataPtr,
@@ -3281,7 +3307,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_allgather_base_uneven(
 
     auto inputTensors_ = cast_to_origin_format(inputTensors);
     auto outputTensors_ = cast_to_origin_format(outputTensors);
-
+    auto streamId = getStreamId(false, -1);
     return collective(
         inputTensors_,
         outputTensors_,
@@ -3302,9 +3328,10 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_allgather_base_uneven(
                 numel,
                 comm,
                 stream,
-                is_dispatched]() -> int {
+                is_dispatched,
+                streamId]() -> int {
                     torch_npu::profiler::MstxRange range(
-                        getMstxHcclMsg("HcclAllGatherV", numel, hcclType, comm),
+                        getMstxHcclMsg("HcclAllGatherV", numel, hcclType, comm, streamId),
                         stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
                     auto hccl_result = hcclAllGatherV(
                         inputDataPtr,
@@ -3359,7 +3386,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather(
         auto outputFlattened =
             flatten_for_scatter_gather(byte_alignment_outputTensors, byte_alignment_inputTensors_, size_);
         check_npu_tensors_different_devices(outputFlattened);
-
+        auto streamId = getStreamId(false, -1);
         return collective(
             byte_alignment_inputTensors_,
             outputFlattened,
@@ -3373,9 +3400,9 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather(
                 auto outputDataPtr = output.data_ptr();
                 auto numel = getNumelForHCCL(input);
                 auto hcclType = getHcclDataType(input.scalar_type());
-                auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, comm, stream, is_dispatched]() -> int {
+                auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, comm, stream, is_dispatched, streamId]() -> int {
                     torch_npu::profiler::MstxRange range(
-                        getMstxHcclMsg("HcclAllGather", numel, hcclType, comm), stream.stream(false),
+                        getMstxHcclMsg("HcclAllGather", numel, hcclType, comm, streamId), stream.stream(false),
                         torch_npu::profiler::DOMAIN_COMMUNICATION);
                     auto hccl_result = HcclAllGather(inputDataPtr, outputDataPtr, numel, hcclType, comm, stream.stream(false));
                     *is_dispatched = true;
@@ -3430,6 +3457,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather(
         }
         std::vector<at::Tensor> inputFlattened = {at::flatten(inputTensors[0])};
         std::vector<at::Tensor> outputFlattened = {at::cat(flattenedOutputTensors, 0)};
+        auto streamId = getStreamId(false, -1);
         return collective(
             inputFlattened,
             outputFlattened,
@@ -3450,9 +3478,10 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather(
                     numel,
                     comm,
                     stream,
-                    is_dispatched]() -> int {
+                    is_dispatched,
+                    streamId]() -> int {
                         torch_npu::profiler::MstxRange range(
-                            getMstxHcclMsg("HcclAllGatherV", numel, hcclType, comm),
+                            getMstxHcclMsg("HcclAllGatherV", numel, hcclType, comm, streamId),
                             stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
                         auto hccl_result = hcclAllGatherV(
                             inputDataPtr,
@@ -3503,6 +3532,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather(
         const auto num_devices = outputTensors.size();
         const auto num_reduces = outputTensors[0].size();
         std::vector<c10::intrusive_ptr<c10d::Work>> works;
+        auto streamId = getStreamId(false, -1);
         // Need to add a method like startCoalescing();
         for (const auto i : c10::irange(num_reduces)) {
             std::vector<at::Tensor> inputs_multi_dev(num_devices);
@@ -3514,12 +3544,10 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather(
                     outputs_multi_dev[j].copy_(inputTensors[j]);
                 }
             }
-
             auto broadcastOpts = c10d::BroadcastOptions{
                 static_cast<int64_t>(i / num_devices),
                 static_cast<int64_t>(i % num_devices),
                 opts.timeout};
-            
             auto work = collective(
                 outputs_multi_dev, outputs_multi_dev, [&](at::Tensor& input, at::Tensor& output, HcclComm comm, c10_npu::NPUStream& stream, std::shared_ptr<bool> is_dispatched) {
                 RECORD_FUNCTION("HcclBroadcast", std::vector<c10::IValue>({input}));
@@ -3529,7 +3557,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather(
                 auto numel = getNumelForHCCL(input);
                 auto hcclType = getHcclDataType(input.scalar_type());
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclBroadcast", numel, hcclType, comm), stream.stream(false),
+                    getMstxHcclMsg("HcclBroadcast", numel, hcclType, comm, streamId), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclBroadcast(inputDataPtr, numel, hcclType, root, comm, stream.stream());
                 *is_dispatched = true;
@@ -3554,6 +3582,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather_into_tensor_coalesced
     const c10d::AllgatherOptions& opts)
 {
     auto inputTensors_ = cast_to_origin_format(inputs);
+    auto streamId = getStreamId(false, -1);
     return collectiveCoalesced(
         inputTensors_,
         outputs,
@@ -3566,9 +3595,9 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather_into_tensor_coalesced
             auto outputDataPtr = output.data_ptr();
             auto numel = getNumelForHCCL(input);
             auto hcclType = getHcclDataType(input.scalar_type());
-            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, comm, stream, is_dispatched]() -> int {
+            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclAllGather", numel, hcclType, comm), stream.stream(false),
+                    getMstxHcclMsg("HcclAllGather", numel, hcclType, comm, streamId), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclAllGather(inputDataPtr, outputDataPtr, numel, hcclType, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -3599,7 +3628,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather_togather(
     }
 
     auto inputTensors_ = cast_to_origin_format(inputTensors);
-
+    auto streamId = getStreamId(false, -1);
     return collective(
         inputTensors_,
         outputTensors,
@@ -3612,9 +3641,9 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather_togather(
             auto outputDataPtr = output.data_ptr();
             auto numel = getNumelForHCCL(input);
             auto hcclType = getHcclDataType(input.scalar_type());
-            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, comm, stream, is_dispatched]() -> int {
+            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclAllGather", numel, hcclType, comm), stream.stream(false),
+                    getMstxHcclMsg("HcclAllGather", numel, hcclType, comm, streamId), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclAllGather(inputDataPtr, outputDataPtr, numel, hcclType, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -3650,7 +3679,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_allgather_base(
     }
 
     auto inputTensors_ = cast_to_origin_format(inputTensors);
-
+    auto streamId = getStreamId(false, -1);
     return collective(
         inputTensors_,
         outputTensors,
@@ -3663,9 +3692,9 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_allgather_base(
             auto outputDataPtr = output.data_ptr();
             auto numel = getNumelForHCCL(input);
             auto hcclType = getHcclDataType(input.scalar_type());
-            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, comm, stream, is_dispatched]() -> int {
+            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclAllGather", numel, hcclType, comm), stream.stream(false),
+                    getMstxHcclMsg("HcclAllGather", numel, hcclType, comm, streamId), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclAllGather(inputDataPtr, outputDataPtr, numel, hcclType, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -3690,7 +3719,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce_scatter(
     if (C10_UNLIKELY(at_npu::native::env::CheckOpHookEnable())) {
         at_npu::native::OpHook::GetInstance().PreHook("reduce_scatter", outputTensors, inputTensors);
     }
-
+    auto streamId = getStreamId(false, -1);
     bool same_size = check_same_size(inputTensors.back());
     if (same_size) {
         auto inputFlattened = flatten_for_scatter_gather(inputTensors, outputTensors, size_);
@@ -3710,9 +3739,9 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce_scatter(
             auto outputDataPtr = output.data_ptr();
             auto numel = getNumelForHCCL(output);
             auto hcclReduceOp = getHcclReduceOp(opts.reduceOp, input);
-            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched]() -> int {
+            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclReduceScatter", numel, hcclType, comm), stream.stream(false),
+                    getMstxHcclMsg("HcclReduceScatter", numel, hcclType, comm, streamId), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclReduceScatter(
                     inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream.stream(false));
@@ -3792,9 +3821,10 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce_scatter(
                     numel,
                     comm,
                     stream,
-                    is_dispatched]() -> int {
+                    is_dispatched,
+                    streamId]() -> int {
                         torch_npu::profiler::MstxRange range(
-                            getMstxHcclMsg("HcclReduceScatterV", numel, hcclType, comm),
+                            getMstxHcclMsg("HcclReduceScatterV", numel, hcclType, comm, streamId),
                             stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
                         auto hccl_result = hcclReduceScatterV(
                             inputDataPtr,
@@ -3883,7 +3913,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_scatter_base(
     if (C10_UNLIKELY(at_npu::native::env::CheckOpHookEnable())) {
         at_npu::native::OpHook::GetInstance().PreHook("_reduce_scatter_base", outputs, inputs);
     }
-
+    auto streamId = getStreamId(false, -1);
     std::string functionName = __FUNCTION__;
     return collective(
         inputs,
@@ -3899,9 +3929,9 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_scatter_base(
             auto outputDataPtr = output.data_ptr();
             auto numel = getNumelForHCCL(output);
             auto hcclReduceOp = getHcclReduceOp(opts.reduceOp, input);
-            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched]() -> int {
+            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclReduceScatter", numel, hcclType, comm), stream.stream(false),
+                    getMstxHcclMsg("HcclReduceScatter", numel, hcclType, comm, streamId), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclReduceScatter(
                     inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream.stream(false));
@@ -3930,6 +3960,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce_scatter_tensor_coalesced
     const c10d::ReduceScatterOptions& opts)
 {
     std::string functionName = __FUNCTION__;
+    auto streamId = getStreamId(false, -1);
     return collectiveCoalesced(
         inputTensors,
         outputTensors,
@@ -3944,9 +3975,9 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce_scatter_tensor_coalesced
             auto outputDataPtr = output.data_ptr();
             auto numel = getNumelForHCCL(output);
             auto hcclReduceOp = getHcclReduceOp(opts.reduceOp, input);
-            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched]() -> int {
+            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclReduceScatter", numel, hcclType, comm), stream.stream(false),
+                    getMstxHcclMsg("HcclReduceScatter", numel, hcclType, comm, streamId), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclReduceScatter(
                     inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream.stream(false));
@@ -4066,7 +4097,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::scatter(
         inputTensors.push_back(empty);
         inputFlattened = flatten_for_scatter_gather(inputTensors, outputTensors, size_);
     }
-
+    auto streamId = getStreamId(false, -1);
     return collective(
         inputFlattened,
         outputTensors,
@@ -4080,9 +4111,9 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::scatter(
             auto outputDataPtr = output.data_ptr();
             auto numel = getNumelForHCCL(output);
             auto hcclType = getHcclDataType(input.scalar_type());
-            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, root, comm, stream, is_dispatched]() -> int {
+            auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, root, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclScatter", numel, hcclType, comm), stream.stream(false),
+                    getMstxHcclMsg("HcclScatter", numel, hcclType, comm, streamId), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = hcclScatter(inputDataPtr, outputDataPtr, numel, hcclType, root, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -4123,7 +4154,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::send(std::vector<at::Tensor>& t
     if (C10_UNLIKELY(at_npu::native::env::CheckOpHookEnable())) {
         at_npu::native::OpHook::GetInstance().PreHook("send", tensors);
     }
-
+    auto streamId = getStreamId(true, dstRank);
     auto tensors_ = cast_to_origin_format(tensors);
     auto ret = pointToPoint(
         tensors_,
@@ -4132,9 +4163,9 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::send(std::vector<at::Tensor>& t
             auto inputDataPtr = input.data_ptr();
             auto numel = getNumelForHCCL(input);
             auto hcclType = getHcclDataType(input.scalar_type());
-            auto hccl_call = [inputDataPtr, numel, hcclType, dst_rank, comm, stream, is_dispatched]() -> int {
+            auto hccl_call = [inputDataPtr, numel, hcclType, dst_rank, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclSend", numel, hcclType, comm), stream.stream(false),
+                    getMstxHcclMsg("HcclSend", numel, hcclType, comm, streamId), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclSend(inputDataPtr, numel, hcclType, dst_rank, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -4155,7 +4186,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::recv(std::vector<at::Tensor>& t
     if (C10_UNLIKELY(at_npu::native::env::CheckOpHookEnable())) {
         at_npu::native::OpHook::GetInstance().PreHook("recv", tensors);
     }
-
+    auto streamId = getStreamId(true, srcRank);
     auto tensors_ = create_base_format_tensors(tensors);
     auto ret = pointToPoint(
         tensors_,
@@ -4167,9 +4198,9 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::recv(std::vector<at::Tensor>& t
             auto outputDataPtr = output.data_ptr();
             auto numel = getNumelForHCCL(output);
             auto hcclType = getHcclDataType(output.scalar_type());
-            auto hccl_call = [outputDataPtr, numel, hcclType, src_rank, comm, stream, is_dispatched]() -> int {
+            auto hccl_call = [outputDataPtr, numel, hcclType, src_rank, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclRecv", numel, hcclType, comm), stream.stream(false),
+                    getMstxHcclMsg("HcclRecv", numel, hcclType, comm, streamId), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclRecv(outputDataPtr, numel, hcclType, src_rank, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -4241,7 +4272,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::alltoall_base(
             DIST_ERROR(ErrCode::PARAM));
         uint64_t output_counts = static_cast<uint64_t>(outputTensor.numel() / ranks);
         uint64_t input_counts = static_cast<uint64_t>(inputTensor.numel() / ranks);
-
+        auto streamId = getStreamId(false, -1);
         check_npu_tensors_different_devices(inputTensors);
         check_npu_tensors_different_devices(outputTensors);
         return collective(
@@ -4264,9 +4295,10 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::alltoall_base(
                                   outputhcclDataType,
                                   comm,
                                   stream,
-                                  is_dispatched]() -> int {
+                                  is_dispatched,
+                                  streamId]() -> int {
                         torch_npu::profiler::MstxRange range(
-                            getMstxHcclMsg("HcclAlltoAll", input_counts, inputhcclDataType, comm),
+                            getMstxHcclMsg("HcclAlltoAll", input_counts, inputhcclDataType, comm, streamId),
                             stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
                         auto hccl_result = hcclAlltoAll(
                             inputDataPtr,
@@ -4340,7 +4372,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::alltoall_base(
                 inputSpl.push_back(inputSpl[i - 1] + inputCounts[i - 1]);
             }
         }
-
+        auto streamId = getStreamId(false, -1);
         check_npu_tensors_different_devices(inputTensors);
         check_npu_tensors_different_devices(outputTensors);
         return collective(
@@ -4362,10 +4394,11 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::alltoall_base(
                                   outputhcclDataType,
                                   comm,
                                   stream,
-                                  is_dispatched]() -> int {
+                                  is_dispatched,
+                                  streamId]() -> int {
                     torch_npu::profiler::MstxRange range(
                         getMstxHcclMsg("HcclAlltoAllV", static_cast<uint64_t>(inputCounts.size()),
-                                       inputhcclDataType, comm),
+                                       inputhcclDataType, comm, streamId),
                         stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
                     auto hccl_result = hcclAlltoAllV(
                         inputDataPtr,
@@ -4468,7 +4501,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::alltoall(
 
     check_npu_tensors_different_devices(in_tensors);
     check_npu_tensors_different_devices(out_tensors);
-
+    auto streamId = getStreamId(false, -1);
     return collective(
         input_tensors_,
         output_tensors_,
@@ -4488,10 +4521,11 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::alltoall(
                               outputhcclDataType,
                               comm,
                               stream,
-                              is_dispatched]() -> int {
+                              is_dispatched,
+                              streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
                     getMstxHcclMsg("HcclAlltoAllV", static_cast<uint64_t>(input_counts.size()),
-                                   inputhcclDataType, comm),
+                                   inputhcclDataType, comm, streamId),
                     stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = hcclAlltoAllV(
                     inputDataPtr,
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
index af29da05f9..8c1414687c 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
@@ -783,7 +783,8 @@ private:
     static std::string getMstxHcclMsg(const std::string &opName,
                                       uint64_t dataCnt,
                                       HcclDataType hcclType,
-                                      HcclComm comm);
+                                      HcclComm comm,
+                                      int64_t streamId);
 
     std::unordered_map<c10d::OpType, std::pair<at::Tensor, at::Tensor>> silenceCheckCache_;
 
-- 
Gitee


From b6628faa9301f563334ef50606aada0d96c8ba2a Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Sat, 22 Feb 2025 07:20:11 +0000
Subject: [PATCH 045/358] !18214 Update torchair commit id Merge pull request
 !18214 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 40fbfc8ad6..491325ac8f 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 40fbfc8ad6cb47716c4566f2f8cfc4abd4c56e5a
+Subproject commit 491325ac8fb4535c2d4e5ef5cb4689dd900b05d8
-- 
Gitee


From 98325ac318586282645e3aa38bf19683a580be18 Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Sat, 22 Feb 2025 07:20:11 +0000
Subject: [PATCH 046/358] !18214 Update torchair commit id Merge pull request
 !18214 from torchair_robot/v2.6.0

-- 
Gitee


From 2223f490d060d7a8e81fdd3e4536fe2589e3c8a1 Mon Sep 17 00:00:00 2001
From: hhz886 <hehongzhe@h-partners.com>
Date: Sat, 22 Feb 2025 08:48:47 +0000
Subject: [PATCH 047/358] !18040 [Profiler] Enable data simplify, when
 collecting db format data, there is no profiler_metadata.json under Ascend_pt
 Merge pull request !18040 from hhz886/v2.6.0

---
 torch_npu/profiler/analysis/_profiling_parser.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/torch_npu/profiler/analysis/_profiling_parser.py b/torch_npu/profiler/analysis/_profiling_parser.py
index db63d97fcd..e68bf1914e 100644
--- a/torch_npu/profiler/analysis/_profiling_parser.py
+++ b/torch_npu/profiler/analysis/_profiling_parser.py
@@ -43,9 +43,6 @@ class ProfilingParser:
                 target_path = os.path.join(host_path, rm_dir)
                 PathManager.remove_path_safety(target_path)
         if simplify_flag:
-            if Constant.Db in ProfilerConfig().export_type:
-                profiler_metadata_path = os.path.join(profiler_path, Constant.PROFILER_META_DATA)
-                PathManager.remove_file_safety(profiler_metadata_path)
             fwk_path = ProfilerPathManager.get_fwk_path(profiler_path)
             PathManager.remove_path_safety(fwk_path)
             if not cann_path:
-- 
Gitee


From 3d82704f3c7069eb730482cb8158d3acbb01e693 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Sat, 22 Feb 2025 09:45:09 +0000
Subject: [PATCH 048/358] !18230 Update op_plugin commit id Merge pull request
 !18230 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 5c2673aa4d..09cd291390 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 5c2673aa4dbf101ba72d435fbbd28e1a907d2474
+Subproject commit 09cd2913903dcb7e17b9218616cd9368aa71873a
-- 
Gitee


From 1600fa8e8f4ca020d10fe14d8f03c4eb5373e5b9 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Mon, 24 Feb 2025 02:45:12 +0000
Subject: [PATCH 049/358] !18251 Update op_plugin commit id Merge pull request
 !18251 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 09cd291390..3e81de7f79 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 09cd2913903dcb7e17b9218616cd9368aa71873a
+Subproject commit 3e81de7f79c3ef7fbe594371069ea6a62e2b981c
-- 
Gitee


From c3d6e5f0f79486e48a6c5f5b372fee243402c888 Mon Sep 17 00:00:00 2001
From: will-devil <wangyicheng16@huawei.com>
Date: Tue, 25 Feb 2025 01:36:52 +0000
Subject: [PATCH 050/358] !18253 [Task] Adapt to the aten::_to_copy instead of
 the aten::to.dtype_layout. Merge pull request !18253 from will-devil/copy_26

---
 torch_npu/csrc/aten/common/ToKernelNpu.cpp    | 73 ++++++++++++-------
 torch_npu/csrc/aten/npu_native_functions.yaml |  2 +-
 2 files changed, 49 insertions(+), 26 deletions(-)

diff --git a/torch_npu/csrc/aten/common/ToKernelNpu.cpp b/torch_npu/csrc/aten/common/ToKernelNpu.cpp
index baad15eeb8..c4b9a941af 100644
--- a/torch_npu/csrc/aten/common/ToKernelNpu.cpp
+++ b/torch_npu/csrc/aten/common/ToKernelNpu.cpp
@@ -58,50 +58,72 @@ static inline at::Tensor to_impl_npu(
   return r;
 }
 
-at::Tensor NPUNativeFunctions::to(
-    const at::Tensor &self,
+at::Tensor NPUNativeFunctions::_to_copy(
+    const at::Tensor& self,
     c10::optional<at::ScalarType> dtype,
     c10::optional<c10::Layout> layout,
     c10::optional<c10::Device> device,
     c10::optional<bool> pin_memory,
     bool non_blocking,
-    bool copy,
     c10::optional<c10::MemoryFormat> optional_memory_format)
 {
+    c10::TensorOptions options_ = c10::TensorOptions()
+        .dtype(dtype)
+        .layout(layout)
+        .device(device);
+
+    auto options = self.options().merge_in(options_);
+
+    if (layout.has_value()) {
+        TORCH_CHECK(
+            self.layout() == layout.value(),
+            "to(options) doesn't support converting to a different layout, "
+            "but got self.layout being ",
+            self.layout(),
+            " and options.layout set as ",
+            layout.value(), OPS_ERROR(ErrCode::NOT_SUPPORT));
+    }
+
+    if (device.has_value()) {
+        options = options.device(ensure_has_index(device.value()));
+    }
+
     if (optional_memory_format.has_value()) {
         TORCH_CHECK(
             optional_memory_format.value() == c10::MemoryFormat::Preserve ||
             optional_memory_format.value() == c10::MemoryFormat::Contiguous,
             "Only contiguous_format or preserve_format is supported.", OPS_ERROR(ErrCode::NOT_SUPPORT));
+        options = options.memory_format(optional_memory_format.value());
+    } else {
+        options = options.memory_format(c10::MemoryFormat::Contiguous);
     }
-
-    c10::TensorOptions options_ = c10::TensorOptions().dtype(dtype).layout(layout).device(device);
-    TORCH_CHECK(
-        !(options_.has_memory_format() && optional_memory_format.has_value()),
-        "Cannot set memory_format both in c10::TensorOptions and explicit argument; please delete "
-        "the redundant setter.", OPS_ERROR(ErrCode::PARAM));
-    auto options =
-        options_.merge_in(c10::TensorOptions().memory_format(optional_memory_format));
-
     TORCH_CHECK(
         options.requires_grad_opt() == c10::nullopt,
         "to(options) expects unset requires_grad flag, but got "
         "options.requires_grad set as ",
         options.requires_grad(), OPS_ERROR(ErrCode::PARAM));
 
-    TORCH_CHECK(
-        !options.has_layout() || self.layout() == options.layout(),
-        "to(options) doesn't support converting to a different layout, "
-        "but got self.layout being ",
-        self.layout(),
-        " and options.layout set as ",
-        options.layout(), OPS_ERROR(ErrCode::NOT_SUPPORT));
-
-    if (options.has_device()) {
-        options = options.device(ensure_has_index(options.device()));
+    bool pin_out = non_blocking && torch_npu::utils::is_npu(self) && options.device().is_cpu() &&
+                    (options.layout() == c10::kStrided);
+
+    c10::MemoryFormat memory_format = options.memory_format_opt().value_or(c10::MemoryFormat::Contiguous);
+    if (memory_format == c10::MemoryFormat::Preserve) {
+        if (self.is_non_overlapping_and_dense()) {
+            // Copy all strides
+            auto r = at::empty_strided(
+                self.sizes(), self.strides(), options.memory_format(c10::nullopt).pinned_memory(pin_out));
+            r.copy_(self, non_blocking);
+            return r;
+        } else {
+            memory_format = self.suggest_memory_format();
+        }
     }
-    auto specified_options = self.options().merge_in(options);
-    return to_impl_npu(self, specified_options, non_blocking, copy);
+    
+    // See Note [Explicit nullopt c10::MemoryFormat argument]
+    auto r = at::empty(
+        self.sizes(), options.memory_format(memory_format).pinned_memory(pin_out), c10::nullopt);
+    r.copy_(self, non_blocking);
+    return r;
 }
 
 at::Tensor NPUNativeFunctions::to(
@@ -110,7 +132,8 @@ at::Tensor NPUNativeFunctions::to(
     at::ScalarType dtype,
     bool non_blocking,
     bool copy,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    c10::optional<c10::MemoryFormat> optional_memory_format)
+{
   device = ensure_has_index(device);
   return to_impl_npu(
       self,
diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml
index 94d19e37a5..144157eeb5 100644
--- a/torch_npu/csrc/aten/npu_native_functions.yaml
+++ b/torch_npu/csrc/aten/npu_native_functions.yaml
@@ -48,7 +48,7 @@ supported:
   - squeeze.dim
   - to.device
   - to.dtype
-  - to.dtype_layout
+  - _to_copy
   - to.other
   - tril_indices
   - triu_indices
-- 
Gitee


From a330367dd8def7af3ee0c0620106778b2ac0363c Mon Sep 17 00:00:00 2001
From: wgb <wgb_strive@163.com>
Date: Tue, 25 Feb 2025 01:37:38 +0000
Subject: [PATCH 051/358] !18258 Fix the judgment condition of device_check
 Merge pull request !18258 from wgb/v2.6.0

---
 codegen/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codegen/utils.py b/codegen/utils.py
index b24ebef164..39f013437c 100644
--- a/codegen/utils.py
+++ b/codegen/utils.py
@@ -342,7 +342,7 @@ return {self_arg_name};
 
             device_check = "  // No device check\n"
             # Backends that require device guards presumably also require device checks.
-            if self.backend_index.device_guard and op_name not in DEVICE_NOCHECK_SET:
+            if self.backend_index.device_guard and str(f.func.name) not in DEVICE_NOCHECK_SET:
                 device_check_args = itertools.chain(
                     f.func.arguments.out, f.func.arguments.flat_positional
                 )
-- 
Gitee


From 78f5243029e0a856448b2f70783c3fb3562ad80c Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 25 Feb 2025 05:00:11 +0000
Subject: [PATCH 052/358] !18274 Update op_plugin commit id Merge pull request
 !18274 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 3e81de7f79..1cb70e1337 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 3e81de7f79c3ef7fbe594371069ea6a62e2b981c
+Subproject commit 1cb70e1337ddd3ee6427cbdf361241504576194d
-- 
Gitee


From 1a8638606a10721531ba2d8055a0fbc1923d9a2e Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 25 Feb 2025 13:45:13 +0000
Subject: [PATCH 053/358] !18307 Update op_plugin commit id Merge pull request
 !18307 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 1cb70e1337..a680a4054c 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 1cb70e1337ddd3ee6427cbdf361241504576194d
+Subproject commit a680a4054ca840abcfa41d85a0ee01a19c64ae0d
-- 
Gitee


From 1f1f0f9d7bba7e34ffeb9a1d276b4e48791f8dfe Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 25 Feb 2025 15:45:13 +0000
Subject: [PATCH 054/358] !18316 Update op_plugin commit id Merge pull request
 !18316 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index a680a4054c..8209d3e480 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit a680a4054ca840abcfa41d85a0ee01a19c64ae0d
+Subproject commit 8209d3e48059d57a9aa6ead6128d6166f187776f
-- 
Gitee


From b495f2f33fa72b3367eaa027bf57e3b473174e94 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 25 Feb 2025 15:45:13 +0000
Subject: [PATCH 055/358] !18316 Update op_plugin commit id Merge pull request
 !18316 from pta-robot/v2.6.0

-- 
Gitee


From 7e68e3bff47d315bbee096caa4be1ed86345f7b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= <yanpengquan@huawei.com>
Date: Wed, 26 Feb 2025 01:22:38 +0000
Subject: [PATCH 056/358] =?UTF-8?q?!18264=20add=20interface=20for=20LCCL?=
 =?UTF-8?q?=20backend=20Merge=20pull=20request=20!18264=20from=20=E9=97=AB?=
 =?UTF-8?q?=E9=B9=8F=E5=85=A8/v2.6.0=5FLCCL?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 third_party/hccl/inc/hccl/lcal_api.h          |  34 +++
 .../csrc/core/npu/interface/LcclInterface.cpp | 131 ++++++++++
 .../csrc/core/npu/interface/LcclInterface.h   |  33 +++
 torch_npu/csrc/distributed/LCCLUtils.cpp      | 238 ++++++++++++++++++
 torch_npu/csrc/distributed/LCCLUtils.hpp      |  34 +++
 .../csrc/distributed/ProcessGroupLCCL.hpp     | 174 +++++++++++++
 6 files changed, 644 insertions(+)
 create mode 100644 third_party/hccl/inc/hccl/lcal_api.h
 create mode 100644 torch_npu/csrc/core/npu/interface/LcclInterface.cpp
 create mode 100644 torch_npu/csrc/core/npu/interface/LcclInterface.h
 create mode 100644 torch_npu/csrc/distributed/LCCLUtils.cpp
 create mode 100644 torch_npu/csrc/distributed/LCCLUtils.hpp
 create mode 100644 torch_npu/csrc/distributed/ProcessGroupLCCL.hpp

diff --git a/third_party/hccl/inc/hccl/lcal_api.h b/third_party/hccl/inc/hccl/lcal_api.h
new file mode 100644
index 0000000000..f1b5c74541
--- /dev/null
+++ b/third_party/hccl/inc/hccl/lcal_api.h
@@ -0,0 +1,34 @@
+#ifndef LCAL_API_H
+#define LCAL_API_H
+
+#include "hccl/hccl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+typedef void *LcalCommPtr;
+
+int LcalCommInitRankLocal(int rankSize, int rank, LcalCommPtr *comms);
+
+int LcalCommInit(int rank, int rankSize, LcalCommPtr *comms);
+
+int LcalCommInitAll(uint32_t ndev, int32_t *devices, LcalCommPtr *comms);
+
+int LcclAllReduce(void *sendBuf, void *recvBuf, int64_t count, HcclDataType dataType, HcclReduceOp op,
+                  LcalCommPtr comm, aclrtStream stream);
+
+int LcclAllGather(void *sendBuf, void *recvBuf, int64_t sendCount, HcclDataType dataType, LcalCommPtr comm,
+                  aclrtStream stream);
+
+int LcclReduceScatter(void *sendBuf, void *recvBuf, int64_t recvCount, HcclDataType dataType, HcclReduceOp op,
+                      LcalCommPtr comm, aclrtStream stream);
+
+int LcclBroadcast(void *buf, int64_t count, HcclDataType dataType, int root, LcalCommPtr comm, aclrtStream stream);
+
+int LcclCommDestroy(LcalCommPtr comm);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // LCAL_API_H
diff --git a/torch_npu/csrc/core/npu/interface/LcclInterface.cpp b/torch_npu/csrc/core/npu/interface/LcclInterface.cpp
new file mode 100644
index 0000000000..dccfe342ce
--- /dev/null
+++ b/torch_npu/csrc/core/npu/interface/LcclInterface.cpp
@@ -0,0 +1,131 @@
+#include "LcclInterface.h"
+
+#include <unordered_map>
+#include <dlfcn.h>
+
+#include "torch_npu/csrc/core/npu/NPUException.h"
+#include "torch_npu/csrc/core/npu/register/FunctionLoader.h"
+
+namespace at_npu {
+namespace lccl {
+
+#undef LOAD_FUNCTION
+#define LOAD_FUNCTION(funcName) \
+  REGISTER_FUNCTION(liblcal, funcName)
+#undef GET_FUNC
+#define GET_FUNC(funcName) \
+  GET_FUNCTION(liblcal, funcName)
+
+REGISTER_LIBRARY(liblcal)
+LOAD_FUNCTION(LcalCommInitRankLocal)
+LOAD_FUNCTION(LcalCommInit)
+LOAD_FUNCTION(LcclAllReduce)
+LOAD_FUNCTION(LcclAllGather)
+LOAD_FUNCTION(LcclReduceScatter)
+LOAD_FUNCTION(LcclBroadcast)
+LOAD_FUNCTION(LcclCommDestroy)
+
+int LcclCommInitRankLocal(int rankSize, int rank, LcclComm *comms)
+{
+    typedef int(*lcalCommInitRankLocal)(int, int, LcclComm *);
+    static lcalCommInitRankLocal func = nullptr;
+    if (func == nullptr) {
+        func = (lcalCommInitRankLocal)GET_FUNC(LcclCommInitRankLocal);
+        if (func == nullptr) {
+            TORCH_CHECK(func, "Failed to find function ", "lcalCommInitRankLocal", PTA_ERROR(ErrCode::NOT_FOUND));
+            return -1;
+        }
+    }
+    return func(rankSize, rank, comms);
+}
+
+int LcclCommInit(int rank, int rankSize, LcclComm *comms)
+{
+    typedef int(*lcalCommInit)(int, int, LcclComm *);
+    static lcalCommInit func = nullptr;
+    if (func == nullptr) {
+        func = (lcalCommInit)GET_FUNC(LcclCommInit);
+        if (func == nullptr) {
+            TORCH_CHECK(func, "Failed to find function ", "lcalCommInit", PTA_ERROR(ErrCode::NOT_FOUND));
+            return -1;
+        }
+    }
+    return func(rank, rankSize, comms);
+}
+
+int LcclAllReduce(void *sendBuf, void *recvBuf, int64_t count, LcclDataType dataType, LcclReduceOp op,
+                  LcclComm comm, aclrtStream stream)
+{
+    typedef int(*lcclAllReduce)(void *, void *, int64_t, LcclDataType, LcclReduceOp, LcclComm, aclrtStream);
+    static lcclAllReduce func = nullptr;
+    if (func == nullptr) {
+        func = (lcclAllReduce)GET_FUNC(LcclAllReduce);
+        if (func == nullptr) {
+            TORCH_CHECK(func, "Failed to find function ", "lcclAllReduce", PTA_ERROR(ErrCode::NOT_FOUND));
+            return -1;
+        }
+    }
+    return func(sendBuf, recvBuf, count, dataType, op, comm, stream);
+}
+
+int LcclAllGather(void *sendBuf, void *recvBuf, int64_t sendCount, LcclDataType dataType, LcclComm comm,
+                  aclrtStream stream)
+{
+    typedef int(*lcclAllGather)(void *, void *, int64_t, LcclDataType, LcclComm, aclrtStream);
+    static lcclAllGather func = nullptr;
+    if (func == nullptr) {
+        func = (lcclAllGather)GET_FUNC(LcclAllGather);
+        if (func == nullptr) {
+            TORCH_CHECK(func, "Failed to find function ", "lcclAllGather", PTA_ERROR(ErrCode::NOT_FOUND));
+            return -1;
+        }
+    }
+    return func(sendBuf, recvBuf, sendCount, dataType, comm, stream);
+}
+
+int LcclReduceScatter(void *sendBuf, void *recvBuf, int64_t recvCount, LcclDataType dataType, LcclReduceOp op,
+                      LcclComm comm, aclrtStream stream)
+{
+    typedef int(*lcclReduceScatter)(void *, void *, int64_t, LcclDataType, LcclReduceOp, LcclComm, aclrtStream);
+    static lcclReduceScatter func = nullptr;
+    if (func == nullptr) {
+        func = (lcclReduceScatter)GET_FUNC(LcclReduceScatter);
+        if (func == nullptr) {
+            TORCH_CHECK(func, "Failed to find function ", "lcclReduceScatter", PTA_ERROR(ErrCode::NOT_FOUND));
+            return -1;
+        }
+    }
+    return func(sendBuf, recvBuf, recvCount, dataType, op, comm, stream);
+}
+
+int LcclBroadcast(void *buf, int64_t count, LcclDataType dataType, int root, LcclComm comm,
+                  aclrtStream stream)
+{
+    typedef int(*lcclBroadcast)(void *, int64_t, LcclDataType, int, LcclComm, aclrtStream);
+    static lcclBroadcast func = nullptr;
+    if (func == nullptr) {
+        func = (lcclBroadcast)GET_FUNC(LcclBroadcast);
+        if (func == nullptr) {
+            TORCH_CHECK(func, "Failed to find function ", "lcclBroadcast", PTA_ERROR(ErrCode::NOT_FOUND));
+            return -1;
+        }
+    }
+    return func(buf, count, dataType, root, comm, stream);
+}
+
+int LcclCommDestroy(LcclComm comm)
+{
+    typedef int(*lcclCommDestroy)(LcclComm);
+    static lcclCommDestroy func = nullptr;
+    if (func == nullptr) {
+        func = (lcclCommDestroy)GET_FUNC(LcclCommDestroy);
+        if (func == nullptr) {
+            TORCH_CHECK(func, "Failed to find function ", "lcclCommDestroy", PTA_ERROR(ErrCode::NOT_FOUND));
+            return -1;
+        }
+    }
+    return func(comm);
+}
+
+}
+}
diff --git a/torch_npu/csrc/core/npu/interface/LcclInterface.h b/torch_npu/csrc/core/npu/interface/LcclInterface.h
new file mode 100644
index 0000000000..9ca5ea83a2
--- /dev/null
+++ b/torch_npu/csrc/core/npu/interface/LcclInterface.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <string>
+#include "hccl/hccl.h"
+#include "hccl/lcal_api.h"
+
+namespace at_npu {
+namespace lccl {
+
+using LcclDataType = HcclDataType;
+using LcclReduceOp = HcclReduceOp;
+using LcclComm = LcalCommPtr;
+
+int LcclCommInitRankLocal(int rankSize, int rank, LcclComm *comms);
+
+int LcclCommInit(int rank, int rankSize, LcclComm *comms);
+
+int LcclAllReduce(void *sendBuf, void *recvBuf, int64_t count, LcclDataType dataType, LcclReduceOp op,
+                  LcclComm comm, aclrtStream stream);
+
+int LcclAllGather(void *sendBuf, void *recvBuf, int64_t sendCount, LcclDataType dataType, LcclComm comm,
+                  aclrtStream stream);
+
+int LcclReduceScatter(void *sendBuf, void *recvBuf, int64_t recvCount, LcclDataType dataType, LcclReduceOp op,
+                      LcclComm comm, aclrtStream stream);
+
+int LcclBroadcast(void *buf, int64_t count, LcclDataType dataType, int root, LcclComm comm,
+                  aclrtStream stream);
+
+int LcclCommDestroy(LcclComm comm);
+
+}
+}
diff --git a/torch_npu/csrc/distributed/LCCLUtils.cpp b/torch_npu/csrc/distributed/LCCLUtils.cpp
new file mode 100644
index 0000000000..b4e62d9f29
--- /dev/null
+++ b/torch_npu/csrc/distributed/LCCLUtils.cpp
@@ -0,0 +1,238 @@
+#include "LCCLUtils.hpp"
+
+#include "torch_npu/csrc/core/NPUBridge.h"
+#include "torch_npu/csrc/core/npu/DeviceUtils.h"
+#include "torch_npu/csrc/core/npu/NPUException.h"
+#include "torch_npu/csrc/core/npu/NPUFormat.h"
+#include "torch_npu/csrc/framework/FormatHelper.h"
+
+namespace c10d_npu {
+
+// LCCL DataType mapping
+std::map<at::ScalarType, at_npu::lccl::LcclDataType> kScalarTypeToLcclDataType = {
+    {at::kByte, HCCL_DATA_TYPE_UINT8},
+    {at::kChar, HCCL_DATA_TYPE_INT8},
+    {at::kShort, HCCL_DATA_TYPE_INT16},
+    {at::kInt, HCCL_DATA_TYPE_INT32},
+    {at::kLong, HCCL_DATA_TYPE_INT64},
+    {at::kHalf, HCCL_DATA_TYPE_FP16},
+    {at::kFloat, HCCL_DATA_TYPE_FP32},
+    {at::kDouble, HCCL_DATA_TYPE_FP64},
+    {at::kBool, HCCL_DATA_TYPE_UINT8},
+    {at::kBFloat16, HCCL_DATA_TYPE_BFP16},
+};
+
+std::map<at_npu::lccl::LcclDataType, std::string> kLcclDataTypeToStringMap = {
+    {HCCL_DATA_TYPE_UINT8, "at::kByte/at::kBool"},
+    {HCCL_DATA_TYPE_INT8, "at::kChar"},
+    {HCCL_DATA_TYPE_INT16, "at::kShort"},
+    {HCCL_DATA_TYPE_INT32, "at::kInt"},
+    {HCCL_DATA_TYPE_INT64, "at::kLong"},
+    {HCCL_DATA_TYPE_FP16, "at::kHalf"},
+    {HCCL_DATA_TYPE_FP32, "at::kFloat"},
+    {HCCL_DATA_TYPE_FP64, "at::kDouble"},
+    {HCCL_DATA_TYPE_BFP16, "at::kBFloat16"},
+};
+
+// LCCL unsupported ReduceOp
+std::map<c10d::ReduceOp, std::string> unsupportedOp = {
+    {c10d::ReduceOp::AVG, "AVG"},
+    {c10d::ReduceOp::BAND, "BAND"},
+    {c10d::ReduceOp::BOR, "BOR"},
+    {c10d::ReduceOp::BXOR, "BXOR"}
+};
+
+// LCCL ReduceOp mapping
+std::map<c10d::ReduceOp, at_npu::lccl::LcclReduceOp> lcclOp = {
+    {c10d::ReduceOp::MIN, HCCL_REDUCE_MIN},
+    {c10d::ReduceOp::MAX, HCCL_REDUCE_MAX},
+    {c10d::ReduceOp::SUM, HCCL_REDUCE_SUM},
+    {c10d::ReduceOp::PRODUCT, HCCL_REDUCE_PROD},
+};
+
+// Helper function that gets the data type and issues error if not supported
+at_npu::lccl::LcclDataType getLcclDataType(at::ScalarType type)
+{
+    try {
+        return kScalarTypeToLcclDataType.at(type);
+    } catch (std::out_of_range& e) {
+        throw std::runtime_error("Unsupported data type for LCCL process group" + DIST_ERROR(ErrCode::NOT_SUPPORT));
+    }
+}
+
+std::string getLcclDataTypeSerialString(at_npu::lccl::LcclDataType type)
+{
+    const auto& iter = kLcclDataTypeToStringMap.find(type);
+    if (iter != kLcclDataTypeToStringMap.end()) {
+        return iter->second;
+    } else {
+        TORCH_NPU_WARN_ONCE("Cannot serialize undefined LCCL data type.");
+        return "";
+    }
+}
+
+// AllGather & Broadcast support all data type, no need do more check.
+void checkSupportedDataType(at_npu::lccl::LcclDataType type, std::string functionName)
+{
+    static std::set<at_npu::lccl::LcclDataType> supportedDataTypes = {
+        HCCL_DATA_TYPE_INT8,
+        HCCL_DATA_TYPE_INT16,
+        HCCL_DATA_TYPE_INT32,
+        HCCL_DATA_TYPE_FP16,
+        HCCL_DATA_TYPE_FP32,
+        HCCL_DATA_TYPE_BFP16,
+        HCCL_DATA_TYPE_INT64};
+    TORCH_CHECK(supportedDataTypes.count(type) != 0, "LCCL "+functionName+": Unsupported data type ",
+        getLcclDataTypeSerialString(type), DIST_ERROR(ErrCode::NOT_SUPPORT));
+}
+
+at_npu::lccl::LcclReduceOp getLcclReduceOp(const c10d::ReduceOp reduceOp, at::Tensor& input)
+{
+    if (reduceOp == c10d::ReduceOp::SUM && input.scalar_type() == at::kBool) {
+        // For bool tensors, map sum to max, which both represent a bitwise or.
+        // This is to prevent overflow issues with sum, since we use uint8 to
+        // represent a bool (see lcclDataType mapping).
+        return HCCL_REDUCE_MAX;
+    }
+
+    if (unsupportedOp.find(reduceOp) != unsupportedOp.end()) {
+        TORCH_CHECK(false, "Cannot use ReduceOp." + unsupportedOp[reduceOp] + " with LCCL",
+            DIST_ERROR(ErrCode::NOT_SUPPORT));
+    } else if (lcclOp.find(reduceOp) == lcclOp.end()) {
+        TORCH_CHECK(false, "Unhandled ReduceOp", DIST_ERROR(ErrCode::NOT_FOUND));
+    }
+    return lcclOp[reduceOp];
+}
+
+// use tensor numel when the format is ACL_FORMAT_ND or ACL_FORMAT_NCHW
+uint64_t getNumelForLCCL(const at::Tensor& self)
+{
+    aclFormat format = torch_npu::NPUBridge::GetNpuStorageImpl(self)->npu_desc_.npu_format_;
+    if (!at_npu::native::FormatHelper::IsBaseFormatType(format)) {
+        if (self.storage().data_ptr().get() != self.data_ptr()) {
+            TORCH_NPU_WARN_ONCE(
+                "The storage data_ptr is different from tensor data_ptr."
+                "Maybe this tensor is not suitable for LCCL.");
+        }
+        auto sizes = torch_npu::NPUBridge::GetNpuStorageImpl(self)->npu_desc_.storage_sizes_;
+        int64_t n = 1;
+        for (auto s : sizes) {
+            n *= s;
+        }
+        return n;
+    } else {
+        return self.numel();
+    }
+}
+
+// Get the list of devices from list of tensors
+std::vector<at::Device> getDeviceList(const std::vector<at::Tensor>& tensors)
+{
+    std::vector<at::Device> res;
+    res.reserve(tensors.size());
+    for (auto& tensor : tensors) {
+        res.push_back(tensor.device());
+    }
+    return res;
+}
+
+// Get the deviceList String from the list of devices
+std::string getKeyFromDevices(const std::vector<at::Device>& devices)
+{
+    std::string deviceList;
+    for (auto& device : devices) {
+        if (deviceList.empty()) {
+            deviceList = std::to_string(device.index());
+        } else {
+            deviceList += "," + std::to_string(device.index());
+        }
+    }
+    return deviceList;
+}
+
+// Check that all `tensors' have the same type and shape and are distributed across distinct NPUs.
+void checkTensors(const std::vector<at::Tensor>& tensors)
+{
+    if (tensors.size() == 0) {
+        TORCH_CHECK(false, "Tensor list must be nonempty", DIST_ERROR(ErrCode::PARAM));
+    }
+    // HCCL support one NPU per process only
+    if (tensors.size() != 1) {
+        TORCH_CHECK(false, "Tensor list mustn't be larger than the number of available NPUs", DIST_ERROR(ErrCode::VALUE));
+    }
+
+    const auto& first = tensors.front();
+
+    if (!torch_npu::utils::is_npu(first) || first.is_sparse()) {
+        TORCH_CHECK(false, "Tensors must be NPU and dense", DIST_ERROR(ErrCode::TYPE));
+    }
+    if (!first.is_contiguous(first.suggest_memory_format())) {
+        TORCH_CHECK(false, "Tensors must be contiguous", DIST_ERROR(ErrCode::TYPE));
+    }
+}
+
+bool CheckTensorsSameSize(const std::vector<at::Tensor>& input_tensors)
+{
+    for (const auto& input_tensor : input_tensors) {
+        if (!input_tensors[0].is_same_size(input_tensor)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+std::vector<at::Tensor> castOriginFormat(const std::vector<at::Tensor>& inputTensors)
+{
+    std::vector<at::Tensor> inputTensors_;
+    inputTensors_.resize(inputTensors.size());
+    size_t index = 0;
+    for (auto& tensor : inputTensors) {
+        if (at_npu::native::FormatHelper::IsBaseFormatType(tensor)) {
+            inputTensors_[index] = tensor;
+        } else {
+            auto origin_format = torch_npu::NPUBridge::GetNpuStorageImpl(tensor)->npu_desc_.origin_format_;
+            inputTensors_[index] = at_npu::native::npu_format_cast(tensor, origin_format);
+        }
+        index++;
+    }
+    return inputTensors_;
+}
+
+// Flatten each list in `tensor_lists' for a gather or scatter operation, and
+// ensure compatibility with the corresponding tensor in `other'.
+std::vector<at::Tensor> FlattenForScatterGather(std::vector<std::vector<at::Tensor>>& tensor_lists,
+    std::vector<at::Tensor>& other, size_t world_size)
+{
+    if (tensor_lists.size() != other.size()) {
+        TORCH_CHECK(false, "Tensor list operands to scatter/gather must have the same length", DIST_ERROR(ErrCode::VALUE));
+    }
+    const auto num_devices = tensor_lists.size();
+
+    std::vector<at::Tensor> flattened;
+    flattened.resize(num_devices);
+
+    for (auto i = size_t{}; i < num_devices; ++i) {
+        if (tensor_lists[i].size() != world_size * num_devices) {
+            TORCH_CHECK(false, "Tensor list input to scatter/gather must match number of collective participants",
+                DIST_ERROR(ErrCode::PARAM));
+        }
+
+        // Only check device match for the first tensor in the list; the call to newLikeFlat() below will check the rest.
+        if (tensor_lists[i].front().get_device() != other[i].get_device()) {
+            TORCH_CHECK(false, "Corresponding input/output tensors to scatter/gather must all on the same device",
+                DIST_ERROR(ErrCode::PARAM));
+        }
+
+        for (const auto& t : tensor_lists[i]) {
+            if (t.numel() != other[i].numel()) {
+                TORCH_CHECK(false, "All tensor operands to scatter/gather must have the same size",
+                    DIST_ERROR(ErrCode::PARAM));
+            }
+        }
+        // Flatten the tensors (from all ranks) into a single big tensor.
+        flattened[i] = c10d::newLikeFlat(tensor_lists, i);
+    }
+    return flattened;
+}
+
+}
diff --git a/torch_npu/csrc/distributed/LCCLUtils.hpp b/torch_npu/csrc/distributed/LCCLUtils.hpp
new file mode 100644
index 0000000000..a282a5e8c0
--- /dev/null
+++ b/torch_npu/csrc/distributed/LCCLUtils.hpp
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <map>
+#include <ATen/ATen.h>
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+
+#include "torch_npu/csrc/core/npu/interface/LcclInterface.h"
+
+namespace c10d_npu {
+
+at_npu::lccl::LcclDataType getLcclDataType(at::ScalarType type);
+
+std::string getLcclDataTypeSerialString(at_npu::lccl::LcclDataType type);
+
+void checkSupportedDataType(at_npu::lccl::LcclDataType type, std::string functionName);
+
+at_npu::lccl::LcclReduceOp getLcclReduceOp(const c10d::ReduceOp reduceOp, at::Tensor& input);
+
+uint64_t getNumelForLCCL(const at::Tensor& self);
+
+std::string getKeyFromDevices(const std::vector<at::Device>& devices);
+
+std::vector<at::Device> getDeviceList(const std::vector<at::Tensor>& tensors);
+
+void checkTensors(const std::vector<at::Tensor>& tensors);
+
+bool CheckTensorsSameSize(const std::vector<at::Tensor>& input_tensors);
+
+std::vector<at::Tensor> castOriginFormat(const std::vector<at::Tensor>& inputTensors);
+
+std::vector<at::Tensor> FlattenForScatterGather(std::vector<std::vector<at::Tensor>>& tensor_lists,
+    std::vector<at::Tensor>& other, size_t world_size);
+
+}
diff --git a/torch_npu/csrc/distributed/ProcessGroupLCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupLCCL.hpp
new file mode 100644
index 0000000000..197c2c4874
--- /dev/null
+++ b/torch_npu/csrc/distributed/ProcessGroupLCCL.hpp
@@ -0,0 +1,174 @@
+#pragma once
+
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+#include <variant>
+#include <future>
+#include <atomic>
+#include <string>
+#include <c10d/ProcessGroup.hpp>
+#include <c10d/Store.hpp>
+#include <c10d/Utils.hpp>
+#include <c10d/Work.hpp>
+
+#include "torch_npu/csrc/distributed/LCCLUtils.hpp"
+#include "torch_npu/csrc/npu/Event.h"
+
+
+namespace c10d_npu {
+
+const std::string LCCL_BACKEND_NAME = "lccl";
+
+class ProcessGroupLCCL : public c10d::Backend {
+public:
+    class WorkLCCL : public c10d::Work, public std::enable_shared_from_this<WorkLCCL> {
+    public:
+        // Constructor takes a list of NPU devices to adapt framework, But LCCL support one device only!!!
+        explicit WorkLCCL(const std::vector<at::Device>& devices);
+
+        ~WorkLCCL() override;
+        // Checks if request has completed. In this specific case of LCCL, it checks
+        // if the LCCL operation has completed on the NPU in its own LCCL stream.
+        // Non-blocking operation.
+        bool isCompleted() override;
+
+        bool isSuccess() const override;
+
+        bool wait(std::chrono::milliseconds timeout) override;
+
+        // Let current stream wait on the completing of the LCCL work
+        // Throws on exceptions. Blocking operation, which will wait for work completion.
+        void synchronize() override;
+
+        // Helper function that checks if the LCCL have finished execution on the NPUs
+        bool finishedNPUExecution();
+        std::vector<at::Tensor> result() override;
+
+    protected:
+        // The cached list of NPU devices to operate on. LCCL support one device per rank only
+        std::vector<at::Device> devices_;
+
+        // The LCCL communicators used for this work item.
+        std::vector<at_npu::lccl::LcclComm> lcclComms_;
+
+        // multiple runtime devices. These start npu events are needed by desync debugging if enabled.
+        std::shared_ptr<std::vector<c10_npu::NPUEvent>> lcclStartEvents_;
+
+        // The end npu events of LCCL operator tracking this work item on multiple npu devices.
+        std::shared_ptr<std::vector<c10_npu::NPUEvent>> lcclEndEvents_;
+
+        // Clone of blockingWait_ from ProcessGroupLCCL.
+        bool blockingWait_ = false;
+
+        // Clone of opTimeout_ from ProcessGroupLCCL.
+        std::chrono::milliseconds opTimeout_;
+
+        // Time point representing when the work started.
+        std::chrono::time_point<std::chrono::steady_clock> workStartTime_;
+
+    private:
+        // Helper function for synchronize
+        void synchronizeInternal(std::chrono::milliseconds timeout);
+
+        // Checks for LCCL errors and sets an appropriate exception_ptr.
+        void checkAndSetException();
+
+        // Checks for LCCL errors and throws an appropriate exception.
+        void checkAndThrowException();
+
+        // Just checks whether NPU execution has completed, without modifying
+        // exception_ptr.
+        bool finishedNPUExecutionInternal() const;
+
+        // Get a Future object that will be marked as completed internally.
+        c10::intrusive_ptr<c10::ivalue::Future> getFuture() override;
+
+        // Store a reference to LCCL collective's outputs, used by result and to
+        // give a more descriptive message when representing the Work as a string.
+        std::shared_ptr<std::vector<at::Tensor>> outputs_;
+
+        // Reference to the store so that we can write aborted communicators to the store.
+        c10::intrusive_ptr<c10d::Store> store_;
+
+        // The future returned by getFuture.
+        c10::intrusive_ptr<at::ivalue::Future> future_;
+
+        std::vector<at::Tensor> lazy_destroy_tensors_;
+        friend class ProcessGroupLCCL;
+    };
+
+    ProcessGroupLCCL(
+        const c10::intrusive_ptr<c10d::Store>& store,
+        int rank,
+        int size);
+
+    ~ProcessGroupLCCL() override;
+
+    const std::string getBackendName() const
+    {
+        return LCCL_BACKEND_NAME;
+    }
+
+    c10::intrusive_ptr<c10d::Work> allreduce(
+        std::vector<at::Tensor>& tensors,
+        const c10d::AllreduceOptions& opts = c10d::AllreduceOptions()) override;
+
+    c10::intrusive_ptr<c10d::Work> allgather(
+        std::vector<std::vector<at::Tensor>>& outputTensors,
+        std::vector<at::Tensor>& inputTensors,
+        const c10d::AllgatherOptions& opts = c10d::AllgatherOptions()) override;
+
+    c10::intrusive_ptr<c10d::Work> broadcast(
+        std::vector<at::Tensor>& tensors,
+        const c10d::BroadcastOptions& opts = c10d::BroadcastOptions()) override;
+
+    c10::intrusive_ptr<c10d::Work> reduce_scatter(
+        std::vector<at::Tensor>& outputTensors,
+        std::vector<std::vector<at::Tensor>>& inputTensors,
+        const c10d::ReduceScatterOptions& opts = c10d::ReduceScatterOptions()) override;
+
+    static const int64_t kProcessGroupLCCLOpTimeoutMillis;
+
+protected:
+    // Helper that either looks up the cached LCCL communicators or creates
+    // a new set of LCCL communicators as a cache entry
+    std::vector<at_npu::lccl::LcclComm>& getLCCLComm(
+        const std::string& devicesKey,
+        const std::vector<at::Device>& devices);
+
+    c10::intrusive_ptr<c10d::Store> store_;
+
+    // Whether or not wait() and synchronize() are blocking operations that wait
+    // for the operation to complete.
+    bool blockingWait_ = false;
+
+    // Timeout for operations. This is only used when blockingWait_ is enabled.
+    std::chrono::milliseconds opTimeout_;
+
+    // The NPU streams used by LCCL kernels
+    std::unordered_map<std::string, std::vector<c10_npu::NPUStream>> lcclStreams_;
+    std::unordered_map<std::string, std::vector<at_npu::lccl::LcclComm>> devLCCLCommMap_;
+    // The NPU events used to sync LCCL streams
+    std::unordered_map<std::string, std::vector<c10_npu::NPUEvent>> lcclEvents_;
+    // Mutex to guard maps like devLCCLCommMap_.
+    std::mutex mutex_;
+private:
+    template <typename Fn>
+    c10::intrusive_ptr<c10d::Work> collective(
+        std::vector<at::Tensor>& input,
+        std::vector<at::Tensor>& output,
+        Fn fn,
+        c10d::OpType opType);
+
+    template <typename Fn, typename PreProcess, typename PostProcess>
+    c10::intrusive_ptr<c10d::Work> collective(
+        std::vector<at::Tensor>& input,
+        std::vector<at::Tensor>& output,
+        Fn fn,
+        PreProcess pre,
+        PostProcess post,
+        c10d::OpType opType);
+};
+
+} // namespace c10d_npu
-- 
Gitee


From 66e0181b99b0906f5ce78e2f2bbb8f8e560990fa Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Wed, 26 Feb 2025 02:45:14 +0000
Subject: [PATCH 057/358] !18321 Update op_plugin commit id Merge pull request
 !18321 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 8209d3e480..b0b44b9206 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 8209d3e48059d57a9aa6ead6128d6166f187776f
+Subproject commit b0b44b92060a5231463db27b0d4a26aefd6d9aea
-- 
Gitee


From c3a20ecf2fa46495dcf973401902acb84e9fef5c Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Wed, 26 Feb 2025 07:45:20 +0000
Subject: [PATCH 058/358] !18336 Update op_plugin commit id Merge pull request
 !18336 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index b0b44b9206..90fb74479f 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit b0b44b92060a5231463db27b0d4a26aefd6d9aea
+Subproject commit 90fb74479f8a65f4bb648bf82ef663fee4b7a3d8
-- 
Gitee


From 6c6716ec683591384741099444adea7ac03c0ec0 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Wed, 26 Feb 2025 07:45:20 +0000
Subject: [PATCH 059/358] !18336 Update op_plugin commit id Merge pull request
 !18336 from pta-robot/v2.6.0

-- 
Gitee


From 9cf9857ad22a9eac10d673154dee6f52d41efc2e Mon Sep 17 00:00:00 2001
From: yuhaiyan <yuhaiyan8@huawei.com>
Date: Wed, 26 Feb 2025 09:38:26 +0000
Subject: [PATCH 060/358] !18344 Fixed the failed tests. Merge pull request
 !18344 from yuhaiyan/v2.6.0-dev1

---
 test/contrib/test_ensemble_dropout.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/contrib/test_ensemble_dropout.py b/test/contrib/test_ensemble_dropout.py
index 2ba7fe93b9..dabd322d3e 100644
--- a/test/contrib/test_ensemble_dropout.py
+++ b/test/contrib/test_ensemble_dropout.py
@@ -32,11 +32,11 @@ class TestEnsembleDropout(unittest.TestCase):
         model = NpuMNIST().to("npu")
         x = torch.randn(2, 10, 16, 16).to("npu")
         NpuFairseqDropout.enable_dropout_ensemble(model)
-        dropout = NpuFairseqDropout(p=1)
+        dropout = NpuFairseqDropout(p=0.5)
         output = model(x, dropout)
 
         NpuCachedDropout.enable_dropout_ensemble(model)
-        dropout = NpuCachedDropout(p=1)
+        dropout = NpuCachedDropout(p=0.5)
         output = model(x, dropout)
 
 
-- 
Gitee


From 73e6ddfb6830367c0d941f07ea6642e56a72c5ae Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Wed, 26 Feb 2025 10:45:20 +0000
Subject: [PATCH 061/358] !18357 Update op_plugin commit id Merge pull request
 !18357 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 90fb74479f..331c28cfc9 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 90fb74479f8a65f4bb648bf82ef663fee4b7a3d8
+Subproject commit 331c28cfc9c2be0db6558960cc45241936d1f90b
-- 
Gitee


From 56894986532e5f21717c3106b71663a2178ef925 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= <wangchao430@huawei.com>
Date: Thu, 27 Feb 2025 09:31:00 +0000
Subject: [PATCH 062/358] =?UTF-8?q?!18368=20Simplified=20distributed=20use?=
 =?UTF-8?q?=20case=20with=20dist.ReduceOp.AVG=20Merge=20pull=20request=20!?=
 =?UTF-8?q?18368=20from=20=E7=8E=8B=E8=B6=85/v2.6.0=5Freducetest?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/distributed/test_allreduce.py            | 28 ++--------------
 test/distributed/test_reduce.py               | 33 ++-----------------
 test/distributed/test_reduce_scatter.py       |  8 ++---
 test/distributed/test_reduce_scatter_base.py  |  4 +--
 .../distributed/test_reduce_scatter_tensor.py |  8 ++---
 5 files changed, 14 insertions(+), 67 deletions(-)

diff --git a/test/distributed/test_allreduce.py b/test/distributed/test_allreduce.py
index 533b490e22..5e9a26cbc8 100644
--- a/test/distributed/test_allreduce.py
+++ b/test/distributed/test_allreduce.py
@@ -71,7 +71,7 @@ class HcomAllReduceTest(TestCase):
     def test_dist_all_reduce(self):
         # CI currently supports only 2 devices
         ranks = [2]
-        dtype_list = [np.float32, np.int32]
+        dtype_list = [np.float32]
         format_list = [0, 2, 3]
         shape_format = [
             [i, j, [2, 3, 16]] for i in dtype_list for j in format_list
@@ -94,24 +94,11 @@ class HcomAllReduceTest(TestCase):
             self._test_multiprocess(HcomAllReduceTest._test_all_reduce,
                                     HcomAllReduceTest._init_dist_hccl, expected, input1, world_size)
 
-    @skipIfUnsupportMultiNPU(2)
-    def test_dist_all_reduce_uint8(self):
-        ranks = [2]
-        shape_format = [[np.uint8, 2, [2, 3, 16]], [np.uint8, 2, [1]]]
-        for world_size in ranks:
-            for shape in shape_format:
-                if len(shape[2]) == 1:
-                    continue
-                exp_input, input1 = create_common_tensor(shape, 0, 10)
-                expected = self._construct_excepted_result(exp_input, world_size)
-                self._test_multiprocess(HcomAllReduceTest._test_all_reduce,
-                                        HcomAllReduceTest._init_dist_hccl, expected, input1, world_size)
-
     @skipIfUnsupportMultiNPU(2)
     def test_dist_all_reduce_avg(self):
         # CI currently supports only 2 devices
         ranks = [2]
-        dtype_list = [np.float32, np.int32]
+        dtype_list = [np.int32]
         shape_format = [
             [i, 2, [3, 16]] for i in dtype_list
         ]
@@ -123,17 +110,6 @@ class HcomAllReduceTest(TestCase):
                                         HcomAllReduceTest._init_dist_hccl, expected, input1, world_size,
                                         dist.ReduceOp.AVG)
 
-    @skipIfUnsupportMultiNPU(2)
-    def test_dist_all_reduce_int64_avg(self):
-        # CI currently supports only 2 devices
-        ranks = [2]
-        shape_format = [np.int64, 2, [1]]
-        for world_size in ranks:
-            exp_input, input1 = create_common_tensor(shape_format, -10, 10)
-            expected = self._construct_excepted_result(exp_input, world_size, np.int64, dist.ReduceOp.AVG)
-            self._test_multiprocess(HcomAllReduceTest._test_all_reduce,
-                                    HcomAllReduceTest._init_dist_hccl, expected, input1, world_size, dist.ReduceOp.AVG)
-
     @skipIfUnsupportMultiNPU(2)
     def test_dist_all_reduce_uint8_avg(self):
         ranks = [2]
diff --git a/test/distributed/test_reduce.py b/test/distributed/test_reduce.py
index e6c29b99d6..eda3ace6f4 100644
--- a/test/distributed/test_reduce.py
+++ b/test/distributed/test_reduce.py
@@ -76,7 +76,7 @@ class HcclReduceTest(TestCase):
     @skipIfUnsupportMultiNPU(2)
     def test_reduce_dist(self):
         ranks = [2, 4, 8]
-        dtype_list = [np.float32, np.float16, np.int32, np.int8]
+        dtype_list = [np.float32, np.float16]
         format_list = [0, 2, 3, 29]
         shape_format = [
             [i, j, [12, 56, 256]] for i in dtype_list for j in format_list
@@ -92,23 +92,6 @@ class HcclReduceTest(TestCase):
                 self._test_multiprocess(HcclReduceTest._test_reduce,
                                         HcclReduceTest._init_dist_hccl, expected, input1, world_size)
 
-    @skipIfUnsupportMultiNPU(2)
-    def test_reduce_int64_dist(self):
-        ranks = [2]
-        dtype_list = [np.int64]
-        format_list = [0, 2]
-        shape_format = [
-            [i, j, [12, 56, 256]] for i in dtype_list for j in format_list
-        ]
-        for world_size in ranks:
-            if torch.npu.device_count() < world_size:
-                continue
-            for shape in shape_format:
-                exp_input, input1 = create_common_tensor(shape, -10, 10)
-                expected = self._construct_excepted_result(exp_input, world_size)
-                self._test_multiprocess(HcclReduceTest._test_reduce,
-                                        HcclReduceTest._init_dist_hccl, expected, input1, world_size)
-
     @skipIfUnsupportMultiNPU(2)
     def test_reduce_uint8_dist(self):
         ranks = [2]
@@ -129,7 +112,7 @@ class HcclReduceTest(TestCase):
     @skipIfUnsupportMultiNPU(2)
     def test_reduce_dist_avg(self):
         ranks = [2]
-        dtype_list = [np.float32, np.float16, np.int32, np.int8, np.int64]
+        dtype_list = [np.int32, np.int8, np.int64]
         shape_format = [[i, 2, [3, 16]] for i in dtype_list]
         for world_size in ranks:
             if torch.npu.device_count() < world_size:
@@ -142,18 +125,6 @@ class HcclReduceTest(TestCase):
                 self._test_multiprocess(HcclReduceTest._test_reduce,
                                         HcclReduceTest._init_dist_hccl, expected, input1, world_size, dist.ReduceOp.AVG)
 
-    @skipIfUnsupportMultiNPU(2)
-    def test_reduce_uint8_dist_avg(self):
-        ranks = [2]
-        dtype_list = [np.uint8]
-        shape_format = [[i, 2, [3, 16]] for i in dtype_list]
-        for world_size in ranks:
-            for shape in shape_format:
-                exp_input, input1 = create_common_tensor(shape, 0, 10)
-                expected = self._construct_excepted_result(exp_input, world_size, shape[0], dist.ReduceOp.AVG)
-                self._test_multiprocess(HcclReduceTest._test_reduce,
-                                        HcclReduceTest._init_dist_hccl, expected, input1, world_size, dist.ReduceOp.AVG)
-
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/distributed/test_reduce_scatter.py b/test/distributed/test_reduce_scatter.py
index 40172ef04d..40a28c0533 100644
--- a/test/distributed/test_reduce_scatter.py
+++ b/test/distributed/test_reduce_scatter.py
@@ -71,7 +71,7 @@ class HcclReduceScatterTest(HcclReduceScatterTestBase):
     @skipIfUnsupportMultiNPU(2)
     def test_reduce_scatter(self):
         ranks = [2]
-        dtype_list = [np.float32, np.float16, np.int32, np.int8]
+        dtype_list = [np.float32, np.float16]
         format_list = [0, 2, 3, 29]
         shape_format = [
             [i, j, [4, 9]] for i in dtype_list for j in format_list] + \
@@ -93,7 +93,7 @@ class HcclReduceScatterTest(HcclReduceScatterTestBase):
     def test_reduce_scatter_with_different_shape(self):
         ranks = [2]
         format_list = [0, 2, 3, 29]
-        dtype_list = [np.float32, np.float16, np.int32, np.int8]
+        dtype_list = [np.int32, np.int8]
 
         def get_random_input(dim=1, max_value=10, dtype=np.float32):
             shape_list = list()
@@ -117,7 +117,7 @@ class HcclReduceScatterTest(HcclReduceScatterTestBase):
     @skipIfUnsupportMultiNPU(2)
     def test_reduce_scatter_avg(self):
         ranks = [2]
-        dtype_list = [np.float32, np.float16, np.int32, np.int8]
+        dtype_list = [np.int32, np.int8]
         shape_format = [[i, 2, [4, 9]] for i in dtype_list]
 
         for world_size in ranks:
@@ -135,7 +135,7 @@ class HcclReduceScatterTest(HcclReduceScatterTestBase):
     @skipIfUnsupportMultiNPU(2)
     def test_reduce_scatter_with_different_shape_avg(self):
         ranks = [2]
-        dtype_list = [np.float32, np.float16, np.int32, np.int8]
+        dtype_list = [np.float32, np.float16]
 
         def get_random_input(dim=1, max_value=10, dtype=np.float32):
             shape_list = list()
diff --git a/test/distributed/test_reduce_scatter_base.py b/test/distributed/test_reduce_scatter_base.py
index 36d3e0880c..4b45a89a47 100644
--- a/test/distributed/test_reduce_scatter_base.py
+++ b/test/distributed/test_reduce_scatter_base.py
@@ -31,7 +31,7 @@ class HcclReduceScatterBaseTest(HcclReduceScatterTestBase):
     @skipIfUnsupportMultiNPU(2)
     def test_reduce_scatter_base(self):
         ranks = [2]
-        dtype_list = [np.float32, np.float16, np.int32, np.int8]
+        dtype_list = [np.float32, np.float16]
         format_list = [0, 2, 3, 29]
         shape_format = [
             [i, j, [4, 9]] for i in dtype_list for j in format_list] + \
@@ -69,7 +69,7 @@ class HcclReduceScatterBaseTest(HcclReduceScatterTestBase):
     @skipIfUnsupportMultiNPU(2)
     def test_reduce_scatter_base_avg(self):
         ranks = [2]
-        dtype_list = [np.float32, np.float16, np.int32, np.int8, np.int64]
+        dtype_list = [np.int32, np.int8]
         shape_format = [[i, 2, [4, 9]] for i in dtype_list]
         for world_size in ranks:
             for shape in shape_format:
diff --git a/test/distributed/test_reduce_scatter_tensor.py b/test/distributed/test_reduce_scatter_tensor.py
index adfafe786a..c58236ba1a 100644
--- a/test/distributed/test_reduce_scatter_tensor.py
+++ b/test/distributed/test_reduce_scatter_tensor.py
@@ -31,7 +31,7 @@ class HcclReduceScatterTensorTest(HcclReduceScatterTestBase):
     @skipIfUnsupportMultiNPU(2)
     def test_reduce_scatter_tensor(self):
         ranks = [2]
-        dtype_list = [np.float32, np.float16, np.int32, np.int8]
+        dtype_list = [np.float32, np.float16]
         format_list = [0, 2, 3, 29]
         shape_format = [
             [i, j, [4, 9]] for i in dtype_list for j in format_list] + \
@@ -63,7 +63,7 @@ class HcclReduceScatterTensorTest(HcclReduceScatterTestBase):
     @skipIfUnsupportMultiNPU(2)
     def test_reduce_scatter_tensor_uneven(self):
         ranks = [2]
-        dtype_list = [np.float32, np.float16, np.int32, np.int8]
+        dtype_list = [np.int32, np.int8]
         format_list = [0, 2, 3, 29]
         shape_format = [
             [i, j, [4, 9]] for i in dtype_list for j in format_list] + \
@@ -83,7 +83,7 @@ class HcclReduceScatterTensorTest(HcclReduceScatterTestBase):
     @skipIfUnsupportMultiNPU(2)
     def test_reduce_scatter_tensor_avg(self):
         ranks = [2]
-        dtype_list = [np.float32, np.float16, np.int32, np.int8]
+        dtype_list = [np.int32, np.int8]
         shape_format = [[i, 2, [4, 9]] for i in dtype_list]
         for world_size in ranks:
             for shape in shape_format:
@@ -100,7 +100,7 @@ class HcclReduceScatterTensorTest(HcclReduceScatterTestBase):
     @skipIfUnsupportMultiNPU(2)
     def test_reduce_scatter_tensor_uneven_avg(self):
         ranks = [2]
-        dtype_list = [np.float32, np.float16, np.int32, np.int8]
+        dtype_list = [np.float32, np.float16]
         shape_format = [[i, 2, [4, 9]] for i in dtype_list]
         for world_size in ranks:
             for shape in shape_format:
-- 
Gitee


From a8eca8ffd91e318379358a428d8856eedeceb9ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= <wangchao430@huawei.com>
Date: Thu, 27 Feb 2025 09:32:12 +0000
Subject: [PATCH 063/358] =?UTF-8?q?!18298=20StressDetect=20support=20AmlAi?=
 =?UTF-8?q?coreDetectOnline.=20Merge=20pull=20request=20!18298=20from=20?=
 =?UTF-8?q?=E7=8E=8B=E8=B6=85/v2.6.0=5Fstressmove1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 setup.py                                      |  3 +-
 third_party/acl/inc/aml/aml_fwk_detect.h      | 41 +++++++++++++++++++
 third_party/acl/libs/aml_fwk_detect.cpp       |  3 ++
 third_party/acl/libs/build_stub.sh            |  2 +
 .../csrc/core/npu/interface/MlInterface.cpp   | 39 ++++++++++++++++++
 .../csrc/core/npu/interface/MlInterface.h     | 17 ++++++++
 torch_npu/csrc/npu/Stress_detect.cpp          | 14 ++++++-
 7 files changed, 117 insertions(+), 2 deletions(-)
 create mode 100644 third_party/acl/inc/aml/aml_fwk_detect.h
 create mode 100644 third_party/acl/libs/aml_fwk_detect.cpp
 create mode 100644 torch_npu/csrc/core/npu/interface/MlInterface.cpp
 create mode 100644 torch_npu/csrc/core/npu/interface/MlInterface.h

diff --git a/setup.py b/setup.py
index 3c9afe54c6..c1ece00691 100644
--- a/setup.py
+++ b/setup.py
@@ -519,7 +519,8 @@ class BdistWheelBuild(bdist_wheel):
 
         torch_dependencies = ["libc10.so", "libtorch.so", "libtorch_cpu.so", "libtorch_python.so"]
         cann_dependencies = ["libhccl.so", "libascendcl.so", "libacl_op_compiler.so", "libge_runner.so",
-                             "libgraph.so", "libacl_tdt_channel.so", "libfmk_parser.so", "libascend_protobuf.so"]
+                             "libgraph.so", "libacl_tdt_channel.so", "libfmk_parser.so", "libascend_protobuf.so",
+                             "libascend_ml.so"]
         other_dependencies = ["libtorch_npu.so", "libnpu_profiler.so", "libgomp.so.1", "libatb.so"]
 
         dependencies = torch_dependencies + cann_dependencies + other_dependencies
diff --git a/third_party/acl/inc/aml/aml_fwk_detect.h b/third_party/acl/inc/aml/aml_fwk_detect.h
new file mode 100644
index 0000000000..2eb355b338
--- /dev/null
+++ b/third_party/acl/inc/aml/aml_fwk_detect.h
@@ -0,0 +1,41 @@
+/**
+* @file aml_fwk_detect.h
+*
+* Copyright (C) Huawei Technologies Co., Ltd. 2023-2024. All Rights Reserved.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+*/
+
+#ifndef INC_FWK_AML_FWK_DETECT_H_
+#define INC_FWK_AML_FWK_DETECT_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int32_t AmlStatus;
+
+typedef enum AmlDetectRunMode {
+    AML_DETECT_RUN_MODE_ONLINE = 0,
+    AML_DETECT_RUN_MODE_OFFLINE = 1,
+    AML_DETECT_RUN_MODE_MAX,
+} AmlDetectRunMode;
+
+typedef struct AmlAicoreDetectAttr {
+    AmlDetectRunMode mode;
+    void *workspace;
+    uint64_t workspaceSize;
+    uint8_t reserve[64];
+} AmlAicoreDetectAttr;
+
+AmlStatus AmlAicoreDetectOnline(int32_t deviceId, const AmlAicoreDetectAttr *attr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // INC_FWK_AML_FWK_DETECT_H_
diff --git a/third_party/acl/libs/aml_fwk_detect.cpp b/third_party/acl/libs/aml_fwk_detect.cpp
new file mode 100644
index 0000000000..80edf2835c
--- /dev/null
+++ b/third_party/acl/libs/aml_fwk_detect.cpp
@@ -0,0 +1,3 @@
+#include "aml/aml_fwk_detect.h"
+
+AmlStatus AmlAicoreDetectOnline(int32_t deviceId, const AmlAicoreDetectAttr *attr) {return 0;}
diff --git a/third_party/acl/libs/build_stub.sh b/third_party/acl/libs/build_stub.sh
index 34a08d33c1..8ce79b6e2f 100644
--- a/third_party/acl/libs/build_stub.sh
+++ b/third_party/acl/libs/build_stub.sh
@@ -16,3 +16,5 @@ gcc -fPIC -shared -o libgraph.so -I../inc graph.cpp operator_factory.cpp operato
 
 gcc -fPIC -shared -o libacl_tdt_channel.so -I../inc acl_tdt.cpp
 
+gcc -fPIC -shared -o libascend_ml.so -I../inc aml_fwk_detect.cpp
+
diff --git a/torch_npu/csrc/core/npu/interface/MlInterface.cpp b/torch_npu/csrc/core/npu/interface/MlInterface.cpp
new file mode 100644
index 0000000000..b992b4a188
--- /dev/null
+++ b/torch_npu/csrc/core/npu/interface/MlInterface.cpp
@@ -0,0 +1,39 @@
+#include "MlInterface.h"
+#include "torch_npu/csrc/core/npu/register/FunctionLoader.h"
+#include "torch_npu/csrc/core/npu/NPUException.h"
+
+namespace c10_npu {
+
+namespace amlapi {
+#undef LOAD_FUNCTION
+#define LOAD_FUNCTION(funcName) \
+    REGISTER_FUNCTION(libascend_ml, funcName)
+#undef GET_FUNC
+#define GET_FUNC(funcName)           \
+    GET_FUNCTION(libascend_ml, funcName)
+
+REGISTER_LIBRARY(libascend_ml)
+LOAD_FUNCTION(AmlAicoreDetectOnline)
+
+bool IsExistAmlAicoreDetectOnline()
+{
+    const static bool isExist = []() -> bool {
+        static auto func = GET_FUNC(AmlAicoreDetectOnline);
+        return func != nullptr;
+    }();
+    return isExist;
+}
+
+AmlStatus AmlAicoreDetectOnlineFace(int32_t deviceId, const AmlAicoreDetectAttr *attr)
+{
+    typedef AmlStatus (*amlAicoreDetectOnline)(int32_t, const AmlAicoreDetectAttr *);
+    static amlAicoreDetectOnline func = nullptr;
+    if (func == nullptr) {
+        func = (amlAicoreDetectOnline) GET_FUNC(AmlAicoreDetectOnline);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "AmlAicoreDetectOnline", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(deviceId, attr);
+}
+
+} // namespace amlapi
+} // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/interface/MlInterface.h b/torch_npu/csrc/core/npu/interface/MlInterface.h
new file mode 100644
index 0000000000..6af498629a
--- /dev/null
+++ b/torch_npu/csrc/core/npu/interface/MlInterface.h
@@ -0,0 +1,17 @@
+#pragma once
+#include "third_party/acl/inc/aml/aml_fwk_detect.h"
+
+namespace c10_npu {
+namespace amlapi {
+/**
+ * This API is used to check whether AmlAicoreDetectOnline exist.
+*/
+bool IsExistAmlAicoreDetectOnline();
+
+/**
+ * This API is used to call AmlAicoreDetectOnline.
+*/
+AmlStatus AmlAicoreDetectOnlineFace(int32_t deviceId, const AmlAicoreDetectAttr *attr);
+
+} // namespace amlapi
+} // namespace c10_npu
diff --git a/torch_npu/csrc/npu/Stress_detect.cpp b/torch_npu/csrc/npu/Stress_detect.cpp
index e778324b0e..3fcade819b 100644
--- a/torch_npu/csrc/npu/Stress_detect.cpp
+++ b/torch_npu/csrc/npu/Stress_detect.cpp
@@ -1,6 +1,7 @@
 #include "Stress_detect.h"
 #include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
+#include "torch_npu/csrc/core/npu/interface/MlInterface.h"
 
 std::atomic<bool> StressDetector::task_in_progress(false);
 std::atomic<bool> StressDetector::stop_thread(false);
@@ -38,7 +39,18 @@ void StressDetector::worker_thread()
         }
 
         // Execute the task
-        int ret = c10_npu::acl::AclStressDetect(device_id, workspaceAddr, workspaceSize);
+        int ret = -1;
+        if (c10_npu::amlapi::IsExistAmlAicoreDetectOnline()) {
+            AmlAicoreDetectAttr attr;
+            attr.mode = AML_DETECT_RUN_MODE_ONLINE;
+            attr.workspace = workspaceAddr;
+            attr.workspaceSize = workspaceSize;
+            ret = c10_npu::amlapi::AmlAicoreDetectOnlineFace(device_id, &attr);
+            ASCEND_LOGI("Stress detect with AmlAicoreDetectOnline, result is %d.", ret);
+        } else {
+            ret = c10_npu::acl::AclStressDetect(device_id, workspaceAddr, workspaceSize);
+            ASCEND_LOGI("Stress detect with StressDetect, result is %d.", ret);
+        }
 
         // Task complete, free memory
         aclrtFree(workspaceAddr);
-- 
Gitee


From 81a999632a1fb64f4eb3c87b639cf318e56c95cb Mon Sep 17 00:00:00 2001
From: aiyang2 <aiyang2@huawei.com>
Date: Thu, 27 Feb 2025 10:32:01 +0000
Subject: [PATCH 064/358] !18376 add mla interface Merge pull request !18376
 from aiyang2/v2.6.0

---
 test/allowlist_for_publicAPI.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index 9ed3179d1d..daa7fc494f 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -2823,6 +2823,7 @@
     "npu_random_choice_with_mask",
     "npu_rms_norm",
     "npu_fused_infer_attention_score",
+    "npu_mla_prolog",
     "npu_convert_weight_to_int4pack",
     "npu_ffn",
     "npu_geglu",
-- 
Gitee


From 8ba9045cb5eab3c96782973dd6edcda6a5ae349c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E5=98=89=E5=B7=8D?= <843972097@qq.com>
Date: Thu, 27 Feb 2025 11:38:57 +0000
Subject: [PATCH 065/358] =?UTF-8?q?!18350=20[Feature]=20Add=20npu=5Fadvanc?=
 =?UTF-8?q?e=5Fstep=5Fflashattn.=20Merge=20pull=20request=20!18350=20from?=
 =?UTF-8?q?=20=E5=88=98=E5=98=89=E5=B7=8D/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/allowlist_for_publicAPI.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index daa7fc494f..5d0c43090b 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -2813,6 +2813,7 @@
     "empty_with_format",
     "npu_alloc_float_status",
     "npu_apply_adam",
+    "npu_advance_step_flashattn",
     "npu_bert_apply_adam",
     "npu_clear_float_status",
     "npu_cross_entropy_loss",
-- 
Gitee


From fb278d17a9aa142017fe241798ffe28933ef68a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=A2=85=E9=A3=9E=E8=A6=81?= <1332490378@qq.com>
Date: Thu, 27 Feb 2025 12:48:05 +0000
Subject: [PATCH 066/358] =?UTF-8?q?!18288=20[PROFILING]update=20mstx=20com?=
 =?UTF-8?q?m=20data=20format;=20inner=20mstx=20data=20add=20device=20times?=
 =?UTF-8?q?tamp=20Merge=20pull=20request=20!18288=20from=20=E6=A2=85?=
 =?UTF-8?q?=E9=A3=9E=E8=A6=81/comm2.6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 769fcb1516..9b24ccca55 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -2127,7 +2127,7 @@ std::string mapToJson(const std::unordered_map<std::string, std::string>& map)
         if (!first) {
             ss << ",";
         }
-        ss << pair.first << ": " << pair.second;
+        ss << "\"" << pair.first << "\"" << ": " << "\"" << pair.second << "\"";
         first = false;
     }
     ss << "}";
@@ -2157,25 +2157,23 @@ std::string ProcessGroupHCCL::getMstxHcclMsg(
     }
     std::unordered_map<std::string, std::string> msgDict;
     msgDict["opName"] = opName;
-    std::string hccl_message_str = "comm:" + opName + "-";
     auto nameIter = commNames.find(comm);
     if (nameIter == commNames.end()) {
         char commName[MAX_GROUP_NAME_LEN];
         HCCL_CHECK_ERROR(at_npu::hccl::HcclGetCommNameFace(comm, commName));
         std::string name(commName);
         commNames.insert({comm, name});
-        msgDict["commName"] = name;
+        msgDict["groupName"] = name;
     } else {
-        msgDict["commName"] = nameIter->second;
+        msgDict["groupName"] = nameIter->second;
     }
-    hccl_message_str += "-";
     std::string data_type_str = "na";
     auto iter = dataTypes.find(dataType);
     if (iter != dataTypes.end()) {
         data_type_str = iter->second;
     }
     msgDict["dataType"] = data_type_str;
-    msgDict["dataCnt"] = std::to_string(dataCnt);
+    msgDict["count"] = std::to_string(dataCnt);
     msgDict["streamId"] = std::to_string(streamId);
     return mapToJson(msgDict);
 }
-- 
Gitee


From b790efc765ea8e77c533f5b05a59862bdbdf874a Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 27 Feb 2025 13:45:15 +0000
Subject: [PATCH 067/358] !18384 Update op_plugin commit id Merge pull request
 !18384 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 331c28cfc9..90d2339fb8 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 331c28cfc9c2be0db6558960cc45241936d1f90b
+Subproject commit 90d2339fb8bff4ca5424f221f4d5a09de8243c59
-- 
Gitee


From cc91c66d4c9b8947d5ffbfee67568530f91836d4 Mon Sep 17 00:00:00 2001
From: czy1255959842 <cuizhengyao@huawei.com>
Date: Fri, 28 Feb 2025 03:51:00 +0000
Subject: [PATCH 068/358] !18246 Add mstx dotting to obtain PTA memory pool
 data Merge pull request !18246 from czy1255959842/v2.6.0

---
 third_party/mstx/ms_tools_ext.h               |  74 +++++++++++--
 .../csrc/core/npu/NPUCachingAllocator.cpp     |  26 ++++-
 .../csrc/core/npu/NPUWorkspaceAllocator.cpp   |   7 ++
 .../framework/interface/MstxInterface.cpp     | 100 ++++++++++++++++--
 .../csrc/framework/interface/MstxInterface.h  |  19 +++-
 torch_npu/csrc/profiler/mstx_mgr.cpp          |  83 ++++++++++++++-
 torch_npu/csrc/profiler/mstx_mgr.h            |  18 ++--
 torch_npu/csrc/profiler/npu_profiler.h        |   2 +-
 8 files changed, 293 insertions(+), 36 deletions(-)

diff --git a/third_party/mstx/ms_tools_ext.h b/third_party/mstx/ms_tools_ext.h
index aac8239127..622896997a 100644
--- a/third_party/mstx/ms_tools_ext.h
+++ b/third_party/mstx/ms_tools_ext.h
@@ -13,7 +13,61 @@ typedef uint64_t  mstxRangeId;
 
 struct mstxDomainRegistration_st;
 typedef struct mstxDomainRegistration_st mstxDomainRegistration_t;
-typedef mstxDomainRegistration_t* mstxDomainhandle_t;
+typedef mstxDomainRegistration_t* mstxDomainHandle_t;
+
+struct mstxMemHeap_st;
+typedef struct mstxMemHeap_st mstxMemHeap_t;
+typedef mstxMemHeap_t* mstxMemHeapHandle_t;
+
+struct mstxMemRegion_st;
+typedef struct mstxMemRegion_st mstxMemRegion_t;
+typedef mstxMemRegion_t* mstxMemRegionHandle_t;
+
+typedef struct mstxMemVirtualRangeDesc_t {
+    uint32_t deviceId;
+    const void* ptr;
+    uint64_t size;
+} mstxMemVirtualRangeDesc_t;
+
+typedef enum mstxMemHeapUsageType {
+    MSTX_MEM_HEAP_USAGE_TYPE_SUB_ALLOCATOR = 0,
+} mstxMemHeapUsageType;
+
+typedef enum mstxMemType {
+    MSTX_MEM_TYPE_VIRTUAL_ADDRESS = 0,
+} mstxMemType;
+
+typedef struct mstxMemHeapDesc_t {
+    mstxMemHeapUsageType usage;
+    mstxMemType type;
+    const void* typeSpecificDesc;
+} mstxMemHeapDesc_t;
+
+typedef struct mstxMemRegionsRegisterBatch_t {
+    mstxMemHeapHandle_t heap;
+    mstxMemType regionType;
+    size_t regionCount;
+    const void* regionDescArray;
+    mstxMemRegionHandle_t* regionHandleArrayOut;
+} mstxMemRegionsRegisterBatch_t;
+
+typedef enum mstxMemRegionRefType {
+    MSTX_MEM_REGION_REF_TYPE_POINTER = 0,
+    MSTX_MEM_REGION_REF_TYPE_HANDLE
+} mstxMemRegionRefType;
+
+typedef struct mstxMemRegionRef_t {
+    mstxMemRegionRefType refType;
+    union {
+        const void* pointer;
+        mstxMemRegionHandle_t handle;
+    };
+} mstxMemRegionRef_t;
+
+typedef struct mstxMemRegionsUnregisterBatch_t {
+    size_t refCount;
+    const mstxMemRegionRef_t* refArray;
+} mstxMemRegionsUnregisterBatch_t;
 
 ACL_FUNC_VISIBILITY void mstxMarkA(const char* message, aclrtStream stream);
 
@@ -21,16 +75,24 @@ ACL_FUNC_VISIBILITY mstxRangeId mstxRangeStartA(const char* message, aclrtStream
 
 ACL_FUNC_VISIBILITY void mstxRangeEnd(mstxRangeId id);
 
-ACL_FUNC_VISIBILITY mstxDomainhandle_t mstxDomainCreateA(const char* name);
+ACL_FUNC_VISIBILITY mstxDomainHandle_t mstxDomainCreateA(const char* name);
 
-ACL_FUNC_VISIBILITY void mstxDomainDestroy(mstxDomainhandle_t handle);
+ACL_FUNC_VISIBILITY void mstxDomainDestroy(mstxDomainHandle_t handle);
 
-ACL_FUNC_VISIBILITY void mstxDomainMarkA(mstxDomainhandle_t handle, const char* message, aclrtStream stream);
+ACL_FUNC_VISIBILITY void mstxDomainMarkA(mstxDomainHandle_t handle, const char* message, aclrtStream stream);
 
-ACL_FUNC_VISIBILITY mstxRangeId mstxDomainRangeStartA(mstxDomainhandle_t handle, const char* message,
+ACL_FUNC_VISIBILITY mstxRangeId mstxDomainRangeStartA(mstxDomainHandle_t handle, const char* message,
                                                       aclrtStream stream);
 
-ACL_FUNC_VISIBILITY void mstxDomainRangeEnd(mstxDomainhandle_t handle, mstxRangeId id);
+ACL_FUNC_VISIBILITY void mstxDomainRangeEnd(mstxDomainHandle_t handle, mstxRangeId id);
+
+ACL_FUNC_VISIBILITY mstxMemHeapHandle_t mstxMemHeapRegister(mstxDomainHandle_t domain, const mstxMemHeapDesc_t* desc);
+
+ACL_FUNC_VISIBILITY void mstxMemHeapUnregister(mstxDomainHandle_t domain, mstxMemHeapHandle_t heap);
+
+ACL_FUNC_VISIBILITY void mstxMemRegionsRegister(mstxDomainHandle_t domain, const mstxMemRegionsRegisterBatch_t* desc);
+
+ACL_FUNC_VISIBILITY void mstxMemRegionsUnregister(mstxDomainHandle_t domain, const mstxMemRegionsUnregisterBatch_t* desc);
 
 #ifdef __cplusplus
 }
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index ecedd1e444..866d2e9310 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -1261,6 +1261,9 @@ class DeviceCachingAllocator {
       stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current);
 
 #ifndef BUILD_LIBTORCH
+    mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+    mstxMemVirtualRangeDesc_t desc{block->device, block->ptr, block->size};
+    torch_npu::profiler::MstxMgr::GetInstance()->memRegionsRegister(msleaksDomain, &desc);
     torch_npu::profiler::reportMemoryDataToNpuProfiler({
       static_cast<int8_t>(c10::DeviceType::PrivateUse1),
       block->device,
@@ -1320,6 +1323,8 @@ class DeviceCachingAllocator {
         stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
         stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current);
 #ifndef BUILD_LIBTORCH
+    mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+    torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, orig_block_ptr);
     torch_npu::profiler::reportMemoryDataToNpuProfiler({
         static_cast<int8_t>(c10::DeviceType::PrivateUse1),
         block->device,
@@ -1669,7 +1674,11 @@ class DeviceCachingAllocator {
     for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
       update_stat(stats.reserved_bytes[stat_type], mapped_range.size);
     });
-
+#ifndef BUILD_LIBTORCH
+    mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+    mstxMemVirtualRangeDesc_t desc{to_map->device, mapped_range.ptr, mapped_range.size};
+    torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &desc);
+#endif
     record_trace(
         TraceEntry::SEGMENT_MAP,
         int64_t(mapped_range.ptr),
@@ -2048,6 +2057,11 @@ class DeviceCachingAllocator {
 
     // p.block came from new, not cudaMalloc. It should not be nullptr here.
     TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr);
+#ifndef BUILD_LIBTORCH
+    mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+    mstxMemVirtualRangeDesc_t desc{p.block->device, p.block->ptr, p.block->size};
+    torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &desc);
+#endif
     record_trace(
         TraceEntry::SEGMENT_ALLOC,
         int64_t(p.block->ptr),
@@ -2165,7 +2179,10 @@ class DeviceCachingAllocator {
 
     if (block->size >= CachingAllocatorConfig::max_split_size())
       update_stat(stats.oversize_segments, -1);
-
+#ifndef BUILD_LIBTORCH
+    mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+    torch_npu::profiler::MstxMgr::GetInstance()->memHeapUnregister(msleaksDomain, block->ptr);
+#endif
     ASCEND_LOGD("pta_memory acl_free: free_size = %zu", block->size);
 
     pool->blocks.erase(block);
@@ -2223,7 +2240,10 @@ class DeviceCachingAllocator {
     for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
       update_stat(stats.reserved_bytes[stat_type], -unmapped.size);
     });
-
+#ifndef BUILD_LIBTORCH
+    mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+    torch_npu::profiler::MstxMgr::GetInstance()->memHeapUnregister(msleaksDomain, block->ptr);
+#endif
     record_trace(
         TraceEntry::SEGMENT_UNMAP,
         int64_t(unmapped.ptr),
diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp
index bda4007059..9ab1e5c032 100644
--- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp
@@ -53,6 +53,8 @@ public:
                 NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeDeviceWithTimeout());
                 NPU_CHECK_ERROR(aclrtFree(block->data_ptr));
 #ifndef BUILD_LIBTORCH
+                mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+                torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, block->data_ptr);
                 record_mem_size_decrement(block->size);
                 const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
                 if (C10_UNLIKELY(trigger)) {
@@ -89,6 +91,9 @@ public:
 
             ASCEND_LOGD("NPUWorkspaceAllocator malloc by AclrtMallocAlign32: size=%zu", block->size);
 #ifndef BUILD_LIBTORCH
+            mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+            mstxMemVirtualRangeDesc_t desc{device, block->data_ptr, block->size};
+            torch_npu::profiler::MstxMgr::GetInstance()->memRegionsRegister(msleaksDomain, &desc);
             record_mem_size_increment(block->size);
             torch_npu::profiler::reportMemoryDataToNpuProfiler({
                 static_cast<int8_t>(c10::DeviceType::PrivateUse1),
@@ -131,6 +136,8 @@ public:
                 ASCEND_LOGI("NPUWorkspaceAllocator free by aclrtFree: size=%zu", block_pair.second->size);
                 NPU_CHECK_ERROR(aclrtFree(block_pair.second->data_ptr));
 #ifndef BUILD_LIBTORCH
+                mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+                torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, block_pair.second->data_ptr);
                 record_mem_size_decrement(block_pair.second->size);
                 const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
                 if (C10_UNLIKELY(trigger)) {
diff --git a/torch_npu/csrc/framework/interface/MstxInterface.cpp b/torch_npu/csrc/framework/interface/MstxInterface.cpp
index 40ef6dcced..4024a63e27 100644
--- a/torch_npu/csrc/framework/interface/MstxInterface.cpp
+++ b/torch_npu/csrc/framework/interface/MstxInterface.cpp
@@ -24,6 +24,10 @@ LOAD_FUNCTION(mstxDomainDestroy)
 LOAD_FUNCTION(mstxDomainMarkA)
 LOAD_FUNCTION(mstxDomainRangeStartA)
 LOAD_FUNCTION(mstxDomainRangeEnd)
+LOAD_FUNCTION(mstxMemHeapRegister)
+LOAD_FUNCTION(mstxMemHeapUnregister)
+LOAD_FUNCTION(mstxMemRegionsRegister)
+LOAD_FUNCTION(mstxMemRegionsUnregister)
 
 // save python range id with cann mstx range id.
 // when mstx.range_end(id) is called, we can check if this id is invalid
@@ -128,9 +132,9 @@ void MstxRangeEnd(int ptRangeId)
     g_rangeIdMap.erase(iter);
 }
 
-mstxDomainhandle_t MstxDomainCreateA(const char* name)
+mstxDomainHandle_t MstxDomainCreateA(const char* name)
 {
-    using MstxDomainCreateAFunc = mstxDomainhandle_t (*)(const char*);
+    using MstxDomainCreateAFunc = mstxDomainHandle_t (*)(const char*);
     static MstxDomainCreateAFunc func = nullptr;
     static bool noFuncFlag = false;
     if (noFuncFlag) {
@@ -147,9 +151,9 @@ mstxDomainhandle_t MstxDomainCreateA(const char* name)
     return func(name);
 }
 
-void MstxDomainDestroy(mstxDomainhandle_t handle)
+void MstxDomainDestroy(mstxDomainHandle_t handle)
 {
-    using MstxDomainDestroyFunc = void (*)(mstxDomainhandle_t);
+    using MstxDomainDestroyFunc = void (*)(mstxDomainHandle_t);
     static MstxDomainDestroyFunc func = nullptr;
     static bool noFuncFlag = false;
     if (noFuncFlag) {
@@ -166,9 +170,9 @@ void MstxDomainDestroy(mstxDomainhandle_t handle)
     func(handle);
 }
 
-void MstxDomainMarkA(mstxDomainhandle_t handle, const char* message, aclrtStream stream)
+void MstxDomainMarkA(mstxDomainHandle_t handle, const char* message, aclrtStream stream)
 {
-    using MstxDomainMarkAFunc = void (*)(mstxDomainhandle_t, const char*, aclrtStream);
+    using MstxDomainMarkAFunc = void (*)(mstxDomainHandle_t, const char*, aclrtStream);
     static MstxDomainMarkAFunc func = nullptr;
     static bool noFuncFlag = false;
     if (noFuncFlag) {
@@ -185,9 +189,9 @@ void MstxDomainMarkA(mstxDomainhandle_t handle, const char* message, aclrtStream
     func(handle, message, stream);
 }
 
-int MstxDomainRangeStartA(mstxDomainhandle_t handle, const char* message, aclrtStream stream, int ptRangeId)
+int MstxDomainRangeStartA(mstxDomainHandle_t handle, const char* message, aclrtStream stream, int ptRangeId)
 {
-    using MstxDomainRangeStartAFunc = mstxRangeId (*)(mstxDomainhandle_t, const char*, aclrtStream);
+    using MstxDomainRangeStartAFunc = mstxRangeId (*)(mstxDomainHandle_t, const char*, aclrtStream);
     static MstxDomainRangeStartAFunc func = nullptr;
     static bool noFuncFlag = false;
     if (noFuncFlag) {
@@ -207,9 +211,9 @@ int MstxDomainRangeStartA(mstxDomainhandle_t handle, const char* message, aclrtS
     return 0;
 }
 
-void MstxDomainRangeEnd(mstxDomainhandle_t handle, int ptRangeId)
+void MstxDomainRangeEnd(mstxDomainHandle_t handle, int ptRangeId)
 {
-    using MstxDomainRangeEndFunc = void (*)(mstxDomainhandle_t, mstxRangeId);
+    using MstxDomainRangeEndFunc = void (*)(mstxDomainHandle_t, mstxRangeId);
     static MstxDomainRangeEndFunc func = nullptr;
     static bool noFuncFlag = false;
     if (noFuncFlag) {
@@ -233,5 +237,81 @@ void MstxDomainRangeEnd(mstxDomainhandle_t handle, int ptRangeId)
     g_rangeIdMap.erase(iter);
 }
 
+mstxMemHeapHandle_t MstxMemHeapRegister(mstxDomainHandle_t domain, mstxMemHeapDesc_t const* desc)
+{
+    using MstxMemHeapRegisterFunc = mstxMemHeapHandle_t (*)(mstxDomainHandle_t, mstxMemHeapDesc_t const*);
+    static MstxMemHeapRegisterFunc func = nullptr;
+    static bool noFuncFlag = false;
+    if (noFuncFlag) {
+        return nullptr;
+    }
+    if (func == nullptr) {
+        func = (MstxMemHeapRegisterFunc)GET_FUNC(mstxMemHeapRegister);
+        if (func == nullptr) {
+            ASCEND_LOGW("Failed to get func mstxMemHeapRegister");
+            noFuncFlag = true;
+            return nullptr;
+        }
+    }
+    return func(domain, desc);
+}
+
+void MstxMemHeapUnregister(mstxDomainHandle_t domain, mstxMemHeapHandle_t heap)
+{
+    using MstxMemHeapUnregisterFunc = void (*)(mstxDomainHandle_t, mstxMemHeapHandle_t);
+    static MstxMemHeapUnregisterFunc func = nullptr;
+    static bool noFuncFlag = false;
+    if (noFuncFlag) {
+        return;
+    }
+    if (func == nullptr) {
+        func = (MstxMemHeapUnregisterFunc)GET_FUNC(mstxMemHeapUnregister);
+        if (func == nullptr) {
+            ASCEND_LOGW("Failed to get func mstxMemHeapUnregister");
+            noFuncFlag = true;
+            return;
+        }
+    }
+    func(domain, heap);
+}
+
+void MstxMemRegionsRegister(mstxDomainHandle_t domain, mstxMemRegionsRegisterBatch_t const* desc)
+{
+    using MstxMemRegionsRegisterFunc = void (*)(mstxDomainHandle_t, mstxMemRegionsRegisterBatch_t const*);
+    static MstxMemRegionsRegisterFunc func = nullptr;
+    static bool noFuncFlag = false;
+    if (noFuncFlag) {
+        return;
+    }
+    if (func == nullptr) {
+        func = (MstxMemRegionsRegisterFunc)GET_FUNC(mstxMemRegionsRegister);
+        if (func == nullptr) {
+            ASCEND_LOGW("Failed to get func mstxMemRegionsRegister");
+            noFuncFlag = true;
+            return;
+        }
+    }
+    func(domain, desc);
+}
+
+void MstxMemRegionsUnregister(mstxDomainHandle_t domain, mstxMemRegionsUnregisterBatch_t const* desc)
+{
+    using MstxMemRegionsUnregisterFunc = void (*)(mstxDomainHandle_t, mstxMemRegionsUnregisterBatch_t const*);
+    static MstxMemRegionsUnregisterFunc func = nullptr;
+    static bool noFuncFlag = false;
+    if (noFuncFlag) {
+        return;
+    }
+    if (func == nullptr) {
+        func = (MstxMemRegionsUnregisterFunc)GET_FUNC(mstxMemRegionsUnregister);
+        if (func == nullptr) {
+            ASCEND_LOGW("Failed to get func mstxMemRegionsUnregister");
+            noFuncFlag = true;
+            return;
+        }
+    }
+    func(domain, desc);
+}
+
 }
 }
\ No newline at end of file
diff --git a/torch_npu/csrc/framework/interface/MstxInterface.h b/torch_npu/csrc/framework/interface/MstxInterface.h
index 806e8e749b..ba0781f587 100644
--- a/torch_npu/csrc/framework/interface/MstxInterface.h
+++ b/torch_npu/csrc/framework/interface/MstxInterface.h
@@ -16,15 +16,24 @@ int MstxRangeStartA(const char* message, aclrtStream stream, int ptRangeId);
 
 void MstxRangeEnd(int ptRangeId);
 
-mstxDomainhandle_t MstxDomainCreateA(const char* name);
+mstxDomainHandle_t MstxDomainCreateA(const char* name);
 
-void MstxDomainDestroy(mstxDomainhandle_t handle);
+void MstxDomainDestroy(mstxDomainHandle_t handle);
 
-void MstxDomainMarkA(mstxDomainhandle_t handle, const char* message, aclrtStream stream);
+void MstxDomainMarkA(mstxDomainHandle_t handle, const char* message, aclrtStream stream);
 
-int MstxDomainRangeStartA(mstxDomainhandle_t handle, const char* message, aclrtStream stream, int ptRangeId);
+int MstxDomainRangeStartA(mstxDomainHandle_t handle, const char* message, aclrtStream stream, int ptRangeId);
+
+void MstxDomainRangeEnd(mstxDomainHandle_t handle, int ptRangeId);
+
+mstxMemHeapHandle_t MstxMemHeapRegister(mstxDomainHandle_t domain, const mstxMemHeapDesc_t* desc);
+
+void MstxMemHeapUnregister(mstxDomainHandle_t domain, mstxMemHeapHandle_t heap);
+
+void MstxMemRegionsRegister(mstxDomainHandle_t domain, const mstxMemRegionsRegisterBatch_t* desc);
+
+void MstxMemRegionsUnregister(mstxDomainHandle_t domain, const mstxMemRegionsUnregisterBatch_t* desc);
 
-void MstxDomainRangeEnd(mstxDomainhandle_t handle, int ptRangeId);
 }
 }
 
diff --git a/torch_npu/csrc/profiler/mstx_mgr.cpp b/torch_npu/csrc/profiler/mstx_mgr.cpp
index 7ee7793e8c..0c7ff91d3e 100644
--- a/torch_npu/csrc/profiler/mstx_mgr.cpp
+++ b/torch_npu/csrc/profiler/mstx_mgr.cpp
@@ -84,17 +84,20 @@ int MstxMgr::getRangeId()
     return ptRangeId_++;
 }
 
-mstxDomainhandle_t MstxMgr::createDomain(const char* name)
+mstxDomainHandle_t MstxMgr::createDomain(const char* name)
 {
+    if (!isMsleaksEnable() && !isMstxEnable()) {
+        return nullptr;
+    }
     return at_npu::native::MstxDomainCreateA(name);
 }
 
-void MstxMgr::destroyDomain(mstxDomainhandle_t domain)
+void MstxMgr::destroyDomain(mstxDomainHandle_t domain)
 {
     at_npu::native::MstxDomainDestroy(domain);
 }
 
-void MstxMgr::domainMark(mstxDomainhandle_t domain, const char* message, const aclrtStream stream)
+void MstxMgr::domainMark(mstxDomainHandle_t domain, const char* message, const aclrtStream stream)
 {
     if (!isMstxEnable()) {
         return;
@@ -111,7 +114,7 @@ void MstxMgr::domainMark(mstxDomainhandle_t domain, const char* message, const a
     at_npu::native::OpCommand::RunOpApi("mstx_domain_mark_op", mark_call);
 }
 
-int MstxMgr::domainRangeStart(mstxDomainhandle_t domain, const char* message, const aclrtStream stream)
+int MstxMgr::domainRangeStart(mstxDomainHandle_t domain, const char* message, const aclrtStream stream)
 {
     if (!isMstxEnable()) {
         return 0;
@@ -133,7 +136,7 @@ int MstxMgr::domainRangeStart(mstxDomainhandle_t domain, const char* message, co
     return id;
 }
 
-void MstxMgr::domainRangeEnd(mstxDomainhandle_t domain, int ptRangeId)
+void MstxMgr::domainRangeEnd(mstxDomainHandle_t domain, int ptRangeId)
 {
     if (!isMstxEnable() || ptRangeId == 0) {
         return;
@@ -158,6 +161,76 @@ void MstxMgr::domainRangeEnd(mstxDomainhandle_t domain, int ptRangeId)
     at_npu::native::OpCommand::RunOpApi("mstx_domain_range_end_op", range_end_call);
 }
 
+mstxMemHeapHandle_t MstxMgr::memHeapRegister(mstxDomainHandle_t domain, mstxMemVirtualRangeDesc_t* desc)
+{
+    if (!isMsleaksEnable() || desc==nullptr) {
+        return nullptr;
+    }
+    mstxMemHeapDesc_t heapDesc;
+    heapDesc.typeSpecificDesc = reinterpret_cast<void const *>(desc);
+    return at_npu::native::MstxMemHeapRegister(domain, &heapDesc);
+}
+
+void MstxMgr::memHeapUnregister(mstxDomainHandle_t domain, void* ptr)
+{
+    if (!isMsleaksEnable() || ptr == nullptr) {
+        return;
+    }
+    at_npu::native::MstxMemHeapUnregister(domain, reinterpret_cast<mstxMemHeapHandle_t>(ptr));
+}
+
+void MstxMgr::memRegionsRegister(mstxDomainHandle_t domain, mstxMemVirtualRangeDesc_t* desc)
+{
+    if (!isMsleaksEnable() || desc == nullptr) {
+        return;
+    }
+    mstxMemRegionsRegisterBatch_t batch;
+    batch.regionCount = 1;
+    batch.regionDescArray = reinterpret_cast<const void *>(desc);
+    at_npu::native::MstxMemRegionsRegister(domain, &batch);
+}
+
+void MstxMgr::memRegionsUnregister(mstxDomainHandle_t domain, void* ptr)
+{
+    if (!isMsleaksEnable() || ptr == nullptr) {
+        return;
+    }
+    mstxMemRegionsUnregisterBatch_t unregisterBatch;
+    unregisterBatch.refCount = 1;
+    mstxMemRegionRef_t regionRef[1] = {};
+    regionRef[0].refType = MSTX_MEM_REGION_REF_TYPE_POINTER;
+    regionRef[0].pointer = ptr;
+    unregisterBatch.refArray = regionRef;
+    at_npu::native::MstxMemRegionsUnregister(domain, &unregisterBatch);
+}
+
+
+bool MstxMgr::isMsleaksEnable()
+{
+    static bool isEnable = isMsleaksEnableImpl();
+    return isEnable;
+}
+
+bool MstxMgr::isMsleaksEnableImpl()
+{
+    bool ret = false;
+    const char* envVal = std::getenv("LD_PRELOAD");
+    if (envVal == nullptr) {
+        return ret;
+    }
+    static const std::string soName = "libascend_hal_hook.so";
+    std::stringstream ss(envVal);
+    std::string path;
+    while (std::getline(ss, path, ':')) {
+        path = torch_npu::toolkit::profiler::Utils::RealPath(path);
+        if ((path.size() > soName.size()) && (path.substr(path.size() - soName.size()) == soName)) {
+            ret = true;
+            break;
+        }
+    }
+    return ret;
+}
+
 bool MstxMgr::isProfTxEnable()
 {
     return ProfilerMgr::GetInstance()->GetNpuTrace().load() && ProfilerMgr::GetInstance()->GetMsprofTx().load();
diff --git a/torch_npu/csrc/profiler/mstx_mgr.h b/torch_npu/csrc/profiler/mstx_mgr.h
index c6fc6a5fe1..bea6f59bea 100644
--- a/torch_npu/csrc/profiler/mstx_mgr.h
+++ b/torch_npu/csrc/profiler/mstx_mgr.h
@@ -12,6 +12,7 @@ namespace torch_npu {
 namespace profiler {
 
 const std::string DOMAIN_COMMUNICATION = "communication";
+const std::string DOMAIN_MSLEAKS = "msleaks";
 
 class MstxMgr : public torch_npu::toolkit::profiler::Singleton<MstxMgr> {
 friend class torch_npu::toolkit::profiler::Singleton<MstxMgr>;
@@ -22,11 +23,15 @@ public:
     bool isMstxEnable();
     int getRangeId();
 
-    mstxDomainhandle_t createDomain(const char* name);
-    void destroyDomain(mstxDomainhandle_t domain);
-    void domainMark(mstxDomainhandle_t domain, const char* message, const aclrtStream stream);
-    int domainRangeStart(mstxDomainhandle_t domain, const char* message, const aclrtStream stream);
-    void domainRangeEnd(mstxDomainhandle_t domain, int ptRangeId);
+    mstxDomainHandle_t createDomain(const char* name);
+    void destroyDomain(mstxDomainHandle_t domain);
+    void domainMark(mstxDomainHandle_t domain, const char* message, const aclrtStream stream);
+    int domainRangeStart(mstxDomainHandle_t domain, const char* message, const aclrtStream stream);
+    void domainRangeEnd(mstxDomainHandle_t domain, int ptRangeId);
+    mstxMemHeapHandle_t memHeapRegister(mstxDomainHandle_t domain, mstxMemVirtualRangeDesc_t* desc);
+    void memHeapUnregister(mstxDomainHandle_t domain, void* ptr);
+    void memRegionsRegister(mstxDomainHandle_t domain, mstxMemVirtualRangeDesc_t* desc);
+    void memRegionsUnregister(mstxDomainHandle_t domain, void* ptr);
 
 private:
     MstxMgr();
@@ -35,6 +40,8 @@ private:
     explicit MstxMgr(MstxMgr &&obj) = delete;
     MstxMgr& operator=(MstxMgr &&obj) = delete;
 
+    bool isMsleaksEnable();
+    bool isMsleaksEnableImpl();
     bool isProfTxEnable();
     bool isMsptiTxEnable();
     bool isMsptiTxEnableImpl();
@@ -43,6 +50,5 @@ private:
     std::unordered_set<int> ptRangeIdsWithStream_;
     std::mutex mtx_;
 };
-
 }
 } // namespace torch_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/profiler/npu_profiler.h b/torch_npu/csrc/profiler/npu_profiler.h
index b58fa182a7..074a41440e 100644
--- a/torch_npu/csrc/profiler/npu_profiler.h
+++ b/torch_npu/csrc/profiler/npu_profiler.h
@@ -129,7 +129,7 @@ inline bool mstxEnable()
 
 struct MstxRange {
     int rangeId{0};
-    mstxDomainhandle_t domainHandle{nullptr};
+    mstxDomainHandle_t domainHandle{nullptr};
     MstxRange(const std::string &message, aclrtStream stream, const std::string &domainName = "default")
     {
         if (!mstxEnable()) {
-- 
Gitee


From 46068b23ae4208aba199b713c357958183751a9d Mon Sep 17 00:00:00 2001
From: liushiyu0214 <liushiyu25@huawei.com>
Date: Fri, 28 Feb 2025 07:51:57 +0000
Subject: [PATCH 069/358] =?UTF-8?q?!18398=20=E5=AF=BC=E5=85=A5dequant=5Fbi?=
 =?UTF-8?q?as=E5=A4=B4=E6=96=87=E4=BB=B6=20Merge=20pull=20request=20!18398?=
 =?UTF-8?q?=20from=20liushiyu0214/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/npu/test_public_bindings.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/npu/test_public_bindings.py b/test/npu/test_public_bindings.py
index a3802878bd..3c7c93b958 100644
--- a/test/npu/test_public_bindings.py
+++ b/test/npu/test_public_bindings.py
@@ -546,6 +546,7 @@ class TestPublicBindings(TestCase):
             "torch_npu.dynamo.torchair.ge_concrete_graph.ge_converter.experimental.hcom_allgather",
             "torch_npu.dynamo.torchair._ge_concrete_graph.ge_converter.custom.npu_selu_backward",
             "torch_npu.dynamo.torchair._ge_concrete_graph.ge_ir_by_protoc_3_13_pb2",
+            "torch_npu.dynamo.torchair._ge_concrete_graph.ge_converter.custom.npu_dequant_bias",
             "torch_npu.utils.collect_hccl_info",
             "torch_npu.op_plugin.meta._meta_registrations",
 
-- 
Gitee


From f9ba0ab4bdbdc11b9362982a7af7447cd47f5f4a Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Fri, 28 Feb 2025 09:36:00 +0000
Subject: [PATCH 070/358] !18361 Update torchair commit id Merge pull request
 !18361 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 491325ac8f..018466a50b 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 491325ac8fb4535c2d4e5ef5cb4689dd900b05d8
+Subproject commit 018466a50b704ffe0df5a38dd77e2d50a2d27afe
-- 
Gitee


From b32e93d81476b786789c2b483cfff40d1bb1d628 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 28 Feb 2025 09:45:17 +0000
Subject: [PATCH 071/358] !18405 Update op_plugin commit id Merge pull request
 !18405 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 90d2339fb8..0b7b338ae5 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 90d2339fb8bff4ca5424f221f4d5a09de8243c59
+Subproject commit 0b7b338ae57f14f08b25df6acb56a7c84b2d6905
-- 
Gitee


From 26bea296372eaa1d9f1f9588eb7e1cf4301aa487 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 28 Feb 2025 10:15:15 +0000
Subject: [PATCH 072/358] !18413 Update op_plugin commit id Merge pull request
 !18413 from pta-robot/v2.6.0

-- 
Gitee


From 7950fc1950ce27ad9d95f42512199d7af2cebc65 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 28 Feb 2025 12:15:16 +0000
Subject: [PATCH 073/358] !18417 Update op_plugin commit id Merge pull request
 !18417 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 0b7b338ae5..49a89f3b13 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 0b7b338ae57f14f08b25df6acb56a7c84b2d6905
+Subproject commit 49a89f3b1374c86f2dc4b0cbb645b1580b19a9d7
-- 
Gitee


From aa53b255dba0d8b2ac74e41147ae29595588099d Mon Sep 17 00:00:00 2001
From: zhangqiongwen <zhangqiongwen@huawei.com>
Date: Sat, 1 Mar 2025 08:11:11 +0000
Subject: [PATCH 074/358] =?UTF-8?q?!18332=20CP=EF=BC=9Anpu=5Ffusion=5Fatte?=
 =?UTF-8?q?ntion=20register=20sharding=20strategy=20Merge=20pull=20request?=
 =?UTF-8?q?=20!18332=20from=20zhangqiongwen/v2.6.0=5Ffa=5Fstrategy?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/distributed/test_register_sharding.py  |  67 +++++++++
 torch_npu/distributed/tensor/__init__.py    |   1 +
 torch_npu/distributed/tensor/_attention.py  |  81 +++++++++++
 torch_npu/distributed/tensor/_matrix_ops.py | 144 ++++++++++++++++++++
 4 files changed, 293 insertions(+)
 create mode 100644 torch_npu/distributed/tensor/_attention.py

diff --git a/test/distributed/test_register_sharding.py b/test/distributed/test_register_sharding.py
index d61b340d86..f5492c0416 100644
--- a/test/distributed/test_register_sharding.py
+++ b/test/distributed/test_register_sharding.py
@@ -182,6 +182,73 @@ class TestRegisterSharding(DTensorTestBase):
         self.assertTrue(dist_x.grad.placements[0].is_replicate())
         self.assertEqual(dist_x.grad.full_tensor(), x.grad)
 
+    @with_comms
+    def test_npu_fussion_attention_forward(self):
+        scale = 0.08838
+        query = self.trans_BNSD2BSH(torch.randn(1, 32, 128, 128, device=self.device_type, dtype=torch.float32))
+        key = self.trans_BNSD2BSH(torch.randn(1, 32, 128, 128, device=self.device_type, dtype=torch.float32))
+        value = self.trans_BNSD2BSH(torch.randn(1, 32, 128, 128, device=self.device_type, dtype=torch.float32))
+        attention_score, softmax_max, softmax_sum, softmax_out, seed, offset, numels = torch_npu.npu_fusion_attention(
+            query, key, value, head_num=32, input_layout="BSH", scale=scale)
+
+        device_mesh = self.build_device_mesh()
+        dist_query = distribute_tensor(query, device_mesh, [Replicate()])
+        dist_key = distribute_tensor(key, device_mesh, [Replicate()])
+        dist_value = distribute_tensor(value, device_mesh, [Replicate()])
+        dist_attention_score, dist_softmax_max, dist_softmax_sum, dist_softmax_out, seed, offset, numels = torch_npu.npu_fusion_attention(
+            dist_query, dist_key, dist_value, head_num=32, input_layout="BSH", scale=scale)
+
+        self.assertEqual(dist_attention_score.full_tensor(), attention_score)
+        self.assertEqual(dist_softmax_max.full_tensor(), softmax_max)
+        self.assertEqual(dist_softmax_sum.full_tensor(), softmax_sum)
+        self.assertEqual(dist_softmax_out.full_tensor(), softmax_out)
+
+    @with_comms
+    def test_npu_fussion_attention_grad(self):
+        scale = 0.08838
+        query = torch.randn(1, 32, 128, 128, device=self.device_type, dtype=torch.float32)
+        key = torch.randn(1, 32, 128, 128, device=self.device_type, dtype=torch.float32)
+        value = torch.randn(1, 32, 128, 128, device=self.device_type, dtype=torch.float32)
+        dy = torch.randn(1, 32, 128, 128, device=self.device_type, dtype=torch.float32)
+
+        # get attention_in
+        query = torch.matmul(query, key.transpose(2, 3)).mul(scale)
+        softmax_res, x_max, x_sum = self.tsoftmax(query.to(torch.float32))
+        attention_in = torch.matmul(softmax_res, value)
+
+        query = self.trans_BNSD2BSH(query)
+        key = self.trans_BNSD2BSH(key)
+        value = self.trans_BNSD2BSH(value)
+        dy = self.trans_BNSD2BSH(dy)
+
+        x_max = x_max.expand(1, 32, 128, 8).npu()
+        x_sum = x_sum.expand(1, 32, 128, 8).npu()
+        out = self.trans_BNSD2BSH(attention_in)
+
+        dq, dk, dv, dpse = torch_npu.npu_fusion_attention_grad(
+            query, key, value, dy, head_num=32, input_layout="BSH",
+            softmax_max=x_max, softmax_sum=x_sum, attention_in=attention_in, scale_value=scale)
+
+        device_mesh = self.build_device_mesh()
+        dist_query = distribute_tensor(query, device_mesh, [Replicate()])
+        dist_key = distribute_tensor(key, device_mesh, [Replicate()])
+        dist_value = distribute_tensor(value, device_mesh, [Replicate()])
+        dist_dy = distribute_tensor(dy, device_mesh, [Replicate()])
+        dist_xmax = distribute_tensor(x_max, device_mesh, [Replicate()])
+        dist_xsum = distribute_tensor(x_sum, device_mesh, [Replicate()])
+        dist_attention_in = distribute_tensor(out, device_mesh, [Replicate()])
+        dist_dq, dist_dk, dist_dv, dist_dpse = torch_npu.npu_fusion_attention_grad(
+            dist_query, dist_key, dist_value, dist_dy, head_num=32, input_layout="BSH",
+            softmax_max=dist_xmax, softmax_sum=dist_xsum, attention_in=dist_attention_in, scale_value=scale)
+
+        self.assertEqual(dist_dq.full_tensor(), dq)
+        self.assertEqual(dist_dk.full_tensor(), dk)
+        self.assertEqual(dist_dv.full_tensor(), dv)
+        if dist_dpse is not None:
+            self.assertEqual(dist_dpse.full_tensor(), dpse)
+        else:
+            self.assertEqual(dist_dpse, dpse)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torch_npu/distributed/tensor/__init__.py b/torch_npu/distributed/tensor/__init__.py
index 903a9a1ea6..3b1aecbd8e 100644
--- a/torch_npu/distributed/tensor/__init__.py
+++ b/torch_npu/distributed/tensor/__init__.py
@@ -1 +1,2 @@
 import torch_npu.distributed.tensor._matrix_ops
+import torch_npu.distributed.tensor._attention
diff --git a/torch_npu/distributed/tensor/_attention.py b/torch_npu/distributed/tensor/_attention.py
new file mode 100644
index 0000000000..dcf7c6b064
--- /dev/null
+++ b/torch_npu/distributed/tensor/_attention.py
@@ -0,0 +1,81 @@
+import itertools
+from typing import Any, Dict, Tuple
+
+import torch
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.tensor import distribute_module, DTensor, Replicate
+
+import torch_npu
+
+npu = torch.ops.npu
+
+
+def _npu_fusion_attention_handler(
+        op_call: torch._ops.OpOverload,
+        args: Tuple[object, ...],
+        kwargs: Dict[str, object],
+) -> object:
+    def npu_attention_input_fn(
+            mesh: DeviceMesh, *args: Tuple[Any, ...], **kwargs: Dict[str, Any]
+    ) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
+        all_args = []
+
+        for arg in itertools.chain(args, kwargs.values()):
+            if isinstance(arg, torch.Tensor) and not isinstance(arg, DTensor):
+                arg = DTensor.from_local(arg, mesh, [Replicate()], run_check=False)
+
+            all_args.append(arg)
+
+        new_args = tuple(all_args[0: len(args)])
+        new_kwargs = dict(zip(kwargs.keys(), all_args[len(args):]))
+
+        return new_args, new_kwargs
+
+    runtime_schema_info = (
+        DTensor._op_dispatcher.sharding_propagator.op_to_schema_info.get(op_call, None)
+    )
+
+    if runtime_schema_info is not None and runtime_schema_info.needs_pytree:
+        try:
+            from torch.utils import _cxx_pytree as pytree
+        except ImportError:
+            from torch.utils import _pytree as pytree  # type: ignore[no-redef]
+        from typing import Sequence
+
+        tree_args, args_spec = pytree.tree_flatten(args)
+        args_list: Sequence[object] = tree_args
+    else:
+        args_list, args_spec = args, None
+
+    args, kwargs = npu_attention_input_fn(args_list[0].device_mesh, *args, **kwargs)
+
+    # extract local tensor and sharding infos to a OpInfo
+    op_info = DTensor._op_dispatcher.unwrap_to_op_info(op_call, args, kwargs)
+
+    # sharding propagation
+    DTensor._op_dispatcher.sharding_propagator.propagate(op_info)
+    output_sharding = op_info.output_sharding
+
+    if op_call == npu.npu_fusion_attention.default:
+        local_results = torch_npu.npu_fusion_attention(
+            *op_info.local_args, **op_info.local_kwargs
+        )
+    elif op_call == npu.npu_fusion_attention_grad.default:
+        local_results = torch_npu.npu_fusion_attention_grad(
+            *op_info.local_args, **op_info.local_kwargs
+        )
+    else:
+        raise NotImplementedError(
+            "_npu_fusion_attention_handler only supports npu_fusion_attention and npu_fusion_attention_grad now."
+        )
+
+    return DTensor._op_dispatcher.wrap(local_results, output_sharding.output_spec)
+
+
+customized_ops = {
+    npu.npu_fusion_attention.default: _npu_fusion_attention_handler,
+    npu.npu_fusion_attention_grad.default: _npu_fusion_attention_handler,
+}
+
+old_handlers = DTensor._op_dispatcher._custom_op_handlers
+DTensor._op_dispatcher._custom_op_handlers = {**old_handlers, **customized_ops}
diff --git a/torch_npu/distributed/tensor/_matrix_ops.py b/torch_npu/distributed/tensor/_matrix_ops.py
index d540c9c531..2705272c4e 100644
--- a/torch_npu/distributed/tensor/_matrix_ops.py
+++ b/torch_npu/distributed/tensor/_matrix_ops.py
@@ -148,3 +148,147 @@ def custom_npu_dtype_cast_backward_sharding(
     acceptable_shardings = []
     acceptable_shardings.append(strategy)
     return acceptable_shardings
+
+
+@register_sharding(npu.npu_fusion_attention.default)
+# pylint:disable=huawei-too-many-arguments
+def custom_npu_fusion_attention_sharding(query, key, value, head_num, input_layout, pse=None, padding_mask=None,
+                                         atten_mask=None, scale=1.0, keep_prob=1.0, pre_tockens=2147483647,
+                                         inner_precise=0, prefix=None, actual_seq_qlen=None, actual_seq_kvlen=None,
+                                         sparse_mode=0, gen_mask_parallel=True, sync=False):
+    acceptable_shardings = []
+
+    # add all replicate strategy
+    replcate_strategy = (
+        [
+            Replicate(),  # Tensor attention_score
+            Replicate(),  # Tensor softmax_max
+            Replicate(),  # Tensor softmax_sum
+            Replicate(),  # Tensor softmax_out
+            None,  # int seed
+            None,  # int offset
+            None  # int numels
+        ],
+        [
+            Replicate(),  # Tensor query
+            Replicate(),  # Tensor key
+            Replicate(),  # Tensor value
+            None,  # int head_num
+            None,  # str input_layout
+            None if pse is None else Replicate(),  # Tensor? pse
+            None if padding_mask is None else Replicate(),  # Tensor? padding_mask
+            None if atten_mask is None else Replicate(),  # Tensor? atten_mask
+            None  # other
+        ]
+    )
+
+    # add sharding strategy
+    for strategy_index, default_sharding in enumerate(query.placements):
+        pse_sharding = None if pse is None else pse.placements[strategy_index]
+        padding_mask_sharding = None if padding_mask is None else padding_mask.placements[strategy_index]
+        atten_mask_sharding = None if atten_mask is None else atten_mask.placements[strategy_index]
+
+        sharding_strategy = (
+            [
+                default_sharding,  # Tensor attention_score
+                default_sharding,  # Tensor softmax_max
+                default_sharding,  # Tensor softmax_sum
+                default_sharding,  # Tensor softmax_out
+                None,  # int seed
+                None,  # int offset
+                None  # int numels
+            ],
+            [
+                default_sharding,  # Tensor query
+                default_sharding,  # Tensor key
+                default_sharding,  # Tensor value
+                None,  # int head_num
+                None,  # str input_layout
+                pse_sharding,  # Tensor? pse
+                padding_mask_sharding,  # Tensor? padding_mask
+                atten_mask_sharding,  # Tensor? atten_mask
+                None  # other
+            ]
+        )
+
+        acceptable_shardings.append(sharding_strategy)
+
+    acceptable_shardings.append(replcate_strategy)
+
+    return acceptable_shardings
+
+
+@register_sharding(npu.npu_fusion_attention_grad.default)
+# pylint:disable=huawei-too-many-arguments
+def custom_npu_fusion_attention_grad_sharding(query, key, value, dy, head_num, input_layout, *, pse=None,
+                                              padding_mask=None, atten_mask=None, softmax_max=None, softmax_sum=None,
+                                              softmax_in=None, attention_in=None, scale_value=1.0, keep_prob=1.0,
+                                              pre_tockens=2147483647, next_tockens=2147483647, inner_precise=0, seed=0,
+                                              offset=0, numels=0, prefix=None, actual_seq_qlen=None,
+                                              actual_seq_kvlen=None, sparse_mode=0, gen_mask_parallel=True, sync=False):
+    acceptable_shardings = []
+
+    # add all replicate strategy
+    replcate_strategy = (
+        [
+            Replicate(),  # Tensor grad_query
+            Replicate(),  # Tensor grad_key
+            Replicate(),  # Tensor grad_value
+            Replicate(),  # Tensor grad_dy
+        ],
+        [
+            Replicate(),  # Tensor query
+            Replicate(),  # Tensor key
+            Replicate(),  # Tensor value
+            Replicate(),  # Tensor dy
+            None,  # int head_num
+            None,  # str input_layout
+            None if pse is None else Replicate(),  # Tensor? pse
+            None if padding_mask is None else Replicate(),  # Tensor? padding_mask
+            None if atten_mask is None else Replicate(),  # Tensor? atten_mask
+            None if softmax_max is None else Replicate(),  # Tensor? softmax_max
+            None if softmax_sum is None else Replicate(),  # Tensor? softmax_sum
+            None if softmax_in is None else Replicate(),  # Tensor? softmax_in
+            None if attention_in is None else Replicate(),  # Tensor? attention_in
+            None  # other
+        ]
+    )
+    acceptable_shardings.append(replcate_strategy)
+
+    # add sharding strategy
+    for strategy_index, default_sharding in enumerate(query.placements):
+        pse_sharding = None if pse is None else pse.placements[strategy_index]
+        padding_mask_sharding = None if padding_mask is None else padding_mask.placements[strategy_index]
+        atten_mask_sharding = None if atten_mask is None else atten_mask.placements[strategy_index]
+        asoftmax_max_sharding = None if softmax_max is None else softmax_max.placements[strategy_index]
+        softmax_sum_sharding = None if softmax_sum is None else softmax_sum.placements[strategy_index]
+        softmax_in_sharding = None if softmax_in is None else softmax_in.placements[strategy_index]
+        attention_in_sharding = None if attention_in is None else attention_in.placements[strategy_index]
+
+        sharding_strategy = (
+            [
+                default_sharding,  # Tensor grad_query
+                default_sharding,  # Tensor grad_key
+                default_sharding,  # Tensor grad_value
+                default_sharding,  # Tensor grad_dy
+            ],
+            [
+                default_sharding,  # Tensor query
+                default_sharding,  # Tensor key
+                default_sharding,  # Tensor value
+                default_sharding,  # Tensor dy
+                None,  # int head_num
+                None,  # str input_layout
+                pse_sharding,  # Tensor? pse
+                padding_mask_sharding,  # Tensor? padding_mask
+                atten_mask_sharding,  # Tensor? atten_mask
+                asoftmax_max_sharding,  # Tensor? softmax_max
+                softmax_sum_sharding,  # Tensor? softmax_sum
+                softmax_in_sharding,  # Tensor? softmax_sum
+                attention_in_sharding,  # Tensor? attention_in
+                None  # other
+            ]
+        )
+        acceptable_shardings.append(sharding_strategy)
+
+    return acceptable_shardings
-- 
Gitee


From 73812ba3c9bfcf2849b953b0cf1a87d3545e1dbe Mon Sep 17 00:00:00 2001
From: SCh-zx <1325467101@qq.com>
Date: Mon, 3 Mar 2025 01:17:38 +0000
Subject: [PATCH 075/358] !18293 log check Merge pull request !18293 from
 SCh-zx/v2.6.0

---
 torch_npu/csrc/core/npu/NPUCachingAllocator.cpp | 5 ++---
 torch_npu/csrc/core/npu/NPUQueue.cpp            | 8 ++++----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index 866d2e9310..355d9e8df9 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -1047,9 +1047,8 @@ class DeviceCachingAllocator {
     }
 
     if (!block_found) {
-        ASCEND_LOGE(
-            "Get a block from the existing pool failed. Try to free cached blocks and reallocate. This error log "
-            "can be ignored.");
+        ASCEND_LOGE("Get a block from the existing pool failed. %s",
+            "Try to free cached blocks and reallocate. This error log can be ignored.");
         // Free all non-split cached blocks and retry alloc.
         c10_npu::NPUWorkspaceAllocator::emptyCache(device, true, true);
         block_found = (release_cached_blocks(true, context) && alloc_block(params, true, context, lock));
diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp
index 3b39831372..f3f681630c 100644
--- a/torch_npu/csrc/core/npu/NPUQueue.cpp
+++ b/torch_npu/csrc/core/npu/NPUQueue.cpp
@@ -462,18 +462,18 @@ void Repository::Enqueue(void* cur_paras) {
         if (type == c10_npu::queue::EXECUTE_OPAPI) {
             auto cur_paras = static_cast<at_npu::native::ExecuteParasOpApi *>(queueParam->paramVal);
             auto op_name = cur_paras->opType;
-            ASCEND_LOGE("Task queue thread is exit, cann't call Enqueue() for executing and op name is=%s.", op_name);
+            ASCEND_LOGE("Task queue thread is exit, can't call Enqueue() for executing and op name is=%s.", op_name);
         } else if (type == c10_npu::queue::COMPILE_AND_EXECUTE) {
             auto cur_paras = static_cast<at_npu::native::ExecuteParas *>(queueParam->paramVal);
             auto op_name = cur_paras->opType;
-            ASCEND_LOGW("Task queue thread is exit, cann't call Enqueue() for executing and op name is=%s.", op_name);
+            ASCEND_LOGW("Task queue thread is exit, can't call Enqueue() for executing and op name is=%s.", op_name);
         } else if (type == c10_npu::queue::ASYNC_MEMCPY) {
             auto cur_paras = static_cast<c10_npu::queue::CopyParas *>(queueParam->paramVal);
-            ASCEND_LOGW("Task queue thread is exit, cann't call Enqueue() for copy, srclen=%zu, dstlen is %zu, kind=%d",
+            ASCEND_LOGW("Task queue thread is exit, can't call Enqueue() for copy, srclen=%zu, dstlen is %zu, kind=%d",
                         cur_paras->srcLen, cur_paras->dstLen, cur_paras->kind);
         } else {
             auto cur_paras = static_cast<c10_npu::queue::EventParas *>(queueParam->paramVal);
-            ASCEND_LOGW("Task queue thread is exit, cann't call Enqueue() for event, event is=%p", cur_paras->event);
+            ASCEND_LOGW("Task queue thread is exit, can't call Enqueue() for event, event is=%p", cur_paras->event);
         }
         return;
     }
-- 
Gitee


From 90d201f9a5f9c62d8830d16074ad37c30f686d36 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Mon, 3 Mar 2025 09:30:30 +0000
Subject: [PATCH 076/358] !18448 Update op_plugin commit id Merge pull request
 !18448 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 49a89f3b13..f4116cbb8a 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 49a89f3b1374c86f2dc4b0cbb645b1580b19a9d7
+Subproject commit f4116cbb8a474663ca35a191bb61805b33243580
-- 
Gitee


From 258d9a682d3c8bd219462ba8547056fe1cc9d825 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Mon, 3 Mar 2025 11:00:23 +0000
Subject: [PATCH 077/358] !18466 Update op_plugin commit id Merge pull request
 !18466 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index f4116cbb8a..69c3dfd3d7 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit f4116cbb8a474663ca35a191bb61805b33243580
+Subproject commit 69c3dfd3d70ef2dc5f402dd36ec657db70941853
-- 
Gitee


From 5a2273783c119d78212829b204a41e7e59b575a7 Mon Sep 17 00:00:00 2001
From: SCh-zx <1325467101@qq.com>
Date: Mon, 3 Mar 2025 13:26:48 +0000
Subject: [PATCH 078/358] !18456 Modify the Time of Fine-Grained Core Binding
 Merge pull request !18456 from SCh-zx/v2.6.0

---
 torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
index 6dae2d670f..6b7a065882 100644
--- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
+++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
@@ -189,6 +189,8 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id)
 
     lazy_fn_.clear();
 
+    SetThreadAffinity(device_id_);
+
     GetAffinityInfo();
 
     init_flag_ = true;
-- 
Gitee


From 5a7e3f7692bde2f7adcab21b2d609fc863a00f1c Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Mon, 3 Mar 2025 14:00:19 +0000
Subject: [PATCH 079/358] !18480 Update op_plugin commit id Merge pull request
 !18480 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 69c3dfd3d7..40b464594a 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 69c3dfd3d70ef2dc5f402dd36ec657db70941853
+Subproject commit 40b464594a25d6ad0ea7ad89ab2d4a980985872a
-- 
Gitee


From 513aee69e587ca75ac83c1282a8c17dbe12e81ff Mon Sep 17 00:00:00 2001
From: Mrtutu <zhangwei983@huawei.com>
Date: Tue, 4 Mar 2025 11:36:37 +0000
Subject: [PATCH 080/358] !18331 cancel del fwk data on branch v2.6.0 Merge
 pull request !18331 from Mrtutu/cancel_del_fwk_v2.6.0

---
 torch_npu/profiler/analysis/_profiling_parser.py | 2 --
 torch_npu/profiler/profiler_interface.py         | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_npu/profiler/analysis/_profiling_parser.py b/torch_npu/profiler/analysis/_profiling_parser.py
index e68bf1914e..5848840ecd 100644
--- a/torch_npu/profiler/analysis/_profiling_parser.py
+++ b/torch_npu/profiler/analysis/_profiling_parser.py
@@ -43,8 +43,6 @@ class ProfilingParser:
                 target_path = os.path.join(host_path, rm_dir)
                 PathManager.remove_path_safety(target_path)
         if simplify_flag:
-            fwk_path = ProfilerPathManager.get_fwk_path(profiler_path)
-            PathManager.remove_path_safety(fwk_path)
             if not cann_path:
                 return
             cann_rm_dirs = ['analyze', 'mindstudio_profiler_log', 'mindstudio_profiler_output']
diff --git a/torch_npu/profiler/profiler_interface.py b/torch_npu/profiler/profiler_interface.py
index 56107b57ac..4232bbb2a7 100644
--- a/torch_npu/profiler/profiler_interface.py
+++ b/torch_npu/profiler/profiler_interface.py
@@ -32,6 +32,7 @@ from .analysis.prof_common_func._constant import print_warn_msg
 from .analysis.prof_common_func._file_manager import FileManager
 from .analysis.prof_common_func._utils import collect_env_vars, no_exception_func
 from .analysis.prof_common_func._path_manager import ProfilerPathManager
+from .analysis.prof_common_func._log import ProfilerLogger
 from ..utils._path_manager import PathManager
 from .analysis.prof_common_func._cann_package_manager import CannPackageManager
 
@@ -127,6 +128,7 @@ class _ProfInterface:
         self._dump_profiler_info()
         self._dump_metadata()
         ProfPathCreator().is_prof_inited = False
+        ProfilerLogger.destroy()
 
     def delete_prof_dir(self):
         ProfPathCreator().delete_prof_dir()
-- 
Gitee


From 3b7661814c8fff8da1a22690779a7d6cd5ee66aa Mon Sep 17 00:00:00 2001
From: yuhaiyan <yuhaiyan8@huawei.com>
Date: Tue, 4 Mar 2025 11:42:33 +0000
Subject: [PATCH 081/358] !18504 Add security note for url. Merge pull request
 !18504 from yuhaiyan/v2.6.0-dev2

---
 SECURITYNOTE.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/SECURITYNOTE.md b/SECURITYNOTE.md
index d7cf1140cf..ccc955b2b8 100644
--- a/SECURITYNOTE.md
+++ b/SECURITYNOTE.md
@@ -78,6 +78,8 @@ torch_npu支持源码编译安装，在编译时会下载依赖第三方库并
 | 自研   | 不涉及                                                                                                                                                                                                                            | ci\docker\X86\Dockerfile               | https://mirrors.huaweicloud.com/repository/pypi/simple                       | docker配置文件，用于配置pip源            |
 | 自研   | 不涉及                                                                                                                                                                                                                            | ci\docker\X86\Dockerfile               | https://download.pytorch.org/whl/cpu                                         | docker配置源，用于配置torch下载连接        |
 | 自研   | 不涉及                                                                                                                                                                                                                            | ci\docker\ARM\Dockerfile               | https://mirrors.huaweicloud.com/repository/pypi/simple                       | docker配置文件，用于配置pip源            |
+| 自研   | 不涉及                                                                                                                                                                                          | ci\docker\X86\Dockerfile                      | https://mirrors.wlnmp.com/centos/Centos7-aliyun-altarch.repo                 | docker配置文件，用于配置yum源            |
+| 自研   | 不涉及                                                                                                                                                                                          | ci\docker\ARM\Dockerfile                      | https://mirrors.wlnmp.com/centos/Centos7-aliyun-altarch.repo                 | docker配置文件，用于配置yum源            |
 | 自研   | 不涉及                                                                                                                                                                                                                            | .github\workflows\\_build-and-test.yml | https://mirrors.huaweicloud.com/repository/pypi/simple                       | workflow配置文件，用于配置pip源          |
 | 自研   | 不涉及                                                                                                                                                                                                                            | setup.cfg                              | https://gitee.com/ascend/pytorch                                             | 用于打包whl的url入参                  |
 | 自研   | 不涉及                                                                                                                                                                                                                            | setup.cfg                              | https://gitee.com/ascend/pytorch/tags                                        | 用于打包whl的download_url入参         |
-- 
Gitee


From 943435524f23399daaf910a0743a8a8a98ae9f8c Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 4 Mar 2025 13:45:38 +0000
Subject: [PATCH 082/358] !18510 Update op_plugin commit id Merge pull request
 !18510 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 40b464594a..38fc7950da 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 40b464594a25d6ad0ea7ad89ab2d4a980985872a
+Subproject commit 38fc7950da764a457c0ccf09f0e583fe269c80ca
-- 
Gitee


From 895147c9f2f58b2a87a5eff28bdb5df1369dacc2 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 4 Mar 2025 15:45:22 +0000
Subject: [PATCH 083/358] !18516 Update op_plugin commit id Merge pull request
 !18516 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 38fc7950da..bf39018eca 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 38fc7950da764a457c0ccf09f0e583fe269c80ca
+Subproject commit bf39018eca64bd007321653bb34a60aff09c4f7b
-- 
Gitee


From 8478ce78e8153d1cd2528c1de2716272ee6fe642 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= <yanpengquan@huawei.com>
Date: Wed, 5 Mar 2025 02:22:44 +0000
Subject: [PATCH 084/358] =?UTF-8?q?!18473=20support=20LCCL=20backend=20Mer?=
 =?UTF-8?q?ge=20pull=20request=20!18473=20from=20=E9=97=AB=E9=B9=8F?=
 =?UTF-8?q?=E5=85=A8/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/__init__.py                         |  11 +
 torch_npu/csrc/distributed/Init.cpp           |   7 +
 .../csrc/distributed/ProcessGroupLCCL.cpp     | 459 ++++++++++++++++++
 3 files changed, 477 insertions(+)
 create mode 100644 torch_npu/csrc/distributed/ProcessGroupLCCL.cpp

diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index 867aa53b6d..5872eddcc0 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -231,6 +231,17 @@ torch.distributed.Backend.register_backend("hccl", lambda dist_backend_opts, pg_
     _new_process_group_hccl_helper(dist_backend_opts, pg_options), extended_api=True, devices=["npu"])
 
 
+def _new_process_group_lccl_helper(dist_backend_opts, pg_options):
+    store = dist_backend_opts.store
+    group_rank = dist_backend_opts.group_rank
+    group_size = dist_backend_opts.group_size
+    return torch_npu._C._distributed_c10d.ProcessGroupLCCL(store, group_rank, group_size)
+
+
+torch.distributed.Backend.register_backend("lccl", lambda dist_backend_opts, pg_options:
+    _new_process_group_lccl_helper(dist_backend_opts, pg_options), extended_api=True, devices=["npu"])
+
+
 # set default device type for gradient checkpointing
 DefaultDeviceType.set_device_type("npu")
 del DefaultDeviceType
diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp
index d48a44c294..3d7e8f0c9e 100644
--- a/torch_npu/csrc/distributed/Init.cpp
+++ b/torch_npu/csrc/distributed/Init.cpp
@@ -19,6 +19,7 @@
 
 #include "torch_npu/csrc/distributed/rpc/init.h"
 #include "torch_npu/csrc/distributed/ProcessGroupHCCL.hpp"
+#include "torch_npu/csrc/distributed/ProcessGroupLCCL.hpp"
 #include "torch_npu/csrc/distributed/reducer.hpp"
 #include "torch_npu/csrc/distributed/Init.h"
 #include "torch_npu/csrc/distributed/ParallelTcpStore.hpp"
@@ -442,6 +443,12 @@ PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs) {
         .def_readwrite("hccl_config", &::c10d_npu::ProcessGroupHCCL::Options::hccl_config)
         .def_readwrite("group_id",
                        &::c10d_npu::ProcessGroupHCCL::Options::group_id);
+    
+    // bind for ProcessGroupLCCL
+    auto processGroupLCCL = intrusive_ptr_no_gil_destructor_class_<::c10d_npu::ProcessGroupLCCL>(
+        module, "ProcessGroupLCCL", dist.attr("Backend"))
+        .def(py::init<const c10::intrusive_ptr<::c10d::Store>&, int, int>(),
+            py::call_guard<py::gil_scoped_release>());
 
     auto cDist = py::module_::import("torch._C._distributed_c10d");
     auto parallelStore = intrusive_ptr_no_gil_destructor_class_<::c10d::ParallelTcpStore>(
diff --git a/torch_npu/csrc/distributed/ProcessGroupLCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupLCCL.cpp
new file mode 100644
index 0000000000..03c979088a
--- /dev/null
+++ b/torch_npu/csrc/distributed/ProcessGroupLCCL.cpp
@@ -0,0 +1,459 @@
+#include "ProcessGroupLCCL.hpp"
+
+#include "torch_npu/csrc/core/NPUBridge.h"
+#include "torch_npu/csrc/core/npu/DeviceUtils.h"
+#include "torch_npu/csrc/core/npu/NPUGuard.h"
+#include "torch_npu/csrc/framework/FormatHelper.h"
+#include "torch_npu/csrc/framework/OpCommand.h"
+
+
+namespace c10d_npu {
+
+namespace {
+constexpr int64_t kSynchronizeBusyWaitMillis = 10;
+
+void syncStreams(const std::vector<at::Device> &devices, std::vector<c10_npu::NPUEvent> &lcclEvents,
+                 std::vector<c10_npu::NPUStream> &lcclStreams)
+{
+    for (size_t i = 0; i < devices.size(); ++i) {
+        c10_npu::NPUStream &lcclStream = lcclStreams[i];
+        c10_npu::NPUEvent &lcclEvent = lcclEvents[i];
+        lcclEvent.record(c10_npu::getCurrentNPUStream(devices[i].index()));
+        ASCEND_LOGI("Event: record lccl group is successfully executed, event=%p", lcclEvent.event());
+        lcclEvent.block(lcclStream);
+        ASCEND_LOGI("Event: block lccl group is successfully executed, event=%p", lcclEvent.event());
+    }
+}
+} // namespace
+
+ProcessGroupLCCL::WorkLCCL::WorkLCCL(const std::vector<at::Device> &devices)
+    : devices_(devices), workStartTime_(std::chrono::steady_clock::now())
+{
+    // Creates the npu event wrappers
+    // Note: The actual events are lazily created when first recorded to with
+    // DEFAULT_FLAGS = npuEventDisableTiming.
+    lcclEndEvents_ = std::make_shared<std::vector<c10_npu::NPUEvent>>(devices.size());
+    lcclComms_.resize(devices.size());
+}
+
+ProcessGroupLCCL::WorkLCCL::~WorkLCCL()
+{}
+
+bool ProcessGroupLCCL::WorkLCCL::isCompleted()
+{
+    checkAndSetException();
+    return exception() || finishedNPUExecutionInternal();
+}
+
+bool ProcessGroupLCCL::WorkLCCL::isSuccess() const
+{
+    if (exception()) {
+        // Already detected an exception.
+        return false;
+    }
+    return finishedNPUExecutionInternal();
+}
+
+void ProcessGroupLCCL::WorkLCCL::synchronizeInternal(std::chrono::milliseconds timeout)
+{
+    for (const auto i: c10::irange(devices_.size())) {
+        auto currentStream = c10_npu::getCurrentNPUStream(devices_[i].index());
+        // Block the current stream on the LCCL stream
+        (*lcclEndEvents_)[i].block(currentStream);
+        ASCEND_LOGI("Event: block lccl work is successfully executed, event=%p", (*lcclEndEvents_)[i].event());
+    }
+
+    // In case of blocking, wait for the operation to complete.
+    if (blockingWait_) {
+        // Wait for the operation to complete.
+        while (!isCompleted()) {
+            auto currentTimepoint = std::chrono::steady_clock::now();
+            if (std::chrono::duration_cast<std::chrono::milliseconds>(currentTimepoint - workStartTime_) > opTimeout_) {
+                throw std::runtime_error("Operation has exceeded timeout limit!");
+            }
+            checkAndThrowException();
+            std::this_thread::sleep_for(std::chrono::milliseconds(kSynchronizeBusyWaitMillis));
+        }
+        checkAndThrowException();
+    }
+}
+
+// Same as calling synchronize().
+bool ProcessGroupLCCL::WorkLCCL::wait(std::chrono::milliseconds timeout)
+{
+    synchronizeInternal(timeout);
+    // Always return true, because abort API is not implemented.
+    return true;
+}
+
+void ProcessGroupLCCL::WorkLCCL::synchronize()
+{
+    // Call Synchronize without a timeout. We use this method to avoid adding a
+    // timeout argument to the public synchronize API.
+    synchronizeInternal(kNoTimeout);
+}
+
+bool ProcessGroupLCCL::WorkLCCL::finishedNPUExecution()
+{
+    checkAndSetException();
+    return finishedNPUExecutionInternal();
+}
+
+std::vector<at::Tensor> ProcessGroupLCCL::WorkLCCL::result()
+{
+    return *outputs_;
+}
+
+void ProcessGroupLCCL::WorkLCCL::checkAndThrowException()
+{
+    // Set the appropriate exception if found.
+    checkAndSetException();
+
+    // Throw an exception, only if we have a valid exception.
+    if (exception()) {
+        std::rethrow_exception(exception());
+    }
+}
+
+void ProcessGroupLCCL::WorkLCCL::checkAndSetException()
+{
+    if (exception()) {
+        // We already have an exception.
+        return;
+    }
+}
+
+// check if LCCL task is finished
+bool ProcessGroupLCCL::WorkLCCL::finishedNPUExecutionInternal() const
+{
+    // If in the Finalize, should not query event
+    if (!c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) {
+        return false;
+    }
+    try {
+        for (const auto i: c10::irange(devices_.size())) {
+            // Checking the work's corresponding ASCEND events' status
+            if (!(*lcclEndEvents_)[i].query()) {
+                return false;
+            }
+        }
+    } catch (const std::exception &e) {
+        if (std::string(e.what()).find("driver shutting down") == std::string::npos) {
+            throw std::runtime_error(DIST_ERROR(ErrCode::INTERNAL));
+        }
+        LOG(INFO) << "[Rank " << rank_ << "] Event query failed with exception: " << e.what();
+    }
+
+    return true;
+}
+
+c10::intrusive_ptr<c10::ivalue::Future> ProcessGroupLCCL::WorkLCCL::getFuture()
+{
+    return future_;
+}
+
+const int64_t ProcessGroupLCCL::kProcessGroupLCCLOpTimeoutMillis = 10 * 1000;
+
+ProcessGroupLCCL::ProcessGroupLCCL(const c10::intrusive_ptr<c10d::Store> &store, int rank, int size)
+    : c10d::Backend(rank, size), blockingWait_(false), store_(store),
+      opTimeout_(ProcessGroupLCCL::kProcessGroupLCCLOpTimeoutMillis)
+{}
+
+std::vector<at_npu::lccl::LcclComm> &ProcessGroupLCCL::getLCCLComm(
+    const std::string &devicesKey,
+    const std::vector<at::Device> &devices)
+{
+    // Sanity check
+    if (devicesKey.empty()) {
+        throw std::runtime_error("Not able to create/get the lccll Communicator since "
+                                 "the NPU devices are not known" +
+                                 DIST_ERROR(ErrCode::PARAM));
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(mutex_);
+        if (devLCCLCommMap_.find(devicesKey) != devLCCLCommMap_.end()) {
+            // Reuse the cached communicator if there is one.
+            return devLCCLCommMap_[devicesKey];
+        }
+    }
+
+    std::vector<at_npu::lccl::LcclComm> lcclComms;
+    lcclComms.resize(devices.size());
+
+    c10_npu::OptionalNPUGuard npuGuard;
+    std::vector<c10_npu::NPUStream> streamVal;
+    streamVal.reserve(devices.size());
+
+    for (size_t i = 0; i < devices.size(); ++i) {
+        npuGuard.set_index(devices[i].index());
+        auto ret = at_npu::lccl::LcclCommInitRankLocal(size_, rank_, &lcclComms[i]);
+        TORCH_CHECK(ret == 0, "init lccl comm failed, error code:", ret, PTA_ERROR(ErrCode::INTERNAL));
+
+        // Creates the LCCL streams
+        streamVal.push_back(c10_npu::getNPUStreamFromPool(devices[i].index()));
+    }
+
+    lcclStreams_.emplace(devicesKey, std::move(streamVal));
+
+    // Note: these events are created with the (default) cudaEventDisableTiming
+    // flag This flag provides the best performance when used with
+    // StreamWaitEvent() and EventQuery(). Since we here don't measure the
+    // performance using npuEvent, this should be set.
+    lcclEvents_.emplace(std::piecewise_construct, std::make_tuple(devicesKey), std::make_tuple(devices.size()));
+
+    // Hold the lock before modifying the cache.
+    std::lock_guard<std::mutex> lock(mutex_);
+    devLCCLCommMap_.emplace(devicesKey, std::move(lcclComms));
+    return devLCCLCommMap_[devicesKey];
+}
+
+template<typename Fn, typename PreProcess, typename PostProcess>
+c10::intrusive_ptr<c10d::Work> ProcessGroupLCCL::collective(std::vector<at::Tensor> &inputs,
+                                                            std::vector<at::Tensor> &outputs, Fn fn, PreProcess pre,
+                                                            PostProcess post, c10d::OpType opType)
+{
+    const auto devices = getDeviceList(inputs);
+    auto key = getKeyFromDevices(devices);
+    std::vector<at_npu::lccl::LcclComm> lcclComms;
+    lcclComms = getLCCLComm(key, devices);
+    // Used many times below, so we stash the unordered_map lookup
+    auto &lcclStreams = lcclStreams_[key];
+    // First let LCCL streams wait for input tensors allocation streams
+    syncStreams(devices, lcclEvents_[key], lcclStreams);
+    // Work itself will create the events on all NPUs of tensors
+    auto work = c10::make_intrusive<ProcessGroupLCCL::WorkLCCL>(devices);
+    // Store references to outputs to be used by WorkLCCL::result and operator<<.
+    work->outputs_ = std::make_shared<std::vector<at::Tensor>>(outputs);
+
+    c10_npu::OptionalNPUGuard npuGuard;
+    pre(lcclStreams, work);
+
+    for (const auto i: c10::irange(inputs.size())) {
+        npuGuard.set_index(devices[i].index());
+        c10_npu::NPUStream &lcclStream = lcclStreams[i];
+
+        // Both `inputs' and `outputs' are created on a worker stream and used in
+        // different lcclStreams.  Hence, both must record the lcclStream to
+        // prevent being freed before the collective finishes.
+        //
+        // We only record `inputs' here, and leave recording `outputs' to `fn' for
+        // operations where `inputs' and `outputs' are not the same.
+        //
+        // See [Sync Streams].
+        c10_npu::NPUCachingAllocator::recordStream(inputs[i].storage().data_ptr(), lcclStream);
+    }
+    {
+        for (const auto i: c10::irange(inputs.size())) {
+            npuGuard.set_index(devices[i].index());
+            // to avoid to much task pushed to the stream, leading to stream overflow
+            // insert sync point fluxLimit(key, i)
+
+            c10_npu::NPUStream &lcclStream = lcclStreams[i];
+            auto ret = fn(inputs[i], outputs[i], lcclComms[i], lcclStream);
+            TORCH_CHECK(ret == 0, "LCCL function error:", opTypeToString(opType).c_str(), ", error code is", ret, "\n");
+        }
+    }
+    post(lcclStreams, work);
+    {
+        c10_npu::NPUMultiStreamGuard guard(lcclStreams);
+        work->future_ = c10::make_intrusive<at::ivalue::Future>(c10::ListType::create(c10::TensorType::get()), devices);
+        work->future_->markCompleted(at::IValue(*work->outputs_));
+    }
+
+    for (size_t i = 0; i < inputs.size(); ++i) {
+        c10_npu::NPUStream &lcclStream = lcclStreams_[key][i];
+        (*(work->lcclEndEvents_))[i].record(lcclStream);
+        ASCEND_LOGI("Event: record lccl work is successfully executed, event=%p", (*(work->lcclEndEvents_))[i].event());
+        work->lcclComms_[i] = lcclComms[i];
+    }
+    work->blockingWait_ = blockingWait_;
+    work->opTimeout_ = opTimeout_;
+    return work;
+}
+
+
+template<typename Fn>
+c10::intrusive_ptr<c10d::Work> ProcessGroupLCCL::collective(std::vector<at::Tensor> &inputs,
+                                                            std::vector<at::Tensor> &outputs, Fn fn,
+                                                            c10d::OpType opType)
+{
+    return collective(
+        inputs, outputs, fn,
+        [](std::vector<c10_npu::NPUStream> &, c10::intrusive_ptr<ProcessGroupLCCL::WorkLCCL> &) {
+        },
+        [](std::vector<c10_npu::NPUStream> &, c10::intrusive_ptr<ProcessGroupLCCL::WorkLCCL> &) {
+        },
+        opType);
+}
+
+c10::intrusive_ptr<c10d::Work> ProcessGroupLCCL::allreduce(std::vector<at::Tensor> &tensors,
+                                                           const c10d::AllreduceOptions &opts)
+{
+    checkTensors(tensors);
+    std::vector<at::Tensor> tensors_cp = {tensors[0]};
+    std::string functionName = __FUNCTION__;
+    return collective(
+        tensors_cp, tensors_cp,
+        [&](at::Tensor &input, at::Tensor &output, at_npu::lccl::LcclComm comm, c10_npu::NPUStream &stream) {
+            auto lcclType = getLcclDataType(input.scalar_type());
+            checkSupportedDataType(lcclType, functionName);
+            RECORD_FUNCTION("LcclAllreduce", std::vector<c10::IValue>({input}));
+
+            auto inputDataPtr = input.data_ptr();
+            auto outputDataPtr = output.data_ptr();
+            auto numel = getNumelForLCCL(input);
+            auto lcclReduceOp = getLcclReduceOp(opts.reduceOp, input);
+            auto lccl_call = [inputDataPtr, outputDataPtr, numel, lcclType, lcclReduceOp, stream, comm]() -> int {
+                auto lccl_result = at_npu::lccl::LcclAllReduce(inputDataPtr, outputDataPtr, numel, lcclType,
+                                                               lcclReduceOp, comm, stream.stream(false));
+                return lccl_result;
+            };
+            at_npu::native::OpCommand::RunOpApi("LcclAllreduce", lccl_call);
+            return 0;
+        },
+        c10d::OpType::ALLREDUCE);
+}
+
+c10::intrusive_ptr<c10d::Work> ProcessGroupLCCL::allgather(std::vector<std::vector<at::Tensor>> &outputTensors,
+                                                           std::vector<at::Tensor> &inputTensors,
+                                                           const c10d::AllgatherOptions &opts)
+{
+    checkTensors(inputTensors);
+
+    auto inputTensors_ = castOriginFormat(inputTensors);
+    bool same_size = CheckTensorsSameSize(outputTensors.back());
+    if (same_size) {
+        auto outputFlattened = FlattenForScatterGather(outputTensors, inputTensors, size_);
+        checkTensors(outputFlattened);
+
+        return collective(
+            inputTensors_, outputFlattened,
+            [&](at::Tensor &input, at::Tensor &output, at_npu::lccl::LcclComm comm, c10_npu::NPUStream &stream) {
+                RECORD_FUNCTION("LcclAllgather", std::vector<c10::IValue>({input}));
+                c10_npu::NPUCachingAllocator::recordStream(output.storage().data_ptr(), stream);
+                auto inputDataPtr = input.data_ptr();
+                auto outputDataPtr = output.data_ptr();
+                auto numel = getNumelForLCCL(input);
+                auto lcclType = getLcclDataType(input.scalar_type());
+                auto lccl_call = [inputDataPtr, outputDataPtr, numel, lcclType, comm, stream]() -> int {
+                    auto lccl_result = at_npu::lccl::LcclAllGather(inputDataPtr, outputDataPtr, numel, lcclType, comm,
+                                                                   stream.stream(false));
+                    return lccl_result;
+                };
+                at_npu::native::OpCommand::RunOpApi("LcclAllgather", lccl_call);
+                return 0;
+            },
+            [&](std::vector<c10_npu::NPUStream> &, c10::intrusive_ptr<ProcessGroupLCCL::WorkLCCL> &) {
+            },
+            [&](std::vector<c10_npu::NPUStream> &lcclStreams, c10::intrusive_ptr<ProcessGroupLCCL::WorkLCCL> &work) {
+                // Copy the flattened output tensors to the outputs.
+
+                for (const auto i: c10::irange(outputTensors.size())) {
+                    c10_npu::NPUStreamGuard guard(lcclStreams[i]);
+                    for (const auto j: c10::irange(outputTensors[0].size())) {
+                        // See [Sync Streams].
+                        c10_npu::NPUCachingAllocator::recordStream(outputTensors[i][j].storage().data_ptr(),
+                                                                   lcclStreams[i]);
+
+                        outputTensors[i][j].copy_(outputFlattened[i][j], true);
+                    }
+                }
+            },
+            c10d::OpType::ALLGATHER);
+    } else {
+        TORCH_CHECK(false, "lccl doesn't support to all_gather different shape");
+    }
+}
+
+c10::intrusive_ptr<c10d::Work> ProcessGroupLCCL::broadcast(std::vector<at::Tensor> &tensors,
+                                                           const c10d::BroadcastOptions &opts)
+{
+    checkTensors(tensors);
+
+    return collective(
+        tensors, tensors,
+        [&](at::Tensor &input, at::Tensor &output, at_npu::lccl::LcclComm comm, c10_npu::NPUStream &stream) {
+            RECORD_FUNCTION("LcclBroadcast", std::vector<c10::IValue>({input}));
+            const auto root = opts.rootRank * tensors.size() + opts.rootTensor;
+
+            auto inputDataPtr = input.data_ptr();
+            auto numel = getNumelForLCCL(input);
+            auto lcclType = getLcclDataType(input.scalar_type());
+            auto lccl_call = [inputDataPtr, numel, lcclType, root, comm, stream]() -> int {
+                auto lccl_result =
+                    at_npu::lccl::LcclBroadcast(inputDataPtr, numel, lcclType, root, comm, stream.stream(false));
+                return lccl_result;
+            };
+            at_npu::native::OpCommand::RunOpApi("LcclBroadcast", lccl_call);
+            return 0;
+        },
+        c10d::OpType::BROADCAST);
+}
+
+c10::intrusive_ptr<c10d::Work> ProcessGroupLCCL::reduce_scatter(std::vector<at::Tensor> &outputTensors,
+                                                                std::vector<std::vector<at::Tensor>> &inputTensors,
+                                                                const c10d::ReduceScatterOptions &opts)
+{
+    checkTensors(outputTensors);
+
+    bool same_size = CheckTensorsSameSize(inputTensors.back());
+    if (same_size) {
+        auto inputFlattened = FlattenForScatterGather(inputTensors, outputTensors, size_);
+        checkTensors(inputFlattened);
+        std::string functionName = __FUNCTION__;
+        return collective(
+            inputFlattened, outputTensors,
+            [&](at::Tensor &input, at::Tensor &output, at_npu::lccl::LcclComm comm, c10_npu::NPUStream &stream) {
+                auto lcclType = getLcclDataType(input.scalar_type());
+                checkSupportedDataType(lcclType, functionName);
+                RECORD_FUNCTION("LcclReduceScatter", std::vector<c10::IValue>({input}));
+                c10_npu::NPUCachingAllocator::recordStream(output.storage().data_ptr(), stream);
+                auto inputDataPtr = input.data_ptr();
+                auto outputDataPtr = output.data_ptr();
+                auto numel = getNumelForLCCL(output);
+                auto lcclReduceOp = getLcclReduceOp(opts.reduceOp, input);
+                auto lccl_call = [inputDataPtr, outputDataPtr, numel, lcclType, lcclReduceOp, stream, comm]() -> int {
+                    auto lccl_result = at_npu::lccl::LcclReduceScatter(inputDataPtr, outputDataPtr, numel, lcclType,
+                                                                       lcclReduceOp, comm, stream.stream(false));
+                    return lccl_result;
+                };
+                at_npu::native::OpCommand::RunOpApi("LcclReduceScatter", lccl_call);
+                return 0;
+            },
+            [&](std::vector<c10_npu::NPUStream> &lcclStreams, c10::intrusive_ptr<ProcessGroupLCCL::WorkLCCL> &work) {
+                // Copy the input tensors to the flattened inputs.
+                for (const auto i: c10::irange(inputTensors.size())) {
+                    c10_npu::NPUStreamGuard guard(lcclStreams[i]);
+                    for (const auto j: c10::irange(inputTensors[0].size())) {
+                        // See [Sync Streams].
+                        c10_npu::NPUCachingAllocator::recordStream(inputTensors[i][j].storage().data_ptr(),
+                                                                   lcclStreams[i]);
+                        inputFlattened[i][j].copy_(inputTensors[i][j], true);
+                    }
+                }
+            },
+            [&](std::vector<c10_npu::NPUStream> &, c10::intrusive_ptr<ProcessGroupLCCL::WorkLCCL> &) {
+            },
+            c10d::OpType::REDUCE_SCATTER);
+    } else {
+        TORCH_CHECK(false, "lccl doesn't support to reduce_scatter different shape");
+    }
+}
+
+ProcessGroupLCCL::~ProcessGroupLCCL()
+{
+    {
+        // Destroy all LCCL Communicators on Process Group Destruction
+        std::lock_guard<std::mutex> lock(mutex_);
+        for (auto &it: devLCCLCommMap_) {
+            auto &lcclComms = it.second;
+
+            for (const auto &lcclComm: lcclComms) {
+                at_npu::lccl::LcclCommDestroy(lcclComm);
+            }
+        }
+    }
+}
+
+} // namespace c10d_npu
-- 
Gitee


From 65699148ec0e69eafd8ed109e5d6469ab506b18f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= <wangchao430@huawei.com>
Date: Wed, 5 Mar 2025 02:29:25 +0000
Subject: [PATCH 085/358] =?UTF-8?q?!18495=20SilentCheck:=20model=20status?=
 =?UTF-8?q?=20only=20use=20weight=20Merge=20pull=20request=20!18495=20from?=
 =?UTF-8?q?=20=E7=8E=8B=E8=B6=85/v2.6.0=5Fsilentperf?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/_logging/_internal.py |  1 +
 torch_npu/utils/_step.py        | 81 ++++++++++-----------------------
 2 files changed, 24 insertions(+), 58 deletions(-)

diff --git a/torch_npu/_logging/_internal.py b/torch_npu/_logging/_internal.py
index 951b1854d4..c27a2aa3c6 100644
--- a/torch_npu/_logging/_internal.py
+++ b/torch_npu/_logging/_internal.py
@@ -35,3 +35,4 @@ def _logging_patch():
 def _add_logging_module():
     torch._logging._internal.register_log("memory", "torch_npu.memory")
     torch._logging._internal.register_log("dispatch", "torch_npu.dispatch")
+    torch._logging._internal.register_log("silent", "torch_npu.silent_check")
diff --git a/torch_npu/utils/_step.py b/torch_npu/utils/_step.py
index bd748bd0ac..9935214e9d 100644
--- a/torch_npu/utils/_step.py
+++ b/torch_npu/utils/_step.py
@@ -44,31 +44,25 @@ class PerfDumpState:
 
 perf_dump_state = PerfDumpState()
 perf_dump_enable = False
-IS_IN_BACKWARD = 0
+IS_IN_BACKWARD = False
+loggerSilent = logging.getLogger("torch_npu.silent_check")
 
 
 def input_hook(idx, asd_flag):
     def hook(grad):
         global IS_IN_BACKWARD
-
-        if idx != "":
-            IS_IN_BACKWARD = IS_IN_BACKWARD & 1  # 011 & 001 = 001
-            if torch_npu._C._get_silent_check_version() == 3:
-                _silent_fault_detector_v3.silent_fault_check(idx, asd_flag, grad)
-            else:
-                _silent_fault_detector_v2.silent_fault_check(idx, asd_flag, grad)
-        else:
-            IS_IN_BACKWARD = IS_IN_BACKWARD & 2  # 011 & 010 = 010
-
-        if not IS_IN_BACKWARD:
-            torch_npu._C._npu_set_call_state("forward")
+        loggerSilent.info(f"input_hook: IS_IN_BACKWARD is {IS_IN_BACKWARD}, will change to False. idx is {idx}, flag is {asd_flag}")
+        IS_IN_BACKWARD = False
+        torch_npu._C._npu_set_call_state("forward")
+        _silent_fault_detector_v2.silent_fault_check(idx, asd_flag, grad)
         return
     return hook
 
 
 def output_hook(grad):
     global IS_IN_BACKWARD
-    IS_IN_BACKWARD = 3  # 011
+    loggerSilent.info(f"output_hook: IS_IN_BACKWARD is {IS_IN_BACKWARD}, will change to True.")
+    IS_IN_BACKWARD = True
     torch_npu._C._npu_set_call_state("backward")
     return grad
 
@@ -91,10 +85,9 @@ class SilentCheckState:
         self.is_training = False
         self.first_module_id = ""
         self.first_weight = None
+        self.first_weight_id = None
         self.last_weight = None
-        self.last_tensor = None
-        self.last_tensor_id = None
-        self.first_tensor_id = None
+        self.last_weight_id = None
 
     def init_module_info(self, module_id, training):
         self.first_module_id = module_id
@@ -123,52 +116,31 @@ class SilentCheckState:
             for param_name, param in module._parameters.items():
                 if isinstance(param, torch.Tensor) and param.requires_grad:
                     self.first_weight = param
+                    self.first_weight_id = id(param)
                     break
 
-    def register_input_hook_before_call(self, asd_flag, *args):
-        # Search the first tensor (if the first tensor is input)
-        if self.is_training and not self.input_hook_flag:
-            for x in args:
-                if isinstance(x, torch.Tensor) and x.requires_grad:
-                    x.register_hook(input_hook(self.first_module_id, asd_flag))
-                    self.input_hook_flag = True
-                    break
-
-    def register_input_hook_after_call(self, output):
-        # Search the first tensor (if the first tensor is output of an inner module)
-        if not self.input_hook_flag:
-            if isinstance(output, torch.Tensor) and output.requires_grad:
-                output.register_hook(input_hook(self.first_module_id, asd_enable))
-                self.input_hook_flag = True
-                self.first_tensor_id = id(output)
-
     def search_last_weight(self, module):
         # Search the last weight (only in inner module)
         if not self.init_marks.get(self.first_module_id, False) and _is_inner_module(module):
             for param_name, param in module._parameters.items():
                 if isinstance(param, torch.Tensor) and param.requires_grad:
                     self.last_weight = param
-
-    def search_last_tensor(self, output):
-        # Search the last tensor
-        if isinstance(output, torch.Tensor) and output.requires_grad:
-            self.last_tensor_id = id(output)
-            self.last_tensor = output
+                    self.last_weight_id = id(param)
 
     def init_all_hook(self, asd_flag):
         if self.is_training:
-            # Otherwise, there is only one weight in the outer module
-            if self.first_tensor_id != self.last_tensor_id:
-                if self.last_tensor is not None:
-                    self.last_tensor.register_hook(output_hook)
-                if self.last_weight_hook_handles.get(self.first_module_id, None) is None:
-                    if self.last_weight is not None:
+            if self.last_weight is not None and self.first_weight is not None:
+                # Otherwise, there is only one weight in the outer module
+                if self.first_weight_id != self.last_weight_id:
+                    loggerSilent.info(f"init_all_hook: module init, first_module_id is {self.first_module_id}.")
+                    if self.last_weight_hook_handles.get(self.first_module_id, None) is None:
                         last_weight_handle = self.last_weight.register_hook(output_hook)
                         self.last_weight_hook_handles[self.first_module_id] = last_weight_handle
-                if self.weight_hook_handles.get(self.first_module_id, None) is None:
-                    if self.first_weight is not None:
-                        first_weight_handle = self.first_weight.register_hook(input_hook("", asd_flag))
+                    if self.weight_hook_handles.get(self.first_module_id, None) is None:
+                        first_weight_handle = self.first_weight.register_hook(input_hook(self.first_module_id, asd_flag))
                         self.weight_hook_handles[self.first_module_id] = first_weight_handle
+                else:
+                    loggerSilent.info(f"init_all_hook: module only have one weight, first_module_id is {self.first_module_id}.")
             self.init_marks[self.first_module_id] = True
 
 
@@ -303,23 +275,14 @@ def _custom_call(self, *args, **kwargs):
                 asd_enable = 0
                 warnings.warn(f"Warning: Module has unsupported dtype tensor, silent check will be closed.")
 
-        # Search the first tensor (if the first tensor is input)
-        silent_check.register_input_hook_before_call(asd_enable, *args)
-
     tmp = original_call(self, *args, **kwargs)
 
     if asd_enable and silent_check.is_training and not IS_IN_BACKWARD:
         # Search the first weight
         silent_check.search_first_weight(self)
 
-        # Search the first tensor (if the first tensor is output of an inner module)
-        silent_check.register_input_hook_after_call(tmp)
-
         # Search the last weight (only in inner module)
         silent_check.search_last_weight(self)
-        
-        # Search the last tensor
-        silent_check.search_last_tensor(tmp)
 
     if perf_dump_enable:
         if hasattr(self, "visited") and self.visited:
@@ -369,6 +332,8 @@ def add_perf_dump_patch():
             asd_enable = 0
         elif torch_npu._C._get_silent_check_version() == 2:
             warnings.warn(f"Warning: CANN version lower than 8.0.0 and currently does not support silent check 3.0 version. It will switch to 2.0 version. The asd_detect is {asd_enable}")
+        else:
+            loggerSilent.info(f"Silent check 3.0 version will be enabled. The asd_detect is {asd_enable}")
 
     if perf_dump_enable or asd_enable:
         Module.__call__ = _custom_call
-- 
Gitee


From d69c71d5b423b493c3d145b8eb5751ab00e6d20e Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Wed, 5 Mar 2025 03:30:21 +0000
Subject: [PATCH 086/358] !18520 Update op_plugin commit id Merge pull request
 !18520 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index bf39018eca..afa134ef2f 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit bf39018eca64bd007321653bb34a60aff09c4f7b
+Subproject commit afa134ef2ff35c049320b0b4a129bdc3553dda81
-- 
Gitee


From 9bf778fceceb9d0d485737bea22883a8ac3f3da6 Mon Sep 17 00:00:00 2001
From: czy1255959842 <cuizhengyao@huawei.com>
Date: Wed, 5 Mar 2025 09:25:37 +0000
Subject: [PATCH 087/358] !18444 Modify msleaks enable judgment logic Merge
 pull request !18444 from czy1255959842/v2.6.0

---
 torch_npu/csrc/profiler/mstx_mgr.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_npu/csrc/profiler/mstx_mgr.cpp b/torch_npu/csrc/profiler/mstx_mgr.cpp
index 0c7ff91d3e..671908257b 100644
--- a/torch_npu/csrc/profiler/mstx_mgr.cpp
+++ b/torch_npu/csrc/profiler/mstx_mgr.cpp
@@ -218,7 +218,7 @@ bool MstxMgr::isMsleaksEnableImpl()
     if (envVal == nullptr) {
         return ret;
     }
-    static const std::string soName = "libascend_hal_hook.so";
+    static const std::string soName = "libascend_kernel_hook.so";
     std::stringstream ss(envVal);
     std::string path;
     while (std::getline(ss, path, ':')) {
-- 
Gitee


From e98d16d06ed0485e100313711fbb80e3ef9c3cb4 Mon Sep 17 00:00:00 2001
From: Mrtutu <zhangwei983@huawei.com>
Date: Wed, 5 Mar 2025 12:51:08 +0000
Subject: [PATCH 088/358] !18461 add profiler warmup on branch v2.6.0 Merge
 pull request !18461 from Mrtutu/warmup_v2.6.0

---
 .../interface/MsProfilerInterface.cpp         |  14 ++
 .../framework/interface/MsProfilerInterface.h |   1 +
 torch_npu/csrc/profiler/init.cpp              |   1 +
 torch_npu/csrc/profiler/npu_profiler.cpp      |  11 +
 torch_npu/csrc/profiler/npu_profiler.h        |   2 +
 torch_npu/csrc/profiler/profiler_mgr.cpp      | 226 ++++++++++++------
 torch_npu/csrc/profiler/profiler_mgr.h        |   5 +
 .../profiler/_profiler_action_controller.py   |   4 +-
 torch_npu/profiler/profiler_interface.py      |   7 +
 9 files changed, 201 insertions(+), 70 deletions(-)

diff --git a/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp b/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp
index 353e7207e1..a36fbb8d50 100644
--- a/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp
+++ b/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp
@@ -15,10 +15,24 @@ namespace native {
 
 
 REGISTER_LIBRARY(libmsprofiler)
+LOAD_FUNCTION(aclprofWarmup)
 LOAD_FUNCTION(aclprofSetConfig)
 LOAD_FUNCTION(aclprofGetSupportedFeatures)
 LOAD_FUNCTION(aclprofMarkEx)
 
+aclError AclProfilingWarmup(const aclprofConfig *profilerConfig)
+{
+    typedef aclError (*AclProfWarmupFunc)(const aclprofConfig *);
+    static AclProfWarmupFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclProfWarmupFunc)GET_FUNC(aclprofWarmup);
+        if (func == nullptr) {
+            return ACL_ERROR_PROF_MODULES_UNSUPPORTED;
+        }
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclprofWarmup", PROF_ERROR(ErrCode::NOT_FOUND));
+    return func(profilerConfig);
+}
 
 aclError AclprofSetConfig(aclprofConfigType configType, const char* config, size_t configLength) {
     typedef aclError(*AclprofSetConfigFunc)(aclprofConfigType, const char *, size_t);
diff --git a/torch_npu/csrc/framework/interface/MsProfilerInterface.h b/torch_npu/csrc/framework/interface/MsProfilerInterface.h
index 721ce5c9cd..b06ca001e6 100644
--- a/torch_npu/csrc/framework/interface/MsProfilerInterface.h
+++ b/torch_npu/csrc/framework/interface/MsProfilerInterface.h
@@ -7,6 +7,7 @@
 namespace at_npu {
 namespace native {
 
+aclError AclProfilingWarmup(const aclprofConfig *profilerConfig);
 
 aclError AclprofSetConfig(aclprofConfigType configType, const char* config, size_t configLength);
 
diff --git a/torch_npu/csrc/profiler/init.cpp b/torch_npu/csrc/profiler/init.cpp
index 07e2964405..a367c250e3 100644
--- a/torch_npu/csrc/profiler/init.cpp
+++ b/torch_npu/csrc/profiler/init.cpp
@@ -72,6 +72,7 @@ PyObject* profiler_initExtension(PyObject* _unused, PyObject *unused) {
         return activities;
     });
     m.def("_init_profiler", initNpuProfiler);
+    m.def("_warmup_profiler", &warmupNpuProfiler, py::arg("config"), py::arg("activities"));
     m.def("_start_profiler",
         &startNpuProfiler,
         py::arg("config"),
diff --git a/torch_npu/csrc/profiler/npu_profiler.cpp b/torch_npu/csrc/profiler/npu_profiler.cpp
index e5f03ef8ba..2d726162e5 100644
--- a/torch_npu/csrc/profiler/npu_profiler.cpp
+++ b/torch_npu/csrc/profiler/npu_profiler.cpp
@@ -302,6 +302,17 @@ static void registerCallback(const std::unordered_set<at::RecordScope> &scopes)
     registeration_state_ptr->setCallbackHandle(handle);
 }
 
+void warmupNpuProfiler(const NpuProfilerConfig &config,
+    const std::set<NpuActivityType> &activities)
+{
+    bool cpu_trace = activities.count(NpuActivityType::CPU);
+    ExperimentalConfig experimental_config = config.experimental_config;
+    NpuTraceConfig npu_config = {experimental_config.trace_level, experimental_config.metrics,
+        config.profile_memory, experimental_config.l2_cache, experimental_config.record_op_args,
+        experimental_config.msprof_tx, experimental_config.op_attr};
+    ProfilerMgr::GetInstance()->Warmup(npu_config, cpu_trace);
+}
+
 void startNpuProfiler(const NpuProfilerConfig &config,
     const std::set<NpuActivityType> &activities,
     const std::unordered_set<at::RecordScope> &scopes)
diff --git a/torch_npu/csrc/profiler/npu_profiler.h b/torch_npu/csrc/profiler/npu_profiler.h
index 074a41440e..2a6f44a318 100644
--- a/torch_npu/csrc/profiler/npu_profiler.h
+++ b/torch_npu/csrc/profiler/npu_profiler.h
@@ -102,6 +102,8 @@ std::atomic<bool>& profDataReportEnable();
 
 void initNpuProfiler(const std::string &path, const std::set<NpuActivityType> &activities);
 
+void warmupNpuProfiler(const NpuProfilerConfig &config, const std::set<NpuActivityType> &activities);
+
 void startNpuProfiler(const NpuProfilerConfig &config, const std::set<NpuActivityType> &activities, const std::unordered_set<at::RecordScope> &scops = {});
 
 void stopNpuProfiler();
diff --git a/torch_npu/csrc/profiler/profiler_mgr.cpp b/torch_npu/csrc/profiler/profiler_mgr.cpp
index f729d1c3dd..b773e2cf4f 100644
--- a/torch_npu/csrc/profiler/profiler_mgr.cpp
+++ b/torch_npu/csrc/profiler/profiler_mgr.cpp
@@ -55,6 +55,7 @@ ProfilerMgr::ProfilerMgr()
       record_op_args_(false),
       profile_memory_(false),
       msprof_tx_(false),
+      enable_warmup_(false),
       profConfig_(nullptr) {}
 
 ProfilerMgr *ProfilerMgr::GetInstance()
@@ -63,59 +64,128 @@ ProfilerMgr *ProfilerMgr::GetInstance()
     return &instance;
 }
 
-void ProfilerMgr::Init(const std::string &path, bool npu_trace) {
-  if (npu_trace == true) {
-    at_npu::native::AclProfilingInit(path.c_str(), path.size());
-    npu_trace_.store(true);
-    FeatureMgr::GetInstance()->Init();
-  }
-  path_ = path;
+void ProfilerMgr::Init(const std::string &path, bool npu_trace)
+{
+    if (npu_trace) {
+        at_npu::native::AclProfilingInit(path.c_str(), path.size());
+        npu_trace_.store(true);
+        FeatureMgr::GetInstance()->Init();
+    }
+    path_ = path;
+}
+
+void ProfilerMgr::WarmupMsProfiler(uint32_t *deviceIdList, uint32_t deviceNum, aclprofAicoreMetrics aicMetrics, uint64_t dataTypeConfig)
+{
+    profConfig_ = at_npu::native::AclProfilingCreateConfig(deviceIdList, deviceNum, aicMetrics, nullptr, dataTypeConfig);
+    if (profConfig_ == nullptr) {
+        ASCEND_LOGE("Create Prof Config failed.");
+        return;
+    }
+    auto ret = at_npu::native::AclProfilingWarmup(profConfig_);
+    if (ret == ACL_ERROR_PROF_MODULES_UNSUPPORTED) {
+        ASCEND_LOGW("Not support aclprofWarmup.");
+    } else if (ret != ACL_ERROR_NONE) {
+        ASCEND_LOGE("Profiling warmup failed.");
+    }
 }
 
 void ProfilerMgr::EnableMsProfiler(uint32_t *deviceIdList, uint32_t deviceNum, aclprofAicoreMetrics aicMetrics, uint64_t dataTypeConfig) {
-  profConfig_ = at_npu::native::AclProfilingCreateConfig(deviceIdList, deviceNum, aicMetrics, nullptr, dataTypeConfig);
-  if (profConfig_ == nullptr) {
-    ASCEND_LOGE("Create Prof Config failed.");
-    return;
-  }
-  auto ret = at_npu::native::AclProfilingStart(profConfig_);
-  if (ret != ACL_ERROR_NONE) {
-    ASCEND_LOGE("Profiling start failed.");
-    return;
-  }
+    // Avoid duplicate config creation in scenarios where warmup is turned on
+    if (profConfig_ == nullptr) {
+        profConfig_ = at_npu::native::AclProfilingCreateConfig(deviceIdList, deviceNum, aicMetrics, nullptr, dataTypeConfig);
+    }
+  
+    if (profConfig_ == nullptr) {
+        ASCEND_LOGE("Create Prof Config failed.");
+        return;
+    }
+
+    auto ret = at_npu::native::AclProfilingStart(profConfig_);
+    if (ret != ACL_ERROR_NONE) {
+        ASCEND_LOGE("Profiling start failed.");
+        return;
+    }
 }
 
-void ProfilerMgr::Start(const NpuTraceConfig &npu_config, bool cpu_trace)
+uint64_t ProfilerMgr::PrepareProfilerConfig(const NpuTraceConfig &npu_config)
 {
-    if (npu_trace_.load() == true) {
-        aclprofAicoreMetrics aic_metrics = ACL_AICORE_NONE;
-        int8_t level_int = trace_level_to_int_.find(npu_config.trace_level) != trace_level_to_int_.end() ?
-            trace_level_to_int_[npu_config.trace_level] : -1;
-        auto level_iter = trace_level_map_.find(npu_config.trace_level);
-        uint64_t datatype_config = (level_iter == trace_level_map_.end()) ? Level0 : trace_level_map_[npu_config.trace_level];
-        auto metrics_iter = npu_metrics_map_.find(npu_config.metrics);
-        if (metrics_iter != npu_metrics_map_.end() && npu_config.metrics.compare("ACL_AICORE_NONE") != 0) {
-            datatype_config |= ACL_PROF_AICORE_METRICS;
-            aic_metrics = CheckAicMetricsFeature(npu_metrics_map_[npu_config.metrics], level_int);
-        }
-        if (npu_config.l2_cache) {
-            datatype_config |= ACL_PROF_L2CACHE;
-        }
-        if (npu_config.msprof_tx) {
-            datatype_config |= ACL_PROF_MSPROFTX;
+    auto level_iter = trace_level_map_.find(npu_config.trace_level);
+    uint64_t datatype_config = (level_iter == trace_level_map_.end()) ? Level0 : trace_level_map_[npu_config.trace_level];
+    auto metrics_iter = npu_metrics_map_.find(npu_config.metrics);
+    if (metrics_iter != npu_metrics_map_.end() && npu_config.metrics.compare("ACL_AICORE_NONE") != 0) {
+        datatype_config |= ACL_PROF_AICORE_METRICS;
+    }
+    if (npu_config.l2_cache) {
+        datatype_config |= ACL_PROF_L2CACHE;
+    }
+    if (npu_config.msprof_tx) {
+        datatype_config |= ACL_PROF_MSPROFTX;
+    }
+    if (npu_config.npu_memory) {
+        datatype_config |= ACL_PROF_TASK_MEMORY;
+        const std::string freq = "50";
+        auto prof_ret = at_npu::native::AclprofSetConfig(ACL_PROF_SYS_HARDWARE_MEM_FREQ, freq.c_str(), freq.size());
+        if (prof_ret == ACL_ERROR_PROF_MODULES_UNSUPPORTED) {
+            ASCEND_LOGW("not support to set config for sys-hardware-mem.");
         }
-        if (npu_config.npu_memory) {
-            datatype_config |= ACL_PROF_TASK_MEMORY;
-            const std::string freq = "50";
-            auto prof_ret = at_npu::native::AclprofSetConfig(ACL_PROF_SYS_HARDWARE_MEM_FREQ, freq.c_str(), freq.size());
-            if (prof_ret == ACL_ERROR_PROF_MODULES_UNSUPPORTED) {
-                ASCEND_LOGW("not support to set config for sys-hardware-mem.");
-            }
+    }
+    if (npu_config.op_attr) {
+        datatype_config |= ACL_PROF_OP_ATTR;
+    }
+    datatype_config = CheckFeatureConfig(datatype_config);
+    return datatype_config;
+}
+
+
+aclprofAicoreMetrics ProfilerMgr::PrepareProfilerAicMetrics(const NpuTraceConfig &npu_config)
+{
+    aclprofAicoreMetrics aic_metrics = ACL_AICORE_NONE;
+    int8_t level_int = trace_level_to_int_.find(npu_config.trace_level) != trace_level_to_int_.end() ? trace_level_to_int_[npu_config.trace_level] : -1;
+    auto metrics_iter = npu_metrics_map_.find(npu_config.metrics);
+    if (metrics_iter != npu_metrics_map_.end() && npu_config.metrics.compare("ACL_AICORE_NONE") != 0) {
+        aic_metrics = CheckAicMetricsFeature(npu_metrics_map_[npu_config.metrics], level_int);
+    }
+    return aic_metrics;
+}
+
+void ProfilerMgr::Warmup(const NpuTraceConfig &npu_config, bool cpu_trace)
+{
+    if (npu_trace_.load()) {
+        auto datatype_config = PrepareProfilerConfig(npu_config);
+        auto aic_metrics = PrepareProfilerAicMetrics(npu_config);
+        int8_t level_int = trace_level_to_int_.find(npu_config.trace_level) != trace_level_to_int_.end() ? trace_level_to_int_[npu_config.trace_level] : -1;
+        int32_t deviceId = 0;
+        auto ret = c10_npu::GetDevice(&deviceId);
+        if (ret != ACL_ERROR_NONE) {
+            ASCEND_LOGE("Get Device ID failed.");
+            return;
         }
-        if (npu_config.op_attr) {
-            datatype_config |= ACL_PROF_OP_ATTR;
+        const uint32_t deviceNum = 1;
+        uint32_t deviceIdList[deviceNum] = {deviceId};
+        WarmupMsProfiler(deviceIdList, deviceNum, aic_metrics, datatype_config);
+        trace_level_.store(level_int);
+    }
+
+    if (cpu_trace) {
+        std::string fwk_path = path_ + "/FRAMEWORK";
+        if (Utils::CreateDir(fwk_path)) {
+            StartDataReceiver(fwk_path);
+            report_enable_.store(true);
+            profile_memory_.store(npu_config.npu_memory);
+        } else {
+            ASCEND_LOGE("Profiler create FRAMEWORK directory failed: %s", fwk_path.c_str());
         }
-        datatype_config = CheckFeatureConfig(datatype_config);
+    }
+
+    enable_warmup_.store(true);
+}
+
+void ProfilerMgr::Start(const NpuTraceConfig &npu_config, bool cpu_trace)
+{
+    if (npu_trace_.load()) {
+        auto datatype_config = PrepareProfilerConfig(npu_config);
+        auto aic_metrics = PrepareProfilerAicMetrics(npu_config);
+        int8_t level_int = trace_level_to_int_.find(npu_config.trace_level) != trace_level_to_int_.end() ? trace_level_to_int_[npu_config.trace_level] : -1;
         int32_t deviceId = 0;
         auto ret = c10_npu::GetDevice(&deviceId);
         if (ret != ACL_ERROR_NONE) {
@@ -128,7 +198,7 @@ void ProfilerMgr::Start(const NpuTraceConfig &npu_config, bool cpu_trace)
         trace_level_.store(level_int);
     }
 
-    if (cpu_trace == true) {
+    if (cpu_trace && enable_warmup_.load() == false) {
         std::string fwk_path = path_ + "/FRAMEWORK";
         if (Utils::CreateDir(fwk_path)) {
             StartDataReceiver(fwk_path);
@@ -138,6 +208,7 @@ void ProfilerMgr::Start(const NpuTraceConfig &npu_config, bool cpu_trace)
             ASCEND_LOGE("Profiler create FRAMEWORK directory failed: %s", fwk_path.c_str());
         }
     }
+    enable_warmup_.store(false);
     msprof_tx_.store(npu_config.msprof_tx);
     if (npu_config.record_op_args) {
         record_op_args_.store(true);
@@ -147,33 +218,37 @@ void ProfilerMgr::Start(const NpuTraceConfig &npu_config, bool cpu_trace)
     }
 }
 
-void ProfilerMgr::Stop() {
-  c10_npu::npuSynchronizeDevice();
-  if (report_enable_.load() == true) {
-    StopDataReceiver();
-    profile_memory_.store(false);
-  }
-  report_enable_.store(false);
-  if (npu_trace_.load() == true) {
-    at_npu::native::AclProfilingStop(profConfig_);
-    auto ret = at_npu::native::AclProfilingDestroyConfig(profConfig_);
-    if (ret != ACL_SUCCESS) {
-        ASCEND_LOGE("AclProfDestoryConfig fail, error code: %d", ret);
+void ProfilerMgr::Stop()
+{
+    c10_npu::npuSynchronizeDevice();
+    if (report_enable_.load()) {
+        StopDataReceiver();
+        profile_memory_.store(false);
+    }
+    
+    if (npu_trace_.load()) {
+        at_npu::native::AclProfilingStop(profConfig_);
+        auto ret = at_npu::native::AclProfilingDestroyConfig(profConfig_);
+        if (ret != ACL_SUCCESS) {
+            ASCEND_LOGE("AclProfDestoryConfig fail, error code: %d", ret);
+        }
+        profConfig_ = nullptr;
     }
-    profConfig_ = nullptr;
-  }
     msprof_tx_.store(false);
-  if (record_op_args_.load() == true) {
-    at_npu::native::AclopStopDumpArgs(ACL_OP_DUMP_OP_AICORE_ARGS);
-    record_op_args_.store(false);
-  }
+    report_enable_.store(false);
+    enable_warmup_.store(false);
+    if (record_op_args_.load()) {
+        at_npu::native::AclopStopDumpArgs(ACL_OP_DUMP_OP_AICORE_ARGS);
+        record_op_args_.store(false);
+    }
 }
 
-void ProfilerMgr::Finalize() {
-  if (npu_trace_.load() == true) {
-    at_npu::native::AclProfilingFinalize();
-  }
-  npu_trace_.store(false);
+void ProfilerMgr::Finalize()
+{
+    if (npu_trace_.load()) {
+        at_npu::native::AclProfilingFinalize();
+    }
+    npu_trace_.store(false);
 }
 
 void ProfilerMgr::StartDataReceiver(const std::string &fwk_path)
@@ -198,27 +273,42 @@ void ProfilerMgr::StopDataReceiver()
 
 void ProfilerMgr::Upload(std::unique_ptr<torch_npu::toolkit::profiler::BaseReportData> data)
 {
+    if (enable_warmup_.load()) {
+        return;
+    }
     dataReceiver_.Report(std::move(data));
 }
 
 void ProfilerMgr::UploadWithLock(std::unique_ptr<torch_npu::toolkit::profiler::BaseReportData> data)
 {
+    if (enable_warmup_.load()) {
+        return;
+    }
     std::lock_guard<std::mutex> lock(reportDataMutex_);
     dataReceiverWithLock_.Report(std::move(data));
 }
 
 void ProfilerMgr::UploadTraceEventData(std::unique_ptr<torch_npu::toolkit::profiler::PythonTracerFuncData> data)
 {
+    if (enable_warmup_.load()) {
+        return;
+    }
     traceDataReceiver_.Report(std::move(data));
 }
 
 void ProfilerMgr::UploadTraceHashData(std::unique_ptr<torch_npu::toolkit::profiler::PythonTracerHashData> data)
 {
+    if (enable_warmup_.load()) {
+        return;
+    }
     traceDataReceiver_.ReportHash(std::move(data));
 }
 
 void ProfilerMgr::UploadParamData(std::unique_ptr<torch_npu::toolkit::profiler::ParamTensorData> data)
 {
+    if (enable_warmup_.load()) {
+        return;
+    }
     traceDataReceiver_.ReportParam(std::move(data));
 }
 
diff --git a/torch_npu/csrc/profiler/profiler_mgr.h b/torch_npu/csrc/profiler/profiler_mgr.h
index 6b1accb3c5..cd4400555c 100644
--- a/torch_npu/csrc/profiler/profiler_mgr.h
+++ b/torch_npu/csrc/profiler/profiler_mgr.h
@@ -31,6 +31,7 @@ C10_NPU_API int8_t GetTraceLevel();
 class ProfilerMgr {
 public:
     void Init(const std::string &path, bool npu_trace);
+    void Warmup(const NpuTraceConfig &npu_config, bool cpu_trace);
     void Start(const NpuTraceConfig &npu_config, bool cpu_trace);
     void Stop();
     void Finalize();
@@ -68,6 +69,9 @@ private:
     explicit ProfilerMgr(ProfilerMgr &&obj) = delete;
     ProfilerMgr& operator=(ProfilerMgr &&obj) = delete;
     void EnableMsProfiler(uint32_t *deviceIdList, uint32_t deviceNum, aclprofAicoreMetrics aicMetrics, uint64_t dataTypeConfig);
+    void WarmupMsProfiler(uint32_t *deviceIdList, uint32_t deviceNum, aclprofAicoreMetrics aicMetrics, uint64_t dataTypeConfig);
+    uint64_t PrepareProfilerConfig(const NpuTraceConfig &npu_config);
+    aclprofAicoreMetrics PrepareProfilerAicMetrics(const NpuTraceConfig &npu_config);
     uint64_t CheckFeatureConfig(uint64_t datatype_config);
     void StartDataReceiver(const std::string &fwk_path);
     void StopDataReceiver();
@@ -80,6 +84,7 @@ private:
     std::atomic<bool> record_op_args_;
     std::atomic<bool> profile_memory_;
     std::atomic<bool> msprof_tx_;
+    std::atomic<bool> enable_warmup_;
     std::atomic<int8_t> trace_level_;
     std::string path_;
     aclprofConfig *profConfig_;
diff --git a/torch_npu/profiler/_profiler_action_controller.py b/torch_npu/profiler/_profiler_action_controller.py
index 9c17bd3c3d..27b4897cc4 100644
--- a/torch_npu/profiler/_profiler_action_controller.py
+++ b/torch_npu/profiler/_profiler_action_controller.py
@@ -33,7 +33,7 @@ class ProfActionController:
     def _init_action_map(self):
         action_map = {
             (ProfilerAction.NONE, ProfilerAction.NONE): [],
-            (ProfilerAction.NONE, ProfilerAction.WARMUP): [self.prof_inst.init_trace],
+            (ProfilerAction.NONE, ProfilerAction.WARMUP): [self.prof_inst.init_trace, self.prof_inst.warmup_trace],
             (ProfilerAction.NONE, ProfilerAction.RECORD): [
                 self.prof_inst.init_trace, self.prof_inst.start_trace],
             (ProfilerAction.NONE, ProfilerAction.RECORD_AND_SAVE): [
@@ -62,7 +62,7 @@ class ProfActionController:
                 self.prof_inst.finalize_trace, self._trace_ready],
             (ProfilerAction.RECORD_AND_SAVE, ProfilerAction.WARMUP): [
                 self.prof_inst.stop_trace, self.prof_inst.finalize_trace,
-                self._trace_ready, self.prof_inst.init_trace],
+                self._trace_ready, self.prof_inst.init_trace, self.prof_inst.warmup_trace],
             (ProfilerAction.RECORD_AND_SAVE, ProfilerAction.RECORD): [
                 self.prof_inst.stop_trace, self.prof_inst.finalize_trace,
                 self._trace_ready, self.prof_inst.init_trace, self.prof_inst.start_trace],
diff --git a/torch_npu/profiler/profiler_interface.py b/torch_npu/profiler/profiler_interface.py
index 4232bbb2a7..9acafb6072 100644
--- a/torch_npu/profiler/profiler_interface.py
+++ b/torch_npu/profiler/profiler_interface.py
@@ -10,6 +10,7 @@ from torch_npu._C._profiler import (
     NpuProfilerConfig,
     _supported_npu_activities,
     _init_profiler,
+    _warmup_profiler,
     _start_profiler,
     _stop_profiler,
     _finalize_profiler,
@@ -105,6 +106,12 @@ class _ProfInterface:
         self.prof_path = ProfPathCreator().get_prof_dir()
         _init_profiler(self.prof_path, self.activities)
 
+    def warmup_trace(self):
+        prof_config = [self.prof_path, self.record_shapes, self.profile_memory,
+                       self.with_stack, self.with_flops, self.with_modules, self.experimental_config()]
+        npu_prof_config = NpuProfilerConfig(*tuple(prof_config))
+        _warmup_profiler(npu_prof_config, self.activities)
+
     def start_trace(self):
         prof_config = [self.prof_path, self.record_shapes, self.profile_memory,
                        self.with_stack, self.with_flops, self.with_modules, self.experimental_config()]
-- 
Gitee


From 8147e28f167896df5c6c13272137519dbc7a7a0b Mon Sep 17 00:00:00 2001
From: Mrtutu <zhangwei983@huawei.com>
Date: Wed, 5 Mar 2025 13:14:37 +0000
Subject: [PATCH 089/358] !18551 dynamic profile add warmup on branch v2.6.0
 Merge pull request !18551 from Mrtutu/dynamic_profile_warmup_v2.6.0

---
 .../_dynamic_profiler_config_context.py       | 20 +++++++++++++++++++
 .../_dynamic_profiler_monitor_shm.py          |  1 +
 torch_npu/profiler/dynamic_profile.py         |  6 +++---
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
index 8570db61dc..6a94a308f8 100644
--- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
@@ -7,6 +7,7 @@ from ._dynamic_profiler_utils import DynamicProfilerUtils
 class ConfigContext:
     DEFAULT_ACTIVE_NUM = 1
     DEFAULT_START_STEP = 0
+    DEFAULT_WARMUP = 0
     DEADLINE_PROF_DIR = "./"
     BOOL_MAP = {'true': True, 'false': False}
 
@@ -24,6 +25,7 @@ class ConfigContext:
         self.experimental_config = None
         self._active = 1
         self._start_step = 0
+        self._warmup = 0
         self.is_valid = False
         self._meta_data = {}
         self._async_mode = False
@@ -44,6 +46,7 @@ class ConfigContext:
         self._parse_with_stack(json_data)
         self._parse_with_modules(json_data)
         self._parse_active(json_data)
+        self._parse_warmup(json_data)
         self._parse_start_step(json_data)
         self._parse_exp_cfg(json_data)
         self._parse_ranks(json_data)
@@ -81,6 +84,16 @@ class ConfigContext:
             except ValueError:
                 self._active = self.DEFAULT_ACTIVE_NUM
 
+    def _parse_warmup(self, json_data: dict):
+        if not self._is_dyno:
+            self._warmup = json_data.get("warmup", self.DEFAULT_WARMUP)
+        else:
+            warmup = json_data.get("WARMUP_ITERATIONS", self.DEFAULT_WARMUP)
+            try:
+                self._warmup = int(warmup)
+            except ValueError:
+                self._warmup = self.DEFAULT_WARMUP
+
     def _parse_with_stack(self, json_data: dict):
         if not self._is_dyno:
             self.with_stack = json_data.get('with_stack', False)
@@ -291,6 +304,13 @@ class ConfigContext:
             return self.DEFAULT_ACTIVE_NUM
         return self._active
 
+    def warmup(self) -> int:
+        if not isinstance(self._warmup, int) or self._warmup <= 0:
+            DynamicProfilerUtils.out_log("Invalid parameter warmup, reset it to 0.",
+                                         DynamicProfilerUtils.LoggerLevelEnum.WARNING)
+            return self.DEFAULT_WARMUP
+        return self._warmup
+
     def start_step(self) -> int:
         return self._start_step
 
diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py
index c763daf653..1f5444e8c3 100644
--- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor_shm.py
@@ -25,6 +25,7 @@ class DynamicProfilerShareMemory:
         "with_flops": False,
         "with_modules": False,
         "active": 1,
+        "warmup": 0,
         "start_step": 0,
         "is_rank": False,
         "rank_list": [],
diff --git a/torch_npu/profiler/dynamic_profile.py b/torch_npu/profiler/dynamic_profile.py
index 2e248b24cd..511a6f4cf8 100644
--- a/torch_npu/profiler/dynamic_profile.py
+++ b/torch_npu/profiler/dynamic_profile.py
@@ -78,7 +78,7 @@ class _DynamicProfile:
                 DynamicProfilerUtils.out_log("Stop Dynamic Profiler at {} step.".format(
                     self.cur_step), DynamicProfilerUtils.LoggerLevelEnum.INFO)
         elif self.prof is None and self.cfg_ctx is not None and self.cur_step == self.cfg_ctx.start_step():
-            self.step_num = self.cfg_ctx.active()
+            self.step_num = self.cfg_ctx.active() + self.cfg_ctx.warmup()
             self.enable_prof()
             self.cfg_ctx = None
 
@@ -113,14 +113,14 @@ class _DynamicProfile:
                 enable_config_path), DynamicProfilerUtils.LoggerLevelEnum.ERROR)
             return
         self.cfg_ctx = ConfigContext(json_data)
-        self.step_num = self.cfg_ctx.active()
+        self.step_num = self.cfg_ctx.active() + self.cfg_ctx.warmup()
         self.enable_prof()
         self.cfg_ctx = None
 
     def enable_prof(self):
         self.prof = profile(
             activities=self.cfg_ctx.activities(),
-            schedule=schedule(wait=0, warmup=0, active=self.cfg_ctx.active(), repeat=1, skip_first=0),
+            schedule=schedule(wait=0, warmup=self.cfg_ctx.warmup(), active=self.cfg_ctx.active(), repeat=1, skip_first=0),
             on_trace_ready=tensorboard_trace_handler(self.cfg_ctx.prof_path, analyse_flag=self.cfg_ctx.analyse(),
                                                      async_mode=self.cfg_ctx.async_mode()),
             record_shapes=self.cfg_ctx.record_shapes,
-- 
Gitee


From c2f19a555890aa8db080025a03c2c9d4ff5faa0b Mon Sep 17 00:00:00 2001
From: Gallium <hujia23@huawei.com>
Date: Wed, 5 Mar 2025 13:55:36 +0000
Subject: [PATCH 090/358] !18534 adjust mstx msg format Merge pull request
 !18534 from Gallium/mstx_v2.6.0

---
 .../csrc/distributed/ProcessGroupHCCL.cpp     | 59 +++++++++++--------
 .../csrc/distributed/ProcessGroupHCCL.hpp     |  4 +-
 2 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 9b24ccca55..eb9c1f53ac 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -2127,7 +2127,7 @@ std::string mapToJson(const std::unordered_map<std::string, std::string>& map)
         if (!first) {
             ss << ",";
         }
-        ss << "\"" << pair.first << "\"" << ": " << "\"" << pair.second << "\"";
+        ss << "\\\"" << pair.first << "\\\"" << ": " << "\\\"" << pair.second << "\\\"";
         first = false;
     }
     ss << "}";
@@ -2135,7 +2135,8 @@ std::string mapToJson(const std::unordered_map<std::string, std::string>& map)
 }
 
 std::string ProcessGroupHCCL::getMstxHcclMsg(
-    const std::string &opName, uint64_t dataCnt, HcclDataType dataType, HcclComm comm, int64_t streamId)
+    const std::string &opName, uint64_t dataCnt, HcclDataType dataType, HcclComm comm, int64_t streamId,
+    int srcRank, int dstRank)
 {
     const static std::map<HcclDataType, std::string> dataTypes = {
         {HCCL_DATA_TYPE_INT8, "int8"},
@@ -2172,6 +2173,12 @@ std::string ProcessGroupHCCL::getMstxHcclMsg(
     if (iter != dataTypes.end()) {
         data_type_str = iter->second;
     }
+    if (srcRank != -1) {
+        msgDict["srcRank"] = std::to_string(srcRank);
+    }
+    if (dstRank != -1) {
+        msgDict["destRank"] = std::to_string(dstRank);
+    }
     msgDict["dataType"] = data_type_str;
     msgDict["count"] = std::to_string(dataCnt);
     msgDict["streamId"] = std::to_string(streamId);
@@ -2818,7 +2825,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allreduce(
             auto hcclReduceOp = getHcclReduceOp(opts.reduceOp, input);
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclAllreduce", numel, hcclType, comm, streamId), stream.stream(false),
+                    getMstxHcclMsg("HcclAllreduce", numel, hcclType, comm, streamId, -1, -1), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclAllReduce(
                     inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream.stream(false));
@@ -2895,7 +2902,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::batch_isend_irecv(
 			                                           };
 			    }
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclBatchSendRecv", sendRecvInfo[0].count, sendRecvInfo[0].dataType, comm, streamId),
+                    getMstxHcclMsg("HcclBatchSendRecv", sendRecvInfo[0].count, sendRecvInfo[0].dataType, comm, streamId, -1, -1),
                     stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
 			    auto hccl_result = hcclBatchIsendIrecv(sendRecvInfo, itemNum, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -2943,7 +2950,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::broadcast(
             auto hcclType = getHcclDataType(input.scalar_type());
             auto hccl_call = [inputDataPtr, numel, hcclType, root, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclBroadcast", numel, hcclType, comm, streamId), stream.stream(false),
+                    getMstxHcclMsg("HcclBroadcast", numel, hcclType, comm, streamId, -1, -1), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclBroadcast(inputDataPtr, numel, hcclType, root, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -2992,7 +2999,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allreduce_coalesced(
             auto hcclReduceOp = getHcclReduceOp(opts.reduceOp, input);
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclAllreduce", numel, hcclType, comm, streamId), stream.stream(false),
+                    getMstxHcclMsg("HcclAllreduce", numel, hcclType, comm, streamId, -1, -1), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclAllReduce(
                     inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream.stream(false));
@@ -3060,7 +3067,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce(
             auto reduceOp = getHcclReduceOp(opts.reduceOp, input);
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, reduceOp, rank, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclReduce", numel, hcclType, comm, streamId), stream.stream(false),
+                    getMstxHcclMsg("HcclReduce", numel, hcclType, comm, streamId, -1, -1), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = hcclReduce(
                     inputDataPtr, outputDataPtr, numel, hcclType, reduceOp, rank, comm, stream.stream(false));
@@ -3120,7 +3127,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_oop(
             auto reduceOp = getHcclReduceOp(opts.reduceOp, input);
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, reduceOp, rank, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclReduce", numel, hcclType, comm, streamId), stream.stream(false),
+                    getMstxHcclMsg("HcclReduce", numel, hcclType, comm, streamId, -1, -1), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = hcclReduce(
                     inputDataPtr, outputDataPtr, numel, hcclType, reduceOp, rank, comm, stream.stream(false));
@@ -3243,7 +3250,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_scatter_base_uneven(
                 is_dispatched,
                 streamId]() -> int {
                     torch_npu::profiler::MstxRange range(
-                        getMstxHcclMsg("HcclReduceScatterV", numel, hcclType, comm, streamId),
+                        getMstxHcclMsg("HcclReduceScatterV", numel, hcclType, comm, streamId, -1, -1),
                         stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
                     auto hccl_result = hcclReduceScatterV(
                         inputDataPtr,
@@ -3329,7 +3336,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_allgather_base_uneven(
                 is_dispatched,
                 streamId]() -> int {
                     torch_npu::profiler::MstxRange range(
-                        getMstxHcclMsg("HcclAllGatherV", numel, hcclType, comm, streamId),
+                        getMstxHcclMsg("HcclAllGatherV", numel, hcclType, comm, streamId, -1, -1),
                         stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
                     auto hccl_result = hcclAllGatherV(
                         inputDataPtr,
@@ -3400,7 +3407,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather(
                 auto hcclType = getHcclDataType(input.scalar_type());
                 auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, comm, stream, is_dispatched, streamId]() -> int {
                     torch_npu::profiler::MstxRange range(
-                        getMstxHcclMsg("HcclAllGather", numel, hcclType, comm, streamId), stream.stream(false),
+                        getMstxHcclMsg("HcclAllGather", numel, hcclType, comm, streamId, -1, -1), stream.stream(false),
                         torch_npu::profiler::DOMAIN_COMMUNICATION);
                     auto hccl_result = HcclAllGather(inputDataPtr, outputDataPtr, numel, hcclType, comm, stream.stream(false));
                     *is_dispatched = true;
@@ -3479,7 +3486,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather(
                     is_dispatched,
                     streamId]() -> int {
                         torch_npu::profiler::MstxRange range(
-                            getMstxHcclMsg("HcclAllGatherV", numel, hcclType, comm, streamId),
+                            getMstxHcclMsg("HcclAllGatherV", numel, hcclType, comm, streamId, -1, -1),
                             stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
                         auto hccl_result = hcclAllGatherV(
                             inputDataPtr,
@@ -3555,7 +3562,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather(
                 auto numel = getNumelForHCCL(input);
                 auto hcclType = getHcclDataType(input.scalar_type());
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclBroadcast", numel, hcclType, comm, streamId), stream.stream(false),
+                    getMstxHcclMsg("HcclBroadcast", numel, hcclType, comm, streamId, -1, -1), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclBroadcast(inputDataPtr, numel, hcclType, root, comm, stream.stream());
                 *is_dispatched = true;
@@ -3595,7 +3602,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather_into_tensor_coalesced
             auto hcclType = getHcclDataType(input.scalar_type());
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclAllGather", numel, hcclType, comm, streamId), stream.stream(false),
+                    getMstxHcclMsg("HcclAllGather", numel, hcclType, comm, streamId, -1, -1), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclAllGather(inputDataPtr, outputDataPtr, numel, hcclType, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -3641,7 +3648,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::allgather_togather(
             auto hcclType = getHcclDataType(input.scalar_type());
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclAllGather", numel, hcclType, comm, streamId), stream.stream(false),
+                    getMstxHcclMsg("HcclAllGather", numel, hcclType, comm, streamId, -1, -1), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclAllGather(inputDataPtr, outputDataPtr, numel, hcclType, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -3692,7 +3699,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_allgather_base(
             auto hcclType = getHcclDataType(input.scalar_type());
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclAllGather", numel, hcclType, comm, streamId), stream.stream(false),
+                    getMstxHcclMsg("HcclAllGather", numel, hcclType, comm, streamId, -1, -1), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclAllGather(inputDataPtr, outputDataPtr, numel, hcclType, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -3739,7 +3746,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce_scatter(
             auto hcclReduceOp = getHcclReduceOp(opts.reduceOp, input);
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclReduceScatter", numel, hcclType, comm, streamId), stream.stream(false),
+                    getMstxHcclMsg("HcclReduceScatter", numel, hcclType, comm, streamId, -1, -1), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclReduceScatter(
                     inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream.stream(false));
@@ -3822,7 +3829,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce_scatter(
                     is_dispatched,
                     streamId]() -> int {
                         torch_npu::profiler::MstxRange range(
-                            getMstxHcclMsg("HcclReduceScatterV", numel, hcclType, comm, streamId),
+                            getMstxHcclMsg("HcclReduceScatterV", numel, hcclType, comm, streamId, -1, -1),
                             stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
                         auto hccl_result = hcclReduceScatterV(
                             inputDataPtr,
@@ -3929,7 +3936,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_scatter_base(
             auto hcclReduceOp = getHcclReduceOp(opts.reduceOp, input);
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclReduceScatter", numel, hcclType, comm, streamId), stream.stream(false),
+                    getMstxHcclMsg("HcclReduceScatter", numel, hcclType, comm, streamId, -1, -1), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclReduceScatter(
                     inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream.stream(false));
@@ -3975,7 +3982,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce_scatter_tensor_coalesced
             auto hcclReduceOp = getHcclReduceOp(opts.reduceOp, input);
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclReduceScatter", numel, hcclType, comm, streamId), stream.stream(false),
+                    getMstxHcclMsg("HcclReduceScatter", numel, hcclType, comm, streamId, -1, -1), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclReduceScatter(
                     inputDataPtr, outputDataPtr, numel, hcclType, hcclReduceOp, comm, stream.stream(false));
@@ -4111,7 +4118,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::scatter(
             auto hcclType = getHcclDataType(input.scalar_type());
             auto hccl_call = [inputDataPtr, outputDataPtr, numel, hcclType, root, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclScatter", numel, hcclType, comm, streamId), stream.stream(false),
+                    getMstxHcclMsg("HcclScatter", numel, hcclType, comm, streamId, -1, -1), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = hcclScatter(inputDataPtr, outputDataPtr, numel, hcclType, root, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -4163,7 +4170,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::send(std::vector<at::Tensor>& t
             auto hcclType = getHcclDataType(input.scalar_type());
             auto hccl_call = [inputDataPtr, numel, hcclType, dst_rank, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclSend", numel, hcclType, comm, streamId), stream.stream(false),
+                    getMstxHcclMsg("HcclSend", numel, hcclType, comm, streamId, -1, dst_rank), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclSend(inputDataPtr, numel, hcclType, dst_rank, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -4198,7 +4205,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::recv(std::vector<at::Tensor>& t
             auto hcclType = getHcclDataType(output.scalar_type());
             auto hccl_call = [outputDataPtr, numel, hcclType, src_rank, comm, stream, is_dispatched, streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
-                    getMstxHcclMsg("HcclRecv", numel, hcclType, comm, streamId), stream.stream(false),
+                    getMstxHcclMsg("HcclRecv", numel, hcclType, comm, streamId, src_rank, -1), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = HcclRecv(outputDataPtr, numel, hcclType, src_rank, comm, stream.stream(false));
                 *is_dispatched = true;
@@ -4296,7 +4303,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::alltoall_base(
                                   is_dispatched,
                                   streamId]() -> int {
                         torch_npu::profiler::MstxRange range(
-                            getMstxHcclMsg("HcclAlltoAll", input_counts, inputhcclDataType, comm, streamId),
+                            getMstxHcclMsg("HcclAlltoAll", input_counts, inputhcclDataType, comm, streamId, -1, -1),
                             stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
                         auto hccl_result = hcclAlltoAll(
                             inputDataPtr,
@@ -4396,7 +4403,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::alltoall_base(
                                   streamId]() -> int {
                     torch_npu::profiler::MstxRange range(
                         getMstxHcclMsg("HcclAlltoAllV", static_cast<uint64_t>(inputCounts.size()),
-                                       inputhcclDataType, comm, streamId),
+                                       inputhcclDataType, comm, streamId, -1, -1),
                         stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
                     auto hccl_result = hcclAlltoAllV(
                         inputDataPtr,
@@ -4523,7 +4530,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::alltoall(
                               streamId]() -> int {
                 torch_npu::profiler::MstxRange range(
                     getMstxHcclMsg("HcclAlltoAllV", static_cast<uint64_t>(input_counts.size()),
-                                   inputhcclDataType, comm, streamId),
+                                   inputhcclDataType, comm, streamId, -1, -1),
                     stream.stream(false), torch_npu::profiler::DOMAIN_COMMUNICATION);
                 auto hccl_result = hcclAlltoAllV(
                     inputDataPtr,
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
index 8c1414687c..7345739279 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
@@ -784,7 +784,9 @@ private:
                                       uint64_t dataCnt,
                                       HcclDataType hcclType,
                                       HcclComm comm,
-                                      int64_t streamId);
+                                      int64_t streamId,
+                                      int srcRank,
+                                      int dstRank);
 
     std::unordered_map<c10d::OpType, std::pair<at::Tensor, at::Tensor>> silenceCheckCache_;
 
-- 
Gitee


From 47d09f2270b9d997e5c01f01b9e12b36f6d3b684 Mon Sep 17 00:00:00 2001
From: Gallium <hujia23@huawei.com>
Date: Wed, 5 Mar 2025 13:56:34 +0000
Subject: [PATCH 091/358] !18310 dynamic_profiler adapt dynolog params Merge
 pull request !18310 from Gallium/dynolog_v2.6.0

---
 .../_dynamic_profiler_config_context.py         | 17 +++++++++++++----
 .../_dynamic_profiler_monitor.py                | 14 ++++++++++++++
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
index 6a94a308f8..27060c4958 100644
--- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
@@ -30,11 +30,15 @@ class ConfigContext:
         self._meta_data = {}
         self._async_mode = False
         self._is_dyno = DynamicProfilerUtils.is_dyno_model()
+        self._is_dyno_monitor = False
         self._rank_id = DynamicProfilerUtils.get_rank_id()
         self.parse(json_data)
 
     def parse(self, json_data: dict):
         self.is_valid = json_data.get("is_valid", False)
+        self._is_dyno_monitor = "NPU_MONITOR_START" in json_data
+        if self._is_dyno_monitor:
+            return
         self._parse_activity(json_data)
         self._parse_prof_dir(json_data)
         self._meta_data = json_data.get('metadata', {})
@@ -108,7 +112,7 @@ class ConfigContext:
         if not self._is_dyno:
             self.record_shapes = json_data.get('record_shapes', False)
         else:
-            record_shapes = json_data.get("PROFILE_REPORT_INPUT_SHAPES")
+            record_shapes = json_data.get("PROFILE_RECORD_SHAPES")
             if isinstance(record_shapes, str):
                 self.record_shapes = self.BOOL_MAP.get(record_shapes.lower(), False)
             else:
@@ -193,12 +197,14 @@ class ConfigContext:
         op_attr = json_data.get('PROFILE_OP_ATTR', 'false')
         op_attr = self.BOOL_MAP.get(op_attr.lower(), False)
         gc_detect_threshold = json_data.get('PROFILE_GC_DETECT_THRESHOLD', None)
+        if gc_detect_threshold is not None:
+            gc_detect_threshold = float(gc_detect_threshold)
         data_simplification = json_data.get('PROFILE_DATA_SIMPLIFICATION', 'true')
         data_simplification = self.BOOL_MAP.get(data_simplification.lower(), True)
-        record_op_args = json_data.get('PROFILE_RECORD_SHAPES', 'false')
-        record_op_args = self.BOOL_MAP.get(record_op_args.lower(), False)
+        record_op_args = False
         export_type = json_data.get('PROFILE_EXPORT_TYPE', 'text').lower()
-        msprof_tx = False
+        msprof_tx = json_data.get('PROFILE_MSPROF_TX', 'false')
+        msprof_tx = self.BOOL_MAP.get(msprof_tx.lower(), False)
         
         self.experimental_config = _ExperimentalConfig(
             profiler_level=profiler_level,
@@ -317,6 +323,9 @@ class ConfigContext:
     def experimental_config(self) -> _ExperimentalConfig:
         return self.experimental_config
 
+    def is_dyno_monitor(self) -> bool:
+        return self._is_dyno_monitor
+
     @staticmethod
     def profiler_cfg_json_to_bytes(json_dict: dict) -> bytes:
         cfg_json_str = json.dumps(json_dict)
diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py
index 0a27e3cb70..eb95cce837 100644
--- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py
@@ -58,6 +58,9 @@ class DynamicProfilerMonitor:
         self.prof_cfg_context = ConfigContext(json_data)
         if not self.prof_cfg_context.valid():
             return None
+        if self.prof_cfg_context.is_dyno_monitor():
+            self._call_dyno_monitor(json_data)
+            return None
         return self.prof_cfg_context
 
     def clean_resource(self):
@@ -104,6 +107,17 @@ class DynamicProfilerMonitor:
             DynamicProfilerUtils.out_log("Rank {} no need to create process.".format(
                 self._rank_id), DynamicProfilerUtils.LoggerLevelEnum.INFO)
 
+    def _call_dyno_monitor(self, json_data: dict):
+        json_data = {key: str(value) for key, value in json_data.items()}
+        try:
+            from IPCMonitor import PyDynamicMonitorProxy
+        except Exception as e:
+            dynamic_profiler_utils.stdout_log(f"Import IPCMonitro module failed :{e}!",
+                                             dynamic_profiler_utils.LoggerLevelEnum.WARNING)
+            return
+        py = PyDynamicMonitorProxy()
+        py.enable_dyno_npu_monitor(json_data)
+
 
 def worker_func(params_dict):
     """ Json monitor process worker function python version >= 3.8"""
-- 
Gitee


From fbb6e41f0d87e416cc9485ec04df173971df1492 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= <yanpengquan@huawei.com>
Date: Wed, 5 Mar 2025 14:37:23 +0000
Subject: [PATCH 092/358] =?UTF-8?q?!18571=20remove=20lccl=20register=20for?=
 =?UTF-8?q?=20processgroup=20Merge=20pull=20request=20!18571=20from=20?=
 =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/__init__.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index 5872eddcc0..b8c0457fc3 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -238,10 +238,6 @@ def _new_process_group_lccl_helper(dist_backend_opts, pg_options):
     return torch_npu._C._distributed_c10d.ProcessGroupLCCL(store, group_rank, group_size)
 
 
-torch.distributed.Backend.register_backend("lccl", lambda dist_backend_opts, pg_options:
-    _new_process_group_lccl_helper(dist_backend_opts, pg_options), extended_api=True, devices=["npu"])
-
-
 # set default device type for gradient checkpointing
 DefaultDeviceType.set_device_type("npu")
 del DefaultDeviceType
-- 
Gitee


From 8fdd8b2b2b57ecf037598668e92b084b96c3fb72 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Wed, 5 Mar 2025 15:15:23 +0000
Subject: [PATCH 093/358] !18580 Update op_plugin commit id Merge pull request
 !18580 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index afa134ef2f..91e5f9a624 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit afa134ef2ff35c049320b0b4a129bdc3553dda81
+Subproject commit 91e5f9a6244a3e8faa3a0b1fb4a40fb8a97c204d
-- 
Gitee


From 3c97ede99cd4b2d971f90b773b2f383ed6d901c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=8F=B6=E5=AD=90=E5=87=A1?= <vscience@qq.com>
Date: Thu, 6 Mar 2025 01:16:43 +0000
Subject: [PATCH 094/358] =?UTF-8?q?!18555=20Support=20qos=20and=20aiv=20in?=
 =?UTF-8?q?=20options.hccl=5Fconfig=20Merge=20pull=20request=20!18555=20fr?=
 =?UTF-8?q?om=20=E5=8F=B6=E5=AD=90=E5=87=A1/v2.6.0=5Fqos?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/distributed/test_options.py              | 52 +++++++++++++++++--
 third_party/hccl/inc/hccl/hccl.h              |  3 ++
 third_party/hccl/inc/hccl/hccl_types.h        |  9 +++-
 .../csrc/distributed/ProcessGroupHCCL.cpp     | 24 +++++++++
 4 files changed, 82 insertions(+), 6 deletions(-)

diff --git a/test/distributed/test_options.py b/test/distributed/test_options.py
index 12aeffbb64..7f31b3ed4a 100644
--- a/test/distributed/test_options.py
+++ b/test/distributed/test_options.py
@@ -42,15 +42,31 @@ class OptionsTest(TestCase):
         dist.all_reduce(input1, group=pg)
 
     @classmethod
-    def _test_options_wrong_type(cls, rank, ranks, world_size, input1):
+    def _test_options_wrong_type(cls, rank, hccl_config, error_expect, world_size, input1):
         options = torch_npu._C._distributed_c10d.ProcessGroupHCCL.Options()
-        options.hccl_config = {"group_name": 123}
+        options.hccl_config = hccl_config
         input1 = input1.npu()
         test_case = TestCase()
-        with test_case.assertRaisesRegex(RuntimeError, "Value type of group_name should be string"):
+        with test_case.assertRaisesRegex(RuntimeError, error_expect):
             OptionsTest._init_dist_hccl(rank, options, world_size)
             dist.all_reduce(input1)
 
+    @classmethod
+    def _test_options_group_name_wrong_types(cls, rank, ranks, world_size, input1):
+        cls._test_options_wrong_type(rank, {"group_name": 123}, "Value type of group_name should be string", world_size, input1)
+
+    @classmethod
+    def _test_options_qos_traffic_class_wrong_types(cls, rank, ranks, world_size, input1):
+        cls._test_options_wrong_type(rank, {"qos_traffic_class": "123"}, "Value type of qos_traffic_class should be int.", world_size, input1)
+
+    @classmethod
+    def _test_options_qos_service_level_wrong_types(cls, rank, ranks, world_size, input1):
+        cls._test_options_wrong_type(rank, {"qos_service_level": "123"}, "Value type of qos_service_level should be int.", world_size, input1)
+   
+    @classmethod
+    def _test_options_hccl_op_expansion_mode_wrong_types(cls, rank, ranks, world_size, input1):
+        cls._test_options_wrong_type(rank, {"hccl_op_expansion_mode": "123"}, "Value type of hccl_op_expansion_mode should be int.", world_size, input1)
+
     def _test_multiprocess(self, f, input1, world_size):
         ctx = mp.get_context('spawn')
 
@@ -79,14 +95,40 @@ class OptionsTest(TestCase):
                                     input1, world_size)
 
     @skipIfUnsupportMultiNPU(2)
-    def test_options_wrong_type(self):
+    def test_options_group_name_wrong_type(self):
+        ranks = [2]
+        shape = [np.int32, 0, [2, 3, 16]]
+        for world_size in ranks:
+            exp_input, input1 = create_common_tensor(shape, -10, 10)
+            self._test_multiprocess(OptionsTest._test_options_group_name_wrong_types,
+                                    input1, world_size)
+
+    @skipIfUnsupportMultiNPU(2)
+    def test_options_qos_traffic_class_wrong_type(self):
         ranks = [2]
         shape = [np.int32, 0, [2, 3, 16]]
         for world_size in ranks:
             exp_input, input1 = create_common_tensor(shape, -10, 10)
-            self._test_multiprocess(OptionsTest._test_options_wrong_type,
+            self._test_multiprocess(OptionsTest._test_options_qos_traffic_class_wrong_types,
                                     input1, world_size)
 
+    @skipIfUnsupportMultiNPU(2)
+    def test_options_qos_service_level_wrong_type(self):
+        ranks = [2]
+        shape = [np.int32, 0, [2, 3, 16]]
+        for world_size in ranks:
+            exp_input, input1 = create_common_tensor(shape, -10, 10)
+            self._test_multiprocess(OptionsTest._test_options_qos_service_level_wrong_types,
+                                    input1, world_size)
+
+    @skipIfUnsupportMultiNPU(2)
+    def test_options_hccl_op_expansion_mode_wrong_type(self):
+        ranks = [2]
+        shape = [np.int32, 0, [2, 3, 16]]
+        for world_size in ranks:
+            exp_input, input1 = create_common_tensor(shape, -10, 10)
+            self._test_multiprocess(OptionsTest._test_options_hccl_op_expansion_mode_wrong_types,
+                                    input1, world_size)
 
 if __name__ == '__main__':
     run_tests()
diff --git a/third_party/hccl/inc/hccl/hccl.h b/third_party/hccl/inc/hccl/hccl.h
index da30721fad..ab48964d8c 100644
--- a/third_party/hccl/inc/hccl/hccl.h
+++ b/third_party/hccl/inc/hccl/hccl.h
@@ -207,6 +207,9 @@ inline void HcclCommConfigInit(HcclCommConfig *config)
     config->hcclDeterministic = HCCL_COMM_DEFAULT_DETERMINISTIC;
     config->hcclCommName[0] = '\0';
     config->hcclUdi[0] = '\0';
+    config->hcclRdmaTrafficClass = HCCL_COMM_TRAFFIC_CLASS_CONFIG_NOT_SET;
+    config->hcclRdmaServiceLevel = HCCL_COMM_SERVICE_LEVEL_CONFIG_NOT_SET;
+    config->hcclOpExpansionMode = HCCL_COMM_DEFAULT_OP_EXPANSION_MODE;
 }
 
 /**
diff --git a/third_party/hccl/inc/hccl/hccl_types.h b/third_party/hccl/inc/hccl/hccl_types.h
index aa653995ea..40631676c1 100644
--- a/third_party/hccl/inc/hccl/hccl_types.h
+++ b/third_party/hccl/inc/hccl/hccl_types.h
@@ -15,11 +15,14 @@ extern "C" {
 
 const uint32_t HCCL_COMM_CONFIG_INFO_BYTES = 24;
 const uint32_t HCCL_COMM_CONFIG_MAGIC_WORD = 0xf0f0f0f0;
-const uint32_t HCCL_COMM_CONFIG_VERSION = 3;
+const uint32_t HCCL_COMM_CONFIG_VERSION = 5;
 const uint32_t HCCL_COMM_DEFAULT_BUFFSIZE = 200;                // 200MB buffer size
 const uint32_t HCCL_COMM_DEFAULT_DETERMINISTIC = 0;             // Disable deterministic calculations
 const uint32_t COMM_NAME_MAX_LENGTH = 128;
 const uint32_t UDI_MAX_LENGTH = 128;
+const uint32_t HCCL_COMM_TRAFFIC_CLASS_CONFIG_NOT_SET = 0xffffffff;
+const uint32_t HCCL_COMM_SERVICE_LEVEL_CONFIG_NOT_SET = 0xffffffff;
+const uint32_t HCCL_COMM_DEFAULT_OP_EXPANSION_MODE = 0;
 
 /**
  * @brief HCCL functions return value definition
@@ -126,12 +129,16 @@ typedef struct HcclCommConfigDef {
     uint32_t hcclDeterministic;
     char hcclCommName[COMM_NAME_MAX_LENGTH];
     char hcclUdi[UDI_MAX_LENGTH];
+    uint32_t hcclOpExpansionMode;
+    uint32_t hcclRdmaTrafficClass;
+    uint32_t hcclRdmaServiceLevel;
 } HcclCommConfig;
 
 typedef enum {
     HCCL_COMM_CONFIG_BUFFER_SIZE = 0,
     HCCL_COMM_CONFIG_DETERMINISTIC = 1,
     HCCL_COMM_CONFIG_COMM_NAME = 2,
+    HCCL_COMM_CONFIG_OP_EXPANSION_MODE = 3,
     HCCL_COMM_CONFIG_RESERVED,
 } HcclCommConfigCapability;
 #ifdef __cplusplus
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index eb9c1f53ac..7224e084d1 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -2262,6 +2262,30 @@ HcclCommConfig ProcessGroupHCCL::createHcclCommConfigWithOptions()
         }
     }
 
+    if (options_->hccl_config.find("qos_traffic_class") != options_->hccl_config.end()) {
+        if (std::holds_alternative<uint32_t>(options_->hccl_config["qos_traffic_class"])) {
+            config.hcclRdmaTrafficClass = std::get<uint32_t>(options_->hccl_config["qos_traffic_class"]);
+        } else {
+            TORCH_CHECK(false, "Value type of qos_traffic_class should be int.", DIST_ERROR(ErrCode::TYPE));
+        }
+    }
+
+    if (options_->hccl_config.find("qos_service_level") != options_->hccl_config.end()) {
+        if (std::holds_alternative<uint32_t>(options_->hccl_config["qos_service_level"])) {
+            config.hcclRdmaServiceLevel = std::get<uint32_t>(options_->hccl_config["qos_service_level"]);
+        } else {
+            TORCH_CHECK(false, "Value type of qos_service_level should be int.", DIST_ERROR(ErrCode::TYPE));
+        }
+    }
+
+    if (options_->hccl_config.find("hccl_op_expansion_mode") != options_->hccl_config.end()) {
+        if (std::holds_alternative<uint32_t>(options_->hccl_config["hccl_op_expansion_mode"])) {
+            config.hcclOpExpansionMode = std::get<uint32_t>(options_->hccl_config["hccl_op_expansion_mode"]);
+        } else {
+            TORCH_CHECK(false, "Value type of hccl_op_expansion_mode should be int.", DIST_ERROR(ErrCode::TYPE));
+        }
+    }
+
     return config;
 }
 
-- 
Gitee


From 7ecaa45aba97486d217352ca09535447dc0d2fc4 Mon Sep 17 00:00:00 2001
From: will-devil <wangyicheng16@huawei.com>
Date: Thu, 6 Mar 2025 01:17:45 +0000
Subject: [PATCH 095/358] !18463 [Feature] fsdp2 testcase 0/N. Merge pull
 request !18463 from will-devil/fsdp2-26-0

---
 .../.pytorch-disabled-tests.json              |  5 ++
 .../distributed/fsdp/_fsdp_collectives.py     | 14 +----
 torch_npu/testing/_internal/common_fsdp.py    | 62 +++++++++++++++++++
 3 files changed, 69 insertions(+), 12 deletions(-)
 create mode 100644 torch_npu/testing/_internal/common_fsdp.py

diff --git a/test/unsupported_test_cases/.pytorch-disabled-tests.json b/test/unsupported_test_cases/.pytorch-disabled-tests.json
index 8e513fc559..e5d41870b6 100644
--- a/test/unsupported_test_cases/.pytorch-disabled-tests.json
+++ b/test/unsupported_test_cases/.pytorch-disabled-tests.json
@@ -14,6 +14,11 @@
   "test_split_to_graph_and_name_node_map (fx.test_matcher_utils.TestMatcher)": ["", [""]],
   "test_typed_storage_internal_no_warning (__main__.TestTorch)": ["", [""]],
   "test_nn_module_tests (__main__.TestComplexity)": ["", [""]],
+  "test_not_import_sympy (main.TestImports)": ["", [""]],
+  "test_no_warning_on_import (main.TestImports)": ["", [""]],
+  "test_no_mutate_global_logging_on_import_path_torch (main.TestImports)": ["", [""]],
+  "test_no_mutate_global_logging_on_import_path_functorch (main.TestImports)": ["", [""]],
+  "test_lazy_imports_are_lazy (main.TestImports)": ["", [""]],
   "test_deprecation_transforms_transform_functionalize_npu (__main__.TestComposabilityPRIVATEUSE1)": ["", [""]],
   "test_deprecation_transforms_transform_grad_and_value_npu (__main__.TestComposabilityPRIVATEUSE1)": ["", [""]],
   "test_basic_sum_npu (__main__.TestHigherOrderOperatorInteractionPRIVATEUSE1)": ["", [""]],
diff --git a/torch_npu/distributed/fsdp/_fsdp_collectives.py b/torch_npu/distributed/fsdp/_fsdp_collectives.py
index 37b049a8a2..a1c203ffe0 100644
--- a/torch_npu/distributed/fsdp/_fsdp_collectives.py
+++ b/torch_npu/distributed/fsdp/_fsdp_collectives.py
@@ -13,6 +13,8 @@ def chunk_cat(
     num_chunks: int,
     out: torch.Tensor,
 ) -> None:
+    tensors = [tensor.contiguous() for tensor in tensors]
+    out = out.contiguous()
     torch._chunk_cat(tensors, dim, num_chunks, out=out)
 
 
@@ -36,15 +38,3 @@ def all_gather_copy_in_npu(
     with torch.no_grad():
         torch._foreach_copy_(foreach_copy_dsts, all_gather_inputs)
     return all_gather_input, all_gather_output
-
-
-@torch.library.impl(lib, "split_with_sizes_copy", "PrivateUse1")
-def split_with_sizes_copy(
-    all_gather_output: torch.Tensor,
-    all_gather_input_split_sizes: List[int],
-    dim: int,
-    out: List[torch.Tensor],
-) -> None:
-    torch.split_with_sizes_copy(
-        all_gather_output, all_gather_input_split_sizes, dim=dim, out=out
-    )
diff --git a/torch_npu/testing/_internal/common_fsdp.py b/torch_npu/testing/_internal/common_fsdp.py
new file mode 100644
index 0000000000..973dfa6e2d
--- /dev/null
+++ b/torch_npu/testing/_internal/common_fsdp.py
@@ -0,0 +1,62 @@
+import torch
+import torch.distributed as dist
+from torch.testing._internal.common_fsdp import FSDPTest
+
+import torch_npu
+
+torch.testing._internal.common_fsdp.DEVICE_TYPE = "npu"
+torch.testing._internal.common_fsdp.DISTRIBUTED_BACKEND = "hccl"
+torch.testing._internal.common_fsdp.DEVICE_COUNT = torch.npu.device_count()
+
+
+class FSDPNPUTest(FSDPTest):
+    @classmethod
+    def _run(cls, rank, test_name, file_name, pipe, **kwargs):
+        self = cls(test_name)
+        self.rank = rank
+        self.file_name = file_name
+        fake_pg = kwargs.get("fake_pg", False)
+
+        print(f"dist init r={self.rank}, world={self.world_size}")
+
+        # Specify gloo backend to make 'init_process_group()' succeed,
+        # Actual tests will be skipped if there is no enough GPUs.
+        try:
+            if fake_pg:
+                store = torch.testing._internal.distributed.fake_pg.FakeStore()
+                dist.init_process_group(
+                    backend="fake",
+                    world_size=self.world_size,
+                    rank=rank,
+                    store=store,
+                )
+            else:
+                dist.init_process_group(
+                    init_method=self.init_method,
+                    backend="hccl",
+                    world_size=int(self.world_size),
+                    rank=self.rank,
+                )
+        except RuntimeError as e:
+            if "recompile" in e.args[0]:
+                sys.exit(TEST_SKIPS["backend_unavailable"].exit_code)
+
+            raise
+
+        device_ids = None
+        device_id = self.rank % torch.npu.device_count()
+        torch.npu.set_device(device_id)
+        device_ids = [device_id]
+
+        # Execute barrier prior to running test to ensure that every process
+        # has finished initialization and that the following test
+        # immediately exiting due to a skip doesn't cause flakiness.
+        dist.barrier(device_ids=device_ids)
+
+        torch._dynamo.reset()
+        self.run_test(test_name, pipe)
+        torch._dynamo.reset()
+
+        dist.barrier(device_ids=device_ids)
+
+        dist.destroy_process_group()
-- 
Gitee


From d50ceaaa9754a56ada4e58c38b313695530c5db8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= <jiangyiwen5@huawei.com>
Date: Thu, 6 Mar 2025 03:13:48 +0000
Subject: [PATCH 096/358] =?UTF-8?q?!18532=20Cleancode=20Merge=20pull=20req?=
 =?UTF-8?q?uest=20!18532=20from=20=E5=A7=9C=E6=80=A1=E6=96=87/v2.6.1=5Fcle?=
 =?UTF-8?q?an?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 2 +-
 torch_npu/csrc/framework/LazyInitAclops.cpp       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
index 6b7a065882..5ec37f44fc 100644
--- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
+++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
@@ -164,7 +164,7 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id)
         c10_npu::acl::AclrtSetDeviceSatMode(aclrtFloatOverflowMode::ACL_RT_OVERFLOW_MODE_SATURATION);
     }
 
-    int acl_op_init_mode = c10_npu::option::OptionsManager::GetAclOpInitMode();
+    auto acl_op_init_mode = c10_npu::option::OptionsManager::GetAclOpInitMode();
     if (acl_op_init_mode == 0) {
         at_npu::aclops::InitAclops();
     } else {
diff --git a/torch_npu/csrc/framework/LazyInitAclops.cpp b/torch_npu/csrc/framework/LazyInitAclops.cpp
index 25e58d2e78..03b0814659 100644
--- a/torch_npu/csrc/framework/LazyInitAclops.cpp
+++ b/torch_npu/csrc/framework/LazyInitAclops.cpp
@@ -176,7 +176,7 @@ void LazyInitAclopsCore()
 
 void LazyInitAclops()
 {
-    static int acl_op_init_mode = c10_npu::option::OptionsManager::GetAclOpInitMode();
+    static auto acl_op_init_mode = c10_npu::option::OptionsManager::GetAclOpInitMode();
     if (acl_op_init_mode == 0) {
         return;
     }
-- 
Gitee


From 8647e97f118188c63d063c65da295be7667fc7ec Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Thu, 6 Mar 2025 03:28:46 +0000
Subject: [PATCH 097/358] !18425 Update torchair commit id Merge pull request
 !18425 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 018466a50b..890aefe634 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 018466a50b704ffe0df5a38dd77e2d50a2d27afe
+Subproject commit 890aefe63407b37372abe053fb549c8433bb6ae1
-- 
Gitee


From 2064d241a930493b3ef6462d033efbcb8f29a502 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 6 Mar 2025 04:45:23 +0000
Subject: [PATCH 098/358] !18599 Update op_plugin commit id Merge pull request
 !18599 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 91e5f9a624..8670a345f2 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 91e5f9a6244a3e8faa3a0b1fb4a40fb8a97c204d
+Subproject commit 8670a345f235d4a8ee590668d1a26b259e5eed5e
-- 
Gitee


From b01ba2d9ec193e6e96675ef01f5fa1d742da6f03 Mon Sep 17 00:00:00 2001
From: yuhaiyan <yuhaiyan8@huawei.com>
Date: Thu, 6 Mar 2025 06:12:49 +0000
Subject: [PATCH 099/358] !18566 Update compatibility info Merge pull request
 !18566 from yuhaiyan/v2.6.0-dev1

---
 test/npu/test_npu.py                                |  4 ----
 test/torch_npu_schema.json                          |  4 ++++
 torch_npu/csrc/core/npu/register/OptionsManager.cpp |  8 ++++++++
 torch_npu/csrc/core/npu/register/OptionsManager.h   | 12 ++++++++++++
 4 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/test/npu/test_npu.py b/test/npu/test_npu.py
index ab8651b922..17aed71f04 100644
--- a/test/npu/test_npu.py
+++ b/test/npu/test_npu.py
@@ -337,10 +337,6 @@ class TestNpu(TestCase):
             # Pushes an 0.1 second spin to stream so if the copy is non blocking,
             # stream will almost surely be active when we query().
             b = a.to(device=dst, non_blocking=non_blocking)
-            c = torch.mean(a)
-            d = torch.mean(b)
-            e = a.to(device=dst, non_blocking=non_blocking)
-            self.assertEqual(stream.query(), not non_blocking)
             stream.synchronize()
             self.assertEqual(a, b)
             self.assertTrue(b.is_pinned() == (non_blocking and dst == "cpu"))
diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json
index e6320eb7cc..f664647d21 100644
--- a/test/torch_npu_schema.json
+++ b/test/torch_npu_schema.json
@@ -3214,5 +3214,9 @@
   "torch_c_func: c10_npu::c10_npu_get_error_message": {
     "signature": "() -> char *",
     "file": "torch_npu/csrc/core/npu/NPUException.h"
+  },
+  "torch_c_func: npu_dropout_gen_mask": {
+    "signature": "(const at::Tensor &self, at::IntArrayRef size, double p, int64_t seed, int64_t offset, c10::optional<bool> parallel, c10::optional<bool> sync) -> at::Tensor",
+    "file": "third_party/op-plugin/op_plugin/include/ops.h"
   }
 }
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
index 7ec743d178..9160857a47 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
@@ -24,6 +24,10 @@ bool OptionsManager::IsHcclZeroCopyEnable()
 {
     const static bool isHcclZeroCopyEnable = []() -> bool {
         int32_t enable = OptionsManager::GetBoolTypeOption("TORCH_HCCL_ZERO_COPY", 0);
+        std::unordered_map<int32_t, std::string> hcclZeroCopyMode = getHcclZeroCopyMode();
+        if (hcclZeroCopyMode.find(enable) == hcclZeroCopyMode.end()) {
+            TORCH_CHECK(false, "TORCH_HCCL_ZERO_COPY should be 0 or 1.", PTA_ERROR(ErrCode::VALUE));
+        }
         return enable != 0;
     }();
     return isHcclZeroCopyEnable;
@@ -577,6 +581,10 @@ void OptionsManager::IsOomSnapshotEnable()
 #ifndef BUILD_LIBTORCH
     char* env_val = std::getenv("OOM_SNAPSHOT_ENABLE");
     int64_t envFlag = (env_val != nullptr) ? strtol(env_val, nullptr, 10) : 0;
+    std::unordered_map<int64_t, std::string> OOMSnapshotEnableMode = getOOMSnapshotEnableMode();
+    if (OOMSnapshotEnableMode.find(envFlag) == OOMSnapshotEnableMode.end()) {
+        TORCH_CHECK(false, "OOM_SNAPSHOT_ENABLE should be 0, 1 or 2", PTA_ERROR(ErrCode::VALUE));
+    }
     switch (envFlag) {
         case 0:
             break;
diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h
index be7445b626..56ffcc0e63 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.h
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.h
@@ -23,6 +23,12 @@ enum SilenceCheckMode {
     PRINT_ALL_LOG = 3,
 };
 
+static std::unordered_map<int32_t, std::string> getHcclZeroCopyMode()
+{
+    std::unordered_map<int32_t, std::string> hcclZeroCopyMode = {{0, "close"}, {1, "open"}};
+    return hcclZeroCopyMode;
+}
+
 static std::unordered_map<int32_t, std::string> getInfNanMode()
 {
     std::unordered_map<int32_t, std::string> infNanMode = {{0, "max"}, {1, "inf_nan"}};
@@ -83,6 +89,12 @@ static std::unordered_map<int32_t, std::string> getAclOpInitMode()
     return aclOpInitMode;
 }
 
+static std::unordered_map<int64_t, std::string> getOOMSnapshotEnableMode()
+{
+    std::unordered_map<int64_t, std::string> OOMSnapshotEnableMode = {{0, "close"}, {1, "all"}, {2, "state"}};
+    return OOMSnapshotEnableMode;
+}
+
 class OptionsManager {
 public:
     static bool IsHcclZeroCopyEnable();
-- 
Gitee


From 9c1007542379348f6cb7fc65b6efb19e97589e89 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 6 Mar 2025 11:00:23 +0000
Subject: [PATCH 100/358] !18626 Update op_plugin commit id Merge pull request
 !18626 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 8670a345f2..cb54c183a5 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 8670a345f235d4a8ee590668d1a26b259e5eed5e
+Subproject commit cb54c183a59b356c46192fcca638d6f7d7c56c88
-- 
Gitee


From 5fe1f4f6d618db115dfe690f659a9b85a1eb300a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= <wangchao430@huawei.com>
Date: Thu, 6 Mar 2025 11:18:27 +0000
Subject: [PATCH 101/358] =?UTF-8?q?!18611=20SilentCheck:=20Change=20the=20?=
 =?UTF-8?q?logging=20level=20of=20the=20debug=20logs=20to=20debug.=20Merge?=
 =?UTF-8?q?=20pull=20request=20!18611=20from=20=E7=8E=8B=E8=B6=85/v2.6.0?=
 =?UTF-8?q?=5Fsilentperf1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/utils/_step.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/torch_npu/utils/_step.py b/torch_npu/utils/_step.py
index 9935214e9d..9ebf364054 100644
--- a/torch_npu/utils/_step.py
+++ b/torch_npu/utils/_step.py
@@ -51,7 +51,7 @@ loggerSilent = logging.getLogger("torch_npu.silent_check")
 def input_hook(idx, asd_flag):
     def hook(grad):
         global IS_IN_BACKWARD
-        loggerSilent.info(f"input_hook: IS_IN_BACKWARD is {IS_IN_BACKWARD}, will change to False. idx is {idx}, flag is {asd_flag}")
+        loggerSilent.debug(f"input_hook: IS_IN_BACKWARD is {IS_IN_BACKWARD}, will change to False. idx is {idx}, flag is {asd_flag}")
         IS_IN_BACKWARD = False
         torch_npu._C._npu_set_call_state("forward")
         _silent_fault_detector_v2.silent_fault_check(idx, asd_flag, grad)
@@ -61,7 +61,7 @@ def input_hook(idx, asd_flag):
 
 def output_hook(grad):
     global IS_IN_BACKWARD
-    loggerSilent.info(f"output_hook: IS_IN_BACKWARD is {IS_IN_BACKWARD}, will change to True.")
+    loggerSilent.debug(f"output_hook: IS_IN_BACKWARD is {IS_IN_BACKWARD}, will change to True.")
     IS_IN_BACKWARD = True
     torch_npu._C._npu_set_call_state("backward")
     return grad
@@ -132,7 +132,7 @@ class SilentCheckState:
             if self.last_weight is not None and self.first_weight is not None:
                 # Otherwise, there is only one weight in the outer module
                 if self.first_weight_id != self.last_weight_id:
-                    loggerSilent.info(f"init_all_hook: module init, first_module_id is {self.first_module_id}.")
+                    loggerSilent.debug(f"init_all_hook: module init, first_module_id is {self.first_module_id}.")
                     if self.last_weight_hook_handles.get(self.first_module_id, None) is None:
                         last_weight_handle = self.last_weight.register_hook(output_hook)
                         self.last_weight_hook_handles[self.first_module_id] = last_weight_handle
@@ -140,7 +140,7 @@ class SilentCheckState:
                         first_weight_handle = self.first_weight.register_hook(input_hook(self.first_module_id, asd_flag))
                         self.weight_hook_handles[self.first_module_id] = first_weight_handle
                 else:
-                    loggerSilent.info(f"init_all_hook: module only have one weight, first_module_id is {self.first_module_id}.")
+                    loggerSilent.debug(f"init_all_hook: module only have one weight, first_module_id is {self.first_module_id}.")
             self.init_marks[self.first_module_id] = True
 
 
@@ -333,7 +333,7 @@ def add_perf_dump_patch():
         elif torch_npu._C._get_silent_check_version() == 2:
             warnings.warn(f"Warning: CANN version lower than 8.0.0 and currently does not support silent check 3.0 version. It will switch to 2.0 version. The asd_detect is {asd_enable}")
         else:
-            loggerSilent.info(f"Silent check 3.0 version will be enabled. The asd_detect is {asd_enable}")
+            loggerSilent.debug(f"Silent check 3.0 version will be enabled. The asd_detect is {asd_enable}")
 
     if perf_dump_enable or asd_enable:
         Module.__call__ = _custom_call
-- 
Gitee


From 8a2be48d04e739e83a2cc2f9cadf6e2239895a4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= <jiangyiwen5@huawei.com>
Date: Thu, 6 Mar 2025 12:17:25 +0000
Subject: [PATCH 102/358] =?UTF-8?q?!18540=20Add=20debug=20info=20in=20NPUW?=
 =?UTF-8?q?orkspaceAllocator=20empty=5Fcache=20Merge=20pull=20request=20!1?=
 =?UTF-8?q?8540=20from=20=E5=A7=9C=E6=80=A1=E6=96=87/v2.6.0=5Fec?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp
index 9ab1e5c032..753089c393 100644
--- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp
@@ -121,7 +121,10 @@ public:
     void empty_cache(bool need_empty_queue, bool check_error)
     {
         if (need_empty_queue) {
+            ASCEND_LOGI("NPUWorkspaceAllocator empty_cache in main_thread.");
             c10_npu::emptyAllNPUStream(check_error);
+        } else {
+            ASCEND_LOGI("NPUWorkspaceAllocator empty_cache in acl_thread.");
         }
 
         auto acl_ret = c10_npu::acl::AclrtSynchronizeDeviceWithTimeout();
-- 
Gitee


From bf7d92fdb39982f26852c792f7ad8c923d17e8fe Mon Sep 17 00:00:00 2001
From: SCh-zx <1325467101@qq.com>
Date: Thu, 6 Mar 2025 12:21:28 +0000
Subject: [PATCH 103/358] !18618 Change the Time of Fine-Grained Core Binding
 Merge pull request !18618 from SCh-zx/v2.6.0

---
 torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
index 5ec37f44fc..42d80b0792 100644
--- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
+++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
@@ -189,10 +189,10 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id)
 
     lazy_fn_.clear();
 
-    SetThreadAffinity(device_id_);
-
     GetAffinityInfo();
 
+    SetThreadAffinity(device_id_);
+
     init_flag_ = true;
     ASCEND_LOGD("Npu sys ctrl initialize successfully.");
 
-- 
Gitee


From faa9477484fe1aaaa5fcd4a122d9298790fab194 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 6 Mar 2025 14:00:24 +0000
Subject: [PATCH 104/358] !18634 Update op_plugin commit id Merge pull request
 !18634 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index cb54c183a5..9efaf481f6 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit cb54c183a59b356c46192fcca638d6f7d7c56c88
+Subproject commit 9efaf481f6a12a395cc392fc768718e680763522
-- 
Gitee


From 8e5557881546b88728386cbe400e7f0dba4ede5c Mon Sep 17 00:00:00 2001
From: shaoyf <shaoyifan1@huawei.com>
Date: Thu, 6 Mar 2025 16:37:26 +0000
Subject: [PATCH 105/358] =?UTF-8?q?!18645=20=E5=9B=9E=E9=80=80=20'Pull=20R?=
 =?UTF-8?q?equest=20!18555=20:=20Support=20qos=20and=20aiv=20in=20options.?=
 =?UTF-8?q?hccl=5Fconfig'=20Merge=20pull=20request=20!18645=20from=20shaoy?=
 =?UTF-8?q?f/revert-merge-18555-v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/distributed/test_options.py              | 52 ++-----------------
 third_party/hccl/inc/hccl/hccl.h              |  3 --
 third_party/hccl/inc/hccl/hccl_types.h        |  9 +---
 .../csrc/distributed/ProcessGroupHCCL.cpp     | 24 ---------
 4 files changed, 6 insertions(+), 82 deletions(-)

diff --git a/test/distributed/test_options.py b/test/distributed/test_options.py
index 7f31b3ed4a..12aeffbb64 100644
--- a/test/distributed/test_options.py
+++ b/test/distributed/test_options.py
@@ -42,31 +42,15 @@ class OptionsTest(TestCase):
         dist.all_reduce(input1, group=pg)
 
     @classmethod
-    def _test_options_wrong_type(cls, rank, hccl_config, error_expect, world_size, input1):
+    def _test_options_wrong_type(cls, rank, ranks, world_size, input1):
         options = torch_npu._C._distributed_c10d.ProcessGroupHCCL.Options()
-        options.hccl_config = hccl_config
+        options.hccl_config = {"group_name": 123}
         input1 = input1.npu()
         test_case = TestCase()
-        with test_case.assertRaisesRegex(RuntimeError, error_expect):
+        with test_case.assertRaisesRegex(RuntimeError, "Value type of group_name should be string"):
             OptionsTest._init_dist_hccl(rank, options, world_size)
             dist.all_reduce(input1)
 
-    @classmethod
-    def _test_options_group_name_wrong_types(cls, rank, ranks, world_size, input1):
-        cls._test_options_wrong_type(rank, {"group_name": 123}, "Value type of group_name should be string", world_size, input1)
-
-    @classmethod
-    def _test_options_qos_traffic_class_wrong_types(cls, rank, ranks, world_size, input1):
-        cls._test_options_wrong_type(rank, {"qos_traffic_class": "123"}, "Value type of qos_traffic_class should be int.", world_size, input1)
-
-    @classmethod
-    def _test_options_qos_service_level_wrong_types(cls, rank, ranks, world_size, input1):
-        cls._test_options_wrong_type(rank, {"qos_service_level": "123"}, "Value type of qos_service_level should be int.", world_size, input1)
-   
-    @classmethod
-    def _test_options_hccl_op_expansion_mode_wrong_types(cls, rank, ranks, world_size, input1):
-        cls._test_options_wrong_type(rank, {"hccl_op_expansion_mode": "123"}, "Value type of hccl_op_expansion_mode should be int.", world_size, input1)
-
     def _test_multiprocess(self, f, input1, world_size):
         ctx = mp.get_context('spawn')
 
@@ -95,40 +79,14 @@ class OptionsTest(TestCase):
                                     input1, world_size)
 
     @skipIfUnsupportMultiNPU(2)
-    def test_options_group_name_wrong_type(self):
-        ranks = [2]
-        shape = [np.int32, 0, [2, 3, 16]]
-        for world_size in ranks:
-            exp_input, input1 = create_common_tensor(shape, -10, 10)
-            self._test_multiprocess(OptionsTest._test_options_group_name_wrong_types,
-                                    input1, world_size)
-
-    @skipIfUnsupportMultiNPU(2)
-    def test_options_qos_traffic_class_wrong_type(self):
+    def test_options_wrong_type(self):
         ranks = [2]
         shape = [np.int32, 0, [2, 3, 16]]
         for world_size in ranks:
             exp_input, input1 = create_common_tensor(shape, -10, 10)
-            self._test_multiprocess(OptionsTest._test_options_qos_traffic_class_wrong_types,
+            self._test_multiprocess(OptionsTest._test_options_wrong_type,
                                     input1, world_size)
 
-    @skipIfUnsupportMultiNPU(2)
-    def test_options_qos_service_level_wrong_type(self):
-        ranks = [2]
-        shape = [np.int32, 0, [2, 3, 16]]
-        for world_size in ranks:
-            exp_input, input1 = create_common_tensor(shape, -10, 10)
-            self._test_multiprocess(OptionsTest._test_options_qos_service_level_wrong_types,
-                                    input1, world_size)
-
-    @skipIfUnsupportMultiNPU(2)
-    def test_options_hccl_op_expansion_mode_wrong_type(self):
-        ranks = [2]
-        shape = [np.int32, 0, [2, 3, 16]]
-        for world_size in ranks:
-            exp_input, input1 = create_common_tensor(shape, -10, 10)
-            self._test_multiprocess(OptionsTest._test_options_hccl_op_expansion_mode_wrong_types,
-                                    input1, world_size)
 
 if __name__ == '__main__':
     run_tests()
diff --git a/third_party/hccl/inc/hccl/hccl.h b/third_party/hccl/inc/hccl/hccl.h
index ab48964d8c..da30721fad 100644
--- a/third_party/hccl/inc/hccl/hccl.h
+++ b/third_party/hccl/inc/hccl/hccl.h
@@ -207,9 +207,6 @@ inline void HcclCommConfigInit(HcclCommConfig *config)
     config->hcclDeterministic = HCCL_COMM_DEFAULT_DETERMINISTIC;
     config->hcclCommName[0] = '\0';
     config->hcclUdi[0] = '\0';
-    config->hcclRdmaTrafficClass = HCCL_COMM_TRAFFIC_CLASS_CONFIG_NOT_SET;
-    config->hcclRdmaServiceLevel = HCCL_COMM_SERVICE_LEVEL_CONFIG_NOT_SET;
-    config->hcclOpExpansionMode = HCCL_COMM_DEFAULT_OP_EXPANSION_MODE;
 }
 
 /**
diff --git a/third_party/hccl/inc/hccl/hccl_types.h b/third_party/hccl/inc/hccl/hccl_types.h
index 40631676c1..aa653995ea 100644
--- a/third_party/hccl/inc/hccl/hccl_types.h
+++ b/third_party/hccl/inc/hccl/hccl_types.h
@@ -15,14 +15,11 @@ extern "C" {
 
 const uint32_t HCCL_COMM_CONFIG_INFO_BYTES = 24;
 const uint32_t HCCL_COMM_CONFIG_MAGIC_WORD = 0xf0f0f0f0;
-const uint32_t HCCL_COMM_CONFIG_VERSION = 5;
+const uint32_t HCCL_COMM_CONFIG_VERSION = 3;
 const uint32_t HCCL_COMM_DEFAULT_BUFFSIZE = 200;                // 200MB buffer size
 const uint32_t HCCL_COMM_DEFAULT_DETERMINISTIC = 0;             // Disable deterministic calculations
 const uint32_t COMM_NAME_MAX_LENGTH = 128;
 const uint32_t UDI_MAX_LENGTH = 128;
-const uint32_t HCCL_COMM_TRAFFIC_CLASS_CONFIG_NOT_SET = 0xffffffff;
-const uint32_t HCCL_COMM_SERVICE_LEVEL_CONFIG_NOT_SET = 0xffffffff;
-const uint32_t HCCL_COMM_DEFAULT_OP_EXPANSION_MODE = 0;
 
 /**
  * @brief HCCL functions return value definition
@@ -129,16 +126,12 @@ typedef struct HcclCommConfigDef {
     uint32_t hcclDeterministic;
     char hcclCommName[COMM_NAME_MAX_LENGTH];
     char hcclUdi[UDI_MAX_LENGTH];
-    uint32_t hcclOpExpansionMode;
-    uint32_t hcclRdmaTrafficClass;
-    uint32_t hcclRdmaServiceLevel;
 } HcclCommConfig;
 
 typedef enum {
     HCCL_COMM_CONFIG_BUFFER_SIZE = 0,
     HCCL_COMM_CONFIG_DETERMINISTIC = 1,
     HCCL_COMM_CONFIG_COMM_NAME = 2,
-    HCCL_COMM_CONFIG_OP_EXPANSION_MODE = 3,
     HCCL_COMM_CONFIG_RESERVED,
 } HcclCommConfigCapability;
 #ifdef __cplusplus
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 7224e084d1..eb9c1f53ac 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -2262,30 +2262,6 @@ HcclCommConfig ProcessGroupHCCL::createHcclCommConfigWithOptions()
         }
     }
 
-    if (options_->hccl_config.find("qos_traffic_class") != options_->hccl_config.end()) {
-        if (std::holds_alternative<uint32_t>(options_->hccl_config["qos_traffic_class"])) {
-            config.hcclRdmaTrafficClass = std::get<uint32_t>(options_->hccl_config["qos_traffic_class"]);
-        } else {
-            TORCH_CHECK(false, "Value type of qos_traffic_class should be int.", DIST_ERROR(ErrCode::TYPE));
-        }
-    }
-
-    if (options_->hccl_config.find("qos_service_level") != options_->hccl_config.end()) {
-        if (std::holds_alternative<uint32_t>(options_->hccl_config["qos_service_level"])) {
-            config.hcclRdmaServiceLevel = std::get<uint32_t>(options_->hccl_config["qos_service_level"]);
-        } else {
-            TORCH_CHECK(false, "Value type of qos_service_level should be int.", DIST_ERROR(ErrCode::TYPE));
-        }
-    }
-
-    if (options_->hccl_config.find("hccl_op_expansion_mode") != options_->hccl_config.end()) {
-        if (std::holds_alternative<uint32_t>(options_->hccl_config["hccl_op_expansion_mode"])) {
-            config.hcclOpExpansionMode = std::get<uint32_t>(options_->hccl_config["hccl_op_expansion_mode"]);
-        } else {
-            TORCH_CHECK(false, "Value type of hccl_op_expansion_mode should be int.", DIST_ERROR(ErrCode::TYPE));
-        }
-    }
-
     return config;
 }
 
-- 
Gitee


From e5bfe83f255ad5dbb721502a89e3785f57d936a3 Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Fri, 7 Mar 2025 01:25:28 +0000
Subject: [PATCH 106/358] !18648 Update torchair commit id Merge pull request
 !18648 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 890aefe634..65c3601a0e 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 890aefe63407b37372abe053fb549c8433bb6ae1
+Subproject commit 65c3601a0e890b7913638f166668580673d0bc9a
-- 
Gitee


From edb04d1d6d91bcf37c5f4cf540d1b4a7eedea87f Mon Sep 17 00:00:00 2001
From: jiangpengfei <jiangpengfei24@huawei.com>
Date: Fri, 7 Mar 2025 03:11:49 +0000
Subject: [PATCH 107/358] !18393 cleancode Merge pull request !18393 from
 jiangpengfei/v2.6.0

---
 .../npu/interface/AsyncTaskQueueInterface.cpp | 195 +++++++++---------
 .../npu/interface/AsyncTaskQueueInterface.h   |  48 ++---
 .../csrc/core/npu/interface/HcclInterface.cpp |   4 +-
 .../csrc/core/npu/register/FunctionLoader.cpp | 106 +++++-----
 .../csrc/core/npu/register/FunctionLoader.h   | 108 +++++-----
 .../csrc/core/npu/register/OptionRegister.cpp |  66 +++---
 .../csrc/core/npu/register/OptionRegister.h   | 156 +++++++-------
 7 files changed, 341 insertions(+), 342 deletions(-)

diff --git a/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.cpp b/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.cpp
index 7199eb895d..e4f3051141 100644
--- a/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.cpp
@@ -13,10 +13,10 @@ namespace c10_npu {
 namespace queue {
 std::atomic<uint64_t> QueueParas::g_correlation_id{0};
 std::map<int64_t, std::string> CopyParas::COPY_PARAS_MAP{
-  {ACL_MEMCPY_HOST_TO_HOST, "acl_memcpy_host_to_host"},
-  {ACL_MEMCPY_HOST_TO_DEVICE, "acl_memcpy_host_to_device"},
-  {ACL_MEMCPY_DEVICE_TO_HOST, "acl_memcpy_device_to_host"},
-  {ACL_MEMCPY_DEVICE_TO_DEVICE, "acl_memcpy_device_to_device"},
+    {ACL_MEMCPY_HOST_TO_HOST, "acl_memcpy_host_to_host"},
+    {ACL_MEMCPY_HOST_TO_DEVICE, "acl_memcpy_host_to_device"},
+    {ACL_MEMCPY_DEVICE_TO_HOST, "acl_memcpy_device_to_host"},
+    {ACL_MEMCPY_DEVICE_TO_DEVICE, "acl_memcpy_device_to_device"},
 };
 std::map<int64_t, std::string> EventParas::EVENT_PARAS_MAP{
     {RECORD_EVENT, "record_event"},
@@ -24,47 +24,47 @@ std::map<int64_t, std::string> EventParas::EVENT_PARAS_MAP{
     {LAZY_DESTROY_EVENT, "destroy_event"},
 };
 void CopyParas::Copy(CopyParas& other) {
-  this->dst = other.dst;
-  this->dstLen = other.dstLen;
-  this->src = other.src;
-  this->srcLen = other.srcLen;
-  this->kind = other.kind;
+    this->dst = other.dst;
+    this->dstLen = other.dstLen;
+    this->src = other.src;
+    this->srcLen = other.srcLen;
+    this->kind = other.kind;
 }
 
 void EventParas::Copy(EventParas& other) {
-  this->event = other.event;
-  this->eventAllocatorType = other.eventAllocatorType;
+    this->event = other.event;
+    this->eventAllocatorType = other.eventAllocatorType;
 }
 
 class AsyncCopyTask {
 public:
-  AsyncCopyTask(
-      void* dst,
-      size_t dstLen,
-      void* src,
-      size_t srcLen,
-      aclrtMemcpyKind kind);
-  ~AsyncCopyTask() = default;
-  void LaunchCopyTask();
+    AsyncCopyTask(
+        void* dst,
+        size_t dstLen,
+        void* src,
+        size_t srcLen,
+        aclrtMemcpyKind kind);
+    ~AsyncCopyTask() = default;
+    void LaunchCopyTask();
 
 private:
-  CopyParas copyParam_;
+    CopyParas copyParam_;
 };
 
 class EventTask {
 public:
-  explicit EventTask(
-      aclrtEvent event,
-      EventAllocatorType allocatorType = RESERVED)
-      : eventParam_(event, allocatorType){};
-  ~EventTask() = default;
-  void LaunchRecordTask(
-      c10_npu::NPUStream npuStream);
-  void LaunchWaitTask(c10_npu::NPUStream npuStream);
-  void LaunchLazyDestroyTask(c10::DeviceIndex device_index);
+    explicit EventTask(
+        aclrtEvent event,
+        EventAllocatorType allocatorType = RESERVED)
+        : eventParam_(event, allocatorType){};
+    ~EventTask() = default;
+    void LaunchRecordTask(
+        c10_npu::NPUStream npuStream);
+    void LaunchWaitTask(c10_npu::NPUStream npuStream);
+    void LaunchLazyDestroyTask(c10::DeviceIndex device_index);
 
 private:
-  EventParas eventParam_;
+    EventParas eventParam_;
 };
 
 AsyncCopyTask::AsyncCopyTask(
@@ -73,35 +73,35 @@ AsyncCopyTask::AsyncCopyTask(
     void* src,
     size_t srcLen,
     aclrtMemcpyKind kind) {
-  copyParam_.dst = dst;
-  copyParam_.dstLen = dstLen;
-  copyParam_.src = src;
-  copyParam_.srcLen = srcLen;
-  copyParam_.kind = kind;
+    copyParam_.dst = dst;
+    copyParam_.dstLen = dstLen;
+    copyParam_.src = src;
+    copyParam_.srcLen = srcLen;
+    copyParam_.kind = kind;
 }
 
 void AsyncCopyTask::LaunchCopyTask() {
-  RECORD_FUNCTION(CopyParas::COPY_PARAS_MAP[copyParam_.kind], std::vector<c10::IValue>({}));
+    RECORD_FUNCTION(CopyParas::COPY_PARAS_MAP[copyParam_.kind], std::vector<c10::IValue>({}));
     auto cur_stream = c10_npu::getCurrentNPUStream();
     if (!cur_stream.isSyncLaunchStream() && c10_npu::option::OptionsManager::GetTaskQueueEnable()) {
 #ifndef BUILD_LIBTORCH
-    at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(0, CopyParas::COPY_PARAS_MAP[copyParam_.kind]);
+        at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(0, CopyParas::COPY_PARAS_MAP[copyParam_.kind]);
 #endif
-    QueueParas params(ASYNC_MEMCPY, sizeof(CopyParas), &copyParam_);
-    c10_npu::enCurrentNPUStream(&params);
+        QueueParas params(ASYNC_MEMCPY, sizeof(CopyParas), &copyParam_);
+        c10_npu::enCurrentNPUStream(&params);
 #ifndef BUILD_LIBTORCH
-    at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, CopyParas::COPY_PARAS_MAP[copyParam_.kind], params.correlation_id);
+        at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, CopyParas::COPY_PARAS_MAP[copyParam_.kind], params.correlation_id);
 #endif
-  } else {
-    c10_npu::NPUStream stream = c10_npu::getCurrentNPUStream();
-    NPU_CHECK_ERROR(aclrtMemcpyAsync(
-        copyParam_.dst,
-        copyParam_.dstLen,
-        copyParam_.src,
-        copyParam_.srcLen,
-        copyParam_.kind,
-        stream));
-  }
+    } else {
+        c10_npu::NPUStream stream = c10_npu::getCurrentNPUStream();
+        NPU_CHECK_ERROR(aclrtMemcpyAsync(
+            copyParam_.dst,
+            copyParam_.dstLen,
+            copyParam_.src,
+            copyParam_.srcLen,
+            copyParam_.kind,
+            stream));
+    }
 }
 
 aclError LaunchAsyncCopyTask(
@@ -110,16 +110,16 @@ aclError LaunchAsyncCopyTask(
     void* src,
     size_t srcLen,
     aclrtMemcpyKind kind) {
-  AsyncCopyTask copyTask(dst, dstLen, src, srcLen, kind);
-  copyTask.LaunchCopyTask();
-  return ACL_ERROR_NONE;
+    AsyncCopyTask copyTask(dst, dstLen, src, srcLen, kind);
+    copyTask.LaunchCopyTask();
+    return ACL_ERROR_NONE;
 }
 
 void EventTask::LaunchRecordTask(c10_npu::NPUStream npuStream) {
     RECORD_FUNCTION(EventParas::EVENT_PARAS_MAP[RECORD_EVENT], std::vector<c10::IValue>({}));
     if (!npuStream.isSyncLaunchStream() && c10_npu::option::OptionsManager::GetTaskQueueEnable()) {
 #ifndef BUILD_LIBTORCH
-    at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(0, EventParas::EVENT_PARAS_MAP[RECORD_EVENT]);
+        at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(0, EventParas::EVENT_PARAS_MAP[RECORD_EVENT]);
 #endif
     uint64_t prof_correlation_id = 0;
     {
@@ -133,32 +133,32 @@ void EventTask::LaunchRecordTask(c10_npu::NPUStream npuStream) {
 #ifndef BUILD_LIBTORCH
     at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[RECORD_EVENT], prof_correlation_id);
 #endif
-  } else {
-    NPU_CHECK_ERROR(aclrtRecordEvent(eventParam_.event, npuStream));
-    ASCEND_LOGI("Event: aclrtRecordEvent is successfully executed, stream=%p, event=%p", npuStream.stream(false), eventParam_.event);
-  }
+    } else {
+        NPU_CHECK_ERROR(aclrtRecordEvent(eventParam_.event, npuStream));
+        ASCEND_LOGI("Event: aclrtRecordEvent is successfully executed, stream=%p, event=%p", npuStream.stream(false), eventParam_.event);
+    }
 }
 
 aclError LaunchRecordEventTask(aclrtEvent event, c10_npu::NPUStream npuStream) {
-  EventTask recordTask(event);
-  recordTask.LaunchRecordTask(npuStream);
+    EventTask recordTask(event);
+    recordTask.LaunchRecordTask(npuStream);
 #ifndef BUILD_LIBTORCH
-  const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
-  if (C10_UNLIKELY(trigger)) {
-      trigger->traceNpuEventRecord(
-          reinterpret_cast<uintptr_t>(event),
-          reinterpret_cast<uintptr_t>(npuStream.stream(false))
-      );
-  }
+    const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
+    if (C10_UNLIKELY(trigger)) {
+        trigger->traceNpuEventRecord(
+            reinterpret_cast<uintptr_t>(event),
+            reinterpret_cast<uintptr_t>(npuStream.stream(false))
+        );
+    }
 #endif
-  return ACL_ERROR_NONE;
+    return ACL_ERROR_NONE;
 }
 
 void EventTask::LaunchWaitTask(c10_npu::NPUStream npuStream) {
     RECORD_FUNCTION(EventParas::EVENT_PARAS_MAP[WAIT_EVENT], std::vector<c10::IValue>({}));
     if (!npuStream.isSyncLaunchStream() && c10_npu::option::OptionsManager::GetTaskQueueEnable()) {
 #ifndef BUILD_LIBTORCH
-    at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(0, EventParas::EVENT_PARAS_MAP[WAIT_EVENT]);
+        at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(0, EventParas::EVENT_PARAS_MAP[WAIT_EVENT]);
 #endif
     uint64_t prof_correlation_id = 0;
     {
@@ -171,24 +171,24 @@ void EventTask::LaunchWaitTask(c10_npu::NPUStream npuStream) {
 #ifndef BUILD_LIBTORCH
     at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[WAIT_EVENT], prof_correlation_id);
 #endif
-  } else {
-    NPU_CHECK_ERROR(aclrtStreamWaitEvent(npuStream, eventParam_.event));
-    ASCEND_LOGI("Event: aclrtStreamWaitEvent is successfully executed, stream=%p, event=%p", npuStream.stream(false), eventParam_.event);
-  }
+    } else {
+        NPU_CHECK_ERROR(aclrtStreamWaitEvent(npuStream, eventParam_.event));
+        ASCEND_LOGI("Event: aclrtStreamWaitEvent is successfully executed, stream=%p, event=%p", npuStream.stream(false), eventParam_.event);
+    }
 }
 
 aclError LaunchWaitEventTask(aclrtEvent event, c10_npu::NPUStream npuStream) {
-  EventTask waitTask(event);
-  waitTask.LaunchWaitTask(npuStream);
+    EventTask waitTask(event);
+    waitTask.LaunchWaitTask(npuStream);
 #ifndef BUILD_LIBTORCH
-  const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
-  if (C10_UNLIKELY(trigger)) {
-      trigger->traceNpuEventWait(
-          reinterpret_cast<uintptr_t>(event),
-          reinterpret_cast<uintptr_t>(npuStream.stream(false)));
-  }
+    const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
+    if (C10_UNLIKELY(trigger)) {
+        trigger->traceNpuEventWait(
+            reinterpret_cast<uintptr_t>(event),
+            reinterpret_cast<uintptr_t>(npuStream.stream(false)));
+    }
 #endif
-  return ACL_ERROR_NONE;
+    return ACL_ERROR_NONE;
 }
 
 void EventTask::LaunchLazyDestroyTask(c10::DeviceIndex device_index) {
@@ -196,30 +196,29 @@ void EventTask::LaunchLazyDestroyTask(c10::DeviceIndex device_index) {
     auto cur_stream = c10_npu::getCurrentNPUStream();
     if (!cur_stream.isSyncLaunchStream() && c10_npu::option::OptionsManager::GetTaskQueueEnable()) {
 #ifndef BUILD_LIBTORCH
-    at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(0, EventParas::EVENT_PARAS_MAP[LAZY_DESTROY_EVENT]);
+        at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(0, EventParas::EVENT_PARAS_MAP[LAZY_DESTROY_EVENT]);
 #endif
-    QueueParas params(LAZY_DESTROY_EVENT, sizeof(EventParas), &eventParam_);
-    c10_npu::enCurrentNPUStream(&params, device_index);
-    ASCEND_LOGI("Event: LaunchLazyDestroyTask is successfully executed, event=%p", eventParam_.event);
+        QueueParas params(LAZY_DESTROY_EVENT, sizeof(EventParas), &eventParam_);
+        c10_npu::enCurrentNPUStream(&params, device_index);
+        ASCEND_LOGI("Event: LaunchLazyDestroyTask is successfully executed, event=%p", eventParam_.event);
 #ifndef BUILD_LIBTORCH
-    at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[LAZY_DESTROY_EVENT], params.correlation_id);
+        at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[LAZY_DESTROY_EVENT], params.correlation_id);
 #endif
-  } else {
-    NPU_CHECK_ERROR(c10_npu::NPUEventManager::GetInstance().LazyDestroy(eventParam_.event),
-                    "aclrtDestroyEvent");
-  }
+    } else {
+        NPU_CHECK_ERROR(c10_npu::NPUEventManager::GetInstance().LazyDestroy(eventParam_.event), "aclrtDestroyEvent");
+    }
 }
 
 aclError LaunchLazyDestroyEventTask(aclrtEvent event, c10::DeviceIndex device_index) {
-  EventTask lazyDestroyTask(event);
+    EventTask lazyDestroyTask(event);
 #ifndef BUILD_LIBTORCH
-  const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
-  if (C10_UNLIKELY(trigger)) {
-      trigger->traceNpuEventDeletion(reinterpret_cast<uintptr_t>(event));
-  }
+    const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
+    if (C10_UNLIKELY(trigger)) {
+        trigger->traceNpuEventDeletion(reinterpret_cast<uintptr_t>(event));
+    }
 #endif
-  lazyDestroyTask.LaunchLazyDestroyTask(device_index);
-  return ACL_ERROR_NONE;
+    lazyDestroyTask.LaunchLazyDestroyTask(device_index);
+    return ACL_ERROR_NONE;
 }
 } // namespace queue
 } // namespace c10
diff --git a/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h b/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h
index 02530729f1..860c5e63f5 100644
--- a/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h
+++ b/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h
@@ -7,29 +7,29 @@
 namespace c10_npu {
 namespace queue {
 struct CopyParas {
-  void *dst = nullptr;
-  size_t dstLen = 0;
-  void *src = nullptr;
-  size_t srcLen = 0;
-  aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_HOST;
-  void Copy(CopyParas& other);
-  static std::map<int64_t, std::string> COPY_PARAS_MAP;
+    void *dst = nullptr;
+    size_t dstLen = 0;
+    void *src = nullptr;
+    size_t srcLen = 0;
+    aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_HOST;
+    void Copy(CopyParas& other);
+    static std::map<int64_t, std::string> COPY_PARAS_MAP;
 };
 
 enum EventAllocatorType {
-  HOST_ALLOCATOR_EVENT = 1,
-  NPU_ALLOCATOR_EVENT = 2,
-  RESERVED = -1,
+    HOST_ALLOCATOR_EVENT = 1,
+    NPU_ALLOCATOR_EVENT = 2,
+    RESERVED = -1,
 };
 
 struct EventParas {
-  explicit EventParas(aclrtEvent aclEvent, EventAllocatorType allocatorType)
-      : event(aclEvent), eventAllocatorType(allocatorType) {}
-  EventParas() = default;
-  aclrtEvent event = nullptr;
-  void Copy(EventParas& other);
-  EventAllocatorType eventAllocatorType = RESERVED;
-  static std::map<int64_t, std::string> EVENT_PARAS_MAP;
+    explicit EventParas(aclrtEvent aclEvent, EventAllocatorType allocatorType)
+        : event(aclEvent), eventAllocatorType(allocatorType) {}
+    EventParas() = default;
+    aclrtEvent event = nullptr;
+    void Copy(EventParas& other);
+    EventAllocatorType eventAllocatorType = RESERVED;
+    static std::map<int64_t, std::string> EVENT_PARAS_MAP;
 };
 
 enum QueueParamType {
@@ -43,13 +43,13 @@ enum QueueParamType {
 };
 
 struct QueueParas {
-  QueueParas(QueueParamType type, size_t len, void *val) : paramType(type), paramLen(len), paramVal(val) {}
-  aclrtStream paramStream = nullptr;
-  QueueParamType paramType = COMPILE_AND_EXECUTE;
-  size_t paramLen = 0;
-  void* paramVal = nullptr;
-  static std::atomic<uint64_t> g_correlation_id;
-  uint64_t correlation_id = 0;
+    QueueParas(QueueParamType type, size_t len, void *val) : paramType(type), paramLen(len), paramVal(val) {}
+    aclrtStream paramStream = nullptr;
+    QueueParamType paramType = COMPILE_AND_EXECUTE;
+    size_t paramLen = 0;
+    void* paramVal = nullptr;
+    static std::atomic<uint64_t> g_correlation_id;
+    uint64_t correlation_id = 0;
 };
 
 aclError LaunchAsyncCopyTask(void* dst, size_t dstLen, void* src, size_t srcLen, aclrtMemcpyKind kind);
diff --git a/torch_npu/csrc/core/npu/interface/HcclInterface.cpp b/torch_npu/csrc/core/npu/interface/HcclInterface.cpp
index 445c39d75f..58860896b7 100644
--- a/torch_npu/csrc/core/npu/interface/HcclInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/HcclInterface.cpp
@@ -6,10 +6,10 @@ namespace at_npu {
 namespace hccl {
 #undef LOAD_FUNCTION
 #define LOAD_FUNCTION(funcName) \
-  REGISTER_FUNCTION(libhccl, funcName)
+    REGISTER_FUNCTION(libhccl, funcName)
 #undef GET_FUNC
 #define GET_FUNC(funcName) \
-  GET_FUNCTION(libhccl, funcName)
+    GET_FUNCTION(libhccl, funcName)
 
 REGISTER_LIBRARY(libhccl)
 LOAD_FUNCTION(HcclGetCommName)
diff --git a/torch_npu/csrc/core/npu/register/FunctionLoader.cpp b/torch_npu/csrc/core/npu/register/FunctionLoader.cpp
index 7966af15e8..17930667c8 100644
--- a/torch_npu/csrc/core/npu/register/FunctionLoader.cpp
+++ b/torch_npu/csrc/core/npu/register/FunctionLoader.cpp
@@ -7,80 +7,80 @@ namespace c10_npu {
 namespace option {
 
 FunctionLoader::FunctionLoader(const std::string& name) {
-  this->fileName = name + ".so";
+    this->fileName = name + ".so";
 }
 
 FunctionLoader::~FunctionLoader() {
-  if (this->handle != nullptr) {
-    dlclose(this->handle);
-  }
+    if (this->handle != nullptr) {
+        dlclose(this->handle);
+    }
 }
 
 void FunctionLoader::Set(const std::string& name) {
-  this->registry[name] = nullptr;
+    this->registry[name] = nullptr;
 }
 
 void* FunctionLoader::Get(const std::string& name) {
-  if (this->handle == nullptr) {
-    auto handle = dlopen(this->fileName.c_str(), RTLD_LAZY | RTLD_GLOBAL);
-    if (handle == nullptr) {
-      AT_ERROR(dlerror());
-      return nullptr;
+    if (this->handle == nullptr) {
+        auto handle = dlopen(this->fileName.c_str(), RTLD_LAZY | RTLD_GLOBAL);
+        if (handle == nullptr) {
+            AT_ERROR(dlerror());
+            return nullptr;
+        }
+        this->handle = handle;
     }
-    this->handle = handle;
-  }
 
-  auto itr = registry.find(name);
-  if (itr == registry.end()) {
-    AT_ERROR("function(", name, ") is not registered.");
-    return nullptr;
-  }
+    auto itr = registry.find(name);
+    if (itr == registry.end()) {
+        AT_ERROR("function(", name, ") is not registered.");
+        return nullptr;
+    }
 
-  if (itr->second != nullptr) {
-    return itr->second;
-  }
+    if (itr->second != nullptr) {
+        return itr->second;
+    }
 
-  auto func = dlsym(this->handle, name.c_str());
-  if (func == nullptr) {
-    return nullptr;
-  }
-  this->registry[name] = func;
-  return func;
+    auto func = dlsym(this->handle, name.c_str());
+    if (func == nullptr) {
+        return nullptr;
+    }
+    this->registry[name] = func;
+    return func;
 }
 
 namespace register_function {
-  FunctionRegister* FunctionRegister::GetInstance() {
-    static FunctionRegister instance;
-    return &instance;
-  }
-  void FunctionRegister::Register(const std::string& name, ::std::unique_ptr<FunctionLoader>& ptr) {
-    std::lock_guard<std::mutex> lock(mu_);
-    registry.emplace(name, std::move(ptr));
-  }
+    FunctionRegister* FunctionRegister::GetInstance() {
+        static FunctionRegister instance;
+        return &instance;
+    }
+    void FunctionRegister::Register(const std::string& name, ::std::unique_ptr<FunctionLoader>& ptr) {
+        std::lock_guard<std::mutex> lock(mu_);
+        registry.emplace(name, std::move(ptr));
+    }
 
-  void FunctionRegister::Register(const std::string& name, const std::string& funcName) {
-    auto itr = registry.find(name);
-    if (itr == registry.end()) {
-      AT_ERROR(name, " library should register first.");
-      return;
+    void FunctionRegister::Register(const std::string& name, const std::string& funcName) {
+        auto itr = registry.find(name);
+        if (itr == registry.end()) {
+            AT_ERROR(name, " library should register first.");
+            return;
+        }
+        itr->second->Set(funcName);
     }
-    itr->second->Set(funcName);
-  }
 
-  void* FunctionRegister::Get(const std::string& soName, const std::string& funcName) {
-    auto itr = registry.find(soName);
-    if (itr != registry.end()) {
-      return itr->second->Get(funcName);
+    void* FunctionRegister::Get(const std::string& soName, const std::string& funcName) {
+        auto itr = registry.find(soName);
+        if (itr != registry.end()) {
+            return itr->second->Get(funcName);
+        }
+        return nullptr;
     }
-    return nullptr;
-  }
 
-  FunctionRegisterBuilder::FunctionRegisterBuilder(const std::string& name, ::std::unique_ptr<FunctionLoader>& ptr) {
-    FunctionRegister::GetInstance()->Register(name, ptr);
-  }
-  FunctionRegisterBuilder::FunctionRegisterBuilder(const std::string& soName, const std::string& funcName) {
-    FunctionRegister::GetInstance()->Register(soName, funcName);
-  }
+    FunctionRegisterBuilder::FunctionRegisterBuilder(const std::string& name, ::std::unique_ptr<FunctionLoader>& ptr) {
+        FunctionRegister::GetInstance()->Register(name, ptr);
+    }
+    FunctionRegisterBuilder::FunctionRegisterBuilder(const std::string& soName, const std::string& funcName) {
+        FunctionRegister::GetInstance()->Register(soName, funcName);
+    }
 } // namespace register_function
 
 
diff --git a/torch_npu/csrc/core/npu/register/FunctionLoader.h b/torch_npu/csrc/core/npu/register/FunctionLoader.h
index a3aff2c07b..489243b1b1 100644
--- a/torch_npu/csrc/core/npu/register/FunctionLoader.h
+++ b/torch_npu/csrc/core/npu/register/FunctionLoader.h
@@ -11,27 +11,27 @@ namespace option {
   */
 class FunctionLoader {
 public:
-  /**
-    ctr
-    */
-  explicit FunctionLoader(const std::string& filename);
-  /**
-    dectr
-    */
-  ~FunctionLoader();
-  /**
-    set function name
-    */
-  void Set(const std::string& name);
-  /**
-    get function address by function name.
-    */
-  void* Get(const std::string& name);
+    /**
+        ctr
+        */
+    explicit FunctionLoader(const std::string& filename);
+    /**
+        dectr
+        */
+    ~FunctionLoader();
+    /**
+        set function name
+        */
+    void Set(const std::string& name);
+    /**
+        get function address by function name.
+        */
+    void* Get(const std::string& name);
 private:
-  mutable std::mutex mu_;
-  std::string fileName;
-  void* handle = nullptr;
-  mutable std::unordered_map<std::string, void*> registry;
+    mutable std::mutex mu_;
+    std::string fileName;
+    void* handle = nullptr;
+    mutable std::unordered_map<std::string, void*> registry;
 }; // class FunctionLoader
 
 
@@ -41,27 +41,27 @@ namespace register_function {
   */
 class FunctionRegister {
 public:
-  /**
-    Singleton
-    */
-  static FunctionRegister* GetInstance();
-  /**
-    this API is used to store FunctionLoader class
-    */
-  void Register(const std::string& name, ::std::unique_ptr<FunctionLoader>& ptr);
-  /**
-    this API is used to associate library name and function name.
-    */
-  void Register(const std::string& name, const std::string& funcName);
-  /**
-    this API is used to get the function address by library and function name.
-    */
-  void* Get(const std::string& soName, const std::string& funcName);
+    /**
+        Singleton
+        */
+    static FunctionRegister* GetInstance();
+    /**
+        this API is used to store FunctionLoader class
+        */
+    void Register(const std::string& name, ::std::unique_ptr<FunctionLoader>& ptr);
+    /**
+        this API is used to associate library name and function name.
+        */
+    void Register(const std::string& name, const std::string& funcName);
+    /**
+        this API is used to get the function address by library and function name.
+        */
+    void* Get(const std::string& soName, const std::string& funcName);
 
 private:
-  FunctionRegister() = default;
-  mutable std::mutex mu_;
-  mutable std::unordered_map<std::string, ::std::unique_ptr<FunctionLoader>> registry;
+    FunctionRegister() = default;
+    mutable std::mutex mu_;
+    mutable std::unordered_map<std::string, ::std::unique_ptr<FunctionLoader>> registry;
 }; // class FunctionRegister
 
 /**
@@ -69,30 +69,30 @@ private:
   */
 class FunctionRegisterBuilder {
 public:
-  /**
-    ctr
-    */
-  FunctionRegisterBuilder(const std::string& name, ::std::unique_ptr<FunctionLoader>& ptr);
-  /**
-    ctr
-    */
-  FunctionRegisterBuilder(const std::string& soName, const std::string& funcName);
+    /**
+        ctr
+        */
+    FunctionRegisterBuilder(const std::string& name, ::std::unique_ptr<FunctionLoader>& ptr);
+    /**
+        ctr
+        */
+    FunctionRegisterBuilder(const std::string& soName, const std::string& funcName);
 }; // class FunctionRegisterBuilder
 
 } // namespace register_function
 
 #define REGISTER_LIBRARY(soName)                                                \
-  auto library_##soName =                                                       \
-    ::std::unique_ptr<c10_npu::option::FunctionLoader>(new c10_npu::option::FunctionLoader(#soName));      \
-  static c10_npu::option::register_function::FunctionRegisterBuilder                             \
-    register_library_##soName(#soName, library_##soName);
+    auto library_##soName =                                                       \
+        ::std::unique_ptr<c10_npu::option::FunctionLoader>(new c10_npu::option::FunctionLoader(#soName));      \
+    static c10_npu::option::register_function::FunctionRegisterBuilder                             \
+        register_library_##soName(#soName, library_##soName);
 
 #define REGISTER_FUNCTION(soName, funcName)                                     \
-  static c10_npu::option::register_function::FunctionRegisterBuilder                             \
-    register_function_##funcName(#soName, #funcName);
+    static c10_npu::option::register_function::FunctionRegisterBuilder                             \
+        register_function_##funcName(#soName, #funcName);
 
 #define GET_FUNCTION(soName, funcName)                                              \
-  c10_npu::option::register_function::FunctionRegister::GetInstance()->Get(#soName, #funcName);
+    c10_npu::option::register_function::FunctionRegister::GetInstance()->Get(#soName, #funcName);
 
 } // namespace option
 } // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/register/OptionRegister.cpp b/torch_npu/csrc/core/npu/register/OptionRegister.cpp
index e3a345af1c..e37543bf83 100644
--- a/torch_npu/csrc/core/npu/register/OptionRegister.cpp
+++ b/torch_npu/csrc/core/npu/register/OptionRegister.cpp
@@ -9,7 +9,7 @@ namespace c10_npu {
 namespace option {
 
 OptionInterface::OptionInterface(OptionCallBack callback) {
-  this->callback = callback;
+    this->callback = callback;
 }
 
 void OptionInterface::Set(const std::string& in) {
@@ -26,70 +26,70 @@ void OptionInterface::Set(const std::string& in) {
 }
 
 std::string OptionInterface::Get() {
-  return val;
+    return val;
 }
 
 
 namespace register_options {
 OptionRegister* OptionRegister::GetInstance() {
-  static OptionRegister instance;
-  return &instance;
+    static OptionRegister instance;
+    return &instance;
 }
 
 void OptionRegister::Register(const std::string& name,
     ::std::unique_ptr<OptionInterface>& ptr) {
-  std::lock_guard<std::mutex> lock(mu_);
-  registry.emplace(name, std::move(ptr));
+    std::lock_guard<std::mutex> lock(mu_);
+    registry.emplace(name, std::move(ptr));
 }
 
 void OptionRegister::Set(const std::string& name, const std::string& val) {
-  auto itr = registry.find(name);
-  if (itr != registry.end()) {
-    itr->second->Set(val);
-  } else {
-    AT_ERROR("invalid npu option name:", name);
-  }
+    auto itr = registry.find(name);
+    if (itr != registry.end()) {
+        itr->second->Set(val);
+    } else {
+        AT_ERROR("invalid npu option name:", name);
+    }
 }
 
 c10::optional<std::string> OptionRegister::Get(const std::string& name) {
-  auto itr = registry.find(name);
-  if (itr != registry.end()) {
-    return itr->second->Get();
-  }
-  return c10::nullopt; // default value
+    auto itr = registry.find(name);
+    if (itr != registry.end()) {
+        return itr->second->Get();
+    }
+    return c10::nullopt; // default value
 }
 
 OptionInterfaceBuilder::OptionInterfaceBuilder(
     const std::string& name,
     ::std::unique_ptr<OptionInterface>& ptr,
     const std::string& type) {
-  OptionRegister::GetInstance()->Register(name, ptr);
-
-  // init the value if env variable.
-  if (type == "env") {
-    std::string env_name = name;
-    std::transform(env_name.begin(), env_name.end(), env_name.begin(), ::toupper);
-    char* env_val = std::getenv(env_name.c_str());
-    if (env_val != nullptr) {
-      std::string val(env_val);
-      OptionRegister::GetInstance()->Set(name, val);
+    OptionRegister::GetInstance()->Register(name, ptr);
+
+    // init the value if env variable.
+    if (type == "env") {
+        std::string env_name = name;
+        std::transform(env_name.begin(), env_name.end(), env_name.begin(), ::toupper);
+        char* env_val = std::getenv(env_name.c_str());
+        if (env_val != nullptr) {
+            std::string val(env_val);
+            OptionRegister::GetInstance()->Set(name, val);
+        }
     }
-  }
 }
 } // namespace register_options
 
 void SetOption(const std::string& key, const std::string& val) {
-  register_options::OptionRegister::GetInstance()->Set(key, val);
+    register_options::OptionRegister::GetInstance()->Set(key, val);
 }
 
 void SetOption(const std::map<std::string, std::string>& options) {
-  for (auto item : options) {
-    SetOption(item.first, item.second);
-  }
+    for (auto item : options) {
+        SetOption(item.first, item.second);
+    }
 }
 
 c10::optional<std::string> GetOption(const std::string& key) {
-  return register_options::OptionRegister::GetInstance()->Get(key);
+    return register_options::OptionRegister::GetInstance()->Get(key);
 }
 
 } // namespace option
diff --git a/torch_npu/csrc/core/npu/register/OptionRegister.h b/torch_npu/csrc/core/npu/register/OptionRegister.h
index e6f68447b0..8e17606da6 100644
--- a/torch_npu/csrc/core/npu/register/OptionRegister.h
+++ b/torch_npu/csrc/core/npu/register/OptionRegister.h
@@ -17,24 +17,24 @@ typedef void(*OptionCallBack) (const std::string&);
   */
 class OptionInterface {
 public:
-  /**
-    dctr
-    */
+    /**
+        dctr
+        */
     OptionInterface(OptionCallBack callback = nullptr);
-  /**
-    This API is used to store value.
-    */
-  void Set(const std::string& in);
-  /**
-    This API is used to load value.
-    */
-  std::string Get();
+    /**
+        This API is used to store value.
+        */
+    void Set(const std::string& in);
+    /**
+        This API is used to load value.
+        */
+    std::string Get();
 private:
-/**
-  Its used to store hook.
-  */
-  OptionCallBack callback = nullptr;
-  std::string val;
+    /**
+     Its used to store hook.
+    */
+    OptionCallBack callback = nullptr;
+    std::string val;
 };
 
 namespace register_options {
@@ -44,30 +44,30 @@ namespace register_options {
   */
 class OptionRegister {
 public:
-  /**
-    dctr
-    */
-  ~OptionRegister() = default;
-  /**
-    singleton
-    */
-  static OptionRegister* GetInstance();
-  /**
-    register
-    */
-  void Register(const std::string& name, ::std::unique_ptr<OptionInterface>& ptr);
-  /**
-    This API is used to store value to special key.
-    */
-  void Set(const std::string& name, const std::string& val);
-  /**
-    This API is used to load value from special key.
-    */
-  c10::optional<std::string> Get(const std::string& name);
+    /**
+        dctr
+        */
+    ~OptionRegister() = default;
+    /**
+        singleton
+        */
+    static OptionRegister* GetInstance();
+    /**
+        register
+        */
+    void Register(const std::string& name, ::std::unique_ptr<OptionInterface>& ptr);
+    /**
+        This API is used to store value to special key.
+        */
+    void Set(const std::string& name, const std::string& val);
+    /**
+        This API is used to load value from special key.
+        */
+    c10::optional<std::string> Get(const std::string& name);
 private:
-  OptionRegister() {}
-  mutable std::mutex mu_;
-  mutable std::unordered_map<std::string, ::std::unique_ptr<OptionInterface>> registry;
+    OptionRegister() {}
+    mutable std::mutex mu_;
+    mutable std::unordered_map<std::string, ::std::unique_ptr<OptionInterface>> registry;
 };
 
 /**
@@ -75,7 +75,7 @@ private:
   */
 class OptionInterfaceBuilder {
 public:
-  OptionInterfaceBuilder(const std::string& name, ::std::unique_ptr<OptionInterface>& ptr, const std::string& type = "cli");
+    OptionInterfaceBuilder(const std::string& name, ::std::unique_ptr<OptionInterface>& ptr, const std::string& type = "cli");
 };
 
 } // namespace register_options
@@ -94,58 +94,58 @@ void SetOption(const std::string& key, const std::string& val);
 c10::optional<std::string> GetOption(const std::string& key);
 
 #define REGISTER_OPTION(name)                                       \
-  REGISTER_OPTION_UNIQ(name, name, cli)
+    REGISTER_OPTION_UNIQ(name, name, cli)
 
 #define REGISTER_OPTION_INIT_BY_ENV(name)                           \
-  REGISTER_OPTION_UNIQ(name, name, env)
+    REGISTER_OPTION_UNIQ(name, name, env)
 
 #define REGISTER_OPTION_UNIQ(id, name, type)                        \
-  auto options_interface_##id =                                     \
-      ::std::unique_ptr<c10_npu::option::OptionInterface>(new c10_npu::option::OptionInterface());    \
-  static c10_npu::option::register_options::OptionInterfaceBuilder                             \
-      register_options_interface_##id(#name, options_interface_##id, #type);
+    auto options_interface_##id =                                     \
+        ::std::unique_ptr<c10_npu::option::OptionInterface>(new c10_npu::option::OptionInterface());    \
+    static c10_npu::option::register_options::OptionInterfaceBuilder                             \
+        register_options_interface_##id(#name, options_interface_##id, #type);
 
 #define REGISTER_OPTION_HOOK(name, ...)                                       \
-  REGISTER_OPTION_HOOK_UNIQ(name, name, __VA_ARGS__)
+    REGISTER_OPTION_HOOK_UNIQ(name, name, __VA_ARGS__)
 
 #define REGISTER_OPTION_HOOK_UNIQ(id, name, ...)                                \
-  auto options_interface_##id =                                                 \
-      ::std::unique_ptr<c10_npu::option::OptionInterface>(                             \
-        new c10_npu::option::OptionInterface(c10_npu::option::OptionCallBack(__VA_ARGS__)));  \
-  static c10_npu::option::register_options::OptionInterfaceBuilder                     \
-      register_options_interface_##id(#name, options_interface_##id);
+    auto options_interface_##id =                                                 \
+        ::std::unique_ptr<c10_npu::option::OptionInterface>(                             \
+            new c10_npu::option::OptionInterface(c10_npu::option::OptionCallBack(__VA_ARGS__)));  \
+    static c10_npu::option::register_options::OptionInterfaceBuilder                     \
+        register_options_interface_##id(#name, options_interface_##id);
 
 #define REGISTER_OPTION_BOOL_FUNCTION(func, key, defaultVal, trueVal)  \
-  bool func() {                                                     \
-    auto val = c10_npu::option::GetOption(#key);                           \
-    if (val.value_or(defaultVal) == (trueVal)) {                    \
-      return true;                                                  \
-    }                                                               \
-    return false;                                                   \
-  }
+    bool func() {                                                     \
+        auto val = c10_npu::option::GetOption(#key);                           \
+        if (val.value_or(defaultVal) == (trueVal)) {                    \
+            return true;                                                  \
+        }                                                               \
+        return false;                                                   \
+    }
 
 #define REGISTER_OPTION_BOOL_FUNCTION_UNIQ(func, key, defaultVal, trueVal)  \
-  bool func() {                                                             \
-    static auto val = c10_npu::option::GetOption(#key);                            \
-    if (val.value_or(defaultVal) == (trueVal)) {                            \
-      return true;                                                          \
-    }                                                                       \
-    return false;                                                           \
-  }
+    bool func() {                                                             \
+        static auto val = c10_npu::option::GetOption(#key);                            \
+        if (val.value_or(defaultVal) == (trueVal)) {                            \
+            return true;                                                          \
+        }                                                                       \
+        return false;                                                           \
+    }
 
 #define REGISTER_OPTION_BOOL_FUNCTION_ALL_CASE(func, key, defaultVal, falseVal, trueVal)  \
-  bool func() {                                                                           \
-    auto val = c10_npu::option::GetOption(#key);                                          \
-    if (val.has_value()) {                                                                \
-        if (val.value() == (trueVal)) {                                                   \
-            return true;                                                                  \
-        }                                                                                 \
-        if (val.value() == (falseVal)) {                                                  \
-            return false;                                                                 \
-        }                                                                                 \
-    }                                                                                     \
-    return (defaultVal) == (trueVal);                                                     \
-  }
+    bool func() {                                                                           \
+        auto val = c10_npu::option::GetOption(#key);                                          \
+        if (val.has_value()) {                                                                \
+            if (val.value() == (trueVal)) {                                                   \
+                return true;                                                                  \
+            }                                                                                 \
+            if (val.value() == (falseVal)) {                                                  \
+                return false;                                                                 \
+            }                                                                                 \
+        }                                                                                     \
+        return (defaultVal) == (trueVal);                                                     \
+    }
 
 #define REGISTER_OPTION_CACHE(type, valueName, ...)                 \
     static thread_local type valueName##Value;                      \
-- 
Gitee


From 6f36da38d9d6d9d18ddad02de6edbef82428c732 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 7 Mar 2025 04:45:25 +0000
Subject: [PATCH 108/358] !18661 Update op_plugin commit id Merge pull request
 !18661 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 9efaf481f6..380db030ee 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 9efaf481f6a12a395cc392fc768718e680763522
+Subproject commit 380db030ee409b7faa8881baf9f395529cfc0610
-- 
Gitee


From e5e6c059c7bd5e1af600a207b04bf23303c06954 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 7 Mar 2025 09:00:34 +0000
Subject: [PATCH 109/358] !18677 Update op_plugin commit id Merge pull request
 !18677 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 380db030ee..62d14e353a 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 380db030ee409b7faa8881baf9f395529cfc0610
+Subproject commit 62d14e353a62f923422f20abd2abfc72e464b553
-- 
Gitee


From 7ae67d3b464f64433d31c693c255ef88735c38ea Mon Sep 17 00:00:00 2001
From: jiangpengfei <jiangpengfei24@huawei.com>
Date: Fri, 7 Mar 2025 11:06:46 +0000
Subject: [PATCH 110/358] !18669 modify README.md Merge pull request !18669
 from jiangpengfei/v2.6.0

---
 README.md    |  6 +++---
 README.zh.md | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index bdf44f7562..4a6f737bea 100644
--- a/README.md
+++ b/README.md
@@ -241,10 +241,10 @@ The version branches of AscendPyTorch have the following maintenance phases:
 |-------------|--------------------------|-------------|-----------------|---------------------------------------------------------------------|--------------|
 | 2.6.0       | Regular Release          | Development | 2025/02/20      | Expected to enter maintenance status from  July 20, 2025            |              |
 | 2.5.1       | Regular Release          | Development | 2024/11/08      | Expected to enter maintenance status from  April 8, 2025            |              |
-| 2.4.0       | Regular Release          | Development | 2024/10/15      | Expected to enter maintenance status from  March 15, 2025           |              |
-| 2.3.1       | Regular Release          | Development | 2024/06/06      | Expected to enter maintenance status from  December 6, 2024         |              |
+| 2.4.0       | Regular Release          | Development | 2024/10/15      | Expected to enter maintenance status from  June 15, 2025           |              |
+| 2.3.1       | Regular Release          | Development | 2024/06/06      | Expected to enter maintenance status from  June 7, 2025         |              |
 | 2.2.0       | Regular Release          | Maintained  | 2024/04/01      | Expected to enter maintenance free status from September 10th, 2025 |              |
-| 2.1.0       | Long Term Support        | Development | 2023/10/15      | Expected to enter maintenance status from March 30, 2025            |              |
+| 2.1.0       | Long Term Support        | Development | 2023/10/15      | Expected to enter maintenance status from September 15, 2025            |              |
 | 2.0.1       | Regular Release          | EOL         | 2023/7/19       |                                                                     | 2024/3/14    |
 | 1.11.0      | Long Term Support        | Maintained  | 2023/4/19       | Expected to enter maintenance free status from September 10th, 2025 |              |
 | 1.8.1       | Long Term Support        | EOL         | 2022/4/10       |                                                                     | 2023/4/10    |
diff --git a/README.zh.md b/README.zh.md
index d47c0500bd..29ec7d6d46 100644
--- a/README.zh.md
+++ b/README.zh.md
@@ -243,12 +243,12 @@ AscendPyTorch版本分支的维护阶段如下：
 |---------------|----------|----------|------------|---------------------|-----------|
 | 2.6.0         | 常规分支     | 开发       | 2025/02/20 | 预计2025/07/20起进入维护状态 | -         | 
 | 2.5.1         | 常规分支     | 开发       | 2024/11/08 | 预计2025/04/08起进入维护状态 | -         | 
-| 2.4.0         | 常规分支     | 开发       | 2024/10/15 | 预计2025/03/15起进入维护状态 | -         | 
-| 2.3.1         | 常规分支     | 开发       | 2024/06/06 | 预计2024/12/06起进入维护状态 |           |
-| 2.2.0         | 常规分支     | 维护       | 2024/04/01 | 预计2025/9/10起进入无维护状态 |           |
-| 2.1.0         | 长期支持     | 开发       | 2023/10/15 | 预计2025/03/30起进入维护状态 |           |
+| 2.4.0         | 常规分支     | 开发       | 2024/10/15 | 预计2025/06/15起进入维护状态 | -         | 
+| 2.3.1         | 常规分支     | 开发       | 2024/06/06 | 预计2025/06/07起进入维护状态 |           |
+| 2.2.0         | 常规分支     | 维护       | 2024/04/01 | 预计2025/09/10起进入无维护状态 |           |
+| 2.1.0         | 长期支持     | 开发       | 2023/10/15 | 预计2025/09/15起进入维护状态 |           |
 | 2.0.1         | 常规分支     | EOL      | 2023/7/19  |                     | 2024/3/14 |
-| 1.11.0        | 长期支持     | 维护       | 2023/4/19  | 预计2025/9/10起进入无维护状态 |           |
+| 1.11.0        | 长期支持     | 维护       | 2023/4/19  | 预计2025/09/10起进入无维护状态 |           |
 | 1.8.1         | 长期支持     | EOL      | 2022/4/10  |                     | 2023/4/10 |
 | 1.5.0         | 长期支持     | EOL      | 2021/7/29  |                     | 2022/7/29 |
 
-- 
Gitee


From 2cc27939545fc9ebb0e534f1161d358e45bf7ca8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=8F=B6=E5=AD=90=E5=87=A1?= <vscience@qq.com>
Date: Fri, 7 Mar 2025 11:40:50 +0000
Subject: [PATCH 111/358] =?UTF-8?q?!18656=20Support=20qos=20and=20aiv=20in?=
 =?UTF-8?q?=20options.hccl=5Fconfig=20Merge=20pull=20request=20!18656=20fr?=
 =?UTF-8?q?om=20=E5=8F=B6=E5=AD=90=E5=87=A1/v2.6.0=5Fqos?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/distributed/test_options.py              | 52 +++++++++++++++++--
 third_party/hccl/inc/hccl/hccl.h              |  5 +-
 third_party/hccl/inc/hccl/hccl_types.h        |  9 +++-
 .../csrc/distributed/ProcessGroupHCCL.cpp     | 24 +++++++++
 4 files changed, 83 insertions(+), 7 deletions(-)

diff --git a/test/distributed/test_options.py b/test/distributed/test_options.py
index 12aeffbb64..7f31b3ed4a 100644
--- a/test/distributed/test_options.py
+++ b/test/distributed/test_options.py
@@ -42,15 +42,31 @@ class OptionsTest(TestCase):
         dist.all_reduce(input1, group=pg)
 
     @classmethod
-    def _test_options_wrong_type(cls, rank, ranks, world_size, input1):
+    def _test_options_wrong_type(cls, rank, hccl_config, error_expect, world_size, input1):
         options = torch_npu._C._distributed_c10d.ProcessGroupHCCL.Options()
-        options.hccl_config = {"group_name": 123}
+        options.hccl_config = hccl_config
         input1 = input1.npu()
         test_case = TestCase()
-        with test_case.assertRaisesRegex(RuntimeError, "Value type of group_name should be string"):
+        with test_case.assertRaisesRegex(RuntimeError, error_expect):
             OptionsTest._init_dist_hccl(rank, options, world_size)
             dist.all_reduce(input1)
 
+    @classmethod
+    def _test_options_group_name_wrong_types(cls, rank, ranks, world_size, input1):
+        cls._test_options_wrong_type(rank, {"group_name": 123}, "Value type of group_name should be string", world_size, input1)
+
+    @classmethod
+    def _test_options_qos_traffic_class_wrong_types(cls, rank, ranks, world_size, input1):
+        cls._test_options_wrong_type(rank, {"qos_traffic_class": "123"}, "Value type of qos_traffic_class should be int.", world_size, input1)
+
+    @classmethod
+    def _test_options_qos_service_level_wrong_types(cls, rank, ranks, world_size, input1):
+        cls._test_options_wrong_type(rank, {"qos_service_level": "123"}, "Value type of qos_service_level should be int.", world_size, input1)
+   
+    @classmethod
+    def _test_options_hccl_op_expansion_mode_wrong_types(cls, rank, ranks, world_size, input1):
+        cls._test_options_wrong_type(rank, {"hccl_op_expansion_mode": "123"}, "Value type of hccl_op_expansion_mode should be int.", world_size, input1)
+
     def _test_multiprocess(self, f, input1, world_size):
         ctx = mp.get_context('spawn')
 
@@ -79,14 +95,40 @@ class OptionsTest(TestCase):
                                     input1, world_size)
 
     @skipIfUnsupportMultiNPU(2)
-    def test_options_wrong_type(self):
+    def test_options_group_name_wrong_type(self):
+        ranks = [2]
+        shape = [np.int32, 0, [2, 3, 16]]
+        for world_size in ranks:
+            exp_input, input1 = create_common_tensor(shape, -10, 10)
+            self._test_multiprocess(OptionsTest._test_options_group_name_wrong_types,
+                                    input1, world_size)
+
+    @skipIfUnsupportMultiNPU(2)
+    def test_options_qos_traffic_class_wrong_type(self):
         ranks = [2]
         shape = [np.int32, 0, [2, 3, 16]]
         for world_size in ranks:
             exp_input, input1 = create_common_tensor(shape, -10, 10)
-            self._test_multiprocess(OptionsTest._test_options_wrong_type,
+            self._test_multiprocess(OptionsTest._test_options_qos_traffic_class_wrong_types,
                                     input1, world_size)
 
+    @skipIfUnsupportMultiNPU(2)
+    def test_options_qos_service_level_wrong_type(self):
+        ranks = [2]
+        shape = [np.int32, 0, [2, 3, 16]]
+        for world_size in ranks:
+            exp_input, input1 = create_common_tensor(shape, -10, 10)
+            self._test_multiprocess(OptionsTest._test_options_qos_service_level_wrong_types,
+                                    input1, world_size)
+
+    @skipIfUnsupportMultiNPU(2)
+    def test_options_hccl_op_expansion_mode_wrong_type(self):
+        ranks = [2]
+        shape = [np.int32, 0, [2, 3, 16]]
+        for world_size in ranks:
+            exp_input, input1 = create_common_tensor(shape, -10, 10)
+            self._test_multiprocess(OptionsTest._test_options_hccl_op_expansion_mode_wrong_types,
+                                    input1, world_size)
 
 if __name__ == '__main__':
     run_tests()
diff --git a/third_party/hccl/inc/hccl/hccl.h b/third_party/hccl/inc/hccl/hccl.h
index da30721fad..b060c6857f 100644
--- a/third_party/hccl/inc/hccl/hccl.h
+++ b/third_party/hccl/inc/hccl/hccl.h
@@ -6,7 +6,7 @@
 #ifndef HCCL_H_
 #define HCCL_H_
 
-#include <hccl/hccl_types.h>
+#include "hccl_types.h"
 #include <acl/acl.h>
 
 #ifdef __cplusplus
@@ -207,6 +207,9 @@ inline void HcclCommConfigInit(HcclCommConfig *config)
     config->hcclDeterministic = HCCL_COMM_DEFAULT_DETERMINISTIC;
     config->hcclCommName[0] = '\0';
     config->hcclUdi[0] = '\0';
+    config->hcclRdmaTrafficClass = HCCL_COMM_TRAFFIC_CLASS_CONFIG_NOT_SET;
+    config->hcclRdmaServiceLevel = HCCL_COMM_SERVICE_LEVEL_CONFIG_NOT_SET;
+    config->hcclOpExpansionMode = HCCL_COMM_DEFAULT_OP_EXPANSION_MODE;
 }
 
 /**
diff --git a/third_party/hccl/inc/hccl/hccl_types.h b/third_party/hccl/inc/hccl/hccl_types.h
index aa653995ea..40631676c1 100644
--- a/third_party/hccl/inc/hccl/hccl_types.h
+++ b/third_party/hccl/inc/hccl/hccl_types.h
@@ -15,11 +15,14 @@ extern "C" {
 
 const uint32_t HCCL_COMM_CONFIG_INFO_BYTES = 24;
 const uint32_t HCCL_COMM_CONFIG_MAGIC_WORD = 0xf0f0f0f0;
-const uint32_t HCCL_COMM_CONFIG_VERSION = 3;
+const uint32_t HCCL_COMM_CONFIG_VERSION = 5;
 const uint32_t HCCL_COMM_DEFAULT_BUFFSIZE = 200;                // 200MB buffer size
 const uint32_t HCCL_COMM_DEFAULT_DETERMINISTIC = 0;             // Disable deterministic calculations
 const uint32_t COMM_NAME_MAX_LENGTH = 128;
 const uint32_t UDI_MAX_LENGTH = 128;
+const uint32_t HCCL_COMM_TRAFFIC_CLASS_CONFIG_NOT_SET = 0xffffffff;
+const uint32_t HCCL_COMM_SERVICE_LEVEL_CONFIG_NOT_SET = 0xffffffff;
+const uint32_t HCCL_COMM_DEFAULT_OP_EXPANSION_MODE = 0;
 
 /**
  * @brief HCCL functions return value definition
@@ -126,12 +129,16 @@ typedef struct HcclCommConfigDef {
     uint32_t hcclDeterministic;
     char hcclCommName[COMM_NAME_MAX_LENGTH];
     char hcclUdi[UDI_MAX_LENGTH];
+    uint32_t hcclOpExpansionMode;
+    uint32_t hcclRdmaTrafficClass;
+    uint32_t hcclRdmaServiceLevel;
 } HcclCommConfig;
 
 typedef enum {
     HCCL_COMM_CONFIG_BUFFER_SIZE = 0,
     HCCL_COMM_CONFIG_DETERMINISTIC = 1,
     HCCL_COMM_CONFIG_COMM_NAME = 2,
+    HCCL_COMM_CONFIG_OP_EXPANSION_MODE = 3,
     HCCL_COMM_CONFIG_RESERVED,
 } HcclCommConfigCapability;
 #ifdef __cplusplus
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index eb9c1f53ac..7224e084d1 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -2262,6 +2262,30 @@ HcclCommConfig ProcessGroupHCCL::createHcclCommConfigWithOptions()
         }
     }
 
+    if (options_->hccl_config.find("qos_traffic_class") != options_->hccl_config.end()) {
+        if (std::holds_alternative<uint32_t>(options_->hccl_config["qos_traffic_class"])) {
+            config.hcclRdmaTrafficClass = std::get<uint32_t>(options_->hccl_config["qos_traffic_class"]);
+        } else {
+            TORCH_CHECK(false, "Value type of qos_traffic_class should be int.", DIST_ERROR(ErrCode::TYPE));
+        }
+    }
+
+    if (options_->hccl_config.find("qos_service_level") != options_->hccl_config.end()) {
+        if (std::holds_alternative<uint32_t>(options_->hccl_config["qos_service_level"])) {
+            config.hcclRdmaServiceLevel = std::get<uint32_t>(options_->hccl_config["qos_service_level"]);
+        } else {
+            TORCH_CHECK(false, "Value type of qos_service_level should be int.", DIST_ERROR(ErrCode::TYPE));
+        }
+    }
+
+    if (options_->hccl_config.find("hccl_op_expansion_mode") != options_->hccl_config.end()) {
+        if (std::holds_alternative<uint32_t>(options_->hccl_config["hccl_op_expansion_mode"])) {
+            config.hcclOpExpansionMode = std::get<uint32_t>(options_->hccl_config["hccl_op_expansion_mode"]);
+        } else {
+            TORCH_CHECK(false, "Value type of hccl_op_expansion_mode should be int.", DIST_ERROR(ErrCode::TYPE));
+        }
+    }
+
     return config;
 }
 
-- 
Gitee


From 1a160bee7cb14ef73d0da21ad9d61cccb88dbdf0 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 7 Mar 2025 15:45:29 +0000
Subject: [PATCH 112/358] !18699 Update op_plugin commit id Merge pull request
 !18699 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 62d14e353a..a8714b9824 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 62d14e353a62f923422f20abd2abfc72e464b553
+Subproject commit a8714b9824ea6151d5036cafb96325d0e5c28f4a
-- 
Gitee


From 73e05f51dc0592faeba45c557be6a3a94c4f245b Mon Sep 17 00:00:00 2001
From: jiangpengfei <jiangpengfei24@huawei.com>
Date: Sat, 8 Mar 2025 09:05:37 +0000
Subject: [PATCH 113/358] !18714 change soc version Merge pull request !18714
 from jiangpengfei/v2.6.0

---
 torch_npu/csrc/core/npu/NpuVariables.cpp | 2 +-
 torch_npu/csrc/core/npu/NpuVariables.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NpuVariables.cpp b/torch_npu/csrc/core/npu/NpuVariables.cpp
index acc8f2f9b1..40efb7fc25 100644
--- a/torch_npu/csrc/core/npu/NpuVariables.cpp
+++ b/torch_npu/csrc/core/npu/NpuVariables.cpp
@@ -35,7 +35,7 @@ static std::map<std::string, SocVersion> socVersionMap = {
     {"Ascend910_9381", SocVersion::Ascend910_9381},
     {"Ascend910_9382", SocVersion::Ascend910_9382},
     {"Ascend910_9372", SocVersion::Ascend910_9372},
-    {"Ascend910_9361", SocVersion::Ascend910_9361}};
+    {"Ascend910_9362", SocVersion::Ascend910_9362}};
 
 void SetSocVersion(const char* const socVersion) {
   if (socVersion == nullptr ||
diff --git a/torch_npu/csrc/core/npu/NpuVariables.h b/torch_npu/csrc/core/npu/NpuVariables.h
index fe15fd1448..f2575ee8cf 100644
--- a/torch_npu/csrc/core/npu/NpuVariables.h
+++ b/torch_npu/csrc/core/npu/NpuVariables.h
@@ -28,7 +28,7 @@ enum class SocVersion {
   Ascend910_9381,
   Ascend910_9382,
   Ascend910_9372,
-  Ascend910_9361
+  Ascend910_9362
 };
 
 void SetSocVersion(const char* const socVersion);
-- 
Gitee


From afeca43b421688ed095227f08f184064f55db286 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Sat, 8 Mar 2025 10:45:26 +0000
Subject: [PATCH 114/358] !18725 Update op_plugin commit id Merge pull request
 !18725 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index a8714b9824..5debaaaf2a 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit a8714b9824ea6151d5036cafb96325d0e5c28f4a
+Subproject commit 5debaaaf2a93ebeb690f157f9a28c2e2fda1119b
-- 
Gitee


From 48fe7d2a783734bf17237b609513cd91d7fecfc9 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Sat, 8 Mar 2025 10:45:26 +0000
Subject: [PATCH 115/358] !18725 Update op_plugin commit id Merge pull request
 !18725 from pta-robot/v2.6.0

-- 
Gitee


From b0dcd8bf848e48dca82f673a15f15925373d1fc6 Mon Sep 17 00:00:00 2001
From: will-devil <wangyicheng16@huawei.com>
Date: Mon, 10 Mar 2025 01:52:48 +0000
Subject: [PATCH 116/358] !18667 [Feature] fsdp2 testcase 1/N. Merge pull
 request !18667 from will-devil/fsdp2-26-1

---
 .../fsdp2/test_fully_shard_clip_grad_norm_.py | 153 ++++++
 .../fsdp2/test_fully_shard_extensions.py      | 470 ++++++++++++++++++
 .../fsdp2/test_fully_shard_state.py           |  81 +++
 .../fsdp2/test_fully_shard_state_dict.py      | 386 ++++++++++++++
 4 files changed, 1090 insertions(+)
 create mode 100644 test/distributed/fsdp2/test_fully_shard_clip_grad_norm_.py
 create mode 100644 test/distributed/fsdp2/test_fully_shard_extensions.py
 create mode 100644 test/distributed/fsdp2/test_fully_shard_state.py
 create mode 100644 test/distributed/fsdp2/test_fully_shard_state_dict.py

diff --git a/test/distributed/fsdp2/test_fully_shard_clip_grad_norm_.py b/test/distributed/fsdp2/test_fully_shard_clip_grad_norm_.py
new file mode 100644
index 0000000000..e3d09aa455
--- /dev/null
+++ b/test/distributed/fsdp2/test_fully_shard_clip_grad_norm_.py
@@ -0,0 +1,153 @@
+import copy
+import functools
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from torch.distributed._composable import replicate
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
+from torch.distributed.fsdp import fully_shard
+from torch.distributed.tensor.debug import CommDebugMode
+from torch.testing._internal.common_fsdp import MLPStack
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    ModelArgs,
+    Transformer,
+    TransformerBlock,
+)
+import torch_npu
+from torch_npu.testing.common_distributed import skipIfUnsupportMultiNPU
+from torch_npu.testing.common_utils import SupportedDevices
+from torch_npu.testing._internal.common_fsdp import FSDPNPUTest
+
+
+class _TestClipGradNormBase(FSDPNPUTest):
+    def _test_clip_grad_norm(
+        self,
+        max_norm: Union[float, int],
+        norm_type: Union[float, int],
+        ref_model: nn.Module,
+        ref_optim: torch.optim.Optimizer,
+        model: nn.Module,
+        optim: torch.optim.Optimizer,
+        inp: torch.Tensor,
+        dp_mesh: Optional[DeviceMesh] = None,
+    ):
+        vector_norm_fn = functools.partial(torch.linalg.vector_norm, ord=norm_type)
+        dp_mesh = dp_mesh or init_device_mesh("npu", (self.world_size,))
+        torch.manual_seed(42 + dp_mesh.get_local_rank() + 1)
+        for iter_idx in range(10):
+            ref_optim.zero_grad()
+            ref_model(inp).sum().backward()
+            optim.zero_grad()
+            model(inp).sum().backward()
+
+            ref_grads = [p.grad.detach().clone() for p in ref_model.parameters()]
+            local_grads = [
+                p.grad.to_local().detach().clone() for p in model.parameters()
+            ]
+            for ref_grad, param in zip(ref_grads, model.parameters()):
+                self.assertEqual(ref_grad, param.grad.full_tensor())
+
+            # Check that at least one gradient has norm greater than the max
+            # norm before clipping to ensure the clipping is not vacuous
+            self.assertTrue(any(vector_norm_fn(g).item() > max_norm for g in ref_grads))
+            self.assertTrue(
+                any(vector_norm_fn(g).item() > max_norm for g in local_grads)
+            )
+
+            # Check gradient norm clipping via total norm and individual
+            # gradient norms post-clipping
+            ref_total_norm = torch.nn.utils.clip_grad_norm_(
+                ref_model.parameters(), max_norm=max_norm, norm_type=norm_type
+            )
+            comm_mode = CommDebugMode()
+            with comm_mode:
+                # foreach is default to turn on so we don't need to specify it.
+                total_norm = torch.nn.utils.clip_grad_norm_(
+                    model.parameters(),
+                    max_norm=max_norm,
+                    norm_type=norm_type,
+                )
+            self.assertEqual(ref_total_norm, total_norm.full_tensor())
+            # Expect one all-reduce per mesh dim for partial -> replicate
+            expected_all_reduces = len(total_norm.placements)
+            self.assertEqual(
+                comm_mode.get_comm_counts()[torch.ops.c10d_functional.all_reduce],
+                expected_all_reduces,
+            )
+            # For zero gradients, clipping has no effect
+            for param, grad in zip(ref_model.parameters(), ref_grads):
+                self.assertTrue(vector_norm_fn(param.grad).item() <= max_norm)
+                if torch.count_nonzero(grad):
+                    self.assertFalse(torch.equal(param.grad, grad))
+            for param, grad in zip(model.parameters(), local_grads):
+                self.assertTrue(
+                    vector_norm_fn(param.grad.to_local()).item() <= max_norm
+                )
+                if torch.count_nonzero(grad):
+                    self.assertFalse(torch.equal(param.grad.to_local(), grad))
+
+
+class TestClipGradNormWorldSize2(_TestClipGradNormBase):
+    @property
+    def world_size(self) -> int:
+        return min(torch.npu.device_count(), 2)
+
+    @SupportedDevices(['Ascend910B'])
+    def test_clip_grad_norm_1d(self):
+        for norm_type in (2, 1, float("inf")):
+            torch.manual_seed(42)
+            model_args = ModelArgs(dropout_p=0.0)
+            model = Transformer(model_args)
+            ref_model = replicate(copy.deepcopy(model).npu())
+            ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+            for module in model.modules():
+                if isinstance(module, TransformerBlock):
+                    fully_shard(module)
+            fully_shard(model)
+            optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+            inp = torch.randint(0, model.model_args.vocab_size, (3, 16), device="npu")
+            self._test_clip_grad_norm(
+                1, norm_type, ref_model, ref_optim, model, optim, inp
+            )
+
+
+class TestClipGradNormWorldSize4(_TestClipGradNormBase):
+    @property
+    def world_size(self) -> int:
+        return min(torch.npu.device_count(), 4)
+
+    @SupportedDevices(['Ascend910B'])
+    def test_clip_grad_norm_2d(self):
+        for norm_type in (2, 1, 3, float("inf")):
+            dp_size = 2
+            global_mesh = init_device_mesh(
+                "npu",
+                (dp_size, self.world_size // dp_size),
+                mesh_dim_names=("dp", "tp"),
+            )
+            dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
+            torch.manual_seed(42)
+            # Test using an MLP stack, not a transformer, since the transformer
+            # has some more significant numeric differences from the TP
+            model = MLPStack(16, with_seq_parallel=True)
+            ref_model = replicate(
+                copy.deepcopy(model).npu(), process_group=dp_mesh.get_group()
+            )
+            ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+            model.parallelize(
+                tp_mesh,
+                dp_mesh,
+                use_activation_checkpointing=False,
+                reshard_after_forward=True,
+            )
+            optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+            inp = torch.randn(2, 16, device="npu")
+            self._test_clip_grad_norm(
+                0.5, norm_type, ref_model, ref_optim, model, optim, inp, dp_mesh
+            )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/fsdp2/test_fully_shard_extensions.py b/test/distributed/fsdp2/test_fully_shard_extensions.py
new file mode 100644
index 0000000000..2f071369c1
--- /dev/null
+++ b/test/distributed/fsdp2/test_fully_shard_extensions.py
@@ -0,0 +1,470 @@
+import contextlib
+import copy
+import functools
+import math
+import threading
+from typing import Any, List, Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.utils._pytree as pytree
+from torch.autograd.grad_mode import _unsafe_preserve_version_counter
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
+from torch.distributed.fsdp import fully_shard, MixedPrecisionPolicy
+from torch.testing._internal.common_fsdp import (
+    check_sharded_parity,
+    FSDPTest,
+    FSDPTestMultiThread,
+    MLP,
+)
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.two_tensor import TwoTensor
+
+import torch_npu
+from torch_npu.testing.common_utils import SupportedDevices
+from torch_npu.testing._internal.common_fsdp import FSDPNPUTest
+
+
+def two_tensor_fsdp_pre_all_gather_v1(
+    self, mesh: DeviceMesh
+) -> Tuple[Tuple[torch.Tensor, ...], Any]:
+    all_gather_inputs = (self.a, self.b)
+    metadata = None
+    return all_gather_inputs, metadata
+
+
+def two_tensor_fsdp_pre_all_gather_v2(
+    self,
+    mesh: DeviceMesh,
+    outer_size: torch.Size,
+    outer_stride: Tuple[int, ...],
+    module: nn.Module,
+    mp_policy: MixedPrecisionPolicy,
+) -> Tuple[Tuple[torch.Tensor, ...], Any]:
+    all_gather_inputs = (self.a, self.b)
+    metadata = None
+    return all_gather_inputs, metadata
+
+
+def two_tensor_fsdp_post_all_gather(
+    self,
+    all_gather_outputs: Tuple[torch.Tensor, ...],
+    metadata: Any,
+    param_dtype: torch.dtype,
+    *,
+    out: Optional[torch.Tensor] = None,
+) -> Union[Tuple[torch.Tensor, Tuple[torch.Tensor, ...]], None]:
+    assert metadata is None, f"{metadata}"
+    a, b = all_gather_outputs
+    if out is not None:
+        assert isinstance(out, TwoTensor), f"{type(out)}"
+        if a.dtype == param_dtype:
+            assert a.untyped_storage().data_ptr() == out.a.untyped_storage().data_ptr()
+            assert b.untyped_storage().data_ptr() == out.b.untyped_storage().data_ptr()
+        else:
+            assert out.a.dtype == param_dtype, f"{out.a.dtype} {param_dtype}"
+            assert out.b.dtype == param_dtype, f"{out.b.dtype} {param_dtype}"
+            out.a.copy_(a)
+            out.b.copy_(b)
+        return
+    tensors_to_free = (a, b)
+    # If the cast is real, then the all-gather outputs will not alias the
+    # returned `TwoTensor`'s `a` and `b`
+    two_tensor = TwoTensor(a, b).to(param_dtype)
+    return two_tensor, tensors_to_free
+
+
+class BFloat16AllGatherTensor(torch.Tensor):
+    @staticmethod
+    def __new__(cls, data: torch.Tensor, pad_in_pre_all_gather: bool = True):
+        return torch.Tensor._make_wrapper_subclass(
+            cls,
+            data.shape,
+            data.stride(),
+            data.storage_offset(),
+            dtype=data.dtype,
+            device=data.device,
+        )
+
+    def __init__(self, data: torch.Tensor, pad_in_pre_all_gather: bool = True):
+        self._data = data
+        self._pad_in_pre_all_gather = pad_in_pre_all_gather
+
+    def fsdp_pre_all_gather(
+        self,
+        mesh: DeviceMesh,
+        outer_size: torch.Size,
+        outer_stride: Tuple[int, ...],
+        module: nn.Module,
+        mp_policy: MixedPrecisionPolicy,
+    ) -> Tuple[Tuple[torch.Tensor, ...], Any]:
+        assert mesh.ndim == 1, f"{mesh.ndim}"
+        mesh_size = mesh.size()
+        requires_padding = outer_size[0] % mesh_size != 0
+        if requires_padding and self._pad_in_pre_all_gather:
+            sharded_padded_size = list(outer_size)
+            sharded_padded_size[0] = math.ceil(outer_size[0] / mesh_size)
+            padded_out = torch.empty(
+                sharded_padded_size, dtype=torch.bfloat16, device=self.device
+            )
+            padded_out[: self._data.size(0)].copy_(self._data)
+            return (padded_out,), None
+        else:
+            return self._data.to(torch.bfloat16), None
+
+    def fsdp_post_all_gather(
+        self,
+        all_gather_outputs: Tuple[torch.Tensor, ...],
+        metadata: Any,
+        param_dtype: torch.dtype,
+        *,
+        out: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple[torch.Tensor, Tuple[torch.Tensor, ...]], None]:
+        assert metadata is None, f"{metadata}"
+        (tensor,) = all_gather_outputs
+        assert tensor.dtype == torch.bfloat16, f"{tensor.dtype}"
+        if out is not None:
+            with _unsafe_preserve_version_counter(out):
+                out.copy_(tensor)
+            return
+        upcast_tensor = tensor.to(param_dtype)
+        return upcast_tensor, (tensor, upcast_tensor)
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs):
+        pad_in_pre_all_gather = None
+
+        def unwrap(x: cls):
+            nonlocal pad_in_pre_all_gather
+            if pad_in_pre_all_gather is None:
+                pad_in_pre_all_gather = x._pad_in_pre_all_gather
+            else:
+                assert pad_in_pre_all_gather == x._pad_in_pre_all_gather
+            return x._data
+
+        out = func(
+            *pytree.tree_map_only(cls, unwrap, args),
+            **pytree.tree_map_only(cls, unwrap, kwargs),
+        )
+        return pytree.tree_map_only(
+            torch.Tensor, lambda x: cls(x, pad_in_pre_all_gather), out
+        )
+
+    def __tensor_flatten__(self):
+        return ["_data"], None
+
+    @staticmethod
+    def __tensor_unflatten__(
+        inner_tensors, outer_size: torch.Size, outer_stride: Tuple[int, ...]
+    ):
+        return inner_tensors["_data"]
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self._data})"
+
+
+class TestFullyShardAllGatherExtensionsCommon:
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @contextlib.contextmanager
+    def _patch_two_tensor_fsdp_all_gather(self, pre_all_gather_version: int):
+        lock = threading.Lock()
+        if pre_all_gather_version == 1:
+            TwoTensor.fsdp_pre_all_gather = two_tensor_fsdp_pre_all_gather_v1
+        elif pre_all_gather_version == 2:
+            TwoTensor.fsdp_pre_all_gather = two_tensor_fsdp_pre_all_gather_v2
+        TwoTensor.fsdp_post_all_gather = two_tensor_fsdp_post_all_gather
+        dist.barrier()
+        try:
+            yield
+        finally:
+            dist.barrier()
+            with lock:  # only one thread needs to delete
+                if hasattr(TwoTensor, "fsdp_pre_all_gather"):
+                    delattr(TwoTensor, "fsdp_pre_all_gather")
+                if hasattr(TwoTensor, "fsdp_post_all_gather"):
+                    delattr(TwoTensor, "fsdp_post_all_gather")
+
+    def _init_two_tensor_mlp(self) -> nn.Module:
+        # Disable bias because the reference model will end up with a bias
+        # gradient that is a `TwoTensor`, whereas the FSDP model does not
+        model = nn.Sequential(*[MLP(8, bias=False) for _ in range(3)])
+        for mlp in model:
+            mlp.in_proj.weight = nn.Parameter(
+                TwoTensor(mlp.in_proj.weight, mlp.in_proj.weight.clone())
+            )
+            mlp.out_proj.weight = nn.Parameter(
+                TwoTensor(mlp.out_proj.weight, mlp.out_proj.weight.clone())
+            )
+        return model
+
+
+class TestFullyShardAllGatherExtensionsMultiProcess(
+    TestFullyShardAllGatherExtensionsCommon, FSDPNPUTest
+):
+    @SupportedDevices(['Ascend910B'])
+    def test_all_gather_extensions_train_parity(self):
+        with self._patch_two_tensor_fsdp_all_gather(pre_all_gather_version=1):
+            self.run_subtests(
+                {"reshard_after_forward": [True, False]},
+                self._test_all_gather_extensions_train_parity,
+            )
+        with self._patch_two_tensor_fsdp_all_gather(pre_all_gather_version=2):
+            self.run_subtests(
+                {"reshard_after_forward": [True, False]},
+                self._test_all_gather_extensions_train_parity,
+            )
+
+    def _test_all_gather_extensions_train_parity(self, reshard_after_forward: bool):
+        torch.manual_seed(42)
+        model = self._init_two_tensor_mlp()
+        ref_model = copy.deepcopy(model).npu()
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=True)
+        fully_shard_fn = functools.partial(
+            fully_shard, reshard_after_forward=reshard_after_forward
+        )
+        for mlp in model:
+            fully_shard_fn(mlp)
+        fully_shard_fn(model)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
+        check_sharded_parity(self, ref_model, model)
+
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn((2, 8), device="npu")
+        for iter_idx in range(10):
+            losses: List[torch.Tensor] = []
+            for _model in (ref_model, model):
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+                if _model is ref_model:
+                    for param_name, param in _model.named_parameters():
+                        dist.all_reduce(param.grad)
+                        param.grad.detach().div_(self.world_size)
+            self.assertEqual(losses[0], losses[1])
+            check_sharded_parity(self, ref_model, model)
+            for _optim in (ref_optim, optim):
+                _optim.step()
+                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            check_sharded_parity(self, ref_model, model)
+
+
+class TestFullyShardAllGatherExtensionsMultiThread(
+    TestFullyShardAllGatherExtensionsCommon, FSDPTestMultiThread
+):
+    @property
+    def world_size(self) -> int:
+        return 8
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device("npu:0")
+
+    def perThreadSetUp(self):
+        super().perThreadSetUp()
+        torch.npu.set_device(0)
+
+    @SupportedDevices(['Ascend910B'])
+    def test_all_gather_extensions_end_to_end(self):
+        with self._patch_two_tensor_fsdp_all_gather(pre_all_gather_version=1):
+            self.run_subtests(
+                {"reshard_after_forward": [True, False]},
+                self._test_all_gather_extensions_end_to_end,
+            )
+        with self._patch_two_tensor_fsdp_all_gather(pre_all_gather_version=2):
+            self.run_subtests(
+                {"reshard_after_forward": [True, False]},
+                self._test_all_gather_extensions_end_to_end,
+            )
+
+    def _test_all_gather_extensions_end_to_end(self, reshard_after_forward: bool):
+        # Check that we can run the meta-device initialization flow
+        with torch.device("meta"):
+            model = self._init_two_tensor_mlp()
+        for param in model.parameters():
+            self.assertEqual(param.device, torch.device("meta"))
+        fully_shard_fn = functools.partial(
+            fully_shard,
+            reshard_after_forward=reshard_after_forward,
+            mp_policy=MixedPrecisionPolicy(param_dtype=torch.bfloat16),
+        )
+        for mlp in model:
+            fully_shard_fn(mlp)
+        fully_shard_fn(model)
+        model.to_empty(device=self.device)
+        for param in model.parameters():
+            nn.init.trunc_normal_(param)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
+
+        # Run a few iterations to check for errors
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn((2, 8), device="npu")
+        for _ in range(3):
+            model(inp).sum().backward()
+            optim.step()
+            optim.zero_grad()
+
+    @SupportedDevices(['Ascend910B'])
+    def test_all_gather_extensions_monkey_patch(self):
+        tls = threading.local()
+        tls.ran_pre_all_gather = False
+
+        # Define a pre/post-all-gather pair that quantizes to bf16 for the
+        # all-gather and de-quantizes back to the parameter dtype
+        def fsdp_pre_all_gather(
+            self,
+            mesh: DeviceMesh,
+            outer_size: torch.Size,
+            outer_stride: Tuple[int, ...],
+            module: nn.Module,
+            mp_policy: MixedPrecisionPolicy,
+        ) -> Tuple[Tuple[torch.Tensor, ...], Any]:
+            nonlocal tls
+            tls.ran_pre_all_gather = True
+            return (self.to(torch.bfloat16),), None
+
+        @torch.no_grad()
+        def fsdp_post_all_gather(
+            self,
+            all_gather_outputs: Tuple[torch.Tensor, ...],
+            metadata: Any,
+            param_dtype: torch.dtype,
+            *,
+            out: Optional[torch.Tensor] = None,
+        ) -> Union[Tuple[torch.Tensor, Tuple[torch.Tensor, ...]], None]:
+            (tensor,) = all_gather_outputs
+            assert metadata is None, f"{metadata}"
+            assert tensor.dtype == torch.bfloat16, f"{tensor.dtype}"
+            if out is not None:
+                with _unsafe_preserve_version_counter(out):
+                    out.copy_(tensor)
+                return
+            upcast_tensor = tensor.to(param_dtype)
+            return upcast_tensor, (tensor, upcast_tensor)
+
+        with torch.device("meta"):
+            model = self._init_two_tensor_mlp()
+        for mlp in model:
+            fully_shard(mlp)
+        fully_shard(model)
+        model.to_empty(device=self.device)
+        for param in model.parameters():
+            nn.init.trunc_normal_(param)
+        # Monkey patch the pre/post-all-gather functions *after* `to_empty()`
+        # since the local tensor objects change from materialization
+        self.assertGreater(sum("weight" in n for n, _ in model.named_parameters()), 0)
+        for param_name, param in model.named_parameters():
+            if "weight" in param_name:
+                # Need to use `_local_tensor` to patch the tensor object
+                local_param = param._local_tensor
+                # Monkey patch on the `torch.Tensor` as instance methods to
+                # show that the extension can work even without a subclass
+                local_param.fsdp_pre_all_gather = fsdp_pre_all_gather.__get__(
+                    local_param
+                )
+                local_param.fsdp_post_all_gather = fsdp_post_all_gather.__get__(
+                    local_param
+                )
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
+
+        # Run a few iterations to check for errors
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn((2, 8), device="npu")
+        for _ in range(3):
+            model(inp).sum().backward()
+            optim.step()
+            optim.zero_grad()
+        assert tls.ran_pre_all_gather
+
+    @SupportedDevices(['Ascend910B'])
+    def test_all_gather_extension_outer_size_stride(self):
+        """
+        NOTE: We cannot easily test the incorrect case where the user-defined
+        ``fsdp_pre_all_gather`` does not correctly pad the local tensor because
+        only some ranks may require padding, in which case only those ranks
+        will error out and the all-gather will timeout.
+        """
+        assert (
+            self.world_size >= 2
+        ), f"Assumes world size of at least 2 but got {self.world_size=}"
+        model = MLP(dim=3, dim_multiplier=3)
+        for module in model.modules():
+            for param_name, param in module.named_parameters(recurse=False):
+                if "weight" in param_name:
+                    param = nn.Parameter(BFloat16AllGatherTensor(param))
+                    setattr(module, param_name, param)
+        fully_shard(model)
+        optim = torch.optim.AdamW(model.parameters(), lr=1e-2, fused=True)
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn((2, 3), device="npu")
+        loss = model(inp).sum()
+        loss.backward()
+        optim.step()
+        optim.zero_grad()
+
+    @SupportedDevices(['Ascend910B'])
+    def test_all_gather_extension_hsdp_mesh(self):
+        tls = threading.local()
+        replicate_size = 2
+        shard_size = self.world_size // replicate_size
+        mesh = init_device_mesh(
+            "npu",
+            (replicate_size, shard_size),
+            mesh_dim_names=("dp_replicate", "dp_shard"),
+        )
+
+        def fsdp_pre_all_gather(
+            self,
+            mesh: DeviceMesh,
+            outer_size: torch.Size,
+            outer_stride: Tuple[int, ...],
+            module: nn.Module,
+            mp_policy: MixedPrecisionPolicy,
+        ) -> Tuple[Tuple[torch.Tensor, ...], Any]:
+            nonlocal tls
+            tls.mesh = mesh
+            return (self,), None
+
+        @torch.no_grad()
+        def fsdp_post_all_gather(
+            self,
+            all_gather_outputs: Tuple[torch.Tensor, ...],
+            metadata: Any,
+            param_dtype: torch.dtype,
+            *,
+            out: Optional[torch.Tensor] = None,
+        ) -> Union[Tuple[torch.Tensor, Tuple[torch.Tensor, ...]], None]:
+            (tensor,) = all_gather_outputs
+            if out is not None:
+                return
+            return tensor, (tensor,)
+
+        model = self._init_two_tensor_mlp()
+        for mlp in model:
+            fully_shard(mlp, mesh=mesh)
+        fully_shard(model, mesh=mesh)
+        self.assertGreater(sum("weight" in n for n, _ in model.named_parameters()), 0)
+        for param_name, param in model.named_parameters():
+            if "weight" in param_name:
+                # Need to use `_local_tensor` to patch the tensor object
+                local_param = param._local_tensor
+                # Monkey patch on the `torch.Tensor` as instance methods to
+                # show that the extension can work even without a subclass
+                local_param.fsdp_pre_all_gather = fsdp_pre_all_gather.__get__(
+                    local_param
+                )
+                local_param.fsdp_post_all_gather = fsdp_post_all_gather.__get__(
+                    local_param
+                )
+
+        inp = torch.randn((2, 8), device="npu")
+        model(inp)
+        # Check that FSDP passes only the shard mesh to the pre-all-gather
+        self.assertEqual(tls.mesh.ndim, 1)
+        self.assertEqual(tls.mesh.size(), shard_size)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/fsdp2/test_fully_shard_state.py b/test/distributed/fsdp2/test_fully_shard_state.py
new file mode 100644
index 0000000000..bd639ebbcd
--- /dev/null
+++ b/test/distributed/fsdp2/test_fully_shard_state.py
@@ -0,0 +1,81 @@
+import copy
+
+import torch.nn as nn
+from torch.distributed.fsdp import FSDPModule, fully_shard
+from torch.testing._internal.common_fsdp import FSDPTestMultiThread, MLP
+from torch.testing._internal.common_utils import run_tests
+
+import torch_npu
+
+
+class TestFullyShardState(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 1
+
+    def test_fully_shard_state(self):
+        """
+        Tests the ability to get the state object from a fully sharded module.
+        """
+        num_mlps = 3
+        model = nn.Sequential(*[MLP(8) for _ in range(num_mlps)])
+        for mlp in model:
+            fully_shard(mlp)
+        fully_shard(model)
+        root_state = fully_shard.state(model)
+        self.assertTrue(root_state is not None)
+        all_states = [root_state] + [fully_shard.state(mlp) for mlp in model]
+        # Check that each `fully_shard` call constructs a distinct state object
+        self.assertEqual(len(set(all_states)), num_mlps + 1)
+
+    def test_fully_shard_reapply(self):
+        model = MLP(8)
+        fully_shard(model)
+        with self.assertRaisesRegex(
+            AssertionError,
+            "Each distinct composable distributed API can only be applied to a module once.",
+        ):
+            fully_shard(model)
+
+    def test_fully_shard_cls(self):
+        # Check that we only swap class for the module passed to `fully_shard`
+        model = MLP(8)
+        fully_shard(model)
+        self.assertTrue(isinstance(model, MLP))
+        self.assertTrue(isinstance(model, FSDPModule))
+        self.assertEqual(model.__class__.__name__, "FSDPMLP")
+        for module in model.modules():
+            if module is model:
+                continue
+            self.assertFalse(isinstance(module, FSDPModule))
+
+        # Check that slicing into a `Sequential` does not preserve FSDP
+        model = nn.Sequential(*[MLP(8) for _ in range(3)])
+        fully_shard(model)
+        self.assertTrue(isinstance(model, nn.Sequential))
+        self.assertTrue(isinstance(model, FSDPModule))
+        self.assertEqual(model.__class__.__name__, "FSDPSequential")
+        sliced_model = model[:2]
+        self.assertTrue(isinstance(sliced_model, nn.Sequential))
+        self.assertFalse(isinstance(sliced_model, FSDPModule))
+
+    def test_fully_shard_unsupported_module_cls(self):
+        regex = (
+            r"fully\_shard does not support containers that do not implement forward"
+        )
+        model = nn.ModuleList([MLP(8) for _ in range(3)])
+        with self.assertRaisesRegex(ValueError, regex):
+            fully_shard(model)
+        model = nn.ModuleDict({"1": MLP(8), "2": MLP(8)})
+        with self.assertRaisesRegex(ValueError, regex):
+            fully_shard(model)
+
+    def test_fully_shard_deepcopy(self):
+        model = MLP(8)
+        fully_shard(model)
+        with self.assertRaisesRegex(AssertionError, "FSDP does not support deepcopy"):
+            copy.deepcopy(model)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/fsdp2/test_fully_shard_state_dict.py b/test/distributed/fsdp2/test_fully_shard_state_dict.py
new file mode 100644
index 0000000000..44d9962b68
--- /dev/null
+++ b/test/distributed/fsdp2/test_fully_shard_state_dict.py
@@ -0,0 +1,386 @@
+import copy
+import functools
+from contextlib import nullcontext
+from typing import Dict, Optional
+
+import torch
+import torch.nn as nn
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
+from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard
+from torch.distributed.tensor import distribute_tensor, DTensor, Shard
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    parallelize_module,
+    RowwiseParallel,
+)
+from torch.testing._internal.common_fsdp import FSDPTest, FSDPTestMultiThread, MLP
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    ModelArgs,
+    Transformer,
+    TransformerBlock,
+)
+
+import torch_npu
+from torch_npu.testing.common_distributed import skipIfUnsupportMultiNPU
+from torch_npu.testing._internal.common_fsdp import FSDPNPUTest
+
+
+class TestFullyShardStateDictMultiProcess(FSDPNPUTest):
+    @property
+    def world_size(self) -> int:
+        return min(8, torch.npu.device_count())
+
+    def test_dp_state_dict_save_load(self):
+        fsdp_mesh = init_device_mesh("npu", (self.world_size,))
+        self.run_subtests(
+            {"mlp_dim": [2, 3, 4, 5], "mesh": [fsdp_mesh]},
+            self._test_dp_state_dict_save_load,
+        )
+        self.run_subtests(
+            {"mlp_dim": [16], "mesh": [fsdp_mesh], "use_shard_placement_fn": [True]},
+            self._test_dp_state_dict_save_load,
+        )
+        if self.world_size % 2 != 0:
+            return
+        hsdp_mesh = init_device_mesh(
+            "npu",
+            (self.world_size // 2, 2),
+            mesh_dim_names=("dp_replicate", "dp_shard"),
+        )
+        self.run_subtests(
+            {"mlp_dim": [2, 3, 4, 5], "mesh": [hsdp_mesh]},
+            self._test_dp_state_dict_save_load,
+        )
+        self.run_subtests(
+            {"mlp_dim": [16], "mesh": [hsdp_mesh], "use_shard_placement_fn": [True]},
+            self._test_dp_state_dict_save_load,
+        )
+
+    def _test_dp_state_dict_save_load(
+        self, mlp_dim: int, mesh: DeviceMesh, use_shard_placement_fn: bool = False
+    ):
+        torch.manual_seed(42)
+        base_model = nn.Sequential(
+            MLP(mlp_dim),
+            nn.Sequential(MLP(mlp_dim), nn.Linear(mlp_dim, mlp_dim)),
+            MLP(mlp_dim),
+        )
+
+        def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
+            largest_dim = largest_dim_size = -1
+            for dim, dim_size in enumerate(param.shape):
+                if dim_size > largest_dim_size:
+                    largest_dim = dim
+                    largest_dim_size = dim_size
+            return Shard(largest_dim)
+
+        shard_placement_fn = _shard_placement_fn if use_shard_placement_fn else None
+        fully_shard_fn = functools.partial(
+            fully_shard, mesh=mesh, shard_placement_fn=shard_placement_fn
+        )
+
+        # Check basic `reshard_after_forward=True`
+        model1 = copy.deepcopy(base_model)
+        for module in model1:
+            fully_shard_fn(module)
+        fully_shard_fn(model1)
+        self._test_state_dict_save_load(model1)
+
+        # Check `reshard_after_forward=False` before and after a forward
+        model2 = copy.deepcopy(base_model)
+        for module in model2:
+            fully_shard_fn(module, reshard_after_forward=False)
+        fully_shard_fn(model2, reshard_after_forward=False)
+        self._test_state_dict_save_load(model2)
+        ref_sharded_sd = model2.state_dict()
+        inp = torch.randn((2, mlp_dim), device="npu")
+        model2(inp)  # parameters are not resharded after this forward
+        # Check that state dict hooks reshard
+        sharded_sd = model2.state_dict()
+        self.assertEqual(set(ref_sharded_sd.keys()), set(sharded_sd.keys()))
+        for key, value in ref_sharded_sd.items():
+            self.assertEqual(value, sharded_sd[key])
+
+    def test_dp_state_dict_cpu_offload(self):
+        self.run_subtests(
+            {
+                "offload_policy": [
+                    CPUOffloadPolicy(pin_memory=True),
+                    CPUOffloadPolicy(pin_memory=False),
+                ],
+                "cpu_state_dict": [True, False],
+            },
+            self._test_dp_state_dict_cpu_offload,
+        )
+
+    def _test_dp_state_dict_cpu_offload(
+        self, offload_policy: CPUOffloadPolicy, cpu_state_dict: bool
+    ):
+        mlp_dim = 4
+        torch.manual_seed(42)
+        with torch.device("meta"):
+            model = nn.Sequential(
+                nn.Linear(mlp_dim, mlp_dim, bias=False),
+                nn.Linear(mlp_dim, mlp_dim, bias=False),
+            )
+        for module in model:
+            fully_shard(module, offload_policy=offload_policy)
+        fully_shard(model, offload_policy=offload_policy)
+
+        # split full sd into multiple pieces
+        # to test loading with `strict=False`
+        state_dicts = []
+        for name, dtensor in model.named_parameters():
+            full_tensor = torch.randn(dtensor.size())
+            sharded_tensor = distribute_tensor(
+                full_tensor, dtensor.device_mesh, dtensor.placements
+            )
+            if cpu_state_dict:
+                sharded_tensor = sharded_tensor.cpu()
+            state_dicts.append({name: sharded_tensor})
+
+        # check that we can load with some parameters still on meta device
+        for sd in state_dicts:
+            model.load_state_dict(sd, assign=True, strict=False)
+
+        # lazy init without error
+        inp = torch.rand((mlp_dim, mlp_dim), device="npu")
+
+        context = (
+            self.assertRaisesRegex(
+                RuntimeError,
+                r"Found following parameters on non-CPU device: \[\('0.weight', device\(type='npu'",
+            )
+            if not cpu_state_dict
+            else nullcontext()
+        )
+        with context:
+            model(inp).sum()
+            state_dict = model.state_dict()
+            for name, dtensor in state_dict.items():
+                self.assertEqual(dtensor.device.type, "cpu")
+
+    def test_2d_state_dict_correctness(self):
+        dp_size = 2
+        global_mesh = init_device_mesh(
+            "npu", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
+        )
+        dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
+        torch.manual_seed(42)
+        mlp_dim = 4
+
+        # model init
+        model = nn.Sequential(*[MLP(mlp_dim) for _ in range(3)])
+        model_2d = copy.deepcopy(model)
+
+        # FSDP + TP
+        model_2d = parallelize_module(
+            model_2d,
+            device_mesh=tp_mesh,
+            parallelize_plan={
+                "0.in_proj": ColwiseParallel(),
+                "0.out_proj": RowwiseParallel(),
+                "1.in_proj": ColwiseParallel(),
+                "1.out_proj": RowwiseParallel(),
+                "2.in_proj": ColwiseParallel(),
+                "2.out_proj": RowwiseParallel(),
+            },
+        )
+        for mlp in model_2d:
+            fully_shard(mlp, mesh=dp_mesh)
+        fully_shard(model_2d, mesh=dp_mesh)
+
+        # state_dict parity check
+        model_state_dict = model.state_dict()
+        model_2d_state_dict = model_2d.state_dict()
+        for tensor, dtensor in zip(
+            model_state_dict.values(), model_2d_state_dict.values()
+        ):
+            self.assertTrue(isinstance(dtensor, DTensor))
+            self.assertEqual(tensor, dtensor.full_tensor())
+
+    def test_dp_tp_state_dict_save_load(self):
+        dp_size = 2
+        global_mesh = init_device_mesh(
+            "npu", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
+        )
+        self.run_subtests(
+            {"mlp_dim": [4, 6, 8, 10]},
+            functools.partial(self._test_dp_tp_state_dict_save_load, global_mesh),
+        )
+
+    def _test_dp_tp_state_dict_save_load(self, global_mesh: DeviceMesh, mlp_dim: int):
+        dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
+        torch.manual_seed(42)
+        model = nn.Sequential(*[MLP(mlp_dim) for _ in range(3)])
+        model = parallelize_module(
+            model,
+            device_mesh=tp_mesh,
+            parallelize_plan={
+                "0.in_proj": ColwiseParallel(),
+                "0.out_proj": RowwiseParallel(),
+                "1.in_proj": ColwiseParallel(),
+                "1.out_proj": RowwiseParallel(),
+                "2.in_proj": ColwiseParallel(),
+                "2.out_proj": RowwiseParallel(),
+            },
+        )
+        for mlp in model:
+            fully_shard(mlp, mesh=dp_mesh)
+        fully_shard(model, mesh=dp_mesh)
+        self._test_state_dict_save_load(model)
+
+    @skipIfUnsupportMultiNPU(4)
+    def test_hsdp_tp_state_dict_save_load(self):
+        global_mesh = init_device_mesh(
+            "npu",
+            (2, 2, self.world_size // 4),
+            mesh_dim_names=("dp_replicate", "dp_shard", "tp"),
+        )
+        self.run_subtests(
+            {"mlp_dim": [4, 6, 8, 10]},
+            functools.partial(self._test_hsdp_tp_state_dict_save_load, global_mesh),
+        )
+
+    def _test_hsdp_tp_state_dict_save_load(self, global_mesh: DeviceMesh, mlp_dim: int):
+        dp_mesh, tp_mesh = global_mesh["dp_replicate", "dp_shard"], global_mesh["tp"]
+        torch.manual_seed(42)
+        model = nn.Sequential(*[MLP(mlp_dim) for _ in range(3)])
+        model = parallelize_module(
+            model,
+            device_mesh=tp_mesh,
+            parallelize_plan={
+                "0.in_proj": ColwiseParallel(),
+                "0.out_proj": RowwiseParallel(),
+                "1.in_proj": ColwiseParallel(),
+                "1.out_proj": RowwiseParallel(),
+                "2.in_proj": ColwiseParallel(),
+                "2.out_proj": RowwiseParallel(),
+            },
+        )
+        for mlp in model:
+            fully_shard(mlp, mesh=dp_mesh)
+        fully_shard(model, mesh=dp_mesh)
+        self._test_state_dict_save_load(model)
+
+    def _test_state_dict_save_load(self, model: nn.Module):
+        for param_name, param in model.named_parameters():
+            self.assertIsInstance(
+                param,
+                DTensor,
+                f"Expects parameters to be sharded as DTensors but got {param_name} "
+                f"as {type(param)}: {param}",
+            )
+        old_fill_value = 1
+        new_fill_value = 42 + self.rank
+        with torch.no_grad():
+            for param in model.parameters():
+                param.fill_(old_fill_value)
+        # Use that the parameters are currently sharded, meaning that their
+        # data pointers correspond to the sharded parameter data
+        param_name_to_data_ptr = {
+            n: p.to_local().data_ptr() for n, p in model.named_parameters()
+        }
+        ref_sharded_sizes = [p.size() for p in model.parameters()]
+        state_dict = model.state_dict()
+        for param, ref_sharded_size in zip(model.parameters(), ref_sharded_sizes):
+            self.assertEqual(param.size(), ref_sharded_size)
+            self.assertTrue(isinstance(param, nn.Parameter))
+
+        # Verify that keys match, values are DTensors, and values share the
+        # same storage as the existing sharded parameter data
+        self.assertEqual(set(state_dict.keys()), set(param_name_to_data_ptr.keys()))
+        for param_name, tensor in state_dict.items():
+            self.assertTrue(isinstance(tensor, DTensor))
+            if param_name_to_data_ptr[param_name] == 0:
+                # Check that this is padding (added by DTensor)
+                self.assertGreater(self.rank, 0)
+                self.assertEqual(torch.count_nonzero(tensor.to_local()).item(), 0)
+            else:
+                self.assertEqual(
+                    tensor.to_local().data_ptr(), param_name_to_data_ptr[param_name]
+                )
+
+        # Verify that we can load a new state dict that contains DTensors with
+        # storages different from the current model parameters
+        new_state_dict: Dict[str, DTensor] = {}
+        for param_name, dtensor in state_dict.items():
+            # Construct new DTensors to exercise load state dict writeback
+            new_state_dict[param_name] = dtensor.detach().clone().fill_(new_fill_value)
+        for param in model.parameters():
+            self.assertEqual(
+                param.to_local(),
+                torch.ones_like(param.to_local()) * old_fill_value,
+            )
+        model.load_state_dict(new_state_dict)
+        for param_name, param in model.named_parameters():
+            self.assertEqual(
+                param.to_local(),
+                torch.ones_like(param.to_local()) * new_fill_value,
+            )
+            local_param = param.to_local()
+            # Only guarantee that the local tensor's data pointer does not
+            # change if the sharding was even (i.e. no padding); otherwise,
+            # FSDP may re-pad the local tensor, changing its data pointer
+            if local_param.size(0) * param.device_mesh.size() == param.size(0):
+                self.assertEqual(
+                    local_param.data_ptr(), param_name_to_data_ptr[param_name]
+                )
+
+
+class TestFullyShardStateDictMultiThread(FSDPTestMultiThread):
+    @property
+    def world_size(self):
+        return 2
+
+    def test_rank0_offload_full_state_dict(self):
+        # Construct a reference unsharded model on all ranks
+        model_args = ModelArgs(dropout_p=0.0)
+        torch.manual_seed(42)
+        ref_model = Transformer(model_args).npu()
+        for param in ref_model.parameters():
+            torch.distributed.broadcast(param.detach(), src=0)
+
+        # Construct a sharded model and sharded state dict on all ranks
+        model = copy.deepcopy(ref_model)
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                fully_shard(module)
+        fully_shard(model)
+        sharded_sd = model.state_dict()
+
+        # Save a reference CPU full state dict on rank 0 and delete the
+        # reference model otherwise
+        if self.rank != 0:
+            del ref_model
+        else:
+            ref_gpu_full_sd = ref_model.state_dict()
+            ref_full_sd = {k: v.cpu() for k, v in ref_gpu_full_sd.items()}
+            del ref_gpu_full_sd
+
+        # Reshard the GPU sharded state dict to a CPU full state dict on rank 0
+        full_sd = {}
+        for param_name, sharded_param in sharded_sd.items():
+            full_param = sharded_param.full_tensor()
+            if self.rank == 0:
+                full_sd[param_name] = full_param.cpu()
+            else:
+                del full_param
+
+        # Check that we have a CPU full state dict only on rank 0
+        if self.rank == 0:
+            self.assertEqual(len(full_sd), len(ref_full_sd))
+            self.assertEqual(list(full_sd.keys()), list(ref_full_sd.keys()))
+            for (param_name, param), ref_param in zip(
+                full_sd.items(), ref_full_sd.values()
+            ):
+                self.assertEqual(param.device, torch.device("cpu"))
+                self.assertEqual(param.device, ref_param.device)
+                self.assertEqual(param, ref_param)
+        else:
+            self.assertEqual(len(full_sd), 0)
+
+
+if __name__ == "__main__":
+    run_tests()
-- 
Gitee


From 0d2dc1c27cf5d80bfccbc01cdaa5928b4c0b80ee Mon Sep 17 00:00:00 2001
From: dilililiwhy <why.wuhuanyu@huawei.com>
Date: Mon, 10 Mar 2025 02:30:42 +0000
Subject: [PATCH 117/358] !18491 [1/N] cleancode (torch_npu/csrc/aten) Merge
 pull request !18491 from dilililiwhy/cleancode_aten_260_part1

---
 torch_npu/csrc/aten/common/ChangeDataPtr.cpp  |  95 ++---
 torch_npu/csrc/aten/common/EmptyTensor.cpp    | 128 +++---
 .../csrc/aten/common/FormatCastHelper.cpp     |  89 ++--
 torch_npu/csrc/aten/common/FormatCastHelper.h |  21 +-
 .../csrc/aten/common/FormatCastKernelNpu.cpp  | 182 +++++----
 .../aten/common/MatmulByBmmV2KernelNpu.cpp    |  63 +--
 torch_npu/csrc/aten/common/NpuFastReshape.cpp |  48 +--
 .../aten/common/PinnedMemoryAllocator.cpp     |  32 +-
 torch_npu/csrc/aten/common/SetNpu.cpp         | 200 ++++-----
 torch_npu/csrc/aten/common/TensorCompare.cpp  |  28 +-
 .../csrc/aten/common/TensorFactories.cpp      | 386 +++++++++---------
 11 files changed, 674 insertions(+), 598 deletions(-)

diff --git a/torch_npu/csrc/aten/common/ChangeDataPtr.cpp b/torch_npu/csrc/aten/common/ChangeDataPtr.cpp
index 8caae202c3..0f27a471cb 100644
--- a/torch_npu/csrc/aten/common/ChangeDataPtr.cpp
+++ b/torch_npu/csrc/aten/common/ChangeDataPtr.cpp
@@ -5,53 +5,54 @@
 namespace at_npu {
 namespace native {
 
-int64_t NPUNativeFunctions::npu_change_data_ptr(const at::Tensor& dst, const at::Tensor& src, int64_t offset) {
-  TORCH_CHECK(
-      offset >= 0,
-      "Expect offset equal or greater than zero, got: ", offset);
-
-  const auto& src_scalar_type = src.scalar_type();
-  const auto& dst_scalar_type = dst.scalar_type();
-
-  TORCH_CHECK(
-      src_scalar_type == dst_scalar_type,
-      "Expect src and dst tensors having the same dtype, got: ",
-      "src with dtype ", src_scalar_type,
-      ", dst with dtype ", dst_scalar_type, PTA_ERROR(ErrCode::TYPE));
-  TORCH_CHECK(
-      (src_scalar_type == at::ScalarType::Half) ||
-      (src_scalar_type == at::ScalarType::Float) ||
-      (src_scalar_type == at::ScalarType::BFloat16),
-      "Only supports src and dst tensors with dtype float32, float16 or bfloat16, got: ", src_scalar_type,
-      PTA_ERROR(ErrCode::TYPE));
-
-  auto dst_sizes = torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_.storage_sizes_;
-  auto src_sizes = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_.storage_sizes_;
-  int64_t dst_storage_size = c10::multiply_integers(dst_sizes);
-  int64_t src_storage_size = c10::multiply_integers(src_sizes);
-
-  TORCH_CHECK(
-      offset + dst_storage_size * dst.element_size() <=
-      src_storage_size * src.element_size(),
-      "Offsets overflow, got: ",
-      "offset ", offset,
-      ", dst storage size ", dst_storage_size,
-      ", src storage size ", src_storage_size, PTA_ERROR(ErrCode::PARAM));
-
-  at::DataPtr aim_data_ptr;
-  if (src_scalar_type == at::ScalarType::Float) {
-    float* data_ptr = static_cast<float*>(src.storage().data_ptr().get()) + offset;
-    aim_data_ptr = at::DataPtr(data_ptr, dst.storage().device());
-  } else if (src_scalar_type == at::ScalarType::BFloat16) {
-    at::BFloat16* data_ptr = static_cast<at::BFloat16*>(src.storage().data_ptr().get()) + offset;
-    aim_data_ptr = at::DataPtr(data_ptr, dst.storage().device());
-  } else {
-    at::Half* data_ptr = static_cast<at::Half*>(src.storage().data_ptr().get()) + offset;
-    aim_data_ptr = at::DataPtr(data_ptr, dst.storage().device());
-  }
-  dst.storage().set_data_ptr(std::move(aim_data_ptr));
-
-  return 0;
+int64_t NPUNativeFunctions::npu_change_data_ptr(const at::Tensor& dst, const at::Tensor& src, int64_t offset)
+{
+    TORCH_CHECK(
+        offset >= 0,
+        "Expect offset equal or greater than zero, got: ", offset, PTA_ERROR(ErrCode::VALUE));
+
+    const auto& src_scalar_type = src.scalar_type();
+    const auto& dst_scalar_type = dst.scalar_type();
+
+    TORCH_CHECK(
+        src_scalar_type == dst_scalar_type,
+        "Expect src and dst tensors having the same dtype, got: ",
+        "src with dtype ", src_scalar_type,
+        ", dst with dtype ", dst_scalar_type, PTA_ERROR(ErrCode::TYPE));
+    TORCH_CHECK(
+        (src_scalar_type == at::ScalarType::Half) ||
+        (src_scalar_type == at::ScalarType::Float) ||
+        (src_scalar_type == at::ScalarType::BFloat16),
+        "Only supports src and dst tensors with dtype float32, float16 or bfloat16, got: ", src_scalar_type,
+        PTA_ERROR(ErrCode::TYPE));
+
+    auto dst_sizes = torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_.storage_sizes_;
+    auto src_sizes = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_.storage_sizes_;
+    int64_t dst_storage_size = c10::multiply_integers(dst_sizes);
+    int64_t src_storage_size = c10::multiply_integers(src_sizes);
+
+    TORCH_CHECK(
+        offset + dst_storage_size * dst.element_size() <=
+        src_storage_size * src.element_size(),
+        "Offsets overflow, got: ",
+        "offset ", offset,
+        ", dst storage size ", dst_storage_size,
+        ", src storage size ", src_storage_size, PTA_ERROR(ErrCode::PARAM));
+
+    at::DataPtr aim_data_ptr;
+    if (src_scalar_type == at::ScalarType::Float) {
+        float* data_ptr = static_cast<float*>(src.storage().data_ptr().get()) + offset;
+        aim_data_ptr = at::DataPtr(data_ptr, dst.storage().device());
+    } else if (src_scalar_type == at::ScalarType::BFloat16) {
+        at::BFloat16* data_ptr = static_cast<at::BFloat16*>(src.storage().data_ptr().get()) + offset;
+        aim_data_ptr = at::DataPtr(data_ptr, dst.storage().device());
+    } else {
+        at::Half* data_ptr = static_cast<at::Half*>(src.storage().data_ptr().get()) + offset;
+        aim_data_ptr = at::DataPtr(data_ptr, dst.storage().device());
+    }
+    dst.storage().set_data_ptr(std::move(aim_data_ptr));
+
+    return 0;
 }
 
 } // namespace native
diff --git a/torch_npu/csrc/aten/common/EmptyTensor.cpp b/torch_npu/csrc/aten/common/EmptyTensor.cpp
index 00c44c1239..3e456e9aca 100644
--- a/torch_npu/csrc/aten/common/EmptyTensor.cpp
+++ b/torch_npu/csrc/aten/common/EmptyTensor.cpp
@@ -10,26 +10,35 @@
 namespace at_npu {
 namespace native {
 
-static c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
-  if (pin_memory) {
-    return getPinnedMemoryAllocator();
-  }
-  return c10::GetCPUAllocator();
+static c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory)
+{
+    if (pin_memory) {
+        return getPinnedMemoryAllocator();
+    }
+    return c10::GetCPUAllocator();
 }
 
-at::TensorBase empty_cpu(c10::IntArrayRef size, at::ScalarType dtype, bool pin_memory,
-                         c10::optional<c10::MemoryFormat> memory_format_opt) {
-  auto allocator = GetCPUAllocatorMaybePinned(pin_memory);
-  constexpr c10::DispatchKeySet cpu_ks(c10::DispatchKey::CPU);
-  return at::detail::empty_generic(size, allocator, cpu_ks, dtype, memory_format_opt);
+at::TensorBase empty_cpu(
+    c10::IntArrayRef size,
+    at::ScalarType dtype,
+    bool pin_memory,
+    c10::optional<c10::MemoryFormat> memory_format_opt)
+{
+    auto allocator = GetCPUAllocatorMaybePinned(pin_memory);
+    constexpr c10::DispatchKeySet cpu_ks(c10::DispatchKey::CPU);
+    return at::detail::empty_generic(size, allocator, cpu_ks, dtype, memory_format_opt);
 }
 
-at::TensorBase empty_strided_cpu(c10::IntArrayRef size, c10::IntArrayRef stride,
-                                 at::ScalarType dtype, bool pin_memory) {
-  auto allocator = GetCPUAllocatorMaybePinned(pin_memory);
-  constexpr c10::DispatchKeySet cpu_ks(c10::DispatchKey::CPU);
-  return at::detail::empty_strided_generic(
-      size, stride, allocator, cpu_ks, dtype);
+at::TensorBase empty_strided_cpu(
+    c10::IntArrayRef size,
+    c10::IntArrayRef stride,
+    at::ScalarType dtype,
+    bool pin_memory)
+{
+    auto allocator = GetCPUAllocatorMaybePinned(pin_memory);
+    constexpr c10::DispatchKeySet cpu_ks(c10::DispatchKey::CPU);
+    return at::detail::empty_strided_generic(
+        size, stride, allocator, cpu_ks, dtype);
 }
 
 at::TensorBase empty_cpu(
@@ -38,25 +47,26 @@ at::TensorBase empty_cpu(
     c10::optional<at::Layout> layout_opt,
     c10::optional<at::Device> device_opt,
     c10::optional<bool> pin_memory_opt,
-    c10::optional<c10::MemoryFormat> memory_format_opt) {
-  auto device = device_or_default(device_opt);
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device.type() == at::DeviceType::CPU, OPS_ERROR(ErrCode::PARAM));
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == at::Layout::Strided, OPS_ERROR(ErrCode::PARAM));
-
-  auto pin_memory = c10::pinned_memory_or_default(pin_memory_opt);
-  auto dtype = dtype_or_default(dtype_opt);
-  return empty_cpu(size, dtype, pin_memory, memory_format_opt);
+    c10::optional<c10::MemoryFormat> memory_format_opt)
+{
+    auto device = device_or_default(device_opt);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device.type() == at::DeviceType::CPU, OPS_ERROR(ErrCode::PARAM));
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == at::Layout::Strided, OPS_ERROR(ErrCode::PARAM));
+
+    auto pin_memory = c10::pinned_memory_or_default(pin_memory_opt);
+    auto dtype = dtype_or_default(dtype_opt);
+    return empty_cpu(size, dtype, pin_memory, memory_format_opt);
 }
 
-at::TensorBase empty_cpu(
-    c10::IntArrayRef size, const at::TensorOptions &options) {
-  return empty_cpu(
-      size,
-      c10::optTypeMetaToScalarType(options.dtype_opt()),
-      options.layout_opt(),
-      options.device_opt(),
-      options.pinned_memory_opt(),
-      options.memory_format_opt());
+at::TensorBase empty_cpu(c10::IntArrayRef size, const at::TensorOptions &options)
+{
+    return empty_cpu(
+        size,
+        c10::optTypeMetaToScalarType(options.dtype_opt()),
+        options.layout_opt(),
+        options.device_opt(),
+        options.pinned_memory_opt(),
+        options.memory_format_opt());
 }
 
 at::TensorBase empty_strided_cpu(
@@ -65,31 +75,38 @@ at::TensorBase empty_strided_cpu(
     c10::optional<at::ScalarType> dtype_opt,
     c10::optional<at::Layout> layout_opt,
     c10::optional<at::Device> device_opt,
-    c10::optional<bool> pin_memory_opt) {
-  auto device = device_or_default(device_opt);
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device.type() == at::DeviceType::CPU, OPS_ERROR(ErrCode::PARAM));
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == at::Layout::Strided, OPS_ERROR(ErrCode::PARAM));
-
-  auto pin_memory = c10::pinned_memory_or_default(pin_memory_opt);
-  auto dtype = dtype_or_default(dtype_opt);
-  return empty_strided_cpu(size, stride, dtype, pin_memory);
+    c10::optional<bool> pin_memory_opt)
+{
+    auto device = device_or_default(device_opt);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device.type() == at::DeviceType::CPU, OPS_ERROR(ErrCode::PARAM));
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == at::Layout::Strided, OPS_ERROR(ErrCode::PARAM));
+
+    auto pin_memory = c10::pinned_memory_or_default(pin_memory_opt);
+    auto dtype = dtype_or_default(dtype_opt);
+    return empty_strided_cpu(size, stride, dtype, pin_memory);
 }
 
 at::TensorBase empty_strided_cpu(
     c10::IntArrayRef size,
     c10::IntArrayRef stride,
-    const at::TensorOptions &options) {
-  return empty_strided_cpu(
-      size,
-      stride,
-      c10::optTypeMetaToScalarType(options.dtype_opt()),
-      options.layout_opt(),
-      options.device_opt(),
-      options.pinned_memory_opt());
+    const at::TensorOptions &options)
+{
+    return empty_strided_cpu(
+        size,
+        stride,
+        c10::optTypeMetaToScalarType(options.dtype_opt()),
+        options.layout_opt(),
+        options.device_opt(),
+        options.pinned_memory_opt());
 }
 
-at::Tensor empty_memory_format(c10::IntArrayRef size, c10::optional<at::ScalarType> dtype_opt, c10::optional<at::Layout> layout_opt,
-    c10::optional<at::Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt)
+at::Tensor empty_memory_format(
+    c10::IntArrayRef size,
+    c10::optional<at::ScalarType> dtype_opt,
+    c10::optional<at::Layout> layout_opt,
+    c10::optional<at::Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt)
 {
     at::Tensor result = empty_cpu(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
     if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms() && at::globalContext().deterministicFillUninitializedMemory())) {
@@ -98,8 +115,13 @@ at::Tensor empty_memory_format(c10::IntArrayRef size, c10::optional<at::ScalarTy
     return result;
 }
 
-at::Tensor empty_strided(c10::IntArrayRef size, c10::IntArrayRef stride, c10::optional<at::ScalarType> dtype_opt,
-                         c10::optional<at::Layout> layout_opt, c10::optional<at::Device> device_opt, c10::optional<bool> pin_memory_opt)
+at::Tensor empty_strided(
+    c10::IntArrayRef size,
+    c10::IntArrayRef stride,
+    c10::optional<at::ScalarType> dtype_opt,
+    c10::optional<at::Layout> layout_opt,
+    c10::optional<at::Device> device_opt,
+    c10::optional<bool> pin_memory_opt)
 {
     at::Tensor result = empty_strided_cpu(size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
     if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms() && at::globalContext().deterministicFillUninitializedMemory())) {
diff --git a/torch_npu/csrc/aten/common/FormatCastHelper.cpp b/torch_npu/csrc/aten/common/FormatCastHelper.cpp
index 23505f6d4a..6dd82d8ef5 100644
--- a/torch_npu/csrc/aten/common/FormatCastHelper.cpp
+++ b/torch_npu/csrc/aten/common/FormatCastHelper.cpp
@@ -8,18 +8,21 @@
 namespace at_npu {
 namespace native {
 
-bool FormatCastHelper::IsSameGroupType(const at::Tensor& src, const at::Tensor& dst) {
-  auto src_format = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_.npu_format_;
-  auto dst_format = torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_.npu_format_;
-  return FormatHelper::GetBaseFormat(src_format) == FormatHelper::GetBaseFormat(dst_format);
+bool FormatCastHelper::IsSameGroupType(const at::Tensor& src, const at::Tensor& dst)
+{
+    auto src_format = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_.npu_format_;
+    auto dst_format = torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_.npu_format_;
+    return FormatHelper::GetBaseFormat(src_format) == FormatHelper::GetBaseFormat(dst_format);
 }
 
-void FormatCastHelper::base_format_cast_nocheck(at::Tensor& dst, const at::Tensor& src) {
-  dst.set_(dst.storage(), src.storage_offset(), src.sizes(), src.strides());
-  NPUNativeFunctions::copy_memory_(dst, src, true);
+void FormatCastHelper::base_format_cast_nocheck(at::Tensor& dst, const at::Tensor& src)
+{
+    dst.set_(dst.storage(), src.storage_offset(), src.sizes(), src.strides());
+    NPUNativeFunctions::copy_memory_(dst, src, true);
 }
 
-void FormatCastHelper::format_cast_as_base_format(const at::Tensor& src, aclFormat format) {
+void FormatCastHelper::format_cast_as_base_format(const at::Tensor& src, aclFormat format)
+{
     AT_ASSERT(FormatHelper::IsBaseFormatType(format), "dst format must be base format", PTA_ERROR(ErrCode::PARAM));
     AT_ASSERT(FormatHelper::IsBaseFormatType(src), "src format must be base format", PTA_ERROR(ErrCode::PARAM));
 
@@ -32,44 +35,50 @@ void FormatCastHelper::format_cast_as_base_format(const at::Tensor& src, aclForm
     return;
 }
 
-bool FormatCastHelper::format_cast_between_group(at::Tensor& dst, const at::Tensor& src, FormatCastHelper::FormatCastFunc format_cast_inside_group) {
-  if (FormatHelper::IsBaseFormatType(src)) {
-    if (FormatHelper::IsBaseFormatType(dst)) {
-      // src base format (src format) -> dst base format
-      base_format_cast_nocheck(dst, src); // only need to copy memory
-      return true;
+bool FormatCastHelper::format_cast_between_group(
+    at::Tensor& dst,
+    const at::Tensor& src,
+    FormatCastHelper::FormatCastFunc format_cast_inside_group)
+{
+    if (FormatHelper::IsBaseFormatType(src)) {
+        if (FormatHelper::IsBaseFormatType(dst)) {
+            // src base format (src format) -> dst base format
+            base_format_cast_nocheck(dst, src); // only need to copy memory
+            return true;
+        } else {
+            // src base format (src format) -> dst base format
+            // dst base format -> dst format
+            auto src_base_format = FormatHelper::GetBaseFormat(src);
+            format_cast_as_base_format(src, FormatHelper::GetBaseFormat(dst)); // prepare: covert src to dst base format
+            format_cast_inside_group(dst, src); // src base format (src format) -> dst base format
+            format_cast_as_base_format(src, src_base_format); // recover: dst base format -> dst format
+            return true;
+        }
     } else {
-      // src base format (src format) -> dst base format
-      // dst base format -> dst format
-      auto src_base_format = FormatHelper::GetBaseFormat(src);
-      format_cast_as_base_format(src, FormatHelper::GetBaseFormat(dst)); // prepare: covert src to dst base format
-      format_cast_inside_group(dst, src); // src base format (src format) -> dst base format
-      format_cast_as_base_format(src, src_base_format); // recover: dst base format -> dst format
-      return true;
+        if (FormatHelper::IsBaseFormatType(dst)) {
+            // src format -> src base format
+            // src base format -> dst base format (dst format)
+            auto dst_base_format = FormatHelper::GetBaseFormat(dst);
+            format_cast_as_base_format(dst, FormatHelper::GetBaseFormat(src)); // prepare: cover dst to src base format
+            format_cast_inside_group(dst, src); // src format -> src base format
+            format_cast_as_base_format(dst, dst_base_format); // recover: src base format -> dst format
+            return true;
+        }
     }
-  } else {
-    if (FormatHelper::IsBaseFormatType(dst)) {
-      // src format -> src base format
-      // src base format -> dst base format (dst format)
-      auto dst_base_format = FormatHelper::GetBaseFormat(dst);
-      format_cast_as_base_format(dst, FormatHelper::GetBaseFormat(src)); // prepare: cover dst to src base format
-      format_cast_inside_group(dst, src); // src format -> src base format
-      format_cast_as_base_format(dst, dst_base_format); // recover: src base format -> dst format
-      return true;
-    }
-  }
-  return false;
+    return false;
 }
 
-at::Tensor FormatCastHelper::ApplyBaseFormatTensorBy(const at::Tensor& src) {
-  auto format = FormatHelper::GetBaseFormat(src);
-  return custom_ops::npu_format_cast(src, format);
+at::Tensor FormatCastHelper::ApplyBaseFormatTensorBy(const at::Tensor& src)
+{
+    auto format = FormatHelper::GetBaseFormat(src);
+    return custom_ops::npu_format_cast(src, format);
 }
 
-at::Tensor& FormatCastHelper::CovertSelfToBaseFormat(at::Tensor& src) {
-  auto format = FormatHelper::GetBaseFormat(src);
-  return NPUNativeFunctions::npu_format_cast_(src, format);
+at::Tensor& FormatCastHelper::CovertSelfToBaseFormat(at::Tensor& src)
+{
+    auto format = FormatHelper::GetBaseFormat(src);
+    return NPUNativeFunctions::npu_format_cast_(src, format);
 }
 
 } // namespace native
-} // namespace at_npu
\ No newline at end of file
+} // namespace at_npu
diff --git a/torch_npu/csrc/aten/common/FormatCastHelper.h b/torch_npu/csrc/aten/common/FormatCastHelper.h
index 0e25b41abb..065306a860 100644
--- a/torch_npu/csrc/aten/common/FormatCastHelper.h
+++ b/torch_npu/csrc/aten/common/FormatCastHelper.h
@@ -10,19 +10,20 @@ namespace native {
 
 class FormatCastHelper {
 public:
-  static bool IsSameGroupType(const at::Tensor& src, const at::Tensor& dst);
-  static void format_cast_as_base_format(const at::Tensor& src, aclFormat format);
-  using FormatCastFunc = std::function<at::Tensor(at::Tensor&, const at::Tensor&)>;
-  static bool format_cast_between_group(at::Tensor& dst, const at::Tensor& src, FormatCastFunc format_cast_inside_group);
-  // this interface is similar to CastBackToOriFormat, but CastBackToOriFormat may have overload problem.
-  static at::Tensor ApplyBaseFormatTensorBy(const at::Tensor& src);
-  static at::Tensor& CovertSelfToBaseFormat(at::Tensor& src);
+    static bool IsSameGroupType(const at::Tensor& src, const at::Tensor& dst);
+    static void format_cast_as_base_format(const at::Tensor& src, aclFormat format);
+    using FormatCastFunc = std::function<at::Tensor(at::Tensor&, const at::Tensor&)>;
+    static bool format_cast_between_group(
+        at::Tensor& dst, const at::Tensor& src, FormatCastFunc format_cast_inside_group);
+    // this interface is similar to CastBackToOriFormat, but CastBackToOriFormat may have overload problem.
+    static at::Tensor ApplyBaseFormatTensorBy(const at::Tensor& src);
+    static at::Tensor& CovertSelfToBaseFormat(at::Tensor& src);
 private:
-  // help function of format_cast_between_group
-  static void base_format_cast_nocheck(at::Tensor& dst, const at::Tensor& src);
+    // help function of format_cast_between_group
+    static void base_format_cast_nocheck(at::Tensor& dst, const at::Tensor& src);
 }; // class FormatCastHelper
 
 } // namespace native
 } // namespace at_npu
 
-#endif // __NATIVE_NPU_COMMON_FORMAT_CAST_HELPER__
\ No newline at end of file
+#endif // __NATIVE_NPU_COMMON_FORMAT_CAST_HELPER__
diff --git a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
index 8760e8c9e0..b5838f5cbb 100644
--- a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
+++ b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
@@ -12,129 +12,135 @@ namespace native {
 
 using tensor_list = std::vector<at::Tensor>;
 
-at::Tensor format_cast_impl_out_npu(at::Tensor& dst, const at::Tensor& src) {
-  string srcFormat = FormatHelper::GetFormatName(src);
-  string dstFormat = FormatHelper::GetFormatName(dst);
-
-  if (!FormatCastHelper::IsSameGroupType(src, dst)) {
-    bool res = FormatCastHelper::format_cast_between_group(dst, src, format_cast_impl_out_npu);
-    if (!res) {
-      AT_ERROR("unsupport cast from ", srcFormat, " to ", dstFormat);
+at::Tensor format_cast_impl_out_npu(at::Tensor& dst, const at::Tensor& src)
+{
+    string srcFormat = FormatHelper::GetFormatName(src);
+    string dstFormat = FormatHelper::GetFormatName(dst);
+
+    if (!FormatCastHelper::IsSameGroupType(src, dst)) {
+        bool res = FormatCastHelper::format_cast_between_group(dst, src, format_cast_impl_out_npu);
+        if (!res) {
+            AT_ERROR("unsupport cast from ", srcFormat, " to ", dstFormat);
+        }
+        return dst;
     }
+
+    NpuStorageOffsetGuard guard_input(const_cast<at::Tensor &>(src));
+    NpuStorageOffsetGuard guard_output(dst);
+    OpCommand cmd;
+    cmd.Name("Identity")
+       .InputWithoutContiguous(src)
+       .Output(dst)
+       .Run();
     return dst;
-  }
-
-  NpuStorageOffsetGuard guard_input(const_cast<at::Tensor &>(src));
-  NpuStorageOffsetGuard guard_output(dst);
-  OpCommand cmd;
-  cmd.Name("Identity")
-     .InputWithoutContiguous(src)
-     .Output(dst)
-     .Run();
-  return dst;
 }
 
 // convert src from src_format to dst_format, write the result into dst
-at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& dst, const at::Tensor& src) {
-  torch_npu::utils::torch_check_npu(dst);
-  torch_npu::utils::torch_check_npu(src);
-  auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_;
-  auto dst_desc = torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_;
-  if (src_desc.npu_format_ == dst_desc.npu_format_) {
-    dst.copy_(src);
-    return dst;
-  }
+at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& dst, const at::Tensor& src)
+{
+    torch_npu::utils::torch_check_npu(dst);
+    torch_npu::utils::torch_check_npu(src);
+    auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_;
+    auto dst_desc = torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_;
+    if (src_desc.npu_format_ == dst_desc.npu_format_) {
+        dst.copy_(src);
+        return dst;
+    }
 
-  // calculate the output result of the NPU
-  format_cast_impl_out_npu(dst, src);
+    // calculate the output result of the NPU
+    format_cast_impl_out_npu(dst, src);
 
-  return dst;
+    return dst;
 }
 
 // conver self to acl_format, write the result into new result tensor
 at::Tensor npu_format_cast_impl(
     const at::Tensor& src,
-    int64_t acl_format) {
-  auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_;
-  if (src_desc.npu_format_ == acl_format) {
-    ASCEND_LOGD("no need to do format cast");
-    return src;
-  }
-  if (FormatHelper::IsBaseFormatType(src) &&
-      FormatHelper::IsBaseFormatType(static_cast<aclFormat>(acl_format))) {
-    FormatCastHelper::format_cast_as_base_format(src, static_cast<aclFormat>(acl_format));
-    return src;
-  }
+    int64_t acl_format)
+{
+    auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_;
+    if (src_desc.npu_format_ == acl_format) {
+        ASCEND_LOGD("no need to do format cast");
+        return src;
+    }
+    if (FormatHelper::IsBaseFormatType(src) &&
+        FormatHelper::IsBaseFormatType(static_cast<aclFormat>(acl_format))) {
+        FormatCastHelper::format_cast_as_base_format(src, static_cast<aclFormat>(acl_format));
+        return src;
+    }
 
-  at::Tensor dst = OpPreparation::ApplyTensorWithFormat(
-      src_desc.base_sizes_, src.options(), acl_format);
+    at::Tensor dst = OpPreparation::ApplyTensorWithFormat(
+        src_desc.base_sizes_, src.options(), acl_format);
 
-  // calculate the output result of the NPU
-  format_cast_impl_out_npu(dst, src);
+    // calculate the output result of the NPU
+    format_cast_impl_out_npu(dst, src);
 
-  // format cast only change physical layout of base tensor and view tensor's
-  // metadata remain unchanged
-  dst.set_(dst.storage(), src.storage_offset(), src.sizes(), src.strides());
-  return dst;
+    // format cast only change physical layout of base tensor and view tensor's
+    // metadata remain unchanged
+    dst.set_(dst.storage(), src.storage_offset(), src.sizes(), src.strides());
+    return dst;
 }
 
 // conver self to dst'format, write the result into new result tensor
 at::Tensor NPUNativeFunctions::npu_format_cast(
     const at::Tensor& src,
-    const at::Tensor& dst) {
-  torch_npu::utils::torch_check_npu(dst);
-  auto dst_desc = torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_;
-  int64_t dst_format = dst_desc.npu_format_;
-  return custom_ops::npu_format_cast(src, dst_format);
+    const at::Tensor& dst)
+{
+    torch_npu::utils::torch_check_npu(dst);
+    auto dst_desc = torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_;
+    int64_t dst_format = dst_desc.npu_format_;
+    return custom_ops::npu_format_cast(src, dst_format);
 }
 
 // conver self to acl_format, write the result into self
 at::Tensor& NPUNativeFunctions::npu_format_cast_(
     at::Tensor& src,
-    int64_t acl_format) {
-  torch_npu::utils::torch_check_npu(src);
-  auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_;
-  if (src_desc.npu_format_ == acl_format) {
-    return src;
-  }
-  if (FormatHelper::IsBaseFormatType(src) &&
-      FormatHelper::IsBaseFormatType(static_cast<aclFormat>(acl_format))) {
-    FormatCastHelper::format_cast_as_base_format(src, static_cast<aclFormat>(acl_format));
-    return src;
-  }
+    int64_t acl_format)
+{
+    torch_npu::utils::torch_check_npu(src);
+    auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_;
+    if (src_desc.npu_format_ == acl_format) {
+        return src;
+    }
+    if (FormatHelper::IsBaseFormatType(src) &&
+        FormatHelper::IsBaseFormatType(static_cast<aclFormat>(acl_format))) {
+        FormatCastHelper::format_cast_as_base_format(src, static_cast<aclFormat>(acl_format));
+        return src;
+    }
 
-  at::Tensor dst = OpPreparation::ApplyTensorWithFormat(
-      src_desc.base_sizes_, src.options(), acl_format);
+    at::Tensor dst = OpPreparation::ApplyTensorWithFormat(
+        src_desc.base_sizes_, src.options(), acl_format);
 
-  // calculate the output result of the NPU
-  format_cast_impl_out_npu(dst, src);
+    // calculate the output result of the NPU
+    format_cast_impl_out_npu(dst, src);
 
-  // format cast only change physical layout of base tensor and view tensor's
-  // metadata remain unchanged
-  src.set_(dst.storage(), src.storage_offset(), src.sizes(), src.strides());
+    // format cast only change physical layout of base tensor and view tensor's
+    // metadata remain unchanged
+    src.set_(dst.storage(), src.storage_offset(), src.sizes(), src.strides());
 
-  return src;
+    return src;
 }
 
-int64_t NPUNativeFunctions::get_npu_format(const at::Tensor& src) {
-  torch_npu::utils::torch_check_npu(src);
-  auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_;
-  return src_desc.npu_format_;
+int64_t NPUNativeFunctions::get_npu_format(const at::Tensor& src)
+{
+    torch_npu::utils::torch_check_npu(src);
+    auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_;
+    return src_desc.npu_format_;
 }
 
-at::Tensor NPUNativeFunctions::_npu_format_cast(const at::Tensor& self,
-    int64_t acl_format) {
-  return npu_format_cast_impl(self, acl_format);
+at::Tensor NPUNativeFunctions::_npu_format_cast(const at::Tensor& self, int64_t acl_format)
+{
+    return npu_format_cast_impl(self, acl_format);
 }
 
-at::Tensor NPUNativeFunctions::npu_format_cast(const at::Tensor& self,
-    int64_t acl_format) {
-  torch_npu::utils::torch_check_npu(self);
-  if (NPUNativeFunctions::get_npu_format(self) == acl_format) {
-    ASCEND_LOGD("no need to do format cast");
-    return self;
-  }
-  return custom_ops::_npu_format_cast(self, acl_format);
+at::Tensor NPUNativeFunctions::npu_format_cast(const at::Tensor& self, int64_t acl_format)
+{
+    torch_npu::utils::torch_check_npu(self);
+    if (NPUNativeFunctions::get_npu_format(self) == acl_format) {
+        ASCEND_LOGD("no need to do format cast");
+        return self;
+    }
+    return custom_ops::_npu_format_cast(self, acl_format);
 }
 
 } // namespace native
diff --git a/torch_npu/csrc/aten/common/MatmulByBmmV2KernelNpu.cpp b/torch_npu/csrc/aten/common/MatmulByBmmV2KernelNpu.cpp
index 0015ee233a..ce37bb9c00 100644
--- a/torch_npu/csrc/aten/common/MatmulByBmmV2KernelNpu.cpp
+++ b/torch_npu/csrc/aten/common/MatmulByBmmV2KernelNpu.cpp
@@ -5,37 +5,38 @@
 namespace at_npu {
 namespace native {
 
-at::Tensor matmul_by_bmmV2(const at::Tensor& tensor1, const at::Tensor& tensor2) {
-  auto dim_tensor1 = tensor1.dim();
-  auto dim_tensor2 = tensor2.dim();
-  if (dim_tensor1 == 1 && dim_tensor2 == 1) {
-    return tensor1.dot(tensor2);
-  } else if (dim_tensor1 == 2 && dim_tensor2 == 1) {
-    return tensor1.mm(tensor2.unsqueeze(-1)).squeeze_(-1);
-  } else if (dim_tensor1 == 1 && dim_tensor2 == 2) {
-    return tensor1.unsqueeze(0).mm(tensor2).squeeze_(0);
-  } else if (dim_tensor1 == 2 && dim_tensor2 == 2) {
-    return tensor1.mm(tensor2);
-  } else if (dim_tensor1 >= 3 && (dim_tensor2 == 1 || dim_tensor2 == 2)) {
-    at::Tensor t2 = dim_tensor2 == 1 ? tensor2.unsqueeze(-1) : tensor2;
-    auto size1 = tensor1.sizes();
-    auto size2 = t2.sizes();
-    std::vector<int64_t> output_size;
-    output_size.insert(output_size.end(), size1.begin(), size1.end() - 1);
-    if (dim_tensor2 > 1) {
-      output_size.push_back(size2[dim_tensor2 - 1]);
+at::Tensor matmul_by_bmmV2(const at::Tensor& tensor1, const at::Tensor& tensor2)
+{
+    auto dim_tensor1 = tensor1.dim();
+    auto dim_tensor2 = tensor2.dim();
+    if (dim_tensor1 == 1 && dim_tensor2 == 1) {
+        return tensor1.dot(tensor2);
+    } else if (dim_tensor1 == 2 && dim_tensor2 == 1) {
+        return tensor1.mm(tensor2.unsqueeze(-1)).squeeze_(-1);
+    } else if (dim_tensor1 == 1 && dim_tensor2 == 2) {
+        return tensor1.unsqueeze(0).mm(tensor2).squeeze_(0);
+    } else if (dim_tensor1 == 2 && dim_tensor2 == 2) {
+        return tensor1.mm(tensor2);
+    } else if (dim_tensor1 >= 3 && (dim_tensor2 == 1 || dim_tensor2 == 2)) {
+        at::Tensor t2 = dim_tensor2 == 1 ? tensor2.unsqueeze(-1) : tensor2;
+        auto size1 = tensor1.sizes();
+        auto size2 = t2.sizes();
+        std::vector<int64_t> output_size;
+        output_size.insert(output_size.end(), size1.begin(), size1.end() - 1);
+        if (dim_tensor2 > 1) {
+            output_size.push_back(size2[dim_tensor2 - 1]);
+        }
+        // fold the batch into the first dimension
+        at::Tensor t1 = tensor1.reshape({-1, tensor1.size(-1)});
+        at::Tensor output = at::_unsafe_view(t1.mm(t2), output_size);
+        return output;
+    } else if ((dim_tensor1 == 1 || dim_tensor1 == 2) && dim_tensor2 >= 3) {
+        return custom_ops::npu_bmmV2(tensor1, tensor2, {});
+    } else if ((dim_tensor1 >= 1 && dim_tensor2 >= 1) && (dim_tensor1 >= 3 || dim_tensor2 >= 3)) {
+        return custom_ops::npu_bmmV2(tensor1, tensor2, {});
     }
-    // fold the batch into the first dimension
-    at::Tensor t1 = tensor1.reshape({-1, tensor1.size(-1)});
-    at::Tensor output = at::_unsafe_view(t1.mm(t2), output_size);
-    return output;
-  } else if ((dim_tensor1 == 1 || dim_tensor1 == 2) && dim_tensor2 >= 3) {
-    return custom_ops::npu_bmmV2(tensor1, tensor2, {});
-  } else if ((dim_tensor1 >= 1 && dim_tensor2 >= 1) && (dim_tensor1 >= 3 || dim_tensor2 >= 3)) {
-    return custom_ops::npu_bmmV2(tensor1, tensor2, {});
-  }
-  AT_ERROR("both arguments to matmul need to be at least 1D, but they are ",
-      dim_tensor1, "D and ", dim_tensor2, "D");
+    AT_ERROR("both arguments to matmul need to be at least 1D, but they are ", dim_tensor1, "D and ", dim_tensor2, "D");
+}
+
 }
 }
-}
\ No newline at end of file
diff --git a/torch_npu/csrc/aten/common/NpuFastReshape.cpp b/torch_npu/csrc/aten/common/NpuFastReshape.cpp
index 4681f0efe8..c07136aa50 100644
--- a/torch_npu/csrc/aten/common/NpuFastReshape.cpp
+++ b/torch_npu/csrc/aten/common/NpuFastReshape.cpp
@@ -7,31 +7,33 @@
 namespace at_npu {
 namespace native {
 
-void npu_fast_reshape_(at::Tensor& tensor) {
-  /**
-    [NOTE] For some reshape cases such as view, unsqueeze, squeeze, flatten,
-    storages of them remain unchanged. So we can refresh reshape tensor's metadata
-    to obtain matched tensor.
-    */
+void npu_fast_reshape_(at::Tensor& tensor)
+{
+    /**
+      [NOTE] For some reshape cases such as view, unsqueeze, squeeze, flatten,
+      storages of them remain unchanged. So we can refresh reshape tensor's metadata
+      to obtain matched tensor.
+      */
 
-  // restriction 1
-  if (!tensor.is_contiguous()) {
-    return;
-  }
-  // restriction 2
-  if (!FormatHelper::IsBaseFormatType(tensor)) {
-    return;
-  }
-  // restriction 3: reshape case without any numels change
-  if ((tensor.numel() != StorageDescHelper::GetMemorySize(tensor)) ||
-      StorageDescHelper::MetaDataAreMatch(&tensor)) {
-    return;
-  }
+    // restriction 1
+    if (!tensor.is_contiguous()) {
+        return;
+    }
+    // restriction 2
+    if (!FormatHelper::IsBaseFormatType(tensor)) {
+        return;
+    }
+    // restriction 3: reshape case without any numels change
+    if ((tensor.numel() != StorageDescHelper::GetMemorySize(tensor)) ||
+        StorageDescHelper::MetaDataAreMatch(&tensor)) {
+        return;
+    }
 
-  // refresh matadata to input tensor
-  StorageDescHelper::ReflushDescBySelf(tensor);
-  auto base_format = InferFormat::GuessBaseFormat(tensor.sizes());
-  NPUNativeFunctions::npu_format_cast_(tensor, base_format);
+    // refresh matadata to input tensor
+    StorageDescHelper::ReflushDescBySelf(tensor);
+    auto base_format = InferFormat::GuessBaseFormat(tensor.sizes());
+    NPUNativeFunctions::npu_format_cast_(tensor, base_format);
 }
+
 } // namespace native
 } // namespace at_npu
diff --git a/torch_npu/csrc/aten/common/PinnedMemoryAllocator.cpp b/torch_npu/csrc/aten/common/PinnedMemoryAllocator.cpp
index 6686e4e2a4..ae1b53cf9e 100644
--- a/torch_npu/csrc/aten/common/PinnedMemoryAllocator.cpp
+++ b/torch_npu/csrc/aten/common/PinnedMemoryAllocator.cpp
@@ -14,7 +14,8 @@
 namespace at_npu {
 namespace native {
 
-bool NPUNativeFunctions::is_pinned(const at::Tensor& self, c10::optional<at::Device> device) {
+bool NPUNativeFunctions::is_pinned(const at::Tensor& self, c10::optional<at::Device> device)
+{
     // Only CPU tensors can be pinned
     if (!self.is_cpu()) {
         return false;
@@ -23,20 +24,21 @@ bool NPUNativeFunctions::is_pinned(const at::Tensor& self, c10::optional<at::Dev
     return CachingHostAllocator_isPinned(self.storage().mutable_data());
 }
 
-at::Tensor NPUNativeFunctions::_pin_memory(const at::Tensor& self, c10::optional<at::Device> device) {
-  // TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_npu());
-  auto allocator = getPinnedMemoryAllocator();
-  auto storage = c10::Storage(
-      c10::Storage::use_byte_size_t(),
-      at::detail::computeStorageNbytes(
-          self.sizes(),
-          self.strides(),
-          self.dtype().itemsize()),
-      allocator,
-      false);
-  auto tensor = at::cpu::empty({0}, self.options()).set_(storage, 0, self.sizes(), self.strides());
-  tensor.copy_(self);
-  return tensor;
+at::Tensor NPUNativeFunctions::_pin_memory(const at::Tensor& self, c10::optional<at::Device> device)
+{
+    // TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_npu());
+    auto allocator = getPinnedMemoryAllocator();
+    auto storage = c10::Storage(
+        c10::Storage::use_byte_size_t(),
+        at::detail::computeStorageNbytes(
+            self.sizes(),
+            self.strides(),
+            self.dtype().itemsize()),
+        allocator,
+        false);
+    auto tensor = at::cpu::empty({0}, self.options()).set_(storage, 0, self.sizes(), self.strides());
+    tensor.copy_(self);
+    return tensor;
 }
 
 } // namespace native
diff --git a/torch_npu/csrc/aten/common/SetNpu.cpp b/torch_npu/csrc/aten/common/SetNpu.cpp
index 2393151a54..9deb4d4344 100644
--- a/torch_npu/csrc/aten/common/SetNpu.cpp
+++ b/torch_npu/csrc/aten/common/SetNpu.cpp
@@ -3,111 +3,129 @@
 namespace at_npu {
 namespace native {
 
-void set_storage_nd_npu(at::Tensor& self, c10::Storage storage, int64_t storage_offset, int nDimension, c10::IntArrayRef size, c10::IntArrayRef stride) {
-  at::native::checkSetStorage(self, storage, storage_offset, size, stride);
-  self.unsafeGetTensorImpl()->set_storage_offset(storage_offset);
-  resize_nd_npu(self.unsafeGetTensorImpl(), nDimension, size.data(), stride.data());
+void set_storage_nd_npu(
+    at::Tensor& self,
+    c10::Storage storage,
+    int64_t storage_offset,
+    int nDimension,
+    c10::IntArrayRef size,
+    c10::IntArrayRef stride)
+{
+    at::native::checkSetStorage(self, storage, storage_offset, size, stride);
+    self.unsafeGetTensorImpl()->set_storage_offset(storage_offset);
+    resize_nd_npu(self.unsafeGetTensorImpl(), nDimension, size.data(), stride.data());
 }
 
-bool CheckStorageDesc(const at::Tensor& self, const c10::Storage src) {
-  if (self.unsafeGetTensorImpl()->storage_offset() != 0 || !self.is_contiguous()) {
-    return false;
-  }
-  int64_t new_size = static_cast<int64_t>(src.nbytes() / self.dtype().itemsize());
-  int64_t nelements = c10::multiply_integers(self.unsafeGetTensorImpl()->sizes());
-  if (new_size != nelements) {
-    return false;
-  }
-  return true;
-}
-
-at::Tensor& NPUNativeFunctions::set_(at::Tensor& self, c10::Storage src, long storage_offset, c10::IntArrayRef size, c10::IntArrayRef stride) {
-  set_storage_nd_npu(self, src, storage_offset, size.size(), size, stride);
-  if (StorageDescHelper::CheckDescInit(src)) {
-    StorageDescHelper::CopyDesc(self, src);
-    return self;
-  }
-  // NPUStorageImpl create by constructor, NPUStorageDesc is not initialized by
-  // SetDesc.
-  if (CheckStorageDesc(self, src)) {
-    StorageDescHelper::SetDesc(self, size, stride);
-  } else {
-    // Check input tensor propertys. If conditions are not met, NPUStorageDesc
-    // base_sizes_ change to 1D. Conditions:
-    // 1. Tensor storage_offset == 0
-    // 2. Tnput tensor is contiguous
-    // 3. Storage element size same to Tensor
+bool CheckStorageDesc(const at::Tensor& self, const c10::Storage src)
+{
+    if (self.unsafeGetTensorImpl()->storage_offset() != 0 || !self.is_contiguous()) {
+        return false;
+    }
     int64_t new_size = static_cast<int64_t>(src.nbytes() / self.dtype().itemsize());
-    StorageDescHelper::SetDesc(self, {new_size}, {1});
-  }
-  return self;
+    int64_t nelements = c10::multiply_integers(self.unsafeGetTensorImpl()->sizes());
+    if (new_size != nelements) {
+        return false;
+    }
+    return true;
 }
 
-at::Tensor& NPUNativeFunctions::set_(at::Tensor& self) {
-  caffe2::TypeMeta dtype = self.dtype();
-  c10::intrusive_ptr<c10::StorageImpl> npu_storage_impl = torch_npu::make_npu_storage_impl(
-      c10::StorageImpl::use_byte_size_t(),
-      c10::SymInt(0),
-      c10_npu::NPUCachingAllocator::get()->allocate(0),
-      c10_npu::NPUCachingAllocator::get(),
-      true);
-  c10::Storage storage(npu_storage_impl);
-  set_storage_nd_npu(self, storage, 0, 1, {0}, {});
-  StorageDescHelper::SetDesc(self);
-  TORCH_INTERNAL_ASSERT(dtype == self.dtype(), OPS_ERROR(ErrCode::TYPE));
-  return self;
+at::Tensor& NPUNativeFunctions::set_(
+    at::Tensor& self,
+    c10::Storage src,
+    long storage_offset,
+    c10::IntArrayRef size,
+    c10::IntArrayRef stride)
+{
+    set_storage_nd_npu(self, src, storage_offset, size.size(), size, stride);
+    if (StorageDescHelper::CheckDescInit(src)) {
+        StorageDescHelper::CopyDesc(self, src);
+        return self;
+    }
+    // NPUStorageImpl create by constructor, NPUStorageDesc is not initialized by
+    // SetDesc.
+    if (CheckStorageDesc(self, src)) {
+        StorageDescHelper::SetDesc(self, size, stride);
+    } else {
+        // Check input tensor propertys. If conditions are not met, NPUStorageDesc
+        // base_sizes_ change to 1D. Conditions:
+        // 1. Tensor storage_offset == 0
+        // 2. Tnput tensor is contiguous
+        // 3. Storage element size same to Tensor
+        int64_t new_size = static_cast<int64_t>(src.nbytes() / self.dtype().itemsize());
+        StorageDescHelper::SetDesc(self, {new_size}, {1});
+    }
+    return self;
 }
 
-at::Tensor& NPUNativeFunctions::set_(at::Tensor& self, const at::Tensor& src) {
-  at::TensorImpl* self_ = self.unsafeGetTensorImpl();
-  at::TensorImpl* src_ = src.unsafeGetTensorImpl();
-  if (self_ != src_) {
-    set_storage_nd_npu(self, src.storage(), src.storage_offset(), src.dim(), src.sizes(), src.strides());
-  }
-  StorageDescHelper::CopyDesc(self, src);
-  return self;
+at::Tensor& NPUNativeFunctions::set_(at::Tensor& self)
+{
+    caffe2::TypeMeta dtype = self.dtype();
+    c10::intrusive_ptr<c10::StorageImpl> npu_storage_impl = torch_npu::make_npu_storage_impl(
+        c10::StorageImpl::use_byte_size_t(),
+        c10::SymInt(0),
+        c10_npu::NPUCachingAllocator::get()->allocate(0),
+        c10_npu::NPUCachingAllocator::get(),
+        true);
+    c10::Storage storage(npu_storage_impl);
+    set_storage_nd_npu(self, storage, 0, 1, {0}, {});
+    StorageDescHelper::SetDesc(self);
+    TORCH_INTERNAL_ASSERT(dtype == self.dtype(), OPS_ERROR(ErrCode::TYPE));
+    return self;
 }
 
-at::Tensor& NPUNativeFunctions::set_(at::Tensor& self, c10::Storage src) {
-  int64_t new_size = static_cast<int64_t>(src.nbytes() / self.dtype().itemsize());
-  set_storage_nd_npu(self, src, 0, 1, {new_size}, {});
-  if (StorageDescHelper::CheckDescInit(src)) {
+at::Tensor& NPUNativeFunctions::set_(at::Tensor& self, const at::Tensor& src)
+{
+    at::TensorImpl* self_ = self.unsafeGetTensorImpl();
+    at::TensorImpl* src_ = src.unsafeGetTensorImpl();
+    if (self_ != src_) {
+        set_storage_nd_npu(self, src.storage(), src.storage_offset(), src.dim(), src.sizes(), src.strides());
+    }
     StorageDescHelper::CopyDesc(self, src);
     return self;
-  }
-  // NPUStorageImpl create by constructor, NPUStorageDesc is not initialized by
-  // SetDesc.
-  StorageDescHelper::SetDesc(
-      self,
-      self.unsafeGetTensorImpl()->sizes(),
-      self.unsafeGetTensorImpl()->strides());
-  return self;
 }
 
-at::Tensor set_tensor_with_storage_format(c10::Storage src) {
-  if (StorageDescHelper::CheckDescInit(src)) {
-    // The storage object src has complete description information,
-    // and the tensor object self needs to be brushed to be the same
-    auto desc = torch_npu::NPUBridge::GetNpuStorageImpl(src.unsafeGetStorageImpl())->npu_desc_;
-    auto dist_tensor = NPUNativeFunctions::empty(
-        {0}, desc.data_type_.toScalarType(), c10::nullopt,
-        src.device(), false, c10::MemoryFormat::Contiguous);
-    set_storage_nd_npu(dist_tensor, src, 0, desc.base_sizes_.size(), desc.base_sizes_, desc.base_strides_);
-    return dist_tensor;
-  } else {
-    // The storage object src doesn't have complete description information,
-    // and the tensor object self needs to be brushed to be the 1 dimension
-    auto dist_tensor = NPUNativeFunctions::empty(
-        {0}, at::ScalarType::Char, c10::nullopt,
-        src.device(), false, c10::MemoryFormat::Contiguous);
-    int64_t new_size = static_cast<int64_t>(src.nbytes() / dist_tensor.dtype().itemsize());
-    set_storage_nd_npu(dist_tensor, src, 0, 1, {new_size}, {});
+at::Tensor& NPUNativeFunctions::set_(at::Tensor& self, c10::Storage src)
+{
+    int64_t new_size = static_cast<int64_t>(src.nbytes() / self.dtype().itemsize());
+    set_storage_nd_npu(self, src, 0, 1, {new_size}, {});
+    if (StorageDescHelper::CheckDescInit(src)) {
+        StorageDescHelper::CopyDesc(self, src);
+        return self;
+    }
+    // NPUStorageImpl create by constructor, NPUStorageDesc is not initialized by
+    // SetDesc.
     StorageDescHelper::SetDesc(
-        dist_tensor,
-        dist_tensor.unsafeGetTensorImpl()->sizes(),
-        dist_tensor.unsafeGetTensorImpl()->strides());
-    return dist_tensor;
-  }
+        self,
+        self.unsafeGetTensorImpl()->sizes(),
+        self.unsafeGetTensorImpl()->strides());
+    return self;
+}
+
+at::Tensor set_tensor_with_storage_format(c10::Storage src)
+{
+    if (StorageDescHelper::CheckDescInit(src)) {
+        // The storage object src has complete description information,
+        // and the tensor object self needs to be brushed to be the same
+        auto desc = torch_npu::NPUBridge::GetNpuStorageImpl(src.unsafeGetStorageImpl())->npu_desc_;
+        auto dist_tensor = NPUNativeFunctions::empty(
+            {0}, desc.data_type_.toScalarType(), c10::nullopt,
+            src.device(), false, c10::MemoryFormat::Contiguous);
+        set_storage_nd_npu(dist_tensor, src, 0, desc.base_sizes_.size(), desc.base_sizes_, desc.base_strides_);
+        return dist_tensor;
+    } else {
+        // The storage object src doesn't have complete description information,
+        // and the tensor object self needs to be brushed to be the 1 dimension
+        auto dist_tensor = NPUNativeFunctions::empty(
+            {0}, at::ScalarType::Char, c10::nullopt,
+            src.device(), false, c10::MemoryFormat::Contiguous);
+        int64_t new_size = static_cast<int64_t>(src.nbytes() / dist_tensor.dtype().itemsize());
+        set_storage_nd_npu(dist_tensor, src, 0, 1, {new_size}, {});
+        StorageDescHelper::SetDesc(
+            dist_tensor,
+            dist_tensor.unsafeGetTensorImpl()->sizes(),
+            dist_tensor.unsafeGetTensorImpl()->strides());
+        return dist_tensor;
+    }
 }
 
 } // namespace native
diff --git a/torch_npu/csrc/aten/common/TensorCompare.cpp b/torch_npu/csrc/aten/common/TensorCompare.cpp
index 3f0673ef4a..716d3d7805 100644
--- a/torch_npu/csrc/aten/common/TensorCompare.cpp
+++ b/torch_npu/csrc/aten/common/TensorCompare.cpp
@@ -3,22 +3,24 @@
 namespace at_npu {
 namespace native {
 
-at::Tensor isnan_npu(const at::Tensor& self) {
-  return self != self;
+at::Tensor isnan_npu(const at::Tensor& self)
+{
+    return self != self;
 }
 
-bool is_nonzero_npu(const at::Tensor& self) {
-  c10::Scalar localScalar = self.item();
-  if (localScalar.isFloatingPoint()) {
-    return localScalar.to<double>() != 0;
-  } else if (localScalar.isIntegral(false)) {
-    return localScalar.to<int64_t>() != 0;
-  } else if (localScalar.isBoolean()) {
-    return localScalar.to<bool>();
-  }
+bool is_nonzero_npu(const at::Tensor& self)
+{
+    c10::Scalar localScalar = self.item();
+    if (localScalar.isFloatingPoint()) {
+        return localScalar.to<double>() != 0;
+    } else if (localScalar.isIntegral(false)) {
+        return localScalar.to<int64_t>() != 0;
+    } else if (localScalar.isBoolean()) {
+        return localScalar.to<bool>();
+    }
 
-  return false;
+    return false;
 }
 
 } // namespace native
-} // namespace at_npu
\ No newline at end of file
+} // namespace at_npu
diff --git a/torch_npu/csrc/aten/common/TensorFactories.cpp b/torch_npu/csrc/aten/common/TensorFactories.cpp
index 08e4e65e63..e0e2235b07 100644
--- a/torch_npu/csrc/aten/common/TensorFactories.cpp
+++ b/torch_npu/csrc/aten/common/TensorFactories.cpp
@@ -40,23 +40,24 @@ namespace {
 void window_function_checks(
     const char *function_name,
     const c10::TensorOptions &options,
-    int64_t window_length) {
-  TORCH_CHECK(
-      options.layout() != at::kSparse,
-      function_name,
-      " is not implemented for sparse types, got: ",
-      options, OPS_ERROR(ErrCode::NOT_SUPPORT));
-  TORCH_CHECK(
-      at::isFloatingType(c10::typeMetaToScalarType(options.dtype())) ||
-          at::isComplexType(c10::typeMetaToScalarType(options.dtype())),
-      function_name,
-      " expects floating point dtypes, got: ",
-      options, OPS_ERROR(ErrCode::TYPE));
-  TORCH_CHECK(
-      window_length >= 0,
-      function_name,
-      " requires non-negative window_length, got window_length=",
-      window_length, OPS_ERROR(ErrCode::VALUE));
+    int64_t window_length)
+{
+    TORCH_CHECK(
+        options.layout() != at::kSparse,
+        function_name,
+        " is not implemented for sparse types, got: ",
+        options, OPS_ERROR(ErrCode::NOT_SUPPORT));
+    TORCH_CHECK(
+        at::isFloatingType(c10::typeMetaToScalarType(options.dtype())) ||
+            at::isComplexType(c10::typeMetaToScalarType(options.dtype())),
+        function_name,
+        " expects floating point dtypes, got: ",
+        options, OPS_ERROR(ErrCode::TYPE));
+    TORCH_CHECK(
+        window_length >= 0,
+        function_name,
+        " requires non-negative window_length, got window_length=",
+        window_length, OPS_ERROR(ErrCode::VALUE));
 }
 
 size_t computeStorageNbytes(
@@ -78,8 +79,13 @@ size_t computeStorageNbytes(
 
 } // namespace
 
-at::Tensor NPUNativeFunctions::scalar_tensor(const c10::Scalar& s, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout,
-                                             c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+at::Tensor NPUNativeFunctions::scalar_tensor(
+    const c10::Scalar& s,
+    c10::optional<at::ScalarType> dtype,
+    c10::optional<at::Layout> layout,
+    c10::optional<at::Device> device,
+    c10::optional<bool> pin_memory)
+{
     at::tracer::impl::NoTracerDispatchMode tracer_guard;
     at::AutoDispatchBelowAutograd mode;
     auto result = at::native::empty_cpu({}, dtype, layout, c10::make_optional(c10::Device(at::kCPU)), pin_memory);
@@ -92,12 +98,14 @@ at::Tensor NPUNativeFunctions::scalar_tensor(const c10::Scalar& s, c10::optional
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ empty ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-at::Tensor NPUNativeFunctions::empty(c10::IntArrayRef size,
-                                     c10::optional<at::ScalarType> dtype_opt,
-                                     c10::optional<c10::Layout> layout_opt,
-                                     c10::optional<c10::Device> device_opt,
-                                     c10::optional<bool> pin_memory_opt,
-                                     c10::optional<c10::MemoryFormat> memory_format_opt) {
+at::Tensor NPUNativeFunctions::empty(
+    c10::IntArrayRef size,
+    c10::optional<at::ScalarType> dtype_opt,
+    c10::optional<c10::Layout> layout_opt,
+    c10::optional<c10::Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt)
+{
 #ifndef BUILD_LIBTORCH
     torch_npu::profiler::NPURecordFunction profiler_guard;
 #endif
@@ -123,16 +131,13 @@ at::Tensor NPUNativeFunctions::empty(c10::IntArrayRef size,
         allocator,
         true);
 
-    auto tensor =
-        at::detail::make_tensor<torch_npu::NPUTensorImpl>(storage_impl, dtype);
+    auto tensor = at::detail::make_tensor<torch_npu::NPUTensorImpl>(storage_impl, dtype);
 
     // Default at::TensorImpl has size [0]
-    if (size.size() != 1 || size[0] != 0)
-    {
+    if (size.size() != 1 || size[0] != 0) {
         tensor.unsafeGetTensorImpl()->set_sizes_contiguous(size);
     }
-    auto memory_format =
-        memory_format_opt.value_or(c10::MemoryFormat::Contiguous);
+    auto memory_format = memory_format_opt.value_or(c10::MemoryFormat::Contiguous);
     TORCH_CHECK(
         memory_format == c10::MemoryFormat::Contiguous,
         "Only c10::MemoryFormat::Contiguous is supported for creating a npu tensor", OPS_ERROR(ErrCode::NOT_SUPPORT));
@@ -145,7 +150,8 @@ at::Tensor NPUNativeFunctions::empty(c10::IntArrayRef size,
 at::Tensor empty_like_npu(
     const at::Tensor &self,
     const c10::TensorOptions &options_,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    c10::optional<c10::MemoryFormat> optional_memory_format)
+{
     TORCH_CHECK(
         !(options_.has_memory_format() && optional_memory_format.has_value()),
         "Cannot set memory_format both in TensorOptions and explicit argument; please delete "
@@ -164,8 +170,7 @@ at::Tensor empty_like_npu(
         return result;
     }
 
-    auto memory_format =
-        options.memory_format_opt().value_or(c10::MemoryFormat::Contiguous);
+    auto memory_format = options.memory_format_opt().value_or(c10::MemoryFormat::Contiguous);
 
     if (self.is_quantized()) {
         // To support all features of c10::MemoryFormat::Preserve we need to add
@@ -173,8 +178,7 @@ at::Tensor empty_like_npu(
         // at::Tensor clone(const at::Tensor& src, c10::optional<c10::c10::MemoryFormat>
         // optional_memory_format) if (self.is_non_overlapping_and_dense()) ->
         // _empty_affine_quantized_strided
-        if (memory_format == c10::MemoryFormat::Preserve)
-        {
+        if (memory_format == c10::MemoryFormat::Preserve) {
             memory_format = self.suggest_memory_format();
         }
 
@@ -208,8 +212,7 @@ at::Tensor empty_like_npu(
                 self.q_zero_point(),
                 // See Note [Explicit nullopt c10::MemoryFormat argument]
                 c10::nullopt);
-        }
-        else if (qscheme == at::kPerChannelAffine) {
+        } else if (qscheme == at::kPerChannelAffine) {
             // Copy the tensors with channels to avoid accidental overrides
             return at::_empty_per_channel_affine_quantized(
                 self.sizes(),
@@ -219,35 +222,31 @@ at::Tensor empty_like_npu(
                 options.memory_format(memory_format),
                 // See Note [Explicit nullopt c10::MemoryFormat argument]
                 c10::nullopt);
-        }
-        else {
+        } else {
             TORCH_CHECK(false, "Unsupported qscheme: ", toString(qscheme), OPS_ERROR(ErrCode::NOT_SUPPORT));
         }
     }
 
-  at::Tensor result;
+    at::Tensor result;
 
     if (memory_format == c10::MemoryFormat::Preserve &&
         !(torch_npu::utils::is_npu(options))) {
         if (self.is_non_overlapping_and_dense()) {
             result = at::empty_strided(
                 self.sizes(), self.strides(), options.memory_format(c10::nullopt));
-        }
-        else {
+        } else {
             // See Note [Explicit nullopt c10::MemoryFormat argument]
             result = at::empty(
                 self.sizes(),
                 options.memory_format(self.suggest_memory_format()),
                 c10::nullopt);
         }
-    }
-    else {
+    } else {
         // See Note [Explicit nullopt c10::MemoryFormat argument]
         if (!(torch_npu::utils::is_npu(options))) {
             result = at::empty(
                 self.sizes(), options.memory_format(memory_format), c10::nullopt);
-        }
-        else {
+        } else {
             auto npu_format =
                 torch_npu::NPUBridge::GetNpuStorageImpl(self)->npu_desc_.npu_format_;
             result = OpPreparation::ApplyTensorWithFormat(self.sizes(), options, npu_format);
@@ -267,21 +266,24 @@ at::Tensor NPUNativeFunctions::empty_like(
     c10::optional<c10::Layout> layout_opt,
     c10::optional<c10::Device> device_opt,
     c10::optional<bool> pin_memory_opt,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
-  c10::TensorOptions options = c10::TensorOptions().dtype(dtype_opt)
-                                      .device(device_opt)
-                                      .layout(layout_opt)
-                                      .pinned_memory(pin_memory_opt);
+    c10::optional<c10::MemoryFormat> optional_memory_format)
+{
+    c10::TensorOptions options = c10::TensorOptions().dtype(dtype_opt)
+                                        .device(device_opt)
+                                        .layout(layout_opt)
+                                        .pinned_memory(pin_memory_opt);
 
     return at_npu::native::empty_like_npu(self, options, optional_memory_format);
 }
 
-at::Tensor NPUNativeFunctions::empty_with_format(c10::IntArrayRef size,
-                                                 c10::optional<at::ScalarType> dtype_opt,
-                                                 c10::optional<c10::Layout> layout_opt,
-                                                 c10::optional<c10::Device> device_opt,
-                                                 c10::optional<bool> pin_memory_opt,
-                                                 int64_t dst_format) {
+at::Tensor NPUNativeFunctions::empty_with_format(
+    c10::IntArrayRef size,
+    c10::optional<at::ScalarType> dtype_opt,
+    c10::optional<c10::Layout> layout_opt,
+    c10::optional<c10::Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    int64_t dst_format)
+{
 #ifndef BUILD_LIBTORCH
     torch_npu::profiler::NPURecordFunction profiler_guard;
 #endif
@@ -307,12 +309,10 @@ at::Tensor NPUNativeFunctions::empty_with_format(c10::IntArrayRef size,
         allocator->allocate(size_bytes),
         allocator,
         true);
-    auto tensor =
-        at::detail::make_tensor<torch_npu::NPUTensorImpl>(storage_impl, dtype);
+    auto tensor = at::detail::make_tensor<torch_npu::NPUTensorImpl>(storage_impl, dtype);
 
     // Default NPUTensorImpl has size [0]
-    if (size.size() != 1 || size[0] != 0)
-    {
+    if (size.size() != 1 || size[0] != 0) {
         tensor.unsafeGetTensorImpl()->set_sizes_contiguous(size);
     }
     tensor.unsafeGetTensorImpl()->empty_tensor_restride(c10::MemoryFormat::Contiguous);
@@ -320,13 +320,15 @@ at::Tensor NPUNativeFunctions::empty_with_format(c10::IntArrayRef size,
     return tensor;
 }
 
-at::Tensor NPUNativeFunctions::unsafe_empty_with_format(c10::IntArrayRef size,
-                                                        c10::optional <at::ScalarType> dtype_opt,
-                                                        c10::optional <c10::Layout> layout_opt,
-                                                        c10::optional <c10::Device> device_opt,
-                                                        c10::optional<bool> pin_memory_opt,
-                                                        int64_t dst_format,
-                                                        bool keep_format) {
+at::Tensor NPUNativeFunctions::unsafe_empty_with_format(
+    c10::IntArrayRef size,
+    c10::optional <at::ScalarType> dtype_opt,
+    c10::optional <c10::Layout> layout_opt,
+    c10::optional <c10::Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    int64_t dst_format,
+    bool keep_format)
+{
     // This is a special interface that can adjust the memory application results. Check before use.
 
     // Some ops cannot operate directly based on ND format, such as MatMul, BatchMatMul, MaxPoolWithArgmaxV1.
@@ -339,13 +341,15 @@ at::Tensor NPUNativeFunctions::unsafe_empty_with_format(c10::IntArrayRef size,
     return NPUNativeFunctions::empty_with_format(size, dtype_opt, layout_opt, device_opt, pin_memory_opt, dst_format);
 }
 
-at::Tensor NPUNativeFunctions::empty_with_format(c10::IntArrayRef size,
-                                                 c10::optional<at::DimnameList> names,
-                                                 c10::optional<at::ScalarType> dtype_opt,
-                                                 c10::optional<c10::Layout> layout_opt,
-                                                 c10::optional<c10::Device> device_opt,
-                                                 c10::optional<bool> pin_memory_opt,
-                                                 int64_t dst_format) {
+at::Tensor NPUNativeFunctions::empty_with_format(
+    c10::IntArrayRef size,
+    c10::optional<at::DimnameList> names,
+    c10::optional<at::ScalarType> dtype_opt,
+    c10::optional<c10::Layout> layout_opt,
+    c10::optional<c10::Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    int64_t dst_format)
+{
     torch_npu::utils::torch_check_npu(c10::device_or_default(device_opt));
     caffe2::TypeMeta dtype = c10::scalarTypeToTypeMeta(dtype_or_default(dtype_opt));
     c10::TensorOptions options = c10::TensorOptions().dtype(dtype_opt)
@@ -353,21 +357,21 @@ at::Tensor NPUNativeFunctions::empty_with_format(c10::IntArrayRef size,
                                         .layout(layout_opt)
                                         .pinned_memory(pin_memory_opt);
     at::Tensor result = OpPreparation::ApplyTensorWithFormat(size, options, dst_format);
-    if (names.has_value())
-    {
+    if (names.has_value()) {
         internal_set_names_inplace(result, names);
     }
 
     return result;
 }
 
-at::Tensor empty_with_format_name_npu(c10::IntArrayRef size,
-                                      c10::optional<at::DimnameList> names,
-                                      const c10::TensorOptions &options,
-                                      int64_t dst_format) {
+at::Tensor empty_with_format_name_npu(
+    c10::IntArrayRef size,
+    c10::optional<at::DimnameList> names,
+    const c10::TensorOptions &options,
+    int64_t dst_format)
+{
     at::Tensor result = OpPreparation::ApplyTensorWithFormat(size, options, dst_format);
-    if (names.has_value())
-    {
+    if (names.has_value()) {
         internal_set_names_inplace(result, names);
     }
 
@@ -380,7 +384,8 @@ at::Tensor NPUNativeFunctions::empty_strided(
     c10::optional<at::ScalarType> dtype_opt,
     c10::optional<c10::Layout> layout_opt,
     c10::optional<c10::Device> device_opt,
-    c10::optional<bool> pin_memory_opt) {
+    c10::optional<bool> pin_memory_opt)
+{
     check_size_nonnegative(size);
     c10::optional<c10::MemoryFormat> optional_memory_format = c10::nullopt;
     auto t = NPUNativeFunctions::empty({0}, dtype_opt, layout_opt, device_opt, pin_memory_opt, optional_memory_format);
@@ -397,14 +402,16 @@ at::Tensor NPUNativeFunctions::new_empty_strided_symint(
     c10::optional<at::ScalarType> dtype,
     c10::optional<at::Layout> layout,
     c10::optional<at::Device> device,
-    c10::optional<bool> pin_memory) {
+    c10::optional<bool> pin_memory)
+{
     return at::native::new_empty_strided_symint(self, size, stride, dtype, layout, device, pin_memory);
 }
 
 at::Tensor &empty_out_npu(
     at::Tensor &result,
     c10::IntArrayRef size,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
+    c10::optional<c10::MemoryFormat> optional_memory_format)
+{
     // Preferably, this argument would not be accepted by _out, but the code
     // generator requires the out and non-out overloads to match exactly
     TORCH_CHECK(
@@ -413,24 +420,25 @@ at::Tensor &empty_out_npu(
     check_size_nonnegative(size);
     if (result.is_sparse()) {
         result.sparse_resize_and_clear_(size, size.size(), 0);
-    }
-    else {
+    } else {
         result.resize_(size);
     }
     return result;
 }
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ blackman_window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-at::Tensor NPUNativeFunctions::blackman_window(int64_t window_length,
-                                               bool periodic,
-                                               c10::optional<at::ScalarType> dtype_opt,
-                                               c10::optional<c10::Layout> layout_opt,
-                                               c10::optional<c10::Device> device_opt,
-                                               c10::optional<bool> pin_memory_opt) {
-  c10::TensorOptions options = c10::TensorOptions().dtype(dtype_opt)
-                                      .device(device_opt)
-                                      .layout(layout_opt)
-                                      .pinned_memory(pin_memory_opt);
+at::Tensor NPUNativeFunctions::blackman_window(
+    int64_t window_length,
+    bool periodic,
+    c10::optional<at::ScalarType> dtype_opt,
+    c10::optional<c10::Layout> layout_opt,
+    c10::optional<c10::Device> device_opt,
+    c10::optional<bool> pin_memory_opt)
+{
+    c10::TensorOptions options = c10::TensorOptions().dtype(dtype_opt)
+                                        .device(device_opt)
+                                        .layout(layout_opt)
+                                        .pinned_memory(pin_memory_opt);
 
     window_function_checks("blackman_window", options, window_length);
     if (window_length == 0) {
@@ -447,11 +455,12 @@ at::Tensor NPUNativeFunctions::blackman_window(int64_t window_length,
     return periodic ? window.narrow(0, 0, window_length - 1) : window;
 }
 
-at::Tensor NPUNativeFunctions::blackman_window(int64_t window_length,
-                                               c10::optional<at::ScalarType> dtype_opt,
-                                               c10::optional<c10::Layout> layout_opt,
-                                               c10::optional<c10::Device> device_opt,
-                                               c10::optional<bool> pin_memory_opt)
+at::Tensor NPUNativeFunctions::blackman_window(
+    int64_t window_length,
+    c10::optional<at::ScalarType> dtype_opt,
+    c10::optional<c10::Layout> layout_opt,
+    c10::optional<c10::Device> device_opt,
+    c10::optional<bool> pin_memory_opt)
 {
     return blackman_window(window_length, true, dtype_opt, layout_opt, device_opt, pin_memory_opt);
 }
@@ -463,23 +472,21 @@ at::Tensor NPUNativeFunctions::bartlett_window(
     c10::optional<at::ScalarType> dtype_opt,
     c10::optional<c10::Layout> layout_opt,
     c10::optional<c10::Device> device_opt,
-    c10::optional<bool> pin_memory_opt) {
+    c10::optional<bool> pin_memory_opt)
+{
     c10::TensorOptions options = c10::TensorOptions().dtype(dtype_opt)
                                         .device(device_opt)
                                         .layout(layout_opt)
                                         .pinned_memory(pin_memory_opt);
 
     window_function_checks("bartlett_window", options, window_length);
-    if (window_length == 0)
-    {
+    if (window_length == 0) {
         return at::empty({0}, options);
     }
-    if (window_length == 1)
-    {
+    if (window_length == 1) {
         return at::ones({1}, options);
     }
-    if (periodic)
-    {
+    if (periodic) {
         window_length += 1;
     }
     auto window = at::arange(window_length, options).mul_(2. / static_cast<double>(window_length - 1));
@@ -488,11 +495,12 @@ at::Tensor NPUNativeFunctions::bartlett_window(
     return periodic ? window.narrow(0, 0, window_length - 1) : window;
 }
 
-at::Tensor NPUNativeFunctions::bartlett_window(int64_t window_length,
-                                               c10::optional<at::ScalarType> dtype_opt,
-                                               c10::optional<c10::Layout> layout_opt,
-                                               c10::optional<c10::Device> device_opt,
-                                               c10::optional<bool> pin_memory_opt)
+at::Tensor NPUNativeFunctions::bartlett_window(
+    int64_t window_length,
+    c10::optional<at::ScalarType> dtype_opt,
+    c10::optional<c10::Layout> layout_opt,
+    c10::optional<c10::Device> device_opt,
+    c10::optional<bool> pin_memory_opt)
 {
     return bartlett_window(window_length, true, dtype_opt, layout_opt, device_opt, pin_memory_opt);
 }
@@ -515,11 +523,12 @@ at::Tensor NPUNativeFunctions::hann_window(
     return at::hamming_window(window_length, periodic, 0.5, 0.5, options);
 }
 
-at::Tensor NPUNativeFunctions::hann_window(int64_t window_length,
-                                           c10::optional<at::ScalarType> dtype_opt,
-                                           c10::optional<c10::Layout> layout_opt,
-                                           c10::optional<c10::Device> device_opt,
-                                           c10::optional<bool> pin_memory_opt)
+at::Tensor NPUNativeFunctions::hann_window(
+    int64_t window_length,
+    c10::optional<at::ScalarType> dtype_opt,
+    c10::optional<c10::Layout> layout_opt,
+    c10::optional<c10::Device> device_opt,
+    c10::optional<bool> pin_memory_opt)
 {
     return hann_window(window_length, true, dtype_opt, layout_opt, device_opt, pin_memory_opt);
 }
@@ -541,16 +550,13 @@ at::Tensor NPUNativeFunctions::hamming_window(
                                         .pinned_memory(pin_memory_opt);
 
     window_function_checks("hamming_window", options, window_length);
-    if (window_length == 0)
-    {
+    if (window_length == 0) {
         return at::empty({0}, options);
     }
-    if (window_length == 1)
-    {
+    if (window_length == 1) {
         return at::ones({1}, options);
     }
-    if (periodic)
-    {
+    if (periodic) {
         window_length += 1;
     }
     auto window = at::arange(window_length, options);
@@ -584,11 +590,12 @@ at::Tensor NPUNativeFunctions::hamming_window(
     return hamming_window(window_length, periodic, 0.54, dtype_opt, layout_opt, device_opt, pin_memory_opt);
 }
 
-at::Tensor NPUNativeFunctions::hamming_window(int64_t window_length,
-                                              c10::optional<at::ScalarType> dtype_opt,
-                                              c10::optional<c10::Layout> layout_opt,
-                                              c10::optional<c10::Device> device_opt,
-                                              c10::optional<bool> pin_memory_opt)
+at::Tensor NPUNativeFunctions::hamming_window(
+    int64_t window_length,
+    c10::optional<at::ScalarType> dtype_opt,
+    c10::optional<c10::Layout> layout_opt,
+    c10::optional<c10::Device> device_opt,
+    c10::optional<bool> pin_memory_opt)
 {
     return hamming_window(window_length, true, dtype_opt, layout_opt, device_opt, pin_memory_opt);
 }
@@ -615,21 +622,20 @@ at::Tensor tensor_backend_npu(c10::ArrayRef<T> values, const c10::TensorOptions
 #define TENSOR(T, _1)                                                             \
 at::Tensor tensor_npu(c10::ArrayRef<T> values, const c10::TensorOptions &options) \
 {                                                                                 \
-  if (options.device().type() != c10::DeviceType::PrivateUse1)                   \
-  {                                                                               \
-    return tensor_backend_npu(values, options);                                   \
-  }                                                                               \
-  else                                                                            \
-  {                                                                               \
-    return tensor_npu(values, options);                                           \
-  }                                                                               \
-}
-  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TENSOR)
+    if (options.device().type() != c10::DeviceType::PrivateUse1) {                \
+        return tensor_backend_npu(values, options);                               \
+    } else {                                                                      \
+        return tensor_npu(values, options);                                       \
+    }                                                                             \
+}
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TENSOR)
 #undef TENSOR
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ clone ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-at::Tensor NPUNativeFunctions::clone(const at::Tensor &src,
-                                     c10::optional<c10::MemoryFormat> format) {
+at::Tensor NPUNativeFunctions::clone(
+    const at::Tensor &src,
+    c10::optional<c10::MemoryFormat> format)
+{
     c10_npu::NPUGuard guard(src.device());
     OptimizationCases opt_cases{"reshape", "slice"};
     if (TransContiguous::CanOptimize(src, opt_cases)) {
@@ -655,11 +661,12 @@ at::Tensor NPUNativeFunctions::full(
     c10::optional<at::ScalarType> dtype_opt,
     c10::optional<at::Layout> layout_opt,
     c10::optional<at::Device> device_opt,
-    c10::optional<bool> pin_memory_opt) {
-  c10::TensorOptions options = c10::TensorOptions().dtype(dtype_opt)
-                                      .device(device_opt)
-                                      .layout(layout_opt)
-                                      .pinned_memory(pin_memory_opt);
+    c10::optional<bool> pin_memory_opt)
+{
+    c10::TensorOptions options = c10::TensorOptions().dtype(dtype_opt)
+                                        .device(device_opt)
+                                        .layout(layout_opt)
+                                        .pinned_memory(pin_memory_opt);
     TORCH_CHECK(
         options.layout() != at::kSparse,
         "full(...) is not implemented for sparse layout", OPS_ERROR(ErrCode::TYPE));
@@ -685,7 +692,8 @@ at::Tensor NPUNativeFunctions::tril_indices(
     c10::optional<at::ScalarType> dtype_opt,
     c10::optional<at::Layout> layout_opt,
     c10::optional<at::Device> device_opt,
-    c10::optional<bool> pin_memory_opt) {
+    c10::optional<bool> pin_memory_opt)
+{
     c10::TensorOptions options = c10::TensorOptions().dtype(dtype_opt)
                                                         .device(device_opt)
                                                         .layout(layout_opt)
@@ -738,48 +746,52 @@ at::Tensor NPUNativeFunctions::triu_indices(
     c10::optional<at::ScalarType> dtype_opt,
     c10::optional<at::Layout> layout_opt,
     c10::optional<at::Device> device_opt,
-    c10::optional<bool> pin_memory_opt) {
-  c10::TensorOptions options = c10::TensorOptions().dtype(dtype_opt)
-                                                    .device(device_opt)
-                                                    .layout(layout_opt)
-                                                    .pinned_memory(pin_memory_opt);
-  check_args(row, col, options);
-
-  auto triu_size = row * col - get_tril_size(row, col, offset - 1);
-
-  // create an empty Tensor with correct size
-  auto result = at::empty({2 * triu_size}, options);
-
-  // fill the Tensor with correct values
-  int64_t i = 0;
-  // not typing std::max with scalar_t as it could be an unsigned type
-  // NOTE: no need to check if the returned value of std::max overflows
-  // scalar_t, as i and triu_size act as a guard.
-  int64_t c = std::max<int64_t>(0, offset);
-  int64_t r = 0;
-  while (i < triu_size) {
-    result[i] = r;
-    result[triu_size + i++] = c;
-
-    // move to the next column and check if (r, c) is still in bound
-    c += 1;
-    if (c >= col) {
-      r += 1;
-      // not typing std::max with scalar_t as it could be an unsigned type
-      // NOTE: not necessary to check if c is less than col or overflows here,
-      // because i and triu_size act as a guard.
-      c = std::max<int64_t>(0, r + offset);
+    c10::optional<bool> pin_memory_opt)
+{
+    c10::TensorOptions options = c10::TensorOptions().dtype(dtype_opt)
+                                                      .device(device_opt)
+                                                      .layout(layout_opt)
+                                                      .pinned_memory(pin_memory_opt);
+    check_args(row, col, options);
+
+    auto triu_size = row * col - get_tril_size(row, col, offset - 1);
+
+    // create an empty Tensor with correct size
+    auto result = at::empty({2 * triu_size}, options);
+
+    // fill the Tensor with correct values
+    int64_t i = 0;
+    // not typing std::max with scalar_t as it could be an unsigned type
+    // NOTE: no need to check if the returned value of std::max overflows
+    // scalar_t, as i and triu_size act as a guard.
+    int64_t c = std::max<int64_t>(0, offset);
+    int64_t r = 0;
+    while (i < triu_size) {
+        result[i] = r;
+        result[triu_size + i++] = c;
+
+        // move to the next column and check if (r, c) is still in bound
+        c += 1;
+        if (c >= col) {
+            r += 1;
+            // not typing std::max with scalar_t as it could be an unsigned type
+            // NOTE: not necessary to check if c is less than col or overflows here,
+            // because i and triu_size act as a guard.
+            c = std::max<int64_t>(0, r + offset);
+        }
     }
-  }
-  return result.reshape({2, triu_size});
+    return result.reshape({2, triu_size});
 }
 
-at::Tensor NPUNativeFunctions::isnan(const at::Tensor& self) {
-  return at::native::isnan(self);
+at::Tensor NPUNativeFunctions::isnan(const at::Tensor& self)
+{
+    return at::native::isnan(self);
 }
 
-at::Tensor NPUNativeFunctions::unfold(const at::Tensor& self, int64_t dimension, int64_t size, int64_t step) {
-  return at::native::unfold(self, dimension, size, step);
+at::Tensor NPUNativeFunctions::unfold(const at::Tensor& self, int64_t dimension, int64_t size, int64_t step)
+{
+    return at::native::unfold(self, dimension, size, step);
 }
+
 } // namespace native
 } // namespace at_npu
-- 
Gitee


From dc2af64fcd98995fc686044b3436f8e2054c19c0 Mon Sep 17 00:00:00 2001
From: yuhaiyan <yuhaiyan8@huawei.com>
Date: Mon, 10 Mar 2025 02:59:52 +0000
Subject: [PATCH 118/358] !18625 Alarm for the deprecating API Merge pull
 request !18625 from yuhaiyan/v2.6.0-dev1

---
 ci/access_control/constants.py                |   1 +
 ...t_npu_fused_attention_layernorm_qkv_fwd.py |  73 ------
 test/custom_ops/test_npu_scatter.py           |  49 ----
 test/test_autocast.py                         | 225 ++++++++++--------
 .../.pytorch-disabled-tests.json              |   5 +-
 torch_npu/contrib/function/fused_attention.py |   5 +
 .../csrc/core/npu/register/OptionsManager.cpp |   4 -
 .../csrc/core/npu/register/OptionsManager.h   |   6 -
 8 files changed, 131 insertions(+), 237 deletions(-)
 delete mode 100644 test/custom_ops/test_npu_fused_attention_layernorm_qkv_fwd.py
 delete mode 100644 test/custom_ops/test_npu_scatter.py

diff --git a/ci/access_control/constants.py b/ci/access_control/constants.py
index 6ada096932..d8a8de48f8 100644
--- a/ci/access_control/constants.py
+++ b/ci/access_control/constants.py
@@ -14,6 +14,7 @@ SLOW_TEST_BLOCKLIST = [
     'test_reductions',
     'test_unary_ufuncs',
     'test_ops_jit',
+    'test_jit_fuser_te.py',
     "onnx/test_op_consistency",
     'onnx/test_pytorch_onnx_onnxruntime'
 ]
diff --git a/test/custom_ops/test_npu_fused_attention_layernorm_qkv_fwd.py b/test/custom_ops/test_npu_fused_attention_layernorm_qkv_fwd.py
deleted file mode 100644
index 937d19094f..0000000000
--- a/test/custom_ops/test_npu_fused_attention_layernorm_qkv_fwd.py
+++ /dev/null
@@ -1,73 +0,0 @@
-#
-import unittest
-
-import numpy as np
-import torch
-
-import torch_npu
-from torch_npu.testing.testcase import TestCase, run_tests
-from torch_npu.testing.common_utils import create_common_tensor
-
-
-class TestFusedAttentionQKV(TestCase):
-
-    def confusion_transpose(self, x, new_shape):
-        return torch_npu.npu_format_cast(x.view(new_shape).permute(0, 2, 1, 3), 29)
-
-    def supported_op_exec(self, ln_input, q_kernel, k_kernel, v_kernel, gamma, beta, q_bias, k_bias, v_bias):
-        q_kernel = torch_npu.npu_format_cast(q_kernel.t().contiguous(), 29)
-        k_kernel = torch_npu.npu_format_cast(k_kernel.t().contiguous(), 29)
-        v_kernel = torch_npu.npu_format_cast(v_kernel.t().contiguous(), 29)
-
-        norm_shape = (1024,)
-        norm, mean, _ = torch.native_layer_norm(ln_input, norm_shape, gamma, beta, eps=1e-05)
-        variance = torch.var(ln_input, -1, keepdim=False, unbiased=False)
-        q_layer = self.confusion_transpose(torch.nn.functional.linear(norm, q_kernel, q_bias), (24, 512, 16, 64))
-        k_layer = self.confusion_transpose(torch.nn.functional.linear(norm, k_kernel, k_bias), (24, 512, 16, 64))
-        v_layer = self.confusion_transpose(torch.nn.functional.linear(norm, v_kernel, v_bias), (24, 512, 16, 64))
-        return norm.cpu(), mean.cpu(), variance.cpu(), q_layer.cpu(), k_layer.cpu(), v_layer.cpu()
-
-    def custom_op_exec(self, hidden_states, q_kernel, k_kernel, v_kernel, gamma, beta, q_bias, k_bias, v_bias):
-        hidden_states = torch_npu.npu_format_cast(hidden_states, 29)
-        q_kernel = torch_npu.npu_format_cast(q_kernel, 29)
-        k_kernel = torch_npu.npu_format_cast(k_kernel, 29)
-        v_kernel = torch_npu.npu_format_cast(v_kernel, 29)
-        gamma = torch_npu.npu_format_cast(gamma, 1)
-        beta = torch_npu.npu_format_cast(beta, 1)
-
-        seq_len = 512
-        num_heads = 16
-        norm, q_layer, k_layer, v_layer, mean, variance = torch_npu.npu_fused_attention_layernorm_qkv_fwd(
-            hidden_states, q_kernel, k_kernel, v_kernel, gamma, beta, q_bias, k_bias, v_bias, seq_len, num_heads)
-        return norm.cpu(), mean.cpu(), variance.cpu(), q_layer.cpu(), k_layer.cpu(), v_layer.cpu()
-
-    @unittest.skip("skipped this case")
-    def test_npu_fused_attention_layernorm_qkv_fwd(self, device="npu"):
-        ln_input = torch.rand(12288, 1024).uniform_(-6, 6).half().npu()
-        q_weight = torch.rand(1024, 1024).uniform_(-0.1, 0.1).half().npu()
-        k_weight = torch.rand(1024, 1024).uniform_(-0.1, 0.1).half().npu()
-        v_weight = torch.rand(1024, 1024).uniform_(-0.1, 0.1).half().npu()
-        gamma = torch.rand(1024).half().npu()
-        beta = torch.rand(1024).half().npu()
-        q_bias = torch.rand(1024).half().npu()
-        k_bias = torch.rand(1024).half().npu()
-        v_bias = torch.rand(1024).half().npu()
-
-        supported_norm, supported_mean, supported_variance, \
-            supported_q, supported_k, supported_v = self.supported_op_exec(
-                ln_input, q_weight, k_weight, v_weight, gamma, beta, q_bias, k_bias, v_bias)
-
-        custom_norm, custom_mean, custom_variance, \
-            custom_q, custom_k, custom_v = self.custom_op_exec(
-                ln_input, q_weight, k_weight, v_weight, gamma, beta, q_bias, k_bias, v_bias)
-
-        self.assertRtolEqual(supported_norm, custom_norm)
-        self.assertRtolEqual(supported_mean, custom_mean)
-        self.assertRtolEqual(supported_variance, custom_variance)
-        self.assertRtolEqual(supported_q, custom_q, prec16=0.003)
-        self.assertRtolEqual(supported_k, custom_k, prec16=0.003)
-        self.assertRtolEqual(supported_v, custom_v, prec16=0.003)
-
-
-if __name__ == '__main__':
-    run_tests()
diff --git a/test/custom_ops/test_npu_scatter.py b/test/custom_ops/test_npu_scatter.py
deleted file mode 100644
index 5f1ab02c36..0000000000
--- a/test/custom_ops/test_npu_scatter.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import torch
-
-import torch_npu
-from torch_npu.testing.testcase import TestCase, run_tests
-
-
-class TestNpuScatter(TestCase):
-    def supported_op_exec(self, input1, indices, updates, dim):
-        tmp = input1.reshape(-1)
-        shape = input1.shape
-        dim_len = shape[dim]
-
-        for i in range(indices.numel()):
-            tmp[i * dim_len + indices[i]] = updates[i]
-
-        output = tmp.reshape(shape).to('cpu')
-        output = output.numpy()
-        return output
-
-    def npu_op_exec(self, input1, indices, updates, dim):
-        output = torch_npu.npu_scatter(input1, indices, updates, dim)
-        output = output.to("cpu")
-        output = output.numpy()
-        return output
-
-    def test_npu_scatter(self, device="npu"):
-        input1_list = [[[1.6279, 0.1226], [0.9041, 1.0980]]]
-        indices_list = [[0, 1]]
-        updates_list = [[-1.1993, -1.5247]]
-        dim_list = [0]
-        exoutput_list = [[[-1.1993, 0.1226], [0.9041, -1.5247]]]
-
-        shape_format = [[i, j, k, h, f] for i in input1_list
-                        for j in indices_list for k in updates_list for h in dim_list for f in exoutput_list]
-
-        for item in shape_format:
-            input1_tensor = torch.tensor(item[0]).npu()
-            indices_tensor = torch.tensor(item[1]).npu().to(torch.int32)
-            updates_tensor = torch.tensor(item[2]).npu()
-            dim = item[3]
-            exoutput_tensor = torch.tensor(item[4])
-            output1 = self.npu_op_exec(input1_tensor, indices_tensor, updates_tensor, dim)
-            output2 = self.supported_op_exec(input1_tensor, indices_tensor, updates_tensor, dim)
-            self.assertRtolEqual(exoutput_tensor.numpy(), output1)
-            self.assertRtolEqual(output1, output2)
-
-
-if __name__ == "__main__":
-    run_tests()
diff --git a/test/test_autocast.py b/test/test_autocast.py
index cde04eb66e..f27a0be21b 100644
--- a/test/test_autocast.py
+++ b/test/test_autocast.py
@@ -4,14 +4,22 @@ import collections
 import unittest
 
 import torch
+from torch.testing._internal.autocast_test_lists import (
+    AutocastCPUTestLists,
+    TestAutocast,
+)
+from torch.testing._internal.common_utils import (
+    IS_WINDOWS,
+    run_tests,
+    skipIfTorchDynamo,
+    TestCase,
+)
 import torch_npu
 import torch_npu.testing
-from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS
-from torch.testing._internal.autocast_test_lists import AutocastCPUTestLists
 from torch.utils._python_dispatch import TorchDispatchMode
 
 
-class TestAutocastCPU(TestCase):
+class TestAutocastCPU(TestAutocast):
     def setUp(self):
         super().setUp()
         self.autocast_lists = AutocastCPUTestLists(torch.device('cpu'))
@@ -20,145 +28,120 @@ class TestAutocastCPU(TestCase):
         del self.autocast_lists
         super().tearDown()
 
-    def _run_autocast_outofplace(
-        self,
-        op,
-        args,
-        run_as_type,
-        out_type=None,
-        module=torch,
-        add_kwargs=None,
-        amp_dtype=torch.bfloat16,
-    ):
-        # helper to cast args
-        def cast(val, to_type):
-            if isinstance(val, torch.Tensor):
-                return val.to(to_type) if val.is_floating_point() else val
-            elif isinstance(val, collections.abc.Iterable):
-                return type(val)(cast(v, to_type) for v in val)
-            else:
-                return val
-
-        if add_kwargs is None:
-            add_kwargs = {}
-
-        self.assertFalse(torch.is_autocast_cpu_enabled())
-        with torch.cpu.amp.autocast(dtype=amp_dtype):
-            self.assertTrue(torch.is_autocast_cpu_enabled())
-            out_type = out_type if out_type is not None else run_as_type
-            output = output_method = None
-
-            # Try module.* variant, if requested:
-            if module is not None and hasattr(module, op):
-                output = getattr(module, op)(*args, **add_kwargs)
-                if isinstance(output, torch.Tensor):
-                    self.assertTrue(out_type == output.dtype,
-                                    f"autocast for torch.{op} produced {output.dtype}, should produce {out_type}")
-            # Try Tensor.* variant:
-            if hasattr(torch.Tensor, op):
-                output_method = getattr(args[0], op)(*args[1:], **add_kwargs)
-                if isinstance(output_method, torch.Tensor):
-                    self.assertTrue(out_type == output_method.dtype,
-                                    "autocast for torch.{} produced {}, should produce torch.{}"
-                                    .format(op, output_method.dtype, out_type))
-
-            self.assertTrue((output is not None) or (output_method is not None),
-                            f"{op} not found as an attribute on either Tensor or the requested module {module}")
-
-            # Accounts for ops that return Tensors, iterables, and other non-Tensors.
-            # For example, lstm_cell returns a tuple and equal returns bool.
-            def compare(first, second):
-                if isinstance(first, torch.Tensor):
-                    return torch.equal(first, second)
-                elif isinstance(first, collections.abc.Iterable):
-                    return all(compare(f, s) for f, s in zip(first, second))
-                else:
-                    return first == second
-
-            # If both torch.* and Tensor.* variants were found, check outputs are identical
-            if (output is not None) and (output_method is not None):
-                self.assertTrue(type(output) == type(output_method))
-                comparison = compare(output, output_method)
-                self.assertTrue(comparison, f"torch.{op} result did not match Tensor.{op} result")
-
-            # Compare numerics to Python-side "autocasting" that (we expect) does the same thing
-            # as the C++-side autocasting, and should be bitwise accurate.
-            output_to_compare = output if output is not None else output_method
-            with torch.cpu.amp.autocast(enabled=False):
-                self.assertFalse(torch.is_autocast_cpu_enabled())
-
-                if module is not None and hasattr(module, op):
-                    control = getattr(module, op)(*cast(args, run_as_type), **add_kwargs)
-                else:
-                    control = getattr(args[0].to(run_as_type), op)(*cast(args[1:], run_as_type), **add_kwargs)
-                self.assertTrue(type(output_to_compare) == type(control))
-                comparison = compare(output_to_compare, control)
-                self.assertTrue(comparison, f"torch.{op} result did not match control")
-            self.assertTrue(torch.is_autocast_cpu_enabled())
-        self.assertFalse(torch.is_autocast_cpu_enabled())
-
-    def args_maybe_kwargs(self, op_with_args):
-        if len(op_with_args) == 2:
-            return op_with_args[0], op_with_args[1], {}
-        else:
-            return op_with_args[0], op_with_args[1], op_with_args[2]
-
+    @skipIfTorchDynamo()
     def test_autocast_torch_expect_builtin_promote(self):
         for op, args1, args2, out_type in self.autocast_lists.torch_expect_builtin_promote:
-            self._run_autocast_outofplace(op, args1, torch.float32, out_type=out_type)
-            self._run_autocast_outofplace(op, args2, torch.float32, out_type=out_type, amp_dtype=torch.float16)
+            self._run_autocast_outofplace(
+                op, args1, torch.float32, device="cpu", out_type=out_type
+            )
+            self._run_autocast_outofplace(
+                op,
+                args2,
+                torch.float32,
+                device="cpu",
+                out_type=out_type,
+                amp_dtype=torch.float16,
+            )
 
+    @skipIfTorchDynamo()
     def test_autocast_methods_expect_builtin_promote(self):
         for op, args1, args2, out_type in self.autocast_lists.methods_expect_builtin_promote:
-            self._run_autocast_outofplace(op, args1, torch.float32, module=None, out_type=out_type)
-            self._run_autocast_outofplace(op, args2, torch.float32, module=None, out_type=out_type, amp_dtype=torch.float16)
+            self._run_autocast_outofplace(
+                op, args1, torch.float32, device="cpu", module=None, out_type=out_type
+            )
+            self._run_autocast_outofplace(
+                op,
+                args2,
+                torch.float32,
+                device="cpu",
+                module=None,
+                out_type=out_type,
+                amp_dtype=torch.float16,
+            )
 
+    @skipIfTorchDynamo()
     def test_autocast_torch_16(self):
         for op_with_args in self.autocast_lists.torch_16:
             op, args, maybe_kwargs = self.args_maybe_kwargs(op_with_args)
-            self._run_autocast_outofplace(op, args, torch.bfloat16, add_kwargs=maybe_kwargs)
-            self._run_autocast_outofplace(op, args, torch.float16, add_kwargs=maybe_kwargs, amp_dtype=torch.float16)
+            self._run_autocast_outofplace(
+                op, args, torch.bfloat16, device="cpu", add_kwargs=maybe_kwargs
+            )
+            self._run_autocast_outofplace(
+                op,
+                args,
+                torch.float16,
+                device="cpu",
+                add_kwargs=maybe_kwargs,
+                amp_dtype=torch.float16,
+            )
 
+    @skipIfTorchDynamo()
     def test_autocast_nn_16(self):
         for op_with_args in self.autocast_lists.nn_16:
             op, args, maybe_kwargs = self.args_maybe_kwargs(op_with_args)
             self._run_autocast_outofplace(
-                op, args, torch.bfloat16, module=torch._C._nn, add_kwargs=maybe_kwargs
+                op,
+                args,
+                torch.bfloat16,
+                device="cpu",
+                module=torch._C._nn,
+                add_kwargs=maybe_kwargs,
             )
             self._run_autocast_outofplace(
                 op,
                 args,
                 torch.float16,
+                device="cpu",
                 module=torch._C._nn,
                 add_kwargs=maybe_kwargs,
                 amp_dtype=torch.float16,
             )
 
+    @skipIfTorchDynamo()
     def test_autocast_torch_fp32(self):
         for op_with_args in self.autocast_lists.torch_fp32:
             op, args, maybe_kwargs = self.args_maybe_kwargs(op_with_args)
-            self._run_autocast_outofplace(op, args, torch.float32, add_kwargs=maybe_kwargs)
-            self._run_autocast_outofplace(op, args, torch.float32, add_kwargs=maybe_kwargs, amp_dtype=torch.float16)
+            self._run_autocast_outofplace(
+                op, args, torch.float32, device="cpu", add_kwargs=maybe_kwargs
+            )
+            self._run_autocast_outofplace(
+                op,
+                args,
+                torch.float32,
+                device="cpu",
+                add_kwargs=maybe_kwargs,
+                amp_dtype=torch.float16,
+            )
 
+    @skipIfTorchDynamo()
     def test_autocast_nn_fp32(self):
         for op_with_args in self.autocast_lists.nn_fp32:
             op, args, maybe_kwargs = self.args_maybe_kwargs(op_with_args)
             self._run_autocast_outofplace(
-                op, args, torch.float32, module=torch._C._nn, add_kwargs=maybe_kwargs
+                op,
+                args,
+                torch.float32,
+                device="cpu",
+                module=torch._C._nn,
+                add_kwargs=maybe_kwargs,
             )
             self._run_autocast_outofplace(
                 op,
                 args,
                 torch.float32,
+                device="cpu",
                 module=torch._C._nn,
                 add_kwargs=maybe_kwargs,
                 amp_dtype=torch.float16,
             )
 
+    @skipIfTorchDynamo()
     def test_autocast_torch_need_autocast_promote(self):
         for op, args1, args2 in self.autocast_lists.torch_need_autocast_promote:
-            self._run_autocast_outofplace(op, args1, torch.float32)
-            self._run_autocast_outofplace(op, args2, torch.float32, amp_dtype=torch.float16)
+            self._run_autocast_outofplace(op, args1, torch.float32, device="cpu")
+            self._run_autocast_outofplace(
+                op, args2, torch.float32, device="cpu", amp_dtype=torch.float16
+            )
 
     @unittest.skipIf(IS_WINDOWS, "Limit support for bf16 path")
     def test_autocast_rnn(self):
@@ -174,13 +157,22 @@ class TestAutocastCPU(TestCase):
                 m(x, (hx, cx))
 
             # Should be able to run the below case with autocast
-            with torch.cpu.amp.autocast():
+            with torch.amp.autocast(device_type="cpu"):
                 m(x, (hx, cx))
 
     def test_autocast_disabled_with_fp32_dtype(self):
         with torch.autocast(device_type='cpu', dtype=torch.float32, enabled=False):
             _ = torch.ones(10)
 
+    def test_generic_autocast(self):
+        for op_with_args in self.autocast_lists.torch_16:
+            op, args, maybe_kwargs = self.args_maybe_kwargs(op_with_args)
+            with torch.amp.autocast(device_type="cpu"):
+                generic_autocast_output = getattr(torch, op)(*args, **maybe_kwargs)
+            with torch.amp.autocast(device_type="cpu"):
+                cpu_autocast_output = getattr(torch, op)(*args, **maybe_kwargs)
+            self.assertEqual(generic_autocast_output, cpu_autocast_output)
+
 
 class CustomLinear(torch.autograd.Function):
     @staticmethod
@@ -265,6 +257,22 @@ class TestAutocastNPU(TestCase):
         finally:
             torch._C._set_cached_tensors_enabled(False)
 
+    def test_autocast_prioritize(self):
+        device = "npu"
+        dtype = torch.bfloat16
+
+        with torch.autocast(device_type=device, enabled=True, dtype=dtype):
+            t = torch.randn([3, 4, 5], dtype=dtype, device=device, requires_grad=True)
+            index = torch.randint(
+                low=0, high=3, size=[3, 4, 5], dtype=torch.int64, device=device
+            )
+            val = torch.randn(1, dtype=dtype, device=device)
+
+            res = torch.index_put(t, [index], val)
+
+            loss = res.mean()
+            loss.backward()
+
     def test_set_autocast_dtype(self):
         torch_npu.npu.set_autocast_dtype(torch.float16)
         self.assertTrue(torch_npu.npu.get_autocast_dtype(), torch.float16)
@@ -282,17 +290,26 @@ class TestAutocastNPU(TestCase):
 
 class TestTorchAutocast(TestCase):
     def test_autocast_fast_dtype(self):
-        gpu_fast_dtype = torch.get_autocast_gpu_dtype()
-        cpu_fast_dtype = torch.get_autocast_cpu_dtype()
-        self.assertEqual(gpu_fast_dtype, torch.half)
+        npu_fast_dtype = torch.get_autocast_dtype(device_type="privateuseone")
+        cpu_fast_dtype = torch.get_autocast_dtype(device_type="cpu")
+        self.assertEqual(npu_fast_dtype, torch.half)
         self.assertEqual(cpu_fast_dtype, torch.bfloat16)
 
     def test_invalid_device(self):
-        dev = 'not a real device'
-        msg = f'unsupported autocast device_type \'{dev}\''
+        dev = "not a real device"
+        msg = f"Invalid device string: '{dev}'"
         with self.assertRaisesRegex(RuntimeError, msg):
             with torch.autocast(device_type=dev):
                 _ = torch.tensor(1)
+        with self.assertRaisesRegex(RuntimeError, msg):
+            assert torch.amp.is_autocast_available(device_type=dev)
+
+    def test_non_string_device(self):
+        """Test that `autocast` throws a ValueError when provided a `torch.device` object for `device_type` instead of a string"""
+        dev = torch.device("cpu")
+        msg = f"Expected `device_type` of type `str`, got: `{type(dev)}`"
+        with self.assertRaisesRegex(expected_exception=ValueError, expected_regex=msg):
+            torch.autocast(device_type=dev)
 
 
 if __name__ == '__main__':
diff --git a/test/unsupported_test_cases/.pytorch-disabled-tests.json b/test/unsupported_test_cases/.pytorch-disabled-tests.json
index e5d41870b6..ba602892c4 100644
--- a/test/unsupported_test_cases/.pytorch-disabled-tests.json
+++ b/test/unsupported_test_cases/.pytorch-disabled-tests.json
@@ -31216,5 +31216,8 @@
   "test_unary_op_out_casting_npu_float64_complex128 (__main__.TestTypePromotionPRIVATEUSE1)": ["", ["910A"]],
   "test_unary_op_out_casting_npu_float64_complex64 (__main__.TestTypePromotionPRIVATEUSE1)": ["", ["910A"]],
   "test_unary_op_out_casting_npu_int64_complex128 (__main__.TestTypePromotionPRIVATEUSE1)": ["", ["910A"]],
-  "test_unary_op_out_casting_npu_int64_complex64 (__main__.TestTypePromotionPRIVATEUSE1)": ["", ["910A"]]
+  "test_unary_op_out_casting_npu_int64_complex64 (__main__.TestTypePromotionPRIVATEUSE1)": ["", ["910A"]],
+  "test_unsupported_dtypes (__main__.TestTEFuserDynamic)": ["", [""]],
+  "test_unsupported_dtypes (__main__.TestTEFuserStatic)": ["", [""]],
+  "test_autocast_fast_dtype (__main__.TestTorchAutocast)": ["", [""]]
 }
diff --git a/torch_npu/contrib/function/fused_attention.py b/torch_npu/contrib/function/fused_attention.py
index 32d14ef1a2..1e2bd2b65c 100644
--- a/torch_npu/contrib/function/fused_attention.py
+++ b/torch_npu/contrib/function/fused_attention.py
@@ -1,4 +1,6 @@
 import functools
+import warnings
+
 import torch
 import torch_npu
 from torch_npu.utils._error_code import ErrCode, ops_error
@@ -86,6 +88,9 @@ class _FusedAttentionWithLayerNorm(torch.autograd.Function):
                 beta,
                 scale=1,
                 keep_prob=0):
+        warnings.warn("torch_npu.contrib.npu_fused_attention_with_layernorm is deprecated and "
+                      "will be removed in future version. Use torch_npu.npu_fusion_attention and "
+                      "torch.nn.LayerNorm instead.", FutureWarning)
         _check_compatibility_once(hidden_states, attention_mask, query_kernel,
                                  key_kernel, value_kernel, query_bias,
                                  key_bias, value_bias, gamma, beta)
diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
index 9160857a47..f1e3918ffc 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
@@ -581,10 +581,6 @@ void OptionsManager::IsOomSnapshotEnable()
 #ifndef BUILD_LIBTORCH
     char* env_val = std::getenv("OOM_SNAPSHOT_ENABLE");
     int64_t envFlag = (env_val != nullptr) ? strtol(env_val, nullptr, 10) : 0;
-    std::unordered_map<int64_t, std::string> OOMSnapshotEnableMode = getOOMSnapshotEnableMode();
-    if (OOMSnapshotEnableMode.find(envFlag) == OOMSnapshotEnableMode.end()) {
-        TORCH_CHECK(false, "OOM_SNAPSHOT_ENABLE should be 0, 1 or 2", PTA_ERROR(ErrCode::VALUE));
-    }
     switch (envFlag) {
         case 0:
             break;
diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h
index 56ffcc0e63..3f1d33224d 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.h
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.h
@@ -89,12 +89,6 @@ static std::unordered_map<int32_t, std::string> getAclOpInitMode()
     return aclOpInitMode;
 }
 
-static std::unordered_map<int64_t, std::string> getOOMSnapshotEnableMode()
-{
-    std::unordered_map<int64_t, std::string> OOMSnapshotEnableMode = {{0, "close"}, {1, "all"}, {2, "state"}};
-    return OOMSnapshotEnableMode;
-}
-
 class OptionsManager {
 public:
     static bool IsHcclZeroCopyEnable();
-- 
Gitee


From 837063618741978d9311f864e70471cba0c9f1fe Mon Sep 17 00:00:00 2001
From: Yuanhao Ji <jiyuanhao@apache.org>
Date: Mon, 10 Mar 2025 03:05:01 +0000
Subject: [PATCH 119/358] !18478 [v2.6.0][Build] Allow users custom make
 parallelism Merge pull request !18478 from Yuanhao Ji/v260/build/parallelsim

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c1ece00691..4e3e451e47 100644
--- a/setup.py
+++ b/setup.py
@@ -374,7 +374,8 @@ class CPPLibBuild(build_clib, object):
         if os.getenv('_ABI_VERSION') is not None:
             cmake_args.append('-DABI_VERSION=' + os.getenv('_ABI_VERSION'))
 
-        build_args = ['-j', str(multiprocessing.cpu_count())]
+        max_jobs = os.getenv("MAX_JOBS", str(multiprocessing.cpu_count()))
+        build_args = ['-j', max_jobs]
 
         subprocess.check_call([self.cmake, BASE_DIR] + cmake_args, cwd=build_type_dir, env=os.environ)
         for base_dir, dirs, files in os.walk(build_type_dir):
-- 
Gitee


From fefc087f206359e84e2736f33c14a18dc8255549 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Mon, 10 Mar 2025 03:30:29 +0000
Subject: [PATCH 120/358] !18742 Update op_plugin commit id Merge pull request
 !18742 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 5debaaaf2a..303f505ce5 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 5debaaaf2a93ebeb690f157f9a28c2e2fda1119b
+Subproject commit 303f505ce5b8094d314b5986f87f723a9210a86b
-- 
Gitee


From 6e7b49be1cf3c95a097b90e02baa5c8518da8d6e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=9C=E9=87=91=E8=88=AA?= <15990042527@163.com>
Date: Mon, 10 Mar 2025 06:35:18 +0000
Subject: [PATCH 121/358] =?UTF-8?q?!18749=20Fixed=20Failure=20to=20save=20?=
 =?UTF-8?q?data=20on=20oom=20devices=20except=20device=200=20Merge=20pull?=
 =?UTF-8?q?=20request=20!18749=20from=20=E6=9D=9C=E9=87=91=E8=88=AA/v2.6.0?=
 =?UTF-8?q?-clean?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/core/npu/NPUCachingAllocator.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index 355d9e8df9..bfab266b9e 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -930,7 +930,7 @@ class DeviceCachingAllocator {
 
   void attachOutOfMemoryObserver(OutOfMemoryObserver observer)
   {
-      oom_observers_.emplace_back(std::move(observer));
+      oom_observers_.emplace_back(observer);
   }
 
   bool checkUceInMemPool()
@@ -2545,7 +2545,7 @@ class NpuCachingAllocator : public NPUAllocator {
   void attachOutOfMemoryObserver(OutOfMemoryObserver observer) override
   {
       for (auto& allocator : device_allocator) {
-          allocator->attachOutOfMemoryObserver(std::move(observer));
+          allocator->attachOutOfMemoryObserver(observer);
       }
   }
 
-- 
Gitee


From ab85b4f9c82bd18e4d12b06bc60e70bde23d5817 Mon Sep 17 00:00:00 2001
From: yuhaiyan <yuhaiyan8@huawei.com>
Date: Mon, 10 Mar 2025 08:16:23 +0000
Subject: [PATCH 122/358] !18756 Fixed a value in the slow_test_blocklist.
 Merge pull request !18756 from yuhaiyan/v2.6.0-dev1

---
 ci/access_control/constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/access_control/constants.py b/ci/access_control/constants.py
index d8a8de48f8..7f3b303dd8 100644
--- a/ci/access_control/constants.py
+++ b/ci/access_control/constants.py
@@ -14,7 +14,7 @@ SLOW_TEST_BLOCKLIST = [
     'test_reductions',
     'test_unary_ufuncs',
     'test_ops_jit',
-    'test_jit_fuser_te.py',
+    'test_jit_fuser_te',
     "onnx/test_op_consistency",
     'onnx/test_pytorch_onnx_onnxruntime'
 ]
-- 
Gitee


From 4d18be8243a430f76a1702dd239d579e3e784dba Mon Sep 17 00:00:00 2001
From: huangyunlong <huangyunlong4@huawei.com>
Date: Mon, 10 Mar 2025 11:37:26 +0000
Subject: [PATCH 123/358] !18686 add check for batch_isend_irecv when tensor is
 not npu Merge pull request !18686 from huangyunlong/2.6bsr

---
 torch_npu/distributed/distributed_c10d.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/torch_npu/distributed/distributed_c10d.py b/torch_npu/distributed/distributed_c10d.py
index a2ce44841f..af2bce03fd 100644
--- a/torch_npu/distributed/distributed_c10d.py
+++ b/torch_npu/distributed/distributed_c10d.py
@@ -11,6 +11,8 @@ from torch.distributed.distributed_c10d import _get_default_group, get_group_ran
     get_backend, GatherOptions, _update_default_pg, _world, _unregister_all_process_groups, _pg_map, \
     ProcessGroup, default_pg_timeout, ReduceScatterOptions, _unregister_process_group
 
+from torch_npu.utils._error_code import ErrCode, dist_error
+
 __all__ = ["is_hccl_available", "reinit_process_group"]
 
 
@@ -32,6 +34,9 @@ def _batch_isend_irecv(p2p_op_list):
         tensors = []
         remote_rank_list = []
         for p2p_op in p2p_op_list:
+            if p2p_op.tensor.device.type != "npu":
+                deviceType = p2p_op.tensor.device.type
+                raise RuntimeError(f"No backend type associated with device type {deviceType}" + dist_error(ErrCode.PARAM))
             op_type.append(p2p_op.op.__name__)
             tensors.append(p2p_op.tensor)
             rank_for_op = get_group_rank(group, p2p_op.peer) if is_multi_pg else p2p_op.peer
-- 
Gitee


From 92894e2f3ef9040620a9bcebc9e4b275b368d1f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E5=98=89=E8=AF=9A?=
 <12731429+wang-pierre-jiacheng@user.noreply.gitee.com>
Date: Mon, 10 Mar 2025 12:50:39 +0000
Subject: [PATCH 124/358] =?UTF-8?q?!18589=20set=20hccl=20comm=20name=20Mer?=
 =?UTF-8?q?ge=20pull=20request=20!18589=20from=20=E7=8E=8B=E5=98=89?=
 =?UTF-8?q?=E8=AF=9A/v2.6.0=5Fhccl?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../csrc/distributed/ProcessGroupHCCL.cpp     | 80 ++++++++++++-------
 .../csrc/distributed/ProcessGroupHCCL.hpp     | 13 +++
 2 files changed, 62 insertions(+), 31 deletions(-)

diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 7224e084d1..e68ff7d2a2 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -249,16 +249,37 @@ std::string getExceptionMsgFromExceptionPtr(const std::exception_ptr& exceptionP
     }
 }
 
-void getHcclCommConfig(HcclCommConfig* config)
+bool getDeterministicState()
 {
-    HcclCommConfigInit(config);
-    config->hcclBufferSize = c10_npu::option::OptionsManager::GetHcclBufferSize();
+    static bool cachedDeterministicState = []() {
+        // The env variable has a higher priority.
+        const char* envValue = std::getenv("HCCL_DETERMINISTIC");
+        if (envValue != nullptr) {
+            std::string valueStr(envValue);
+            std::transform(valueStr.begin(), valueStr.end(), valueStr.begin(), ::tolower);
+            if (valueStr == "true") {
+                return true;
+            }
+        }
+
+        return at::globalContext().deterministicAlgorithms();
+    }();
+
+    return cachedDeterministicState;
 }
 
-void getP2PHcclCommCofig(HcclCommConfig* config)
+void getHcclCommConfig(HcclCommConfig* config, bool isP2P = false)
 {
     HcclCommConfigInit(config);
-    config->hcclBufferSize = c10_npu::option::OptionsManager::GetP2PBufferSize();
+    if (!isP2P) {
+        config->hcclBufferSize = c10_npu::option::OptionsManager::GetHcclBufferSize();
+    } else {
+        config->hcclBufferSize = c10_npu::option::OptionsManager::GetP2PBufferSize();
+    }
+
+    // Temporarily adding this logic to set deterministic states to avoid a known issues within HCCL.
+    config->hcclDeterministic = getDeterministicState() ? 1 : 0;
+
     // Compatible with the size check of the old version of HCCL, forcibly convert
     // the config object to a size_t=32 object, and retain the N ± 2 version
     if (!isHcclFeatureSupported(HcclCommConfigCapability::HCCL_COMM_CONFIG_COMM_NAME)) {
@@ -773,6 +794,8 @@ ProcessGroupHCCL::ProcessGroupHCCL(
     traceKeyEnd_("HCCL_" + std::to_string(rank) + "_trace_end"),
     terminateProcessGroup_(false)
 {
+    std::string groupName = "group_name_" + options->group_id;
+    this->setGroupName(groupName);
     int32_t hccl_event_timeout = c10_npu::option::OptionsManager::GetHCCLEventTimeout();
     int32_t hccl_exec_timeout = c10_npu::option::OptionsManager::GetHCCLExecTimeout();
     if (hccl_event_timeout > 0) {
@@ -1500,18 +1523,14 @@ void ProcessGroupHCCL::createHCCLComm(
                     checkHcclCommConfigValid(commConfig);
                     hcclComms[i] = HCCLComm::create_config(numRanks, rank, hcclID, commConfig);
                 } else {
-                    if (!options_->hccl_config.empty()) {
-                        config = createHcclCommConfigWithOptions();
-                        hcclComms[i] = HCCLComm::create_config(numRanks, rank, hcclID, &config);
-                    } else {
-                        hcclComms[i] = HCCLComm::create(numRanks, rank, hcclID);
-                    }
+                    config = createHcclCommConfigWithOptions();
+                    hcclComms[i] = HCCLComm::create_config(numRanks, rank, hcclID, &config);
                 }
                 break;
             case HcclCommType::P2P: // P2P not support set hcclCommName
                 numRanks = 2;
                 rank = p2pRank;
-                getP2PHcclCommCofig(&config);
+                getHcclCommConfig(&config, true);
                 hcclComms[i] = HCCLComm::create_config(numRanks, rank, hcclID, &config);
                 break;
             default:
@@ -1677,6 +1696,8 @@ void ProcessGroupHCCL::createHCCLCommForZeroCopy(
                 DIST_ERROR(ErrCode::INTERNAL));
         }
     }
+    // This HCCL comm is only created for zero copy and will not be used for HCCL operators.
+    // So there is no need to pass HCCL comm config and specify HCCL comm name.
     hcclComms[0] = HCCLComm::create(std::stoi(envMap["local_world_size"]), std::stoi(envMap["local_rank"]), hcclID);
     return;
 }
@@ -2085,20 +2106,19 @@ std::string ProcessGroupHCCL::getHcclCommName(int rankid, bool init_comm)
             return "";
         }
     }
+
+    HcclCommConfig config = createHcclCommConfigWithOptions();
     std::string hcclCommName = "";
-    std::vector <std::shared_ptr<HCCLComm>> hcclComms;
     {
         std::lock_guard <std::mutex> lock(mutex_);
         hcclCommName = devHCCLCommNameMap_[key];
     }
     if (!hcclCommName.empty()) {
-        HcclCommConfig config = createHcclCommConfigWithOptions();
         torch_npu::toolkit::profiler::Utils::safe_strcpy_s(config.hcclCommName, hcclCommName.c_str(),
                                                            COMM_NAME_MAX_LENGTH);
-        hcclComms = getHCCLComm(key, devices, HcclCommType::DEFAULT, &config);
-    } else {
-        hcclComms = getHCCLComm(key, devices);
     }
+    std::vector <std::shared_ptr<HCCLComm>> hcclComms = getHCCLComm(key, devices, HcclCommType::DEFAULT, &config);
+
     TORCH_CHECK(hcclComms.size() == 1, "expect hcclComms.size() = 1, but hcclComms.size() = ",
         hcclComms.size(), DIST_ERROR(ErrCode::VALUE));
     HcclComm hcom = hcclComms[0]->getHcclComm();
@@ -2239,6 +2259,14 @@ HcclCommConfig ProcessGroupHCCL::createHcclCommConfigWithOptions()
     HcclCommConfig config;
     getHcclCommConfig(&config);
 
+    // update group name in hccl comm config
+    std::string groupName = getGroupName();
+    torch_npu::toolkit::profiler::Utils::safe_strcpy_s(config.hcclCommName, groupName.c_str(), COMM_NAME_MAX_LENGTH);
+
+    if (options_->hccl_config.empty()) {
+        return config;
+    }
+
     if (options_->hccl_config.find("hccl_buffer_size") != options_->hccl_config.end()) {
         if (std::holds_alternative<uint32_t>(options_->hccl_config["hccl_buffer_size"])) {
             config.hcclBufferSize = std::get<uint32_t>(options_->hccl_config["hccl_buffer_size"]);
@@ -2304,13 +2332,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::collective(
 
     const auto devices = getDeviceList(inputs);
     auto key = getKeyFromDevices(devices);
-    std::vector<std::shared_ptr<HCCLComm>> hcclComms;
-    if (!options_->hccl_config.empty()) {
-        HcclCommConfig config = createHcclCommConfigWithOptions();
-        hcclComms = getHCCLComm(key, devices, HcclCommType::DEFAULT, &config);
-    } else {
-        hcclComms = getHCCLComm(key, devices);
-    }
+    HcclCommConfig config = createHcclCommConfigWithOptions();
+    std::vector<std::shared_ptr<HCCLComm>> hcclComms = getHCCLComm(key, devices, HcclCommType::DEFAULT, &config);
 
     // Used many times below, so we stash the unordered_map lookup
     auto& hcclStreams = hcclStreams_[key];
@@ -2470,13 +2493,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::collectiveCoalesced(
 
     const auto devices = getDevice(inputs);
     auto key = getKeyFromDevice(devices);
-    std::vector<std::shared_ptr<HCCLComm>> hcclComms;
-    if (!options_->hccl_config.empty()) {
-        HcclCommConfig config = createHcclCommConfigWithOptions();
-        hcclComms = getHCCLComm(key, devices, HcclCommType::DEFAULT, &config);
-    } else {
-        hcclComms = getHCCLComm(key, devices);
-    }
+    HcclCommConfig config = createHcclCommConfigWithOptions();
+    std::vector<std::shared_ptr<HCCLComm>> hcclComms = getHCCLComm(key, devices, HcclCommType::DEFAULT, &config);
 
     // Used many times below, so we stash the unordered_map lookup
     auto& hcclStreams = hcclStreams_[key];
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
index 7345739279..8306bee956 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
@@ -497,6 +497,17 @@ protected:
         int rank,
         c10d::OpType opType);
 
+    // Do not call this directly, use ProcessGroup::setGroupName instead.
+    void setGroupName(const std::string& name)
+    {
+        pg_name_ = name;
+    }
+
+    const std::string& getGroupName() const
+    {
+        return pg_name_;
+    }
+
     static const int64_t kWatchdogThreadSleepMillis;
 
     // The store is used to broadcast the HCCL Master ID of rank 0.
@@ -647,6 +658,8 @@ protected:
     // the ProcessGroup
     uint64_t op_id_{0};
 
+    std::string pg_name_;
+
     std::exception_ptr watchDogException_ = nullptr;
 
     struct StatusStruct {
-- 
Gitee


From 8c12c0156d06f26ac46ae8d422fce98b6d6ddc21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E5=98=89=E5=B7=8D?= <843972097@qq.com>
Date: Tue, 11 Mar 2025 03:46:52 +0000
Subject: [PATCH 125/358] =?UTF-8?q?!18761=20[Fix]=20Change=20copy=5Fd2d=5F?=
 =?UTF-8?q?dtype=20to=20copy=5Fd2d=5Fbaseformat=5Fopapi.=20Merge=20pull=20?=
 =?UTF-8?q?request=20!18761=20from=20=E5=88=98=E5=98=89=E5=B7=8D/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../csrc/aten/ops/op_api/CopyKernelOpApi.cpp  | 33 +++++++++----------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp b/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp
index 8acab55f9a..fcffc1a62f 100644
--- a/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp
+++ b/torch_npu/csrc/aten/ops/op_api/CopyKernelOpApi.cpp
@@ -73,6 +73,21 @@ void copy_between_host_and_device_opapi(at::Tensor& dst, const at::Tensor& src,
     }
 }
 
+// the format of dst and src is baseformat now, copy d2d
+void copy_d2d_baseformat_opapi(at::Tensor& dst, const at::Tensor& src, bool non_blocking)
+{
+    if (dst.device().index() != src.device().index()) {
+        return copy_d2d(dst, src, non_blocking);
+    }
+
+    c10_npu::NPUGuard guard(src.device());
+    c10::SmallVector<at::Tensor, N> inputs = {src};
+    c10::SmallVector<at::Tensor, N> outputs = {dst};
+    CalcuOpUtil::CheckMemoryOverLaps(inputs, outputs);
+
+    EXEC_NPU_CMD(aclnnInplaceCopy, dst, src);
+}
+
 // the format of dst and src is base format now
 // the dtype of dst and src is same
 // and src and dst are contiguous
@@ -122,7 +137,7 @@ void copy_h2d_baseformat_opapi(at::Tensor& dst, const at::Tensor& src, bool non_
     // if necessary, copy back into dst
     if (!dst_contig.is_same(dst)) {
         TORCH_INTERNAL_ASSERT(dst_contig.device() == dst.device(), OPS_ERROR(ErrCode::VALUE));
-        copy_d2d_dtype(dst, dst_contig, non_blocking);
+        copy_d2d_baseformat_opapi(dst, dst_contig, non_blocking);
     }
 }
 
@@ -150,22 +165,6 @@ void copy_d2h_baseformat_opapi(at::Tensor& dst, const at::Tensor& src, bool non_
     }
 }
 
-// the format of dst and src is baseformat now, copy d2d
-void copy_d2d_baseformat_opapi(at::Tensor& dst, const at::Tensor& src, bool non_blocking)
-{
-    if (dst.device().index() != src.device().index()) {
-        return copy_d2d(dst, src, non_blocking);
-    }
-
-    c10_npu::NPUGuard guard(src.device());
-    c10::SmallVector<at::Tensor, N> inputs = {src};
-    c10::SmallVector<at::Tensor, N> outputs = {dst};
-    CalcuOpUtil::CheckMemoryOverLaps(inputs, outputs);
-
-    EXEC_NPU_CMD(aclnnInplaceCopy, dst, src);
-}
-
-
 at::Tensor& NPUNativeOpApiFunctions::copy_(at::Tensor& self, const at::Tensor& src, bool non_blocking)
 {
     DO_COMPATIBILITY(aclnnInplaceCopy, NPUNativeFunctions::copy_(self, src, non_blocking));
-- 
Gitee


From 075661efa149885013cc0b7f7aa4323a1440f71d Mon Sep 17 00:00:00 2001
From: will-devil <wangyicheng16@huawei.com>
Date: Tue, 11 Mar 2025 07:13:47 +0000
Subject: [PATCH 126/358] !18766 [Feature] fsdp2 testcase 2/N. Merge pull
 request !18766 from will-devil/fsdp2-26-2

---
 .../fsdp2/test_fully_shard_frozen.py          | 259 ++++++++++++++++++
 torch_npu/__init__.py                         |   2 +
 torch_npu/distributed/fsdp/_add_fsdp_patch.py |  27 ++
 3 files changed, 288 insertions(+)
 create mode 100644 test/distributed/fsdp2/test_fully_shard_frozen.py
 create mode 100644 torch_npu/distributed/fsdp/_add_fsdp_patch.py

diff --git a/test/distributed/fsdp2/test_fully_shard_frozen.py b/test/distributed/fsdp2/test_fully_shard_frozen.py
new file mode 100644
index 0000000000..1e9aa6e064
--- /dev/null
+++ b/test/distributed/fsdp2/test_fully_shard_frozen.py
@@ -0,0 +1,259 @@
+import copy
+import functools
+import itertools
+from typing import List, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributed._composable import checkpoint, replicate
+from torch.distributed.fsdp import fully_shard
+from torch.distributed.fsdp._fully_shard._fsdp_param_group import (
+    RegisterPostBackwardFunction,
+)
+from torch.testing._internal.common_fsdp import (
+    check_sharded_parity,
+    MLP,
+    patch_reduce_scatter,
+    patch_register_post_backward_hook_backward,
+    reduce_scatter_with_assert,
+)
+from torch.testing._internal.common_utils import run_tests
+
+import torch_npu
+from torch_npu.testing._internal.common_fsdp import FSDPNPUTest
+
+
+class TestFullyShardFrozen(FSDPNPUTest):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.npu.device_count())
+
+    def test_train_mixed_requires_grad_per_group(self):
+        """
+        Tests training parity with DDP when mixing frozen and non-frozen
+        parameters in the same FSDP communication group. This checks that
+        the reduce-scatters reduce the expected numel and that they are called
+        via the custom autograd function backward (i.e. that they are not
+        delayed until the end of backward).
+        """
+        self.run_subtests(
+            {
+                "reshard_after_forward": [False, True, 2],
+                "use_activation_checkpointing": [False, True],
+                "freeze_after_init": [False, True],
+            },
+            self._test_train_mixed_requires_grad_per_group,
+        )
+
+    def _test_train_mixed_requires_grad_per_group(
+        self,
+        reshard_after_forward: Union[bool, int],
+        use_activation_checkpointing: bool,
+        freeze_after_init: bool,
+    ):
+        torch.manual_seed(42)
+        num_mlps, lin_dim = (3, 32)
+        model = nn.Sequential(
+            *[MLP(lin_dim, torch.device("cpu")) for _ in range(num_mlps)]
+        )
+        # Train biases only (e.g. like BitFit)
+        if not freeze_after_init:
+            for param_name, param in model.named_parameters():
+                if "bias" not in param_name:
+                    param.requires_grad_(False)
+        ref_model = replicate(
+            copy.deepcopy(model).npu(),
+            device_ids=[self.rank],
+            find_unused_parameters=freeze_after_init,
+        )
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        for mlp in model:
+            if use_activation_checkpointing:
+                checkpoint(mlp)
+            fully_shard(mlp, reshard_after_forward=reshard_after_forward)
+        fully_shard(model, reshard_after_forward=reshard_after_forward)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+        orig_reduce_scatter = dist.reduce_scatter_tensor
+        if freeze_after_init:
+            for param_name, param in itertools.chain(
+                model.named_parameters(), ref_model.named_parameters()
+            ):
+                if "bias" not in param_name:
+                    param.requires_grad_(False)
+        for mlp in model:
+            if not isinstance(mlp, MLP):
+                raise AssertionError("The reduce-scatter numel check assumes the model consists of "
+                f"only the same MLP class but got {type(mlp)}")
+        expected_numel = sum(
+            p._local_tensor.numel()
+            for n, p in model[0].named_parameters()
+            if "bias" in n
+        )
+
+        def assert_fn(output: torch.Tensor):
+            self.assertEqual(output.numel(), expected_numel)
+
+        reduce_scatter = functools.partial(
+            reduce_scatter_with_assert, self, orig_reduce_scatter, assert_fn
+        )
+        orig_backward = RegisterPostBackwardFunction.backward
+        backward_count = 0
+
+        def backward_with_count(*args, **kwargs):
+            nonlocal backward_count
+            backward_count += 1
+            return orig_backward(*args, **kwargs)
+
+        torch.manual_seed(42 + self.rank + 1)
+        device = torch.device("npu")
+        with patch_reduce_scatter(
+            reduce_scatter
+        ), patch_register_post_backward_hook_backward(backward_with_count):
+            for iter_idx in range(10):
+                inp = torch.randn((8, lin_dim), device=device)
+                losses: List[torch.Tensor] = []
+                for _model, _optim in ((ref_model, ref_optim), (model, optim)):
+                    _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+                    losses.append(_model(inp).sum())
+                    losses[-1].backward()
+                    _optim.step()
+                check_sharded_parity(self, ref_model, model)
+                self.assertEqual(losses[0], losses[1])
+                # Check that the post-backward hooks ran through the autograd
+                # backward, not the final callback (except possibly that of the
+                # first MLP, which does not have an input that requires grad)
+                self.assertTrue(backward_count >= num_mlps - 1)
+
+    def test_train_mixed_requires_grad_across_groups(self):
+        """
+        Tests training parity with DDP when mixing frozen and non-frozen
+        parameters across different FSDP communication groups, including
+        possibly unfreezing parameters.
+        """
+        self.run_subtests(
+            {
+                "reshard_after_forward": [False, True, 2],
+                "unfreeze_params": [False, True],
+            },
+            self._test_train_mixed_requires_grad_across_groups,
+        )
+
+    def _test_train_mixed_requires_grad_across_groups(
+        self,
+        reshard_after_forward: Union[bool, int],
+        unfreeze_params: bool,
+    ):
+        torch.manual_seed(42)
+        num_linears, lin_dim = (6, 32)
+        modules: List[nn.Module] = []
+        for _ in range(num_linears):
+            modules += [nn.Linear(lin_dim, lin_dim), nn.ReLU()]
+        model = nn.Sequential(*modules)
+        ref_model = replicate(
+            copy.deepcopy(model).npu(),
+            device_ids=[self.rank],
+            find_unused_parameters=True,
+        )
+        for module in model.modules():
+            if isinstance(module, nn.Linear):
+                fully_shard(module, reshard_after_forward=reshard_after_forward)
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+        orig_backward = RegisterPostBackwardFunction.backward
+        backward_count = 0
+
+        def _set_requires_grad(seq: nn.Module, requires_grad: bool):
+            for i in range(num_linears):
+                # Interleave frozen -> non-frozen -> ... linears
+                if i % 2 == 0:
+                    for param in seq[i % 2].parameters():
+                        param.requires_grad_(requires_grad)
+
+        def backward_with_count(*args, **kwargs):
+            nonlocal backward_count
+            backward_count += 1
+            return orig_backward(*args, **kwargs)
+
+        _set_requires_grad(model, False)
+        _set_requires_grad(ref_model, False)
+        num_iters, no_grad_iter_idx = (3, 1)
+        torch.manual_seed(42 + self.rank)
+        inp = torch.randn((8, lin_dim), device="npu")
+        with patch_register_post_backward_hook_backward(backward_with_count):
+            for iter_idx in range(num_iters):
+                losses: List[torch.Tensor] = []
+                for _model, _optim in ((ref_model, ref_optim), (model, optim)):
+                    # Unfreeze the parameters on the last step to emulate some
+                    # kinds of fine-tuning
+                    if unfreeze_params and iter_idx == num_iters - 1:
+                        _set_requires_grad(model, True)
+                    if iter_idx == no_grad_iter_idx:
+                        with torch.no_grad():
+                            losses.append(_model(inp).sum())
+                    else:
+                        losses.append(_model(inp).sum())
+                        losses[-1].backward()
+                        _optim.step()
+                        _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            self.assertEqual(losses[0], losses[1])
+            # Check that the post-backward hooks ran through the autograd
+            # backward, not the final callback (except possibly that of the
+            # first linear, which does not have an input that requires grad)
+            self.assertTrue(backward_count >= num_linears - 1)
+
+    def test_multi_forward_mixed_requires_grad(self):
+        """
+        Tests training parity with DDP when having trainable and frozen modules
+        that participate multiple times in forward.
+        """
+        self.run_subtests(
+            {"reshard_after_forward": [True, False, 2]},
+            self._test_multi_forward_mixed_requires_grad,
+        )
+
+    def _test_multi_forward_mixed_requires_grad(
+        self,
+        reshard_after_forward: Union[bool, int],
+    ):
+        class MultiForwardModule(nn.Module):
+            def __init__(self, device: torch.device):
+                super().__init__()
+                self.layer_0 = nn.Linear(5, 5, device=device)
+                self.layer_no_grad = nn.Linear(5, 5, device=device)
+                self.layer_with_grad = nn.Linear(5, 5, device=device)
+                self.layer_no_grad.requires_grad_(False)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                x = self.layer_0(x)
+                for _ in range(3):
+                    x = self.layer_no_grad(F.relu(self.layer_with_grad(x)))
+                    # Make sure that calling the same layer multiple times
+                    # works regardless whether gradient is enabled
+                    with torch.no_grad():
+                        x += F.relu(self.layer_with_grad(x))
+                return x
+
+        torch.manual_seed(42)
+        model = MultiForwardModule(torch.device("cpu"))
+        ref_model = replicate(copy.deepcopy(model).npu(), device_ids=[self.rank])
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        for module in model.modules():
+            if isinstance(module, nn.Linear):
+                fully_shard(module, reshard_after_forward=reshard_after_forward)
+        fully_shard(model, reshard_after_forward=reshard_after_forward)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+        for iter_idx in range(10):
+            inp = torch.randn((8, 5), device="npu")
+            losses: List[torch.Tensor] = []
+            for _model, _optim in ((ref_model, ref_optim), (model, optim)):
+                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+                _optim.step()
+            self.assertEqual(losses[0], losses[1])
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index b8c0457fc3..76f0903b23 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -76,6 +76,7 @@ import torch_npu.utils.custom_ops
 import torch_npu.distributed.rpc
 import torch_npu.op_plugin
 from torch_npu.profiler._add_mstx_patch import _apply_mstx_patch
+from torch_npu.distributed.fsdp._add_fsdp_patch import _apply_fsdp_patch
 from torch_npu.distributed.rpc.backend_registry import _rpc_backend_registry
 from torch_npu.utils import _cann_package_check, _add_intercept_methods
 from torch_npu.utils import _register_ops_under_dtensor_rules
@@ -168,6 +169,7 @@ def _apply_class_patches():
     add_perf_dump_patch()
     _apply_distributed_methods_patch()
     _apply_mstx_patch()
+    _apply_fsdp_patch()
 
 
 def _apply_distributed_methods_patch():
diff --git a/torch_npu/distributed/fsdp/_add_fsdp_patch.py b/torch_npu/distributed/fsdp/_add_fsdp_patch.py
new file mode 100644
index 0000000000..cc3d5fcfb9
--- /dev/null
+++ b/torch_npu/distributed/fsdp/_add_fsdp_patch.py
@@ -0,0 +1,27 @@
+import torch
+from torch import distributed as dist
+from torch.distributed.fsdp._fully_shard._fsdp_param_group import FSDPParamGroup
+
+import torch_npu
+
+
+def _patched_finalize_backward(self):
+    self._wait_for_post_backward()
+    for fsdp_param in self.fsdp_params:
+        if fsdp_param.grad_offload_event is not None:
+            fsdp_param.grad_offload_event.synchronize()
+            fsdp_param.grad_offload_event = None
+    if self._all_gather_result is not None:
+        # If there was a mistargeted unshard without a corresponding wait,
+        # then we wait here and clear the unshard
+        if (event := self._all_gather_result.all_gather_event) is not None:
+            torch.npu.current_stream().wait_event(event)
+        work = self._all_gather_result.all_gather_work
+        if isinstance(work, dist.distributed_c10d.Work):
+            work.wait()
+        self._all_gather_result = None
+    self._post_forward_indices.clear()
+
+
+def _apply_fsdp_patch():
+    FSDPParamGroup.finalize_backward = _patched_finalize_backward
-- 
Gitee


From 2fb2a4b0a2853fff902885fed723d6ebcc06b61b Mon Sep 17 00:00:00 2001
From: dilililiwhy <why.wuhuanyu@huawei.com>
Date: Tue, 11 Mar 2025 11:08:08 +0000
Subject: [PATCH 127/358] !18659 [2/2] Add tp cases Merge pull request !18659
 from dilililiwhy/tp_cases_260_p2

---
 .../tensor/parallel/test_tp_examples.py       | 565 ++++++++++++++++++
 1 file changed, 565 insertions(+)
 create mode 100644 test/distributed/tensor/parallel/test_tp_examples.py

diff --git a/test/distributed/tensor/parallel/test_tp_examples.py b/test/distributed/tensor/parallel/test_tp_examples.py
new file mode 100644
index 0000000000..17400d9911
--- /dev/null
+++ b/test/distributed/tensor/parallel/test_tp_examples.py
@@ -0,0 +1,565 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import itertools
+from copy import deepcopy
+from typing import Dict, NamedTuple, Optional
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch.distributed._tensor import (
+    DeviceMesh,
+    distribute_tensor,
+    DTensor,
+    Replicate,
+    Shard,
+)
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    checkpoint_wrapper,
+    CheckpointImpl,
+)
+from torch.distributed.tensor.debug import CommDebugMode
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    loss_parallel,
+    parallelize_module,
+    RowwiseParallel,
+)
+from torch.distributed.tensor.parallel.input_reshard import input_reshard
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+)
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    MLPModule,
+    ModelArgs,
+    skip_unless_torch_gpu,
+    Transformer,
+)
+
+import torch_npu
+from torch_npu.testing.common_distributed import with_comms, skipIfUnsupportMultiNPU
+
+
+c10d_functional = torch.ops.c10d_functional
+reduce_scatter, all_gather, all_reduce = (
+    c10d_functional.reduce_scatter_tensor,
+    c10d_functional.all_gather_into_tensor,
+    c10d_functional.all_reduce,
+)
+
+
+class ExpCommCounts(NamedTuple):
+    fwd: Optional[Dict] = None
+    bwd: Optional[Dict] = None
+    optim: Optional[Dict] = None
+
+
+class DistTensorParallelExampleTest(DTensorTestBase):
+    @property
+    def world_size(self):
+        return 2
+
+    def _check_module(self, m1, m2, check_grad=False):
+        named_parameters = dict(m1.named_parameters())
+        for name, param_m2 in m2.named_parameters():
+            self.assertTrue(name in named_parameters)
+            param_m1 = named_parameters[name]
+            if check_grad:
+                param_m2 = param_m2.grad
+                param_m1 = param_m1.grad
+            if isinstance(param_m2, DTensor):
+                replicate = [Replicate()]
+                param_m2 = param_m2.redistribute(
+                    device_mesh=param_m2.device_mesh, placements=replicate
+                ).to_local()
+            self.assertEqual(param_m2, param_m1)
+
+    def _test_mlp_training_e2e(self, is_seq_parallel=False, recompute_activation=False):
+        inp_size = [8, 10]
+        # Ensure all tp ranks have same input.
+        rng_seed = self.rank if is_seq_parallel else 0
+        torch.manual_seed(rng_seed)
+        inp = torch.rand(*inp_size, device=self.device_type)
+        model = MLPModule(self.device_type)
+        model_tp = deepcopy(model)
+
+        # Ensure model are initialized the same way.
+        self._check_module(model, model_tp)
+
+        # Shard module and initialize optimizer.
+        LR = 0.25
+        device_mesh = DeviceMesh(
+            self.device_type,
+            torch.arange(0, self.world_size),
+        )
+        parallelize_plan = {
+            "net1": (
+                ColwiseParallel(input_layouts=Shard(0))
+                if is_seq_parallel
+                else ColwiseParallel()
+            ),
+            "net2": (
+                RowwiseParallel(output_layouts=Shard(0))
+                if is_seq_parallel
+                else RowwiseParallel()
+            ),
+        }
+        model_tp = parallelize_module(model_tp, device_mesh, parallelize_plan)
+        if recompute_activation:
+            model_tp = input_reshard(
+                checkpoint_wrapper(
+                    model_tp, checkpoint_impl=CheckpointImpl.NO_REENTRANT
+                ),
+                device_mesh,
+                None if is_seq_parallel else 0,
+            )
+        optim = torch.optim.SGD(model.parameters(), lr=LR)
+        optim_tp = torch.optim.SGD(model_tp.parameters(), lr=LR)
+
+        output = model(inp)
+        output.sum().backward()
+
+        comm_mode = CommDebugMode()
+        with comm_mode:
+            output_tp = model_tp(inp)
+            output_tp.sum().backward()
+
+        self.assertEqual(output, output_tp)
+        if is_seq_parallel:
+            self.assertEqual(
+                comm_mode.get_comm_counts()[c10d_functional.all_gather_into_tensor], 2
+            )
+            self.assertEqual(
+                comm_mode.get_comm_counts()[c10d_functional.reduce_scatter_tensor], 1
+            )
+        else:
+            self.assertEqual(comm_mode.get_comm_counts()[c10d_functional.all_reduce], 1)
+
+        if is_seq_parallel:
+            # Sum gradients from different ranks, since input
+            # are different across ranks for sequence parallel.
+            dist.all_reduce(model.net1.weight.grad)
+            dist.all_reduce(model.net1.bias.grad)
+            dist.all_reduce(model.net2.weight.grad)
+            dist.all_reduce(model.net2.bias.grad)
+
+        # Ensure gradients are same.
+        self._check_module(model, model_tp, check_grad=True)
+
+        optim.step()
+        optim_tp.step()
+
+        # Ensure model weights are still same after update.
+        # Due to the trick we use for Partial aggregation, we only check the weight when local_rank = 0.
+        self._check_module(model, model_tp)
+
+        inp = torch.rand(*inp_size, device=self.device_type)
+        output = model(inp)
+        output_tp = model_tp(inp)
+        self.assertEqual(output, output_tp)
+
+    def _test_mlp_inference(self, device_mesh):
+        inp_size = [8, 10]
+        # Ensure all tp ranks have same input.
+        torch.manual_seed(0)
+        inp = torch.rand(*inp_size, device=self.device_type)
+        model = MLPModule(self.device_type)
+        model_tp = deepcopy(model)
+
+        # Ensure model are initialized the same way.
+        self._check_module(model, model_tp)
+
+        # Shard module and initialize optimizer.
+        parallelize_plan = {
+            "net1": ColwiseParallel(),
+            "net2": RowwiseParallel(),
+        }
+        model_tp = parallelize_module(model_tp, device_mesh, parallelize_plan)
+
+        output = model(inp)
+        output_tp = model_tp(inp)
+        self.assertEqual(output, output_tp)
+
+    @with_comms
+    @parametrize("is_seq_parallel", [True, False])
+    @skipIfUnsupportMultiNPU(2)
+    # TODO: need to revisit input_reshard API about why it failed multi-gpu tests.
+    # @parametrize("recompute_activation", [True, False])
+    @parametrize("recompute_activation", [False])
+    def test_mlp_training(self, is_seq_parallel, recompute_activation):
+        self._test_mlp_training_e2e(
+            is_seq_parallel=is_seq_parallel, recompute_activation=recompute_activation
+        )
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_mlp_inference(self):
+        device_mesh = DeviceMesh(
+            self.device_type,
+            torch.arange(0, self.world_size),
+        )
+        with torch.inference_mode():
+            self._test_mlp_inference(device_mesh)
+
+    def _setup_single_gpu_model(self, model_args, dtype):
+        return Transformer(model_args).to(device=self.device_type, dtype=dtype)
+
+    def _setup_tp_model(self, model, is_seq_parallel, dtype):
+        model_tp = deepcopy(model)
+        self._check_module(model, model_tp)
+        device_mesh = DeviceMesh(self.device_type, torch.arange(0, self.world_size))
+        local_output_for_attn = dtype is torch.float64
+        return Transformer.parallelize(
+            model_tp,
+            device_mesh,
+            is_seq_parallel,
+            local_output_for_attn=local_output_for_attn,
+        )
+
+    def _setup_optimizer(self, model, model_tp):
+        # Step 3: Run test by comparing outputs from single-gpu and multi-gpu models.
+        LR = 0.25
+        optim = torch.optim.Adam(model.parameters(), lr=LR)
+        optim_tp = torch.optim.Adam(model_tp.parameters(), lr=LR)
+        return optim, optim_tp
+
+    def _validate_fwd(
+        self, model, model_tp, inp, expected_comms_dict=None, check_comms=True
+    ):
+        # Compare outputs on the same input.
+        output = model(inp)
+        with CommDebugMode() as comm_mode:
+            output_tp = model_tp(inp)
+        self.assertEqual(output, output_tp)
+        if check_comms:
+            self.assertDictEqual(comm_mode.get_comm_counts(), expected_comms_dict or {})
+        return output, output_tp
+
+    def _validate_bwd(
+        self,
+        model,
+        model_tp,
+        output,
+        output_tp,
+        expected_comms_dict=None,
+        check_comms=True,
+    ):
+        # Ensure gradients are equal.
+        output.sum().backward()
+        with CommDebugMode() as comm_mode:
+            output_tp.sum().backward()
+        self._check_module(model, model_tp, check_grad=True)
+        if check_comms:
+            self.assertDictEqual(comm_mode.get_comm_counts(), expected_comms_dict or {})
+
+    def _validate_optim_step(
+        self,
+        model,
+        model_tp,
+        optim,
+        optim_tp,
+        expected_comms_dict=None,
+        check_comms=True,
+    ):
+        optim.step()  # Ensure model weights are still the same after update.
+        from torch.distributed._tensor.experimental import implicit_replication
+
+        with implicit_replication():
+            with CommDebugMode() as comm_mode:
+                optim_tp.step()
+        self._check_module(model, model_tp)
+        if check_comms:
+            self.assertDictEqual(comm_mode.get_comm_counts(), expected_comms_dict or {})
+
+    @staticmethod
+    def _thaw_params(thaw_params, model, model_tp):
+        if not thaw_params:
+            return
+        for target_model in [model, model_tp]:
+            for n, p in target_model.named_parameters():
+                if n not in thaw_params:
+                    p.requires_grad_(False)
+
+    @with_comms
+    @skip_unless_torch_gpu
+    @parametrize("is_seq_parallel", [True, False])
+    @parametrize("dtype", [torch.float64, torch.float32])
+    def test_transformer_training(self, is_seq_parallel, dtype: torch.dtype):
+        EXP_BASE_CC = ExpCommCounts(
+            fwd={all_reduce: 6, all_gather: 1}, bwd={all_reduce: 9}
+        )
+        EXP_SEQ_PARALLEL_CC = ExpCommCounts(
+            fwd={reduce_scatter: 6, all_gather: 6},
+            bwd={reduce_scatter: 5, all_gather: 6},
+            optim={all_reduce: 30},
+        )
+
+        # Disable dropout in the test since we cannot reproduce the same random
+        # behaviors when comparing single-gpu models with multi-gpu models.
+        model_args = ModelArgs(dropout_p=0.0)
+        model = self._setup_single_gpu_model(
+            model_args, dtype
+        )  # Step 1: Initialize single-gpu models.
+        model_tp = self._setup_tp_model(
+            model, is_seq_parallel, dtype
+        )  # Step 2: Setup tp model, place onto device mesh.
+        optim, optim_tp = self._setup_optimizer(
+            model, model_tp
+        )  # Step 3: Setup optimizers for both models
+
+        # Initialize input and make sure all ranks have the same input.
+        inp_size = [8, 8]  # [batch_size, seq_len]
+        if is_seq_parallel:
+            assert inp_size[1] % self.world_size == 0
+
+        torch.manual_seed(0)
+        steps = 10 if type(model) is torch.float64 else 1
+        for iter in range(steps):
+            inp = torch.randint(
+                model_args.vocab_size, inp_size, device=self.device_type
+            )
+            expected_fwd_comms = (
+                EXP_SEQ_PARALLEL_CC.fwd if is_seq_parallel else EXP_BASE_CC.fwd
+            )
+            output, output_tp = self._validate_fwd(
+                model, model_tp, inp, expected_fwd_comms
+            )
+            expected_bwd_comms = (
+                EXP_SEQ_PARALLEL_CC.bwd if is_seq_parallel else EXP_BASE_CC.bwd
+            )
+            self._validate_bwd(model, model_tp, output, output_tp, expected_bwd_comms)
+            expected_optim_comms = (
+                EXP_SEQ_PARALLEL_CC.optim if is_seq_parallel else EXP_BASE_CC.optim
+            )
+            self._validate_optim_step(
+                model, model_tp, optim, optim_tp, expected_optim_comms
+            )
+
+    @with_comms
+    @skip_unless_torch_gpu
+    @parametrize(
+        "thaw_params, is_seq_parallel, dtype, exp_cnts",
+        [
+            (
+                None,  # all require grad seq_parallel float32 baseline
+                True,
+                torch.float32,
+                ExpCommCounts(
+                    bwd={reduce_scatter: 5, all_gather: 6}, optim={all_reduce: 30}
+                ),
+            ),
+            (
+                None,  # all require grad no seq_parallel float64 baseline
+                False,
+                torch.float64,
+                ExpCommCounts(bwd={all_reduce: 9}),
+            ),
+            # test a subset of LayerNorm bwd output_masks
+            (
+                ("output.weight", "norm.weight", "norm.bias"),  # [False, True, True]
+                True,
+                torch.float32,
+                ExpCommCounts(bwd={reduce_scatter: 1}, optim={all_reduce: 6}),
+            ),
+            (
+                ("tok_embeddings.weight", "output.weight"),  # [True, False, False]
+                True,
+                torch.float32,
+                ExpCommCounts(bwd={reduce_scatter: 5, all_gather: 5}),
+            ),
+            (
+                (
+                    "tok_embeddings.weight",
+                    "output.weight",
+                    "norm.weight",
+                    "norm.bias",
+                ),  # [True, True, True]
+                True,
+                torch.float32,
+                ExpCommCounts(
+                    bwd={reduce_scatter: 5, all_gather: 5}, optim={all_reduce: 6}
+                ),
+            ),
+            (
+                (
+                    "tok_embeddings.weight",
+                    "output.weight",
+                    "norm.weight",
+                    "norm.bias",
+                    "layers.1.ffn_norm.weight",
+                    "layers.1.ffn_norm.bias",
+                ),  # a single transformerblock layernorm
+                True,
+                torch.float32,
+                ExpCommCounts(
+                    bwd={reduce_scatter: 5, all_gather: 5}, optim={all_reduce: 12}
+                ),
+            ),
+            (
+                (
+                    "tok_embeddings.weight",
+                    "layers.0.attention.wv.weight",
+                    "layers.0.feed_forward.w1.bias",
+                    "layers.1.ffn_norm.bias",
+                    "layers.1.feed_forward.w2.weight",
+                    "output.weight",
+                ),  # varied layer/param types
+                True,
+                torch.float32,
+                ExpCommCounts(
+                    bwd={reduce_scatter: 5, all_gather: 5}, optim={all_reduce: 3}
+                ),
+            ),
+        ],
+        name_fn=lambda thaw, seq, dtype, *_: f"{'seq_parallel_' if seq else ''}"
+        + f"{str(dtype).split('.')[-1]}_"
+        + f"thaw_{'__'.join(sorted({n.rpartition('.')[0].replace('.', '_') for n in thaw})) if thaw else 'all'}",
+    )
+    def test_transformer_req_grad(self, thaw_params, is_seq_parallel, dtype, exp_cnts):
+        # Sample a subset of `requires_grad` patterns
+
+        # disabling dropout to facilitate single gpu to multi-device comparison
+        # disable weight-tying to enable more fine-tuning configurations
+        model_args = ModelArgs(dropout_p=0.0, weight_tying=False)
+        model = self._setup_single_gpu_model(
+            model_args, dtype
+        )  # Step 1: Initialize single-gpu models.
+        model_tp = self._setup_tp_model(
+            model, is_seq_parallel, dtype
+        )  # Step 2: Setup tp model, place onto device mesh.
+        optim, optim_tp = self._setup_optimizer(
+            model, model_tp
+        )  # Step 3: Setup optimizers for both models
+        DistTensorParallelExampleTest._thaw_params(
+            thaw_params, model, model_tp
+        )  # Step 4: set `requires_grad` patterns
+
+        # Initialize input and make sure all ranks have the same input.
+        inp_size = [8, 8]  # [batch_size, seq_len]
+        if is_seq_parallel:
+            assert inp_size[1] % self.world_size == 0
+
+        torch.manual_seed(0)
+        inp = torch.randint(model_args.vocab_size, inp_size, device=self.device_type)
+        output, output_tp = self._validate_fwd(model, model_tp, inp, check_comms=False)
+        self._validate_bwd(
+            model, model_tp, output, output_tp, exp_cnts.bwd, check_comms=True
+        )
+        self._validate_optim_step(
+            model, model_tp, optim, optim_tp, exp_cnts.optim, check_comms=True
+        )
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_weight_tying(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                # Initialize different weights for embedding and fc.
+                torch.manual_seed(1)
+                self.embedding = torch.nn.Embedding(16, 8)
+                torch.manual_seed(2)
+                self.fc = torch.nn.Linear(8, 16)
+
+            def forward(self, x):
+                return self.fc(self.embedding(x))
+
+        model = TestModule().to(self.device_type)
+        parallelize_plan = {
+            "embedding": ColwiseParallel(),
+            "fc": RowwiseParallel(),
+        }
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        parallelize_module(model, device_mesh, parallelize_plan)
+
+        input_size = [5]
+        torch.manual_seed(0)
+        inp = torch.randint(16, input_size, device=self.device_type)
+
+        # Without weight tying.
+        self.assertNotEqual(
+            model.embedding.weight.to_local(), model.fc.weight.to_local()
+        )
+        output = model(inp)
+        output.sum().backward()
+        self.assertNotEqual(
+            model.embedding.weight.grad.to_local(), model.fc.weight.grad.to_local()
+        )
+        model.zero_grad()
+
+        # With weight tying.
+        model.fc.weight = model.embedding.weight
+
+        self.assertEqual(model.embedding.weight, model.fc.weight)
+        self.assertEqual(id(model.embedding.weight), id(model.fc.weight))
+        output = model(inp)
+        output.sum().backward()
+        self.assertEqual(model.embedding.weight.grad, model.fc.weight.grad)
+        self.assertEqual(id(model.embedding.weight.grad), id(model.fc.weight.grad))
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_loss_parallel(self):
+        device_mesh = self.build_device_mesh()
+        comm_mode = CommDebugMode()
+
+        channel_size, channel_dim = 16, 1
+        test_setup = [
+            (2, (8, channel_size), (8,)),  # calling aten.nll_loss_forward
+            (3, (8, channel_size, 12), (8, 12)),  # calling aten.nll_loss2d_forward
+        ]
+        weight = torch.rand(channel_size, device=self.device_type)
+        for input_ndim, input_size, target_size in test_setup:
+            x = torch.rand(*input_size, device=self.device_type, requires_grad=True)
+            target = torch.randint(channel_size, target_size, device=self.device_type)
+
+            shard_dims = list(range(input_ndim))
+            reductions = ["none", "mean", "sum"]
+            for shard_dim, reduction in itertools.product(shard_dims, reductions):
+                dist_x = distribute_tensor(x, device_mesh, [Shard(shard_dim)])
+                y = F.cross_entropy(x, target, weight, reduction=reduction)
+                with loss_parallel():
+                    if shard_dim == channel_dim:
+                        with comm_mode:
+                            dist_y = F.cross_entropy(
+                                dist_x, target, weight, reduction=reduction
+                            )
+                            self.assertEqual(comm_mode.get_total_counts(), 3)
+                            self.assertEqual(
+                                comm_mode.get_comm_counts()[c10d_functional.all_reduce],
+                                3,
+                            )
+                            self.assertTrue(dist_y.placements[0].is_replicate())
+                            self.assertEqual(dist_y.to_local(), y)
+
+                        with comm_mode:
+                            if reduction == "none":
+                                y.sum().backward()
+                                dist_y.sum().backward()
+                            else:
+                                y.backward()
+                                dist_y.backward()
+                            self.assertEqual(comm_mode.get_total_counts(), 0)
+                            self.assertTrue(
+                                dist_x.grad.placements[0].is_shard(shard_dim)
+                            )
+                            self.assertEqual(dist_x.grad.full_tensor(), x.grad)
+                        x.grad.zero_()
+                    else:
+                        with self.assertRaisesRegex(
+                            ValueError,
+                            "loss_parallel",
+                        ):
+                            dist_y = F.cross_entropy(
+                                dist_x, target, reduction=reduction
+                            )
+
+
+instantiate_parametrized_tests(DistTensorParallelExampleTest)
+
+if __name__ == "__main__":
+    run_tests()
-- 
Gitee


From 9e661086138472ee03e8fb88e9dffd0e55b96202 Mon Sep 17 00:00:00 2001
From: dilililiwhy <why.wuhuanyu@huawei.com>
Date: Tue, 11 Mar 2025 11:08:19 +0000
Subject: [PATCH 128/358] !18658 [1/2] Add tp cases Merge pull request !18658
 from dilililiwhy/tp_cases_260_p1

---
 .../tensor/parallel/test_parallelize_api.py   | 310 ++++++++++++
 .../tensor/parallel/test_tp_style.py          | 440 ++++++++++++++++++
 2 files changed, 750 insertions(+)
 create mode 100644 test/distributed/tensor/parallel/test_parallelize_api.py
 create mode 100644 test/distributed/tensor/parallel/test_tp_style.py

diff --git a/test/distributed/tensor/parallel/test_parallelize_api.py b/test/distributed/tensor/parallel/test_parallelize_api.py
new file mode 100644
index 0000000000..61cb7de4f0
--- /dev/null
+++ b/test/distributed/tensor/parallel/test_parallelize_api.py
@@ -0,0 +1,310 @@
+# Owner(s): ["oncall: distributed"]
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+from torch.distributed._tensor import DeviceMesh, DTensor, Replicate, Shard
+from torch.distributed.tensor.parallel.api import parallelize_module
+from torch.distributed.tensor.parallel.style import (
+    ColwiseParallel,
+    PrepareModuleInput,
+    PrepareModuleOutput,
+    RowwiseParallel,
+)
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    MLPModule,
+    MLPStacked,
+)
+
+import torch_npu
+from torch_npu.testing.common_distributed import with_comms, skipIfUnsupportMultiNPU
+
+
+class DummyModule(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, x):
+        return x
+
+
+class TensorParallelAPITests(DTensorTestBase):
+    @property
+    def world_size(self):
+        return 2
+
+    def _compare_params(
+        self,
+        local_module,
+        dist_module,
+        rank0_only,
+        skip_rowwise_bias=False,
+        compare_grad=False,
+    ):
+        replicate = [Replicate()]
+        for name, param in local_module.named_parameters():
+            dist_param = dist_module.get_parameter(name)
+            param = param.grad if compare_grad else param
+            dist_param = dist_param.grad if compare_grad else dist_param
+            if (
+                (not rank0_only)
+                or (self.rank == 0)
+                or (
+                    name not in ["net2.bias"]
+                    and not skip_rowwise_bias
+                    or name not in ["bias", "net2.bias"]
+                )
+            ):
+                self.assertEqual(
+                    param,
+                    dist_param.redistribute(
+                        device_mesh=dist_param.device_mesh, placements=replicate
+                    ).to_local(),
+                    f"{name} not equal between dist and non-dist",
+                )
+
+    def _compare_module(
+        self, local_module, dist_module, inp_size, rank0_only=True, rowwise=False
+    ):
+        LR = 0.25  # the learning rate we use for testing
+        local_optim = torch.optim.SGD(local_module.parameters(), lr=LR)
+        dist_optim = torch.optim.SGD(dist_module.parameters(), lr=LR)
+        torch.manual_seed(0)
+        inp = torch.rand(*inp_size, device=self.device_type)
+        self._compare_params(local_module, dist_module, rank0_only)
+
+        # check forward correctness
+        local_output = local_module(inp)
+        inp = inp.chunk(self.world_size, dim=-1)[self.rank] if rowwise else inp
+        dist_output = dist_module(inp)
+        dist_output = (
+            dist_output.redistribute(dist_output.device_mesh, [Replicate()]).to_local()
+            if isinstance(dist_output, DTensor)
+            else dist_output
+        )
+        self.assertEqual(local_output, dist_output)
+
+        local_output.sum().backward()
+        dist_output.sum().backward()
+
+        # check backward and ensure gradients are same
+        self._compare_params(local_module, dist_module, rank0_only, rowwise, True)
+
+        local_optim.step()
+        dist_optim.step()
+        self._compare_params(local_module, dist_module, rank0_only, rowwise)
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_parallelize_mlp_with_module_api(self):
+        inp_size = [12, 10]
+        model = MLPModule(self.device_type)
+        model_tp = deepcopy(model)
+
+        # Parallelize module.
+        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        model_tp = parallelize_module(
+            model_tp,
+            device_mesh,
+            {
+                "net1": ColwiseParallel(output_layouts=Replicate()),
+                "net2": ColwiseParallel(output_layouts=Replicate()),
+            },
+        )
+        self._compare_module(model, model_tp, inp_size, rank0_only=False)
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_parallelize_mlp_with_module_api_nested(self):
+        inp_size = [12, 10]
+        model = torch.nn.Sequential(
+            OrderedDict([("dummy_encoder", MLPModule(self.device_type))])
+        )
+        model_tp = deepcopy(model)
+
+        # Parallelize module.
+        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+        model_tp = parallelize_module(
+            model_tp,
+            device_mesh,
+            {
+                "dummy_encoder.net1": ColwiseParallel(output_layouts=Replicate()),
+                "dummy_encoder.net2": ColwiseParallel(output_layouts=Replicate()),
+            },
+        )
+        self._compare_module(model, model_tp, inp_size, rank0_only=False)
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_linear_row_wise_parallel(self):
+        # test RowwiseParallel
+        inp_size = [9, 16]
+        rowwise = RowwiseParallel()
+
+        torch.manual_seed(5)
+        model = torch.nn.Linear(16, 10, device=self.device_type)
+        model_tp = deepcopy(model)
+
+        # parallelize model_tp
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        model_tp = parallelize_module(model_tp, device_mesh, rowwise)
+
+        # let each rank generate unique local input
+        torch.manual_seed(self.rank)
+        self._compare_module(model, model_tp, inp_size, rowwise=True)
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_linear_col_wise_parallel(self):
+        # test ColwiseParallel
+        inp_size = [8, 10]
+        colwise = ColwiseParallel(output_layouts=Replicate())
+
+        torch.manual_seed(5)
+        model = torch.nn.Linear(10, 16, device=self.device_type)
+        model_tp = deepcopy(model)
+
+        # parallelize model_tp
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        model_tp = parallelize_module(model_tp, device_mesh, colwise)
+
+        self._compare_module(model, model_tp, inp_size)
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_prepare_module_input(self):
+        module = DummyModule()
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        parallelize_module(
+            module,
+            device_mesh,
+            PrepareModuleInput(
+                input_layouts=Shard(0), desired_input_layouts=Replicate()
+            ),
+        )
+        inp = torch.rand(5, 7, device=self.device_type)
+        output = module(inp).redistribute(device_mesh, [Shard(0)]).to_local()
+        self.assertEqual(inp, output)
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_prepare_module_output(self):
+        module = DummyModule()
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        parallelize_module(
+            module,
+            device_mesh,
+            PrepareModuleOutput(
+                output_layouts=Replicate(), desired_output_layouts=Shard(0)
+            ),
+        )
+        torch.manual_seed(15)
+        inp = torch.rand(16, 7, device=self.device_type)
+        dtensor = DTensor.from_local(inp, device_mesh, [Replicate()], run_check=False)
+        output = module(dtensor)
+        inp = dtensor.redistribute(device_mesh, [Shard(0)]).to_local()
+        self.assertEqual(inp, output)
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_parallelize_module_with_star(self):
+        inp_size = [12, 10]
+        model = MLPModule(self.device_type)
+        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        model_tp = deepcopy(model)
+        model_tp = parallelize_module(
+            model_tp,
+            device_mesh,
+            {
+                "net*": ColwiseParallel(output_layouts=Replicate()),
+            },
+        )
+        self._compare_module(model, model_tp, inp_size, rank0_only=False)
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_parallelize_module_with_question(self):
+        inp_size = [12, 10]
+        model = MLPModule(self.device_type)
+        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        model_tp = deepcopy(model)
+        model_tp = parallelize_module(
+            model_tp,
+            device_mesh,
+            {
+                "net?": ColwiseParallel(output_layouts=Replicate()),
+            },
+        )
+        self._compare_module(model, model_tp, inp_size, rank0_only=False)
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_parallelize_module_with_digit(self):
+        inp_size = [12, 10]
+        model = MLPModule(self.device_type)
+        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        model_tp = deepcopy(model)
+        model_tp = parallelize_module(
+            model_tp,
+            device_mesh,
+            {
+                "net[1-2]": ColwiseParallel(output_layouts=Replicate()),
+            },
+        )
+        self._compare_module(model, model_tp, inp_size, rank0_only=False)
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_parallelize_module_multi_wildcard(self):
+        inp_size = [12, 10]
+        model = MLPStacked(self.device_type, n_layers=2)
+        device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        model_tp = deepcopy(model)
+        model_tp = parallelize_module(
+            model_tp,
+            device_mesh,
+            {
+                "layers.*.net[1]": ColwiseParallel(),
+                "layers.*.net[2]": RowwiseParallel(),
+            },
+        )
+        self._compare_module(model, model_tp, inp_size, rank0_only=False)
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_under_devicemesh_context(self):
+        # test ColwiseParallel
+        inp_size = [8, 10]
+        colwise = ColwiseParallel(output_layouts=Replicate())
+
+        torch.manual_seed(5)
+        model = torch.nn.Linear(10, 16, device=self.device_type)
+        model_tp = deepcopy(model)
+
+        # Call parallelize_module under DeviceMesh context.
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        with device_mesh:
+            model_tp = parallelize_module(model_tp, parallelize_plan=colwise)
+
+        self._compare_module(model, model_tp, inp_size)
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_empty_plan(self):
+        torch.manual_seed(5)
+        model = torch.nn.Linear(10, 16, device=self.device_type)
+
+        # Call parallelize_module with empty plan.
+        # Goal is not to crash.
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        parallelize_module(model, device_mesh)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/tensor/parallel/test_tp_style.py b/test/distributed/tensor/parallel/test_tp_style.py
new file mode 100644
index 0000000000..26a55ca63b
--- /dev/null
+++ b/test/distributed/tensor/parallel/test_tp_style.py
@@ -0,0 +1,440 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+from copy import deepcopy
+
+import torch
+
+import torch.nn as nn
+from torch.distributed._tensor import (
+    distribute_tensor,
+    DTensor,
+    init_device_mesh,
+    Replicate,
+    Shard,
+)
+from torch.distributed.tensor.debug import CommDebugMode
+from torch.distributed.tensor.parallel import parallelize_module
+from torch.distributed.tensor.parallel.style import (
+    ColwiseParallel,
+    PrepareModuleInput,
+    PrepareModuleOutput,
+    RowwiseParallel,
+    SequenceParallel,
+)
+from torch.distributed.tensor.placement_types import _Partial
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    RMSNormPython,
+)
+
+import torch_npu
+from torch_npu.testing.common_utils import SupportedDevices
+from torch_npu.testing.common_distributed import with_comms, skipIfUnsupportMultiNPU
+
+
+c10d_functional = torch.ops.c10d_functional
+
+
+class TensorParallelStyleTest(DTensorTestBase):
+    @property
+    def world_size(self):
+        return 2
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_colwise_parallel_style(self):
+        mesh = init_device_mesh(self.device_type, (self.world_size,))
+
+        comm_mode = CommDebugMode()
+        tensor = torch.rand(8, 16, device=self.device_type, requires_grad=True)
+        model = nn.Linear(16, 16, device=self.device_type)
+
+        default_col_parallel = ColwiseParallel()
+        colwise_mod = parallelize_module(deepcopy(model), mesh, default_col_parallel)
+        with comm_mode:
+            out = colwise_mod(tensor)
+            # ensure output shard on the last dim
+            self.assertEqual(out.shape, (8, 16 // self.world_size))
+            # ensure no communication happened in fwd
+            self.assertEqual(comm_mode.get_total_counts(), 0)
+
+            out.sum().backward()
+            # allreduce in bwd
+            self.assertEqual(comm_mode.get_comm_counts()[c10d_functional.all_reduce], 1)
+            self.assertEqual(comm_mode.get_total_counts(), 1)
+
+        sharded_col_parallel = ColwiseParallel(input_layouts=Shard(0))
+        colwise_mod = parallelize_module(deepcopy(model), mesh, sharded_col_parallel)
+        with comm_mode:
+            out = colwise_mod(tensor)
+            # ensure output shard on the last dim
+            self.assertEqual(out.shape, (8 * self.world_size, 16 // self.world_size))
+            # allgather in fwd
+            self.assertEqual(
+                comm_mode.get_comm_counts()[c10d_functional.all_gather_into_tensor], 1
+            )
+            self.assertEqual(comm_mode.get_total_counts(), 1)
+
+            out.sum().backward()
+            # reduce_scatter in bwd
+            self.assertEqual(
+                comm_mode.get_comm_counts()[c10d_functional.reduce_scatter_tensor], 1
+            )
+            self.assertEqual(comm_mode.get_total_counts(), 2)
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_colwise_parallel_embedding(self):
+        mesh = init_device_mesh(self.device_type, (self.world_size,))
+
+        comm_mode = CommDebugMode()
+        tensor = torch.arange(8, device=self.device_type).reshape(4, 2)
+        model = nn.Embedding(16, 16, device=self.device_type)
+
+        default_col_parallel = ColwiseParallel()
+        colwise_mod = parallelize_module(deepcopy(model), mesh, default_col_parallel)
+        with comm_mode:
+            out = colwise_mod(tensor)
+            # ensure output shard on the last dim
+            self.assertEqual(out.shape, (4, 2, 16 // self.world_size))
+            # ensure no communication happened in fwd
+            self.assertEqual(comm_mode.get_total_counts(), 0)
+
+            out.sum().backward()
+            # no comm in bwd
+            self.assertEqual(comm_mode.get_total_counts(), 0)
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_rowwise_parallel_style(self):
+        mesh = init_device_mesh(self.device_type, (self.world_size,))
+
+        comm_mode = CommDebugMode()
+        tensor = torch.rand(
+            8, 16 // self.world_size, device=self.device_type, requires_grad=True
+        )
+        model = nn.Linear(16, 16, device=self.device_type)
+
+        default_row_parallel = RowwiseParallel()
+        rowwise_mod = parallelize_module(deepcopy(model), mesh, default_row_parallel)
+        with comm_mode:
+            out = rowwise_mod(tensor)
+            # ensure output replicated
+            self.assertEqual(out.shape, (8, 16))
+            # allreduce in fwd
+            self.assertEqual(comm_mode.get_comm_counts()[c10d_functional.all_reduce], 1)
+            self.assertEqual(comm_mode.get_total_counts(), 1)
+
+            out.sum().backward()
+            # no op in bwd
+            self.assertEqual(comm_mode.get_total_counts(), 1)
+
+        sharded_row_parallel = RowwiseParallel(output_layouts=Shard(0))
+        rowwise_mod = parallelize_module(deepcopy(model), mesh, sharded_row_parallel)
+        with comm_mode:
+            out = rowwise_mod(tensor)
+            # ensure output replicated
+            self.assertEqual(out.shape, (8 // self.world_size, 16))
+            # reduce_scatter in fwd
+            self.assertEqual(
+                comm_mode.get_comm_counts()[c10d_functional.reduce_scatter_tensor], 1
+            )
+            self.assertEqual(comm_mode.get_total_counts(), 1)
+
+            out.sum().backward()
+            # allgather in bwd
+            self.assertEqual(
+                comm_mode.get_comm_counts()[c10d_functional.all_gather_into_tensor], 1
+            )
+            self.assertEqual(comm_mode.get_total_counts(), 2)
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_rowwise_parallel_embedding(self):
+        mesh = init_device_mesh(self.device_type, (self.world_size,))
+
+        comm_mode = CommDebugMode()
+        tensor = torch.arange(8, device=self.device_type).reshape(4, 2)
+        model = nn.Embedding(16, 16, device=self.device_type)
+
+        rowwise_mod = parallelize_module(
+            deepcopy(model), mesh, RowwiseParallel(input_layouts=Replicate())
+        )
+        with comm_mode:
+            out = rowwise_mod(tensor)
+            # ensure output shard on the last dim
+            self.assertEqual(out.shape, (4, 2, 16))
+            # ensure allreduce communication happened in fwd
+            self.assertEqual(comm_mode.get_total_counts(), 1)
+            self.assertEqual(comm_mode.get_comm_counts()[c10d_functional.all_reduce], 1)
+
+            out.sum().backward()
+            # no comm in bwd
+            self.assertEqual(comm_mode.get_total_counts(), 1)
+
+        sharded_row_parallel = RowwiseParallel(
+            input_layouts=Replicate(), output_layouts=Shard(1)
+        )
+
+        rowwise_mod = parallelize_module(deepcopy(model), mesh, sharded_row_parallel)
+
+        inp_indices = torch.arange(8, device=self.device_type)
+        with comm_mode:
+            out = rowwise_mod(inp_indices)
+            # ensure output shard on the last dim
+            self.assertEqual(out.shape, (8, 16 // self.world_size))
+            # reduce scatter in fwd
+            self.assertEqual(comm_mode.get_total_counts(), 1)
+            self.assertEqual(
+                comm_mode.get_comm_counts()[c10d_functional.reduce_scatter_tensor], 1
+            )
+            out.sum().backward()
+            # allgather comm in bwd
+            self.assertEqual(comm_mode.get_total_counts(), 2)
+            self.assertEqual(
+                comm_mode.get_comm_counts()[c10d_functional.all_gather_into_tensor], 1
+            )
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_prepare_module_input(self):
+        mesh = init_device_mesh(self.device_type, (self.world_size,))
+
+        tensor = torch.ones(2, 16, device=self.device_type)
+        expected_tensor = torch.ones(2 * self.world_size, 16, device=self.device_type)
+        prepare_inp_style = PrepareModuleInput(
+            input_layouts=Shard(0), desired_input_layouts=Replicate()
+        )
+
+        model = nn.Identity()
+        allgather_mod = parallelize_module(model, mesh, prepare_inp_style)
+        output = allgather_mod(tensor).full_tensor()
+        self.assertEqual(output, expected_tensor)
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_prepare_module_input_multiple_inputs(self):
+        mesh = init_device_mesh(self.device_type, (self.world_size,))
+
+        class TestModule(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(8, 8)
+
+            def forward(self, x, y):
+                return self.linear(x) + y
+
+        # Raise assertion error if input_layouts and desired_input_layouts do not have same length.
+        test_mod = TestModule().to(self.device_type)
+        with self.assertRaisesRegex(
+            AssertionError,
+            "input_layouts and desired_input_layouts should have same length!",
+        ):
+            prepare_inps_dimension_mismatch = PrepareModuleInput(
+                input_layouts=Shard(0), desired_input_layouts=(Replicate(), None)
+            )
+        # Raise assertion error if module inputs and input_layouts do not have same length.
+        prepare_inps_short_dimension = PrepareModuleInput(
+            input_layouts=Shard(0), desired_input_layouts=Replicate()
+        )
+        parallelize_module(test_mod.linear, mesh, ColwiseParallel())
+        parallelize_module(test_mod, mesh, prepare_inps_short_dimension)
+        with self.assertRaisesRegex(
+            ValueError, "module inputs and input_layouts should have same length!"
+        ):
+            output = test_mod(
+                torch.randn(2, 8, device=self.device_type),
+                torch.ones(
+                    self.world_size * 2, 8 // self.world_size, device=self.device_type
+                ),
+            )
+
+        test_mod = TestModule().to(self.device_type)
+        prepare_inps = PrepareModuleInput(
+            input_layouts=(Shard(0), None), desired_input_layouts=(Replicate(), None)
+        )
+
+        parallelize_module(test_mod.linear, mesh, ColwiseParallel())
+        parallelize_module(test_mod, mesh, prepare_inps)
+        output = test_mod(
+            torch.randn(2, 8, device=self.device_type),
+            torch.ones(
+                self.world_size * 2, 8 // self.world_size, device=self.device_type
+            ),
+        )
+        self.assertEqual(output.shape, (self.world_size * 2, 8 // self.world_size))
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_prepare_module_kwargs_input(self):
+        mesh = init_device_mesh(self.device_type, (self.world_size,))
+
+        class TestKwargModule(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(8, 8)
+
+            def forward(self, x, *, y, z=2):
+                return self.linear(x) + y + z
+
+        test_mod = TestKwargModule().to(self.device_type)
+        prepare_inps_simple = PrepareModuleInput(
+            input_kwarg_layouts={"y": Shard(0)},
+            desired_input_kwarg_layouts={"y": Replicate()},
+        )
+        parallelize_module(
+            test_mod.linear, mesh, ColwiseParallel(use_local_output=False)
+        )
+        parallelize_module(test_mod, mesh, prepare_inps_simple)
+
+        comm_mode = CommDebugMode()
+        with comm_mode:
+            output = test_mod(
+                torch.randn(1 * self.world_size, 8, device=self.device_type),
+                y=torch.ones(1, 8, device=self.device_type),
+            )
+
+        self.assertEqual(comm_mode.get_total_counts(), 1)
+        self.assertEqual(output.shape, (1 * self.world_size, 8))
+
+        class TestKwargOnlyModule(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(8, 8)
+
+            def forward(self, *, x, y=2, z=None):
+                return self.linear(x) + y + z
+
+        test_kwonly_mod = TestKwargOnlyModule().to(self.device_type)
+        prepare_inps_simple = PrepareModuleInput(
+            input_kwarg_layouts={"x": Shard(0), "z": Shard(0)},
+            desired_input_kwarg_layouts={"x": Replicate(), "z": Replicate()},
+        )
+        parallelize_module(
+            test_kwonly_mod.linear, mesh, ColwiseParallel(use_local_output=False)
+        )
+        parallelize_module(test_kwonly_mod, mesh, prepare_inps_simple)
+
+        with comm_mode:
+            output = test_kwonly_mod(
+                x=torch.randn(1, 8, device=self.device_type),
+                z=torch.ones(1, 8, device=self.device_type),
+            )
+
+        self.assertEqual(comm_mode.get_total_counts(), 2)
+        self.assertEqual(output.shape, (1 * self.world_size, 8))
+
+        # test the case where x is a DTensor
+        x_dt = DTensor.from_local(
+            torch.randn(1, 8, device=self.device_type), mesh, [Shard(0)]
+        )
+        with comm_mode:
+            output = test_kwonly_mod(
+                x=x_dt, z=torch.ones(1, 8, device=self.device_type)
+            )
+
+        self.assertEqual(comm_mode.get_total_counts(), 2)
+        self.assertEqual(output.shape, (1 * self.world_size, 8))
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    def test_prepare_module_output(self):
+        mesh = init_device_mesh(self.device_type, (self.world_size,))
+
+        tensor = torch.ones(8, 16, device=self.device_type)
+        expected_tensor = torch.ones(8 // self.world_size, 16, device=self.device_type)
+        prepare_out_style = PrepareModuleOutput(
+            output_layouts=Replicate(), desired_output_layouts=Shard(0)
+        )
+
+        model = nn.Identity()
+        chunk_mod = parallelize_module(model, mesh, prepare_out_style)
+        output = chunk_mod(tensor)
+        self.assertEqual(output, expected_tensor)
+
+    @with_comms
+    @skipIfUnsupportMultiNPU(2)
+    @SupportedDevices(['Ascend910B'])
+    def test_sequence_parallel_style(self):
+        mesh = init_device_mesh(self.device_type, (self.world_size,))
+
+        comm_mode = CommDebugMode()
+        batch, N, embedding_dim = 20, 8, 12
+
+        global_input = torch.rand(
+            batch,
+            N * self.world_size,
+            embedding_dim,
+            device=self.device_type,
+            requires_grad=True,
+        )
+        sharded_input = distribute_tensor(global_input, mesh, [Shard(1)])
+
+        # test LayerNorm
+        for elementwise_affine in [True, False]:
+            norm = nn.LayerNorm(
+                embedding_dim,
+                elementwise_affine=elementwise_affine,
+                device=self.device_type,
+            )
+            sp_norm = parallelize_module(deepcopy(norm), mesh, SequenceParallel())
+
+            output = norm(global_input)
+            output.sum().backward()
+
+            with comm_mode:
+                sharded_out = sp_norm(sharded_input)
+                grad_out = torch.ones_like(sharded_out)
+                sharded_out.backward(grad_out)
+                self.assertIsInstance(sharded_out, DTensor)
+                self.assertEqual(sharded_out.placements, (Shard(1),))
+                self.assertEqual(comm_mode.get_total_counts(), 0)
+                self.assertEqual(
+                    comm_mode.get_comm_counts()[c10d_functional.all_reduce], 0
+                )
+                if elementwise_affine:
+                    self.assertEqual(sp_norm.weight.grad.placements, (_Partial(),))
+                    self.assertEqual(sp_norm.bias.grad.placements, (_Partial(),))
+
+                self.assertEqual(sharded_out.full_tensor(), output)
+
+        # test RMSNorm
+        rmsnorm = RMSNormPython(embedding_dim).to(self.device_type)
+        sp_rmsnorm = parallelize_module(deepcopy(rmsnorm), mesh, SequenceParallel())
+
+        output = rmsnorm(global_input)
+        output.sum().backward()
+
+        with comm_mode:
+            sharded_out = sp_rmsnorm(sharded_input)
+            grad_out = torch.ones_like(sharded_out)
+            sharded_out.backward(grad_out)
+            self.assertIsInstance(sharded_out, DTensor)
+            self.assertEqual(sharded_out.placements, (Shard(1),))
+            self.assertEqual(sp_rmsnorm.weight.grad.placements, (_Partial(),))
+            self.assertEqual(comm_mode.get_total_counts(), 0)
+            self.assertEqual(comm_mode.get_comm_counts()[c10d_functional.all_reduce], 0)
+
+            self.assertEqual(sharded_out.full_tensor(), output)
+
+        # test sharded on non-sequence dim input
+        sharded_batch_input = distribute_tensor(global_input, mesh, [Shard(0)])
+        rmsnorm = RMSNormPython(embedding_dim).to(self.device_type)
+        sp_rmsnorm = parallelize_module(deepcopy(rmsnorm), mesh, SequenceParallel())
+
+        with comm_mode:
+            sharded_out = sp_rmsnorm(sharded_batch_input)
+            grad_out = torch.ones_like(sharded_out)
+            sharded_out.backward(grad_out)
+            self.assertIsInstance(sharded_out, DTensor)
+            # output still sharded on sequence dimension
+            self.assertEqual(sharded_out.placements, (Shard(1),))
+            self.assertEqual(sp_rmsnorm.weight.grad.placements, (_Partial(),))
+            # communication happens in both fwd/bwd to redistribute input
+            self.assertEqual(comm_mode.get_total_counts(), 2)
+
+
+if __name__ == "__main__":
+    run_tests()
-- 
Gitee


From 14b1ba9093b58c97890a3b4c6871b117e41799ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= <wangchao430@huawei.com>
Date: Tue, 11 Mar 2025 11:24:17 +0000
Subject: [PATCH 129/358] =?UTF-8?q?!18800=20SilentCheck:=20Add=20false=20d?=
 =?UTF-8?q?etection=20test=20case=20for=20broadcast.=20Merge=20pull=20requ?=
 =?UTF-8?q?est=20!18800=20from=20=E7=8E=8B=E8=B6=85/v2.6.0=5Fsilentbroadca?=
 =?UTF-8?q?st?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/distributed/test_silent_check.py | 86 +++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 test/distributed/test_silent_check.py

diff --git a/test/distributed/test_silent_check.py b/test/distributed/test_silent_check.py
new file mode 100644
index 0000000000..60c4014955
--- /dev/null
+++ b/test/distributed/test_silent_check.py
@@ -0,0 +1,86 @@
+import unittest
+from unittest.mock import patch
+import os
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch_npu
+
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_distributed import skipIfUnsupportMultiNPU
+
+
+class HcclSilentCheckDistTest(TestCase):
+    world_size = 2
+
+    @classmethod
+    def _init_dist_hccl(cls, rank, world_size):
+        os.environ['MASTER_ADDR'] = '127.0.0.1'
+        os.environ['MASTER_PORT'] = '29500'
+        os.environ['HCCL_WHITELIST_DISABLE'] = '1'
+        # enable silent check
+        os.environ['NPU_ASD_ENABLE'] = '2'
+        torch_npu.npu.set_device(rank)
+        dist.init_process_group(backend='hccl', world_size=world_size, rank=rank)
+        return dist
+
+    @classmethod
+    def _test_silent_check_broadcast_fp32_dist(
+            cls, rank, world_size, init_pg, c2p, p2c):
+        pg = init_pg(rank, world_size)
+        if rank == 1:
+            tensor = torch.ones((2, 2), dtype=torch.float).to(f"npu:{rank}")
+        else:
+            tensor = torch.full((2, 2), float('nan')).to(f"npu:{rank}")
+        torch_npu._C._npu_set_module_train_state("train")
+        torch_npu._C._npu_set_call_state("backward")
+        pg.broadcast(tensor, src=1)
+        c2p.put((rank, tensor.cpu()))
+
+    def _test_multiprocess(self, f, init_pg, ws=0):
+        if not ws:
+            ws = self.world_size
+        # file store will delete the test file on destruction
+        ctx = mp.get_context('spawn')
+        c2p = ctx.Queue(2)
+        p2c = ctx.Queue(2)
+        ps = []
+        expected = 0
+        result = 1
+        for i in range(ws):
+            p = ctx.Process(
+                target=f,
+                args=(i, ws, init_pg, c2p, p2c))
+            p.start()
+            ps.append(p)
+
+        for _ in range(2):
+            pid, output = c2p.get()
+            if pid == 0:
+                expected = output
+            else:
+                result = output
+
+        self.assertEqual(
+            expected,
+            result,
+            (
+                "Expect rank {} to receive tensor {} but got {}."
+            ).format(pid, expected, result)
+        )
+
+        for _ in range(2):
+            p2c.put(0)
+
+        for p in ps:
+            p.join(2)
+
+    @skipIfUnsupportMultiNPU(2)
+    def test_silent_check_broadcast_fp32_dist(self):
+        self._test_multiprocess(
+            HcclSilentCheckDistTest._test_silent_check_broadcast_fp32_dist,
+            HcclSilentCheckDistTest._init_dist_hccl)
+
+
+if __name__ == '__main__':
+    run_tests()
-- 
Gitee


From 1c21102999aa0366b543b0b482921b93bb5f327f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=9C=E9=87=91=E8=88=AA?= <15990042527@163.com>
Date: Tue, 11 Mar 2025 13:09:18 +0000
Subject: [PATCH 130/358] =?UTF-8?q?!18381=20Snapeshot=20data=20dump=20supp?=
 =?UTF-8?q?orts=20to=20deal=20with=20GE=20error=20Merge=20pull=20request?=
 =?UTF-8?q?=20!18381=20from=20=E6=9D=9C=E9=87=91=E8=88=AA/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/core/npu/NPUException.cpp      |  5 ++-
 torch_npu/csrc/core/npu/NPUQueue.cpp          | 27 +++++++++++++
 torch_npu/csrc/core/npu/NPUQueue.h            |  5 +++
 torch_npu/csrc/core/npu/NPUStream.cpp         | 13 +++++++
 torch_npu/csrc/core/npu/NPUStream.h           |  2 +
 .../csrc/core/npu/register/OptionsManager.cpp | 38 +++++++++++--------
 .../csrc/core/npu/register/OptionsManager.h   |  4 +-
 7 files changed, 76 insertions(+), 18 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUException.cpp b/torch_npu/csrc/core/npu/NPUException.cpp
index 0b31d34906..97ce750b50 100644
--- a/torch_npu/csrc/core/npu/NPUException.cpp
+++ b/torch_npu/csrc/core/npu/NPUException.cpp
@@ -1,5 +1,6 @@
 #include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/core/npu/NPUFunctions.h"
+#include "torch_npu/csrc/core/npu/NPUStream.h"
 #include "torch_npu/csrc/core/npu/register/OptionsManager.h"
 
 
@@ -100,7 +101,9 @@ void clear_mem_uce_info()
 
 const char *c10_npu_get_error_message()
 {
-    return c10_npu::acl::AclGetErrMsg();
+    auto errmsg = c10_npu::acl::AclGetErrMsg();
+    c10_npu::setRepoErrMsg(errmsg);
+    return errmsg;
 }
 
 void record_mem_hbm_ecc_error()
diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp
index f3f681630c..83e33e3b84 100644
--- a/torch_npu/csrc/core/npu/NPUQueue.cpp
+++ b/torch_npu/csrc/core/npu/NPUQueue.cpp
@@ -279,6 +279,15 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
     if (GetStatus() == RepoStatus::ERROR_EXIT) {
         // Avoid repeatedly throwing exceptions
         SetStatus(CAN_EXIT);
+
+        if (c10_npu::option::OptionsManager::IsOomSnapshotEnable()) {
+            auto errmsg = GetQueueErrMsg();
+            const char *memerror = "Failed to allocate memory";
+            if (strstr(errmsg, memerror) != nullptr) {
+                c10_npu::option::oom_observer();
+            }
+        }
+        
 #ifndef BUILD_LIBTORCH
         if (gilState) {
             PyEval_RestoreThread(gilState);
@@ -445,6 +454,14 @@ void Repository::Enqueue(void* cur_paras) {
         // Avoid repeatedly throwing exceptions
         SetStatus(CAN_EXIT);
 
+        if (c10_npu::option::OptionsManager::IsOomSnapshotEnable()) {
+            auto errmsg = GetQueueErrMsg();
+            const char *memerror = "Failed to allocate memory";
+            if (strstr(errmsg, memerror) != nullptr) {
+                c10_npu::option::oom_observer();
+            }
+        }
+
         throw std::runtime_error("The Inner error is reported as above. "
                                 "The process exits for this inner error, and " + repo_error + ".\n" +
                                 "Since the operator is called asynchronously, the stacktrace may be inaccurate. "
@@ -630,6 +647,16 @@ void Repository::ClearQueue()
     eventfd_write(efd_write, 1);
 }
 
+void Repository::SetQueueErrMsg(const char *errmsg)
+{
+    error_msg = errmsg;
+}
+
+const char* Repository::GetQueueErrMsg()
+{
+    return error_msg;
+}
+
 Repository::~Repository() {
     if (initialized) {
         if (consumer.joinable()) {
diff --git a/torch_npu/csrc/core/npu/NPUQueue.h b/torch_npu/csrc/core/npu/NPUQueue.h
index a7688c1375..f45f2811b6 100644
--- a/torch_npu/csrc/core/npu/NPUQueue.h
+++ b/torch_npu/csrc/core/npu/NPUQueue.h
@@ -76,6 +76,8 @@ public:
   virtual bool CheckInit() const = 0;
   virtual std::string GetPara() = 0;
   virtual void ClearQueue() = 0;
+  virtual void SetQueueErrMsg(const char* errmsg) = 0;
+  virtual const char* GetQueueErrMsg() = 0;
 };
 
 class NPUQueueFactoryBase {
@@ -98,6 +100,8 @@ public:
   bool CheckInit() const override;
   std::string GetPara() override;
   void ClearQueue() override;
+  void SetQueueErrMsg(const char *errmsg) override;
+  const char* GetQueueErrMsg() override;
 
 private:
   void ReleaseResource();
@@ -117,6 +121,7 @@ private:
   int efd_write;
   int efd_empty;
   c10::DeviceIndex device_idx;
+  const char *error_msg;
 
 private:
   sring_idx read_idx;
diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp
index a5ed2971a7..6504f5bccd 100644
--- a/torch_npu/csrc/core/npu/NPUStream.cpp
+++ b/torch_npu/csrc/core/npu/NPUStream.cpp
@@ -430,6 +430,19 @@ std::string getRepoInfo()
     return repo_info.str();
 }
 
+void setRepoErrMsg(const char* errmsg)
+{
+    for (auto i = decltype(num_npus){0}; i < num_npus; ++i) {
+        auto& default_streamsi = default_streams[i];
+        if (default_streamsi.stream == nullptr) {
+            continue;
+        }
+        if (default_streamsi.stream != nullptr &&default_streamsi.repo->CheckInit()) {
+            default_streamsi.repo->SetQueueErrMsg(errmsg);
+        }
+    }
+}
+
 void setDefaultStreamsStatus(c10::DeviceIndex device_index, RepoStatus status)
 {
     if (status == c10_npu::RepoStatus::STOP_EXIT) {
diff --git a/torch_npu/csrc/core/npu/NPUStream.h b/torch_npu/csrc/core/npu/NPUStream.h
index cb8b9f7f10..294e7625cf 100644
--- a/torch_npu/csrc/core/npu/NPUStream.h
+++ b/torch_npu/csrc/core/npu/NPUStream.h
@@ -134,6 +134,8 @@ NPUStatus emptyAllNPUStream(bool check_error = true);
 
 std::string getRepoInfo();
 
+void setRepoErrMsg(const char* errmsg);
+
 void setDefaultStreamsStatus(c10::DeviceIndex device_index, RepoStatus status);
 
 C10_NPU_API bool npuSynchronizeDevice(bool check_error = true);
diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
index f1e3918ffc..22abad0d3d 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
@@ -550,9 +550,9 @@ bool OptionsManager::ShouldPrintWarning()
     return should_print;
 }
 
-#ifndef BUILD_LIBTORCH
 void oom_observer(int64_t device, int64_t allocated, int64_t device_total, int64_t device_free)
 {
+#ifndef BUILD_LIBTORCH
     auto dumppath = c10_npu::option::OptionsManager::GetOomSnapshotDumpPath();
     std::stringstream filename;
     auto now = std::chrono::system_clock::now();
@@ -573,27 +573,33 @@ void oom_observer(int64_t device, int64_t allocated, int64_t device_total, int64
     PyObject* p_args = PyTuple_New(1);
     PyTuple_SetItem(p_args, 0, PyUnicode_FromString(savefilepath.c_str()));
     PyObject* p_res = PyObject_CallObject(p_func, p_args);
-}
 #endif
+}
 
-void OptionsManager::IsOomSnapshotEnable()
+bool OptionsManager::IsOomSnapshotEnable()
 {
-#ifndef BUILD_LIBTORCH
-    char* env_val = std::getenv("OOM_SNAPSHOT_ENABLE");
+    static bool isFirstCall = true;
+    char *env_val = std::getenv("OOM_SNAPSHOT_ENABLE");
     int64_t envFlag = (env_val != nullptr) ? strtol(env_val, nullptr, 10) : 0;
-    switch (envFlag) {
-        case 0:
-            break;
-        case 2:
-            c10_npu::NPUCachingAllocator::attachOutOfMemoryObserver(std::move(oom_observer));
-            torch_npu::_record_memory_history("state", "all", "python", UINT64_MAX);
-            break;
-        default:
-            c10_npu::NPUCachingAllocator::attachOutOfMemoryObserver(std::move(oom_observer));
-            torch_npu::_record_memory_history("all", "all", "python", UINT64_MAX);
-            break;
+#ifndef BUILD_LIBTORCH
+    if (isFirstCall) {
+        switch (envFlag) {
+            case 0:
+                break;
+            case 2:
+                c10_npu::NPUCachingAllocator::attachOutOfMemoryObserver(std::move(oom_observer));
+                torch_npu::_record_memory_history("state", "all", "python", UINT64_MAX);
+                isFirstCall = false;
+                break;
+            default:
+                c10_npu::NPUCachingAllocator::attachOutOfMemoryObserver(std::move(oom_observer));
+                torch_npu::_record_memory_history("all", "all", "python", UINT64_MAX);
+                isFirstCall = false;
+                break;
+        }
     }
 #endif
+    return (envFlag != 0);
 }
 
 } // namespace option
diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h
index 3f1d33224d..d5ff1562eb 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.h
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.h
@@ -129,7 +129,7 @@ public:
     static char* GetCpuAffinityConf();
     static bool CheckForceUncached();
     static std::string GetOomSnapshotDumpPath();
-    static void IsOomSnapshotEnable();
+    static bool IsOomSnapshotEnable();
     static bool ShouldPrintWarning();
 
 private:
@@ -140,5 +140,7 @@ private:
         std::pair<double, double> defaultThresh);
 };
 
+void oom_observer(int64_t device = 0, int64_t allocated = 0, int64_t device_total = 0, int64_t device_free = 0);
+
 } // namespace option
 } // namespace c10_npu
-- 
Gitee


From f415784cd0e98d8ea2188db3529fa27af6630130 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 11 Mar 2025 13:45:30 +0000
Subject: [PATCH 131/358] !18812 Update op_plugin commit id Merge pull request
 !18812 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 303f505ce5..86b640511d 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 303f505ce5b8094d314b5986f87f723a9210a86b
+Subproject commit 86b640511dfc78798cff7690fef8deacc7cb048b
-- 
Gitee


From ef75fbff316c0a48d8aafd4be5b7c936ee6a3817 Mon Sep 17 00:00:00 2001
From: jiangpengfei <jiangpengfei24@huawei.com>
Date: Tue, 11 Mar 2025 14:13:24 +0000
Subject: [PATCH 132/358] !18793 Support npugraph. Merge pull request !18793
 from jiangpengfei/v2.6.0

---
 test/test_npu.py                              |  65 ++
 third_party/acl/inc/acl/acl.h                 |  51 +-
 third_party/acl/inc/acl/acl_base.h            |   3 +-
 third_party/acl/inc/acl/acl_mdl.h             | 346 ++++++++-
 third_party/acl/inc/acl/acl_op.h              |  15 +-
 third_party/acl/inc/acl/acl_op_compiler.h     |   4 +-
 third_party/acl/inc/acl/acl_rt.h              |  20 +-
 third_party/acl/inc/acl/acl_rt_allocator.h    | 148 ++++
 third_party/acl/inc/acl/acl_tdt.h             |  41 +-
 third_party/acl/inc/acl/acl_tdt_queue.h       | 145 +++-
 torch_npu/csrc/InitNpuBindings.cpp            |   4 +
 torch_npu/csrc/aten/NPUGeneratorImpl.cpp      |  86 ++-
 .../csrc/core/npu/NPUCachingAllocator.cpp     | 662 ++++++++++++++----
 torch_npu/csrc/core/npu/NPUCachingAllocator.h |  98 ++-
 torch_npu/csrc/core/npu/NPUFunctions.cpp      |  16 +
 torch_npu/csrc/core/npu/NPUFunctions.h        |   2 +
 torch_npu/csrc/core/npu/NPUGraph.cpp          | 234 +++++++
 torch_npu/csrc/core/npu/NPUGraph.h            |  71 ++
 torch_npu/csrc/core/npu/NPUGraphsUtils.h      | 103 +++
 .../csrc/core/npu/interface/AclInterface.cpp  |  99 ++-
 .../csrc/core/npu/interface/AclInterface.h    |  15 +
 .../csrc/distributed/ProcessGroupHCCL.cpp     |  43 +-
 torch_npu/csrc/framework/OpCommand.cpp        |   6 +
 torch_npu/csrc/framework/OpParamMaker.h       |   5 +
 torch_npu/csrc/npu/Graph.cpp                  |  66 ++
 torch_npu/csrc/npu/MemPool.cpp                |  21 +
 torch_npu/csrc/npu/Module.cpp                 |  37 +
 torch_npu/csrc/npu/NPUPluggableAllocator.cpp  |  32 +
 torch_npu/csrc/npu/NPUPluggableAllocator.h    |  11 +
 torch_npu/npu/__init__.py                     |  18 +-
 torch_npu/npu/graphs.py                       | 468 +++++++++++++
 torch_npu/npu/memory.py                       |  85 ++-
 32 files changed, 2765 insertions(+), 255 deletions(-)
 mode change 100644 => 100755 third_party/acl/inc/acl/acl.h
 mode change 100644 => 100755 third_party/acl/inc/acl/acl_base.h
 mode change 100644 => 100755 third_party/acl/inc/acl/acl_mdl.h
 mode change 100644 => 100755 third_party/acl/inc/acl/acl_op.h
 mode change 100644 => 100755 third_party/acl/inc/acl/acl_op_compiler.h
 mode change 100644 => 100755 third_party/acl/inc/acl/acl_rt.h
 create mode 100644 third_party/acl/inc/acl/acl_rt_allocator.h
 mode change 100644 => 100755 third_party/acl/inc/acl/acl_tdt.h
 create mode 100644 torch_npu/csrc/core/npu/NPUGraph.cpp
 create mode 100644 torch_npu/csrc/core/npu/NPUGraph.h
 create mode 100644 torch_npu/csrc/core/npu/NPUGraphsUtils.h
 create mode 100644 torch_npu/csrc/npu/Graph.cpp
 create mode 100644 torch_npu/csrc/npu/MemPool.cpp
 create mode 100644 torch_npu/npu/graphs.py

diff --git a/test/test_npu.py b/test/test_npu.py
index bed7240515..e2e2391a5a 100644
--- a/test/test_npu.py
+++ b/test/test_npu.py
@@ -3,6 +3,7 @@
 from itertools import product, chain
 import collections
 import contextlib
+import ctypes
 from copy import deepcopy
 import gc
 import os
@@ -4220,6 +4221,70 @@ class TestBlockStateAbsorption(TestCase):
         self.assertEqual(rc, "False", "Triton was imported when importing torch!")
 
 
+@unittest.skipIf(not TEST_PRIVATEUSE1, "npu not available, skipping tests")
+class TestMemPool(TestCase):
+    def test_mempool_id(self):
+        pool1 = torch_npu.npu.graph_pool_handle()
+        pool2 = torch_npu.npu.MemPool().id
+
+        # first value of id in a user created pool is always zero
+        self.assertEqual(pool1[0] == 0, pool2[0] == 0)
+
+        # each call to torch_npu.npu.graph_pool_handle() or torch_npu.npu.MemPool()
+        # increments the id
+        self.assertTrue(abs(pool2[1] - pool1[1]) > 0)
+
+    def test_mempool_context(self):
+        active_pool = torch_npu.npu.MemPoolContext.active_pool()
+
+        # there is no active pool if none was made active
+        self.assertEqual(active_pool, None)
+
+        pool = torch_npu.npu.MemPool()
+        ctx = torch_npu.npu.MemPoolContext(pool)
+        active_pool = torch_npu.npu.MemPoolContext.active_pool()
+
+        # pool was made active
+        self.assertEqual(active_pool, pool)
+
+        del ctx
+        active_pool = torch_npu.npu.MemPoolContext.active_pool()
+
+        # ctx was deleted, so active pool is the previous one
+        self.assertEqual(active_pool, None)
+
+    def test_mempool_multithread(self):
+        pool_ids = []
+        active_pool_ids = []
+
+        def create_mempool_and_make_active():
+            pool = torch_npu.npu.MemPool()
+            pool_ids.extend([pool.id])
+
+            ctx = torch_npu.npu.MemPoolContext(pool)
+            active_pool = torch_npu.npu.MemPoolContext.active_pool()
+            active_pool_ids.extend([active_pool.id])
+            del ctx
+
+        num_threads = 4
+        threads = [
+            threading.Thread(target=create_mempool_and_make_active)
+            for t in range(num_threads)
+        ]
+        for thread in threads:
+            thread.start()
+        for thread in threads:
+            thread.join()
+
+        # each thread should create a unique mempool, since
+        # mempool id creation is atomic
+        self.assertEqual(len(set(pool_ids)), 4)
+
+        # each thread should have different active mempool, since
+        # the pointer to the mempool is thread local
+        self.assertEqual(len(set(active_pool_ids)), 4)
+
+
 instantiate_parametrized_tests(TestNpu)
 
 if __name__ == '__main__':
diff --git a/third_party/acl/inc/acl/acl.h b/third_party/acl/inc/acl/acl.h
old mode 100644
new mode 100755
index 01f7028dff..95abdb6368
--- a/third_party/acl/inc/acl/acl.h
+++ b/third_party/acl/inc/acl/acl.h
@@ -1,7 +1,7 @@
 /**
 * @file acl.h
 *
-* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. All Rights Reserved.
+* Copyright (c) Huawei Technologies Co., Ltd. 2019-2020. All rights reserved.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -19,10 +19,12 @@
 extern "C" {
 #endif
 
-// Current version is 1.2.0
-#define ACL_MAJOR_VERSION    1
-#define ACL_MINOR_VERSION    2
-#define ACL_PATCH_VERSION    0
+// Current version is 1.12.0
+#define ACL_MAJOR_VERSION              1
+#define ACL_MINOR_VERSION              12
+#define ACL_PATCH_VERSION              0
+#define ACL_PKG_VERSION_MAX_SIZE       128
+#define ACL_PKG_VERSION_PARTS_MAX_SIZE 64
 
 /**
  * @ingroup AscendCL
@@ -69,6 +71,45 @@ ACL_FUNC_VISIBILITY aclError aclrtGetVersion(int32_t *majorVersion, int32_t *min
 */
 ACL_FUNC_VISIBILITY const char *aclGetRecentErrMsg();
 
+/**
+ * @ingroup AscendCL
+ * @brief enum for CANN package name
+ */
+typedef enum aclCANNPackageName_ {
+    ACL_PKG_NAME_CANN,
+    ACL_PKG_NAME_RUNTIME,
+    ACL_PKG_NAME_COMPILER,
+    ACL_PKG_NAME_HCCL,
+    ACL_PKG_NAME_TOOLKIT,
+    ACL_PKG_NAME_OPP,
+    ACL_PKG_NAME_OPP_KERNEL,
+    ACL_PKG_NAME_DRIVER
+} aclCANNPackageName;
+
+/**
+ * @ingroup AscendCL
+ * @brief struct for storaging CANN package version
+ */
+typedef struct aclCANNPackageVersion_ {
+    char version[ACL_PKG_VERSION_MAX_SIZE];
+    char majorVersion[ACL_PKG_VERSION_PARTS_MAX_SIZE];
+    char minorVersion[ACL_PKG_VERSION_PARTS_MAX_SIZE];
+    char releaseVersion[ACL_PKG_VERSION_PARTS_MAX_SIZE];
+    char patchVersion[ACL_PKG_VERSION_PARTS_MAX_SIZE];
+    char reserved[ACL_PKG_VERSION_MAX_SIZE];
+} aclCANNPackageVersion;
+
+/**
+ * @ingroup AscendCL
+ * @brief query CANN package version
+ *
+ * @param name[IN] CANN package name
+ * @param version[OUT] CANN package version information
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval ACL_ERROR_INVALID_FILE Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclsysGetCANNVersion(aclCANNPackageName name, aclCANNPackageVersion *version);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/third_party/acl/inc/acl/acl_base.h b/third_party/acl/inc/acl/acl_base.h
old mode 100644
new mode 100755
index 9a9bc4a988..9780a01f8f
--- a/third_party/acl/inc/acl/acl_base.h
+++ b/third_party/acl/inc/acl/acl_base.h
@@ -114,6 +114,7 @@ static const int ACL_ERROR_INVALID_OPP_PATH = 148049;
 static const int ACL_ERROR_OP_UNSUPPORTED_DYNAMIC = 148050;
 static const int ACL_ERROR_RELATIVE_RESOURCE_NOT_CLEARED = 148051;
 static const int ACL_ERROR_UNSUPPORTED_JPEG = 148052;
+static const int ACL_ERROR_INVALID_BUNDLE_MODEL_ID = 148053;
 
 static const int ACL_ERROR_BAD_ALLOC = 200000;
 static const int ACL_ERROR_API_NOT_SUPPORT = 200001;
@@ -715,7 +716,7 @@ ACL_FUNC_VISIBILITY aclError aclGetCannAttribute(aclCannAttr cannAttr, int32_t *
  * @brief Get capability value of the specified device
  *
  * @param  deviceId [IN]  device id
- * @param  infoType [IN]  device capability to query
+ * @param  deviceInfo [IN]  device capability to query
  * @param  value [OUT]    returned device capability value
  *
  * @retval ACL_SUCCESS  The function is successfully executed.
diff --git a/third_party/acl/inc/acl/acl_mdl.h b/third_party/acl/inc/acl/acl_mdl.h
old mode 100644
new mode 100755
index fb1112d5f4..247ccc9f57
--- a/third_party/acl/inc/acl/acl_mdl.h
+++ b/third_party/acl/inc/acl/acl_mdl.h
@@ -1,7 +1,7 @@
 /**
 * @file acl_mdl.h
 *
-* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. All Rights Reserved.
+* Copyright (c) Huawei Technologies Co., Ltd. 2019-2023. All rights reserved.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -21,6 +21,7 @@
 extern "C" {
 #endif
 
+#define ACL_DIM_ENDPOINTS        2
 #define ACL_MAX_DIM_CNT          128
 #define ACL_MAX_TENSOR_NAME_LEN  128
 #define ACL_MAX_BATCH_NUM        128
@@ -39,28 +40,33 @@ extern "C" {
 #define ACL_DYNAMIC_AIPP_NAME "ascend_dynamic_aipp_data"
 #define ACL_ATTR_NAME_DATA_DUMP_ORIGIN_OP_NAMES "_datadump_original_op_names"
 
+/* used for ACL_MDL_WORKSPACE_MEM_OPTIMIZE */
+#define ACL_WORKSPACE_MEM_OPTIMIZE_DEFAULT 0
+#define ACL_WORKSPACE_MEM_OPTIMIZE_INPUTOUTPUT 1
+
 typedef struct aclmdlDataset aclmdlDataset;
 typedef struct aclmdlDesc aclmdlDesc;
 typedef struct aclmdlAIPP aclmdlAIPP;
 typedef struct aclAippExtendInfo aclAippExtendInfo;
 typedef struct aclmdlConfigHandle aclmdlConfigHandle;
+typedef struct aclmdlExecConfigHandle aclmdlExecConfigHandle;
 
 typedef enum {
     ACL_YUV420SP_U8 = 1,
-    ACL_XRGB8888_U8,
-    ACL_RGB888_U8,
-    ACL_YUV400_U8,
-    ACL_NC1HWC0DI_FP16,
-    ACL_NC1HWC0DI_S8,
-    ACL_ARGB8888_U8,
-    ACL_YUYV_U8,
-    ACL_YUV422SP_U8,
-    ACL_AYUV444_U8,
-    ACL_RAW10,
-    ACL_RAW12,
-    ACL_RAW16,
-    ACL_RAW24,
-    ACL_AIPP_RESERVED = 0xffff,
+    ACL_XRGB8888_U8 = 2,
+    ACL_RGB888_U8 = 3,
+    ACL_YUV400_U8 = 4,
+    ACL_NC1HWC0DI_FP16 = 5,
+    ACL_NC1HWC0DI_S8 = 6,
+    ACL_ARGB8888_U8 = 7,
+    ACL_YUYV_U8 = 8,
+    ACL_YUV422SP_U8 = 9,
+    ACL_AYUV444_U8 = 10,
+    ACL_RAW10 = 11,
+    ACL_RAW12 = 12,
+    ACL_RAW16 = 13,
+    ACL_RAW24 = 14,
+    ACL_AIPP_RESERVED = 0xFFFF,
 } aclAippInputFormat;
 
 typedef enum {
@@ -76,9 +82,35 @@ typedef enum {
     ACL_MDL_INPUTQ_NUM_SIZET,
     ACL_MDL_INPUTQ_ADDR_PTR, /**< pointer to inputQ with shallow copy */
     ACL_MDL_OUTPUTQ_NUM_SIZET,
-    ACL_MDL_OUTPUTQ_ADDR_PTR /**< pointer to outputQ with shallow copy */
+    ACL_MDL_OUTPUTQ_ADDR_PTR, /**< pointer to outputQ with shallow copy */
+    ACL_MDL_WORKSPACE_MEM_OPTIMIZE,
+    ACL_MDL_WEIGHT_PATH_PTR, /**< pointer to weight path with deep copy */
+    ACL_MDL_MODEL_DESC_PTR, /**< pointer to model desc of model with shallow copy */
+    ACL_MDL_MODEL_DESC_SIZET,
+    ACL_MDL_KERNEL_PTR, /**< pointer to kernel bin of model with shallow copy */
+    ACL_MDL_KERNEL_SIZET,
+    ACL_MDL_KERNEL_ARGS_PTR, /**< pointer to kernel args of model with shallow copy */
+    ACL_MDL_KERNEL_ARGS_SIZET,
+    ACL_MDL_STATIC_TASK_PTR, /**< pointer to static task desc of model with shallow copy */
+    ACL_MDL_STATIC_TASK_SIZET,
+    ACL_MDL_DYNAMIC_TASK_PTR, /**< pointer to dynamic task desc of model with shallow copy */
+    ACL_MDL_DYNAMIC_TASK_SIZET,
+    ACL_MDL_MEM_MALLOC_POLICY_SIZET,
+    ACL_MDL_FIFO_PTR, /**< pointer to fifo memory of model with shallow copy */
+    ACL_MDL_FIFO_SIZET
 } aclmdlConfigAttr;
 
+typedef enum {
+    ACL_MDL_STREAM_SYNC_TIMEOUT = 0,
+    ACL_MDL_EVENT_SYNC_TIMEOUT,
+    ACL_MDL_WORK_ADDR_PTR, /**< param */
+    ACL_MDL_WORK_SIZET, /**< param */
+    ACL_MDL_MPAIMID_SIZET, /**< param reserved */
+    ACL_MDL_AICQOS_SIZET, /**< param reserved */
+    ACL_MDL_AICOST_SIZET, /**< param reserved */
+    ACL_MDL_MEC_TIMETHR_SIZET /**< param reserved */
+} aclmdlExecConfigAttr;
+
 typedef enum {
     ACL_DATA_WITHOUT_AIPP = 0,
     ACL_DATA_WITH_STATIC_AIPP,
@@ -88,10 +120,15 @@ typedef enum {
 
 typedef struct aclmdlIODims {
     char name[ACL_MAX_TENSOR_NAME_LEN]; /**< tensor name */
-    size_t dimCount;  /**< dim array count */
+    size_t dimCount; /**< dim array count */
     int64_t dims[ACL_MAX_DIM_CNT]; /**< dim data array */
 } aclmdlIODims;
 
+typedef struct aclmdlIODimsRange {
+    size_t rangeCount; /**< dim range array count */
+    int64_t range[ACL_MAX_DIM_CNT][ACL_DIM_ENDPOINTS]; /**< range data array */
+} aclmdlIODimsRange;
+
 typedef struct aclAippDims {
     aclmdlIODims srcDims; /**< input dims before model transform */
     size_t srcSize; /**< input size before model transform */
@@ -165,6 +202,30 @@ typedef struct aclAippInfo {
     aclAippExtendInfo *aippExtend; /**< reserved parameters, current version needs to be null */
 } aclAippInfo;
 
+typedef struct aclmdlExeOMDesc {
+    size_t workSize;
+    size_t weightSize;
+    size_t modelDescSize;
+    size_t kernelSize;
+    size_t kernelArgsSize;
+    size_t staticTaskSize;
+    size_t dynamicTaskSize;
+    size_t fifoTaskSize;
+    size_t reserved[8];
+} aclmdlExeOMDesc;
+
+typedef enum {
+    ACL_MODEL_CAPTURE_MODE_GLOBAL = 0,
+    ACL_MODEL_CAPTURE_MODE_THREAD_LOCAL,
+    ACL_MODEL_CAPTURE_MODE_RELAXED,
+} aclmdlCaptureMode;
+
+typedef enum {
+    ACL_MODEL_CAPTURE_STATUS_NONE = 0,
+    ACL_MODEL_CAPTURE_STATUS_ACTIVE,
+    ACL_MODEL_CAPTURE_STATUS_INVALIDATED,
+} aclmdlCaptureStatus;
+
 /**
  * @ingroup AscendCL
  * @brief Create data of type aclmdlDesc
@@ -196,6 +257,31 @@ ACL_FUNC_VISIBILITY aclError aclmdlDestroyDesc(aclmdlDesc *modelDesc);
  */
 ACL_FUNC_VISIBILITY aclError aclmdlGetDesc(aclmdlDesc *modelDesc, uint32_t modelId);
 
+/**
+ * @ingroup AscendCL
+ * @brief Get aclmdlDesc data of the model according to the model path
+ *
+ * @param  modelDesc [OUT]   aclmdlDesc pointer
+ * @param  modelPath [IN]    model path
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlGetDescFromFile(aclmdlDesc *modelDesc, const char *modelPath);
+
+/**
+ * @ingroup AscendCL
+ * @brief Get aclmdlDesc data of the model according to the model and modelSize
+ *
+ * @param  modelDesc [OUT]   aclmdlDesc pointer
+ * @param  model [IN]        model pointer
+ * @param  modelSize [IN]    model size
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlGetDescFromMem(aclmdlDesc *modelDesc, const void *model, size_t modelSize);
+
 /**
  * @ingroup AscendCL
  * @brief Get the number of the inputs of
@@ -244,6 +330,25 @@ ACL_FUNC_VISIBILITY size_t aclmdlGetInputSizeByIndex(aclmdlDesc *modelDesc, size
  */
 ACL_FUNC_VISIBILITY size_t aclmdlGetOutputSizeByIndex(aclmdlDesc *modelDesc, size_t index);
 
+/**
+ * @ingroup AscendCL
+ * @brief Create config handle of execute
+ *
+ * @retval the aclmdlCreateExecConfigHandle pointer
+ */
+ACL_FUNC_VISIBILITY aclmdlExecConfigHandle *aclmdlCreateExecConfigHandle();
+
+/**
+ * @ingroup AscendCL
+ * @brief Destroy config handle of model execute
+ *
+ * @param  handle [IN]  Pointer to aclmdlExecConfigHandle to be destroyed
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlDestroyExecConfigHandle(const aclmdlExecConfigHandle *handle);
+
 /**
  * @ingroup AscendCL
  * @brief Create data of type aclmdlDataset
@@ -343,6 +448,83 @@ ACL_FUNC_VISIBILITY aclDataBuffer *aclmdlGetDatasetBuffer(const aclmdlDataset *d
  */
 ACL_FUNC_VISIBILITY aclError aclmdlLoadFromFile(const char *modelPath, uint32_t *modelId);
 
+/**
+ * @ingroup AscendCL
+ * @brief Load offline bundle model data from file
+ * and manage memory internally by the system
+ *
+ * @par Function
+ * After the system finishes loading the bundle model,
+ * the bundle model ID returned is used as a mark to identify the bundle model
+ * during subsequent operations
+ *
+ * @param modelPath [IN]   Storage path for offline bundle model file
+ * @param bundleId [OUT]   Bundle model id generated after
+ *        the system finishes loading the bundle model
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlBundleLoadFromFile(const char *modelPath, uint32_t *bundleId);
+
+/**
+ * @ingroup AscendCL
+ * @brief Load offline bundle model data from memory and manage the memory of
+ * model running internally by the system
+ *
+ * @par Function
+ * After the system finishes loading the bundle model,
+ * the bundle model ID returned is used as a mark to identify the bundle  model
+ * during subsequent operations
+ *
+ * @param model [IN]      Bundle model data stored in memory
+ * @param modelSize [IN]  model data size
+ * @param bundleId [OUT]  Bundle model id generated after
+ *        the system finishes loading the model
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlBundleLoadFromMem(const void *model,  size_t modelSize, uint32_t *bundleId);
+
+/**
+ * @ingroup AscendCL
+ * @brief unload bundle model with bundle model id
+ *
+ * @param  bundleId [IN]   bundle model id to be unloaded
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlBundleUnload(uint32_t bundleId);
+
+/**
+ * @ingroup AscendCL
+ * @brief get bundle model inner model nums
+ *
+ * @param bundleId [IN] bundle id acquired by aclmdlBundleLoadFromFile or aclmdlBundleLoadFromMem
+ * @param modelNum [OUT]    the pointer to model num
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ *
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlBundleGetModelNum(uint32_t bundleId, size_t *modelNum);
+
+/**
+ * @ingroup AscendCL
+ * @brief get inner model id by index
+ *
+ * @param bundleId [IN] bundle id acquired by aclmdlBundleLoadFromFile or aclmdlBundleLoadFromMem
+ * @param index [IN] index of bundle models
+ * @param modelId [OUT]    the pointer to inner model id which to be executed
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ *
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlBundleGetModelId(uint32_t bundleId, size_t index, uint32_t *modelId);
+
 /**
  * @ingroup AscendCL
  * @brief Load offline model data from memory and manage the memory of
@@ -361,8 +543,7 @@ ACL_FUNC_VISIBILITY aclError aclmdlLoadFromFile(const char *modelPath, uint32_t
  * @retval ACL_SUCCESS The function is successfully executed.
  * @retval OtherValues Failure
  */
-ACL_FUNC_VISIBILITY aclError aclmdlLoadFromMem(const void *model,  size_t modelSize,
-                                               uint32_t *modelId);
+ACL_FUNC_VISIBILITY aclError aclmdlLoadFromMem(const void *model,  size_t modelSize, uint32_t *modelId);
 
 /**
  * @ingroup AscendCL
@@ -462,6 +643,37 @@ ACL_FUNC_VISIBILITY aclError aclmdlLoadFromMemWithQ(const void *model, size_t mo
  */
 ACL_FUNC_VISIBILITY aclError aclmdlExecute(uint32_t modelId, const aclmdlDataset *input, aclmdlDataset *output);
 
+/**
+ * @ingroup AscendCL
+ * @brief Execute model synchronous inference until the inference result is returned
+ *
+ * @param  modelId [IN]   ID of the model to perform inference
+ * @param  input [IN]     Input data for model inference
+ * @param  output [OUT]   Output data for model inference
+ * @param  stream [IN]   stream
+ * @param  handle [IN]   config of model execute
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlExecuteV2(uint32_t modelId, const aclmdlDataset *input, aclmdlDataset *output,
+                                             aclrtStream stream, const aclmdlExecConfigHandle *handle);
+
+/**
+ * @ingroup AscendCL
+ * @brief Execute model asynchronous inference until the inference result is returned
+ *
+ * @param  modelId [IN]   ID of the model to perform inference
+ * @param  input [IN]     Input data for model inference
+ * @param  output [OUT]   Output data for model inference
+ * @param  stream [IN]   stream
+ * @param  handle [IN]   config of model execute
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY  aclError aclmdlExecuteAsyncV2(uint32_t modelId, const aclmdlDataset *input, aclmdlDataset *output,
+                                                   aclrtStream stream, const aclmdlExecConfigHandle *handle);
 /**
  * @ingroup AscendCL
  * @brief Execute model asynchronous inference until the inference result is returned
@@ -505,6 +717,19 @@ ACL_FUNC_VISIBILITY aclError aclmdlUnload(uint32_t modelId);
  */
 ACL_FUNC_VISIBILITY aclError aclmdlQuerySize(const char *fileName, size_t *workSize, size_t *weightSize);
 
+/**
+ * @ingroup AscendCL
+ * @brief Get the size of each partition and working memory size
+ * required for model execution according to the model file
+ *
+ * @param  fileName [IN]          Model path to get memory information
+ * @param  aclmdlExeOMDesc [OUT]  The size of each partition and working memory size
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlQueryExeOMDesc(const char *fileName, aclmdlExeOMDesc *mdlPartitionSize);
+
 /**
  * @ingroup AscendCL
  * @brief Obtain the weights required for
@@ -612,6 +837,20 @@ ACL_FUNC_VISIBILITY aclError aclmdlGetInputDims(const aclmdlDesc *modelDesc, siz
  */
 ACL_FUNC_VISIBILITY aclError aclmdlGetInputDimsV2(const aclmdlDesc *modelDesc, size_t index, aclmdlIODims *dims);
 
+/**
+ * @ingroup AscendCL
+ * @brief get input dims range info
+ *
+ * @param modelDesc [IN]  model description
+ * @param index [IN]  input tensor index
+ * @param dimsRange [OUT]  dims range info
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlGetInputDimsRange(const aclmdlDesc *modelDesc, size_t index,
+                                                     aclmdlIODimsRange *dimsRange);
+
 /**
  * @ingroup AscendCL
  * @brief get output dims info
@@ -653,7 +892,7 @@ ACL_FUNC_VISIBILITY aclError aclmdlGetCurOutputDims(const aclmdlDesc *modelDesc,
  * @param modelDesc [IN]   model description
  * @param opName [IN]      op name
  * @param attr [IN]        attr name
- * 
+ *
  * @retval the attr value
  */
 ACL_FUNC_VISIBILITY const char *aclmdlGetOpAttr(aclmdlDesc *modelDesc, const char *opName, const char *attr);
@@ -825,6 +1064,18 @@ ACL_FUNC_VISIBILITY aclmdlAIPP *aclmdlCreateAIPP(uint64_t batchSize);
  */
 ACL_FUNC_VISIBILITY aclError aclmdlDestroyAIPP(const aclmdlAIPP *aippParmsSet);
 
+/**
+ * @ingroup AscendCL
+ * @brief Get dynamic aipp data need size according to batchSize
+ *
+ * @param batchSize [IN]    batchsizes of model
+ * @param size [OUT]    Pointer of aipp data need size according to batchSize
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlGetAippDataSize(uint64_t batchSize, size_t *size);
+
 /**
  * @ingroup AscendCL
  * @brief set InputFormat of type aclmdlAIPP
@@ -1237,6 +1488,21 @@ ACL_FUNC_VISIBILITY aclError aclmdlDestroyConfigHandle(aclmdlConfigHandle *handl
 ACL_FUNC_VISIBILITY aclError aclmdlSetConfigOpt(aclmdlConfigHandle *handle, aclmdlConfigAttr attr,
     const void *attrValue, size_t valueSize);
 
+/**
+ * @ingroup AscendCL
+ * @brief set config for model execute
+ *
+ * @param handle [OUT]    pointer to model execute config handle
+ * @param attr [IN]       config attr in model execute config handle to be set
+ * @param attrValue [IN]  pointer to model execute config value
+ * @param valueSize [IN]  memory size of attrValue
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlSetExecConfigOpt(aclmdlExecConfigHandle *handle, aclmdlExecConfigAttr attr,
+                                                    const void *attrValue, size_t valueSize);
+
 /**
  * @ingroup AscendCL
  * @brief get real tensor name from modelDesc
@@ -1249,6 +1515,46 @@ ACL_FUNC_VISIBILITY aclError aclmdlSetConfigOpt(aclmdlConfigHandle *handle, aclm
  */
 ACL_FUNC_VISIBILITY const char *aclmdlGetTensorRealName(const aclmdlDesc *modelDesc, const char *name);
 
+/**
+ * @ingroup AscendCL
+ * @brief begin capture
+ * @param stream [IN] set the stream to be captured
+ * @param mode [IN] capture mode
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlBeginCapture(aclrtStream stream, aclmdlCaptureMode mode);
+
+/**
+ * @ingroup AscendCL
+ * @brief obtain the capture information of a stream
+ * @param stream [IN] stream to be queried
+ * @param status [OUT] return the stream status
+ * @param modelId [OUT] return the model id
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlGetCaptureInfo(aclrtStream stream, aclmdlCaptureStatus *status, uint32_t *modelId);
+
+/**
+ * @ingroup AscendCL
+ * @brief end the stream capture and obtain the corresponding model
+ * @param stream [IN] stream to be ended
+ * @param modelId [OUT] return the model id
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlEndCapture(aclrtStream stream, uint32_t *modelId);
+
+/**
+ * @ingroup AscendCL
+ * @brief print model information
+ * @param modelId [IN] model information needs to be printed
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlDebugPrint(uint32_t modelId);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/third_party/acl/inc/acl/acl_op.h b/third_party/acl/inc/acl/acl_op.h
old mode 100644
new mode 100755
index 6ccb6e39f4..f0894a35d2
--- a/third_party/acl/inc/acl/acl_op.h
+++ b/third_party/acl/inc/acl/acl_op.h
@@ -1,7 +1,7 @@
 /**
 * @file acl_op.h
 *
-* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. All Rights Reserved.
+* Copyright (c) Huawei Technologies Co., Ltd. 2019-2020. All rights reserved.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -538,6 +538,17 @@ ACL_FUNC_VISIBILITY aclError aclopUpdateParams(const char *opType,
                                                const aclTensorDesc *const outputDesc[],
                                                const aclopAttr *attr);
 
+/**
+ * @ingroup AscendCL
+ * @brief set max op queue num
+ *
+ * @param maxOpNum [IN]   number of max op queue
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclopSetMaxOpQueueNum(uint64_t maxOpNum);
+
 /**
  * @ingroup AscendCL
  * @brief inferShape the specified operator synchronously
@@ -569,7 +580,7 @@ ACL_FUNC_VISIBILITY aclError aclopInferShape(const char *opType,
  * @brief Enable the dump function of the corresponding dump type.
  *
  * @param dumpType [IN]       type of dump
- * @param path [IN]    dump path
+ * @param path     [IN]       dump path
  *
  * @retval ACL_SUCCESS The function is successfully executed.
  * @retval OtherValues Failure
diff --git a/third_party/acl/inc/acl/acl_op_compiler.h b/third_party/acl/inc/acl/acl_op_compiler.h
old mode 100644
new mode 100755
index 7664fa0caf..0d87d98f24
--- a/third_party/acl/inc/acl/acl_op_compiler.h
+++ b/third_party/acl/inc/acl/acl_op_compiler.h
@@ -1,7 +1,7 @@
 /**
 * @file acl_op_compiler.h
 *
-* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. All Rights Reserved.
+* Copyright (c) Huawei Technologies Co., Ltd. 2019-2020. All rights reserved.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -25,7 +25,7 @@ typedef enum aclCompileType {
 typedef enum {
     ACL_PRECISION_MODE,
     ACL_AICORE_NUM,
-    ACL_AUTO_TUNE_MODE,
+    ACL_AUTO_TUNE_MODE, // The auto_tune_mode has been discarded
     ACL_OP_SELECT_IMPL_MODE,
     ACL_OPTYPELIST_FOR_IMPLMODE,
     ACL_OP_DEBUG_LEVEL,
diff --git a/third_party/acl/inc/acl/acl_rt.h b/third_party/acl/inc/acl/acl_rt.h
old mode 100644
new mode 100755
index 01af27ca2d..30d796f464
--- a/third_party/acl/inc/acl/acl_rt.h
+++ b/third_party/acl/inc/acl/acl_rt.h
@@ -638,16 +638,16 @@ ACL_FUNC_VISIBILITY aclError aclrtRecordEvent(aclrtEvent event, aclrtStream stre
  */
 ACL_FUNC_VISIBILITY aclError aclrtResetEvent(aclrtEvent event, aclrtStream stream);
 
- /**
- * @ingroup AscendCL
- * @brief Queries an event's status
- *
- * @param  event [IN]    event to query
- * @param  status [OUT]  event status
- *
- * @retval ACL_SUCCESS The function is successfully executed.
- * @retval OtherValues Failure
- */
+/**
+* @ingroup AscendCL
+* @brief Queries an event's status
+*
+* @param  event [IN]    event to query
+* @param  status [OUT]  event status
+*
+* @retval ACL_SUCCESS The function is successfully executed.
+* @retval OtherValues Failure
+*/
 ACL_DEPRECATED_MESSAGE("aclrtQueryEvent is deprecated, use aclrtQueryEventStatus instead")
 ACL_FUNC_VISIBILITY aclError aclrtQueryEvent(aclrtEvent event, aclrtEventStatus *status);
 
diff --git a/third_party/acl/inc/acl/acl_rt_allocator.h b/third_party/acl/inc/acl/acl_rt_allocator.h
new file mode 100644
index 0000000000..ab4c0da3d0
--- /dev/null
+++ b/third_party/acl/inc/acl/acl_rt_allocator.h
@@ -0,0 +1,148 @@
+/**
+* @file acl_rt_allocator.h
+*
+* Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+*/
+#ifndef INC_EXTERNAL_ACL_ACL_RT_ALLOCATOR_H_
+#define INC_EXTERNAL_ACL_ACL_RT_ALLOCATOR_H_
+
+#include "acl_base.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void *aclrtAllocatorDesc;
+typedef void *aclrtAllocator;
+typedef void *aclrtAllocatorBlock;
+typedef void *aclrtAllocatorAddr;
+
+typedef void *(*aclrtAllocatorAllocFunc)(aclrtAllocator allocator, size_t size);
+typedef void (*aclrtAllocatorFreeFunc)(aclrtAllocator allocator, aclrtAllocatorBlock block);
+typedef void *(*aclrtAllocatorAllocAdviseFunc)(aclrtAllocator allocator, size_t size, aclrtAllocatorAddr addr);
+typedef void *(*aclrtAllocatorGetAddrFromBlockFunc)(aclrtAllocatorBlock block);
+
+/**
+ * @ingroup AscendCL
+ * @brief Create allocator description
+ *
+ * @retval null for failed
+ * @retval OtherValues success
+ *
+ * @see aclrtAllocatorDestroyDesc
+ */
+ACL_FUNC_VISIBILITY aclrtAllocatorDesc aclrtAllocatorCreateDesc();
+
+/**
+ * @ingroup AscendCL
+ * @brief Relese allocator description
+ *
+ * @param allocatorDesc [IN]     allocator description
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ *
+ * @see aclrtAllocatorCreateDesc
+ */
+ACL_FUNC_VISIBILITY aclError aclrtAllocatorDestroyDesc(aclrtAllocatorDesc allocatorDesc);
+
+/**
+ * @ingroup AscendCL
+ * @brief Register allocator object to allocator description
+ *
+ * @param allocatorDesc [IN] allocator description
+ * @param allocator [IN]    allocator object handle
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclrtAllocatorSetObjToDesc(aclrtAllocatorDesc allocatorDesc, aclrtAllocator allocator);
+
+/**
+ * @ingroup AscendCL
+ * @brief Register the function pointer of alloc memory to the allocator description
+ *
+ * @param allocatorDesc [IN] allocator description
+ * @param func [IN]    the function pointer of alloc memory
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclrtAllocatorSetAllocFuncToDesc(aclrtAllocatorDesc allocatorDesc,
+                                                              aclrtAllocatorAllocFunc func);
+
+/**
+ * @ingroup AscendCL
+ * @brief Register the function pointer of free memory to the allocator description
+ *
+ * @param allocatorDesc [IN] allocator description
+ * @param func [IN]    free memory function pointer
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclrtAllocatorSetFreeFuncToDesc(aclrtAllocatorDesc allocatorDesc,
+                                                             aclrtAllocatorFreeFunc func);
+
+/**
+ * @ingroup AscendCL
+ * @brief Register the function pointer of alloc suggested memory to the allocator description
+ *
+ * @param allocatorDesc [IN] allocator description
+ * @param func [IN]    the function pointer of alloc suggested memory
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclrtAllocatorSetAllocAdviseFuncToDesc(aclrtAllocatorDesc allocatorDesc,
+                                                                    aclrtAllocatorAllocAdviseFunc func);
+
+/**
+ * @ingroup AscendCL
+ * @brief Register the function pointer of get address from block to the allocator description
+ *
+ * @param allocatorDesc [IN] allocator description
+ * @param func [IN]    the function pointer of get address from block
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclrtAllocatorSetGetAddrFromBlockFuncToDesc(aclrtAllocatorDesc allocatorDesc,
+                                                                         aclrtAllocatorGetAddrFromBlockFunc func);
+
+/**
+ * @ingroup AscendCL
+ * @brief Register allocator description to acl by stream
+ *
+ * @param stream [IN]    stream handle
+ * @param allocatorDesc [IN] allocator description
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ *
+ * @see aclrtAllocatorUnregister
+ */
+ACL_FUNC_VISIBILITY aclError aclrtAllocatorRegister(aclrtStream stream, aclrtAllocatorDesc allocatorDesc);
+
+/**
+ * @ingroup AscendCL
+ * @brief Unregister allocator description from acl by stream
+ *
+ * @param stream [IN]    stream handle
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ *
+ * @see aclrtAllocatorRegister
+ */
+ACL_FUNC_VISIBILITY aclError aclrtAllocatorUnregister(aclrtStream stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // INC_EXTERNAL_ACL_ACL_RT_ALLOCATOR_H_
diff --git a/third_party/acl/inc/acl/acl_tdt.h b/third_party/acl/inc/acl/acl_tdt.h
old mode 100644
new mode 100755
index 5ea3128f96..b2f2eb92ac
--- a/third_party/acl/inc/acl/acl_tdt.h
+++ b/third_party/acl/inc/acl/acl_tdt.h
@@ -1,7 +1,7 @@
 /**
 * @file acl_tdt.h
 *
-* Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. All Rights Reserved.
+* Copyright (c) Huawei Technologies Co., Ltd. 2019-2020. All rights reserved.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -21,7 +21,9 @@ enum acltdtTensorType {
     ACL_TENSOR_DATA_UNDEFINED = -1,
     ACL_TENSOR_DATA_TENSOR,
     ACL_TENSOR_DATA_END_OF_SEQUENCE,
-    ACL_TENSOR_DATA_ABNORMAL
+    ACL_TENSOR_DATA_ABNORMAL,
+    ACL_TENSOR_DATA_SLICE_TENSOR,
+    ACL_TENSOR_DATA_END_TENSOR
 };
 
 typedef struct acltdtDataItem acltdtDataItem;
@@ -83,12 +85,26 @@ ACL_FUNC_VISIBILITY size_t acltdtGetDataSizeFromItem(const acltdtDataItem *dataI
 */
 ACL_FUNC_VISIBILITY size_t acltdtGetDimNumFromItem(const acltdtDataItem *dataItem);
 
+/**
+ * @ingroup AscendCL
+ * @brief Get slice info from item
+ *
+ * @param dataItem [IN] pointer to data item
+ * @param sliceNum [OUT] pointer to the sliceNum of dataItem
+ * @param sliceId [OUT] pointer to the sliceId of dataItem
+ *
+ * @retval ACL_SUCCESS  The function is successfully executed.
+ * @retval OtherValues Failure
+*/
+ACL_FUNC_VISIBILITY aclError acltdtGetSliceInfoFromItem(const acltdtDataItem *dataItem, size_t *sliceNum,
+    size_t* sliceId);
+
 /**
  * @ingroup AscendCL
  * @brief Get dims from item
  *
  * @param  dataItem [IN]      the struct of data item
- * @param  dims [IN|OUT]      pointer to the dims of dataTtem
+ * @param  dims [IN|OUT]      pointer to the dims of dataItem
  * @param  dimNum [IN]        the size of the dims
  *
  * @retval ACL_SUCCESS  The function is successfully executed.
@@ -263,13 +279,25 @@ ACL_FUNC_VISIBILITY acltdtChannelHandle *acltdtCreateChannelWithCapacity(uint32_
  */
 ACL_FUNC_VISIBILITY aclError acltdtDestroyChannel(acltdtChannelHandle *handle);
 
+/**
+ * @ingroup AscendCL
+ * @brief clean the channel
+ *
+ * @param handle [IN]      pointer to the channel handle
+ *
+ * @retval ACL_SUCCESS  The function is successfully executed.
+ * @retval OtherValues Failure
+ *
+ */
+ACL_FUNC_VISIBILITY aclError acltdtCleanChannel(acltdtChannelHandle *handle);
+
 /**
  * @ingroup AscendCL
  * @brief Send tensor to device
  *
  * @param handle [IN]  pointer to the channel handle
  * @param dataset [IN] pointer to the dataset
- * @param timeout [IN] to be reserved, now it must be -1
+ * @param timeout [IN] timeout/ms
  *
  * @retval ACL_SUCCESS  The function is successfully executed.
  * @retval OtherValues Failure
@@ -286,7 +314,7 @@ ACL_FUNC_VISIBILITY aclError acltdtSendTensor(const acltdtChannelHandle *handle,
  *
  * @param handle [IN]      pointer to the channel handle
  * @param dataset [OUT]    pointer to the dataset
- * @param timeout [IN]     to be reserved, now it must be -1
+ * @param timeout [IN]     timeout/ms
  *
  * @retval ACL_SUCCESS  The function is successfully executed.
  * @retval OtherValues Failure
@@ -314,5 +342,4 @@ ACL_FUNC_VISIBILITY aclError acltdtQueryChannelSize(const acltdtChannelHandle *h
 }
 #endif
 
-#endif //INC_EXTERNAL_ACL_ACL_TDT_H_
-
+#endif // INC_EXTERNAL_ACL_ACL_TDT_H_
diff --git a/third_party/acl/inc/acl/acl_tdt_queue.h b/third_party/acl/inc/acl/acl_tdt_queue.h
index 5c254cfd30..e52339da72 100644
--- a/third_party/acl/inc/acl/acl_tdt_queue.h
+++ b/third_party/acl/inc/acl/acl_tdt_queue.h
@@ -1,7 +1,7 @@
 /**
 * @file acl_tdt_queue.h
 *
-* Copyright (C) Huawei Technologies Co., Ltd. 2020-2021. All Rights Reserved.
+* Copyright (c) Huawei Technologies Co., Ltd. 2019-2020. All rights reserved.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -21,6 +21,10 @@ extern "C" {
 #define ACL_TDT_QUEUE_PERMISSION_DEQUEUE 2
 #define ACL_TDT_QUEUE_PERMISSION_ENQUEUE 4
 
+#define ACL_TDT_QUEUE_ROUTE_UNBIND 0
+#define ACL_TDT_QUEUE_ROUTE_BIND 1
+#define ACL_TDT_QUEUE_ROUTE_BIND_ABNORMAL 2
+
 typedef void *acltdtBuf;
 typedef struct tagMemQueueAttr acltdtQueueAttr;
 typedef struct acltdtQueueRouteList acltdtQueueRouteList;
@@ -40,8 +44,9 @@ typedef enum {
 
 typedef enum {
     ACL_TDT_QUEUE_ROUTE_QUERY_SRC = 0,
-    ACL_TDT_QUEUE_ROUTE_QUERY_DST,
-    ACL_TDT_QUEUE_ROUTE_QUERY_SRC_AND_DST
+    ACL_TDT_QUEUE_ROUTE_QUERY_DST = 1,
+    ACL_TDT_QUEUE_ROUTE_QUERY_SRC_AND_DST = 2,
+    ACL_TDT_QUEUE_ROUTE_QUERY_ABNORMAL = 100
 } acltdtQueueRouteQueryMode;
 
 typedef enum {
@@ -50,6 +55,11 @@ typedef enum {
     ACL_TDT_QUEUE_ROUTE_QUERY_DST_ID_UINT32
 } acltdtQueueRouteQueryInfoParamType;
 
+typedef enum {
+    ACL_TDT_NORMAL_MEM = 0,
+    ACL_TDT_DVPP_MEM
+} acltdtAllocBufType;
+
 /**
  * @ingroup AscendCL
  * @brief create queue
@@ -254,6 +264,123 @@ ACL_FUNC_VISIBILITY aclError acltdtFreeBuf(acltdtBuf buf);
  */
 ACL_FUNC_VISIBILITY aclError acltdtGetBufData(const acltdtBuf buf, void **dataPtr, size_t *size);
 
+/**
+ * @ingroup AscendCL
+ * @brief set data buf effective len
+ *
+ * @param buf [IN] acltdtBuf
+ * @param len [IN] set effective len to data buf which must be smaller than size acquired by acltdtGetBufData
+ *
+ * @retval ACL_SUCCESS  The function is successfully executed.
+ * @retval OtherValues Failure
+ *
+ * @see acltdtGetBufData acltdtGetBufDataLen
+ */
+ACL_FUNC_VISIBILITY aclError acltdtSetBufDataLen(acltdtBuf buf, size_t len);
+
+/**
+ * @ingroup AscendCL
+ * @brief get data buf effective len
+ *
+ * @param buf [IN] acltdtBuf
+ * @param len [OUT] get effective len which is set by acltdtSetBufDataLen
+ *
+ * @retval ACL_SUCCESS  The function is successfully executed.
+ * @retval OtherValues Failure
+ *
+ * @see acltdtSetBufDataLen
+ */
+ACL_FUNC_VISIBILITY aclError acltdtGetBufDataLen(acltdtBuf buf, size_t *len);
+
+/**
+ * @ingroup AscendCL
+ * @brief append acltdtBuf to acltdtBuf chain
+ *
+ * @param headBuf [IN] acltdtBuf chain head
+ * @param buf [IN] acltdtBuf will be appended
+ *
+ * @retval ACL_SUCCESS  The function is successfully executed.
+ * @retval OtherValues Failure
+ *
+ */
+ACL_FUNC_VISIBILITY aclError acltdtAppendBufChain(acltdtBuf headBuf, acltdtBuf buf);
+
+/**
+ * @ingroup AscendCL
+ * @brief get acltdtBuf chain total size
+ *
+ * @param headBuf [IN] acltdtBuf chain head
+ * @param num [OUT] acltdtBuf chain total size
+ *
+ * @retval ACL_SUCCESS  The function is successfully executed.
+ * @retval OtherValues Failure
+ *
+ * @see acltdtAppendBufChain
+ */
+ACL_FUNC_VISIBILITY aclError acltdtGetBufChainNum(acltdtBuf headBuf, uint32_t *num);
+
+/**
+ * @ingroup AscendCL
+ * @brief get acltdtBuf from acltdtBuf chain by index
+ *
+ * @param headBuf [IN] acltdtBuf chain head
+ * @param index [IN] the index which is smaller than num acquired from acltdtGetBufChainNum
+ * @param buf [OUT] the acltdtBuf from acltdtBuf on index
+ *
+ * @retval ACL_SUCCESS  The function is successfully executed.
+ * @retval OtherValues Failure
+ *
+ * @see acltdtAppendBufChain acltdtGetBufChainNum
+ */
+ACL_FUNC_VISIBILITY aclError acltdtGetBufFromChain(acltdtBuf headBuf, uint32_t index, acltdtBuf *buf);
+
+/**
+ * @ingroup AscendCL
+ * @brief get private data buf address and size
+ *
+ * @param buf [IN] acltdtBuf
+ * @param dataPtr [IN/OUT] pointer to the user ptr
+ * @param size [IN] the current private data area size, less than or equal to 96B
+ * @param offset [IN] address offset, less than or equal to 96B
+ *
+ * @retval ACL_SUCCESS  The function is successfully executed.
+ * @retval OtherValues Failure
+ *
+ * @see acltdtGetBufUserData
+ */
+ACL_FUNC_VISIBILITY aclError acltdtGetBufUserData(const acltdtBuf buf, void *dataPtr, size_t size, size_t offset);
+
+/**
+ * @ingroup AscendCL
+ * @brief set private data buf address and size
+ *
+ * @param buf [OUT] acltdtBuf
+ * @param dataPtr [IN] pointer to the user ptr
+ * @param size [IN] the current private data area size, less than or equal to 96B
+ * @param offset [IN] address offset, less than or equal to 96B
+ *
+ * @retval ACL_SUCCESS  The function is successfully executed.
+ * @retval OtherValues Failure
+ *
+ * @see acltdtSetBufUserData
+ */
+ACL_FUNC_VISIBILITY aclError acltdtSetBufUserData(acltdtBuf buf, const void *dataPtr, size_t size, size_t offset);
+
+/**
+ * @ingroup AscendCL
+ * @brief copy buf ref
+ *
+ * @param buf [IN] acltdtBuf
+ * @param newBuf [OUT] Make a reference copy of the data area of buf and
+ *                     create a new buf header pointing to the same data area
+ *
+ * @retval ACL_SUCCESS  The function is successfully executed.
+ * @retval OtherValues Failure
+ *
+ * @see acltdtCopyBufRef
+ */
+ACL_FUNC_VISIBILITY aclError acltdtCopyBufRef(const acltdtBuf buf, acltdtBuf *newBuf);
+
 /**
  * @ingroup AscendCL
  * @brief Create the queue attr
@@ -356,7 +483,7 @@ ACL_FUNC_VISIBILITY aclError acltdtDestroyQueueRoute(const acltdtQueueRoute *rou
  * @param param [OUT]        pointer to parameter value
  *
  * @retval ACL_SUCCESS for success, other for failure
- * 
+ *
  * @see acltdtCreateQueueRoute
  */
 ACL_FUNC_VISIBILITY aclError acltdtGetQueueRouteParam(const acltdtQueueRoute *route,
@@ -371,7 +498,7 @@ ACL_FUNC_VISIBILITY aclError acltdtGetQueueRouteParam(const acltdtQueueRoute *ro
  *
  * @retval null for failed
  * @retval OtherValues success
- * 
+ *
  * @see acltdtDestroyQueueRouteList
  */
 ACL_FUNC_VISIBILITY acltdtQueueRouteList* acltdtCreateQueueRouteList();
@@ -398,7 +525,7 @@ ACL_FUNC_VISIBILITY aclError acltdtDestroyQueueRouteList(const acltdtQueueRouteL
  *
  * @retval ACL_SUCCESS  The function is successfully executed.
  * @retval OtherValues Failure
- * 
+ *
  * @see acltdtCreateQueueRouteList | acltdtCreateQueueRoute
  *
  */
@@ -452,7 +579,7 @@ ACL_FUNC_VISIBILITY  acltdtQueueRouteQueryInfo* acltdtCreateQueueRouteQueryInfo(
  *
  * @retval ACL_SUCCESS  The function is successfully executed.
  * @retval OtherValues Failure
- * 
+ *
  * @see acltdtCreateQueueRouteQueryInfo
  *
  */
@@ -468,7 +595,7 @@ ACL_FUNC_VISIBILITY aclError acltdtDestroyQueueRouteQueryInfo(const acltdtQueueR
  * @param param [IN]        pointer to parameter value
  *
  * @retval ACL_SUCCESS for success, other for failure
- * 
+ *
  * @see acltdtCreateQueueRouteQueryInfo
  */
 ACL_FUNC_VISIBILITY aclError acltdtSetQueueRouteQueryInfo(acltdtQueueRouteQueryInfo *param,
@@ -481,4 +608,4 @@ ACL_FUNC_VISIBILITY aclError acltdtSetQueueRouteQueryInfo(acltdtQueueRouteQueryI
 }
 #endif
 
-#endif //INC_EXTERNAL_ACL_ACL_TDT_QUEUE_H_
\ No newline at end of file
+#endif // INC_EXTERNAL_ACL_ACL_TDT_QUEUE_H_
diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp
index 25a804ee3c..2f3960ee9b 100644
--- a/torch_npu/csrc/InitNpuBindings.cpp
+++ b/torch_npu/csrc/InitNpuBindings.cpp
@@ -138,6 +138,8 @@ static PyMethodDef TorchSanitizerMethods[] = {
 
 void THNPStream_init(PyObject *module);
 void THNPEvent_init(PyObject *module);
+void THNPGraph_init(PyObject *module);
+void THNPMemPool_init(PyObject* module);
 PyMethodDef* THNPModule_get_methods();
 
 static std::vector<PyMethodDef> methods;
@@ -173,6 +175,8 @@ PyObject* initModule() {
     // C, so these lines have to execute first)..
     THNPStream_init(module);
     THNPEvent_init(module);
+    THNPGraph_init(module);
+    THNPMemPool_init(module);
 
     RegisterNPUDeviceProperties(module);
     BindGetDeviceProperties(module);
diff --git a/torch_npu/csrc/aten/NPUGeneratorImpl.cpp b/torch_npu/csrc/aten/NPUGeneratorImpl.cpp
index 29ccbac483..5695fb69e2 100644
--- a/torch_npu/csrc/aten/NPUGeneratorImpl.cpp
+++ b/torch_npu/csrc/aten/NPUGeneratorImpl.cpp
@@ -6,6 +6,7 @@
 
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "torch_npu/csrc/aten/NPUGeneratorImpl.h"
+#include "torch_npu/csrc/core/npu/NPUGraphsUtils.h"
 
 namespace at_npu {
 namespace detail {
@@ -28,7 +29,9 @@ static std::vector<at::Generator> default_gens_npu;
 * Populates the global variables related to NPU generators
 * Warning: this function must only be called once!
 */
-static void initNPUGenVector() {
+static void initNPUGenVector()
+{
+    c10_npu::assertNotCapturing("Not support Generator while in capture mode");
     num_npus = c10_npu::device_count();
     npu_gens_init_flag.resize(num_npus);
     default_gens_npu.resize(num_npus);
@@ -44,7 +47,9 @@ static void initNPUGenVector() {
  * getDefaultNPUGenerator gets the default generator for a particular
  * NPU device.
  */
-const at::Generator& getDefaultNPUGenerator(c10::DeviceIndex device_index) {
+const at::Generator& getDefaultNPUGenerator(c10::DeviceIndex device_index)
+{
+    c10_npu::assertNotCapturing("Not support Generator while in capture mode");
     std::call_once(num_npu_init_flag, initNPUGenVector);
     c10::DeviceIndex idx = device_index;
     if (idx == -1) {
@@ -62,7 +67,9 @@ const at::Generator& getDefaultNPUGenerator(c10::DeviceIndex device_index) {
 /**
  * Utility to create a NPUGeneratorImpl. Returns a shared_ptr
  */
-at::Generator createNPUGenerator(c10::DeviceIndex device_index) {
+at::Generator createNPUGenerator(c10::DeviceIndex device_index)
+{
+    c10_npu::assertNotCapturing("Not support Generator while in capture mode");
     std::call_once(num_npu_init_flag, initNPUGenVector);
     c10::DeviceIndex idx = device_index;
     if (idx == -1) {
@@ -97,8 +104,9 @@ at::Generator createNPUGenerator(c10::DeviceIndex device_index) {
  */
 NPUGeneratorImpl::NPUGeneratorImpl(c10::DeviceIndex device_index)
   : c10::GeneratorImpl{c10::Device(c10::DeviceType::PrivateUse1, device_index),
-              c10::DispatchKeySet(c10::DispatchKey::PrivateUse1)} {
-  // at::npu::assertNotCapturing("Cannot construct a new NPUGeneratorImpl");
+              c10::DispatchKeySet(c10::DispatchKey::PrivateUse1)}
+{
+    c10_npu::assertNotCapturing("Not support Generator while in capture mode");
 }
 
 /**
@@ -107,7 +115,9 @@ NPUGeneratorImpl::NPUGeneratorImpl(c10::DeviceIndex device_index)
  *
  * See Note [Acquire lock when using random generators]
  */
-void NPUGeneratorImpl::set_current_seed(uint64_t seed) {
+void NPUGeneratorImpl::set_current_seed(uint64_t seed)
+{
+    c10_npu::assertNotCapturing("Not support Generator while in capture mode");
     seed_ = seed;
     philox_offset_per_thread_ = 0;
 }
@@ -117,14 +127,18 @@ void NPUGeneratorImpl::set_current_seed(uint64_t seed) {
  *
  * See Note [Acquire lock when using random generators]
  */
-void NPUGeneratorImpl::set_offset(uint64_t offset) {
+void NPUGeneratorImpl::set_offset(uint64_t offset)
+{
+    c10_npu::assertNotCapturing("Not support Generator while in capture mode");
     philox_offset_per_thread_ = offset;
 }
 
 /**
  * Gets the current offset of NPUGeneratorImpl.
  */
-uint64_t NPUGeneratorImpl::get_offset() const {
+uint64_t NPUGeneratorImpl::get_offset() const
+{
+    c10_npu::assertNotCapturing("Not support Generator while in capture mode");
     // Debatable if get_offset() should be allowed in captured regions.
     // Conservatively disallow it for now.
     return philox_offset_per_thread_;
@@ -139,7 +153,9 @@ uint64_t NPUGeneratorImpl::get_offset() const {
 /**
  * Gets the current seed of NPUGeneratorImpl.
  */
-uint64_t NPUGeneratorImpl::current_seed() const {
+uint64_t NPUGeneratorImpl::current_seed() const
+{
+    c10_npu::assertNotCapturing("Not support Generator while in capture mode");
     // Debatable if current_seed() should be allowed in captured regions.
     // Conservatively disallow it for now.
     return seed_;
@@ -152,7 +168,9 @@ uint64_t NPUGeneratorImpl::current_seed() const {
  * You can move this function to Generator.cpp if the algorithm
  * in getNonDeterministicRandom is unified for both CPU and NPU
  */
-uint64_t NPUGeneratorImpl::seed() {
+uint64_t NPUGeneratorImpl::seed()
+{
+    c10_npu::assertNotCapturing("Not support Generator while in capture mode");
     auto random = c10::detail::getNonDeterministicRandom(true);
     this->set_current_seed(random);
     return random;
@@ -162,12 +180,14 @@ uint64_t NPUGeneratorImpl::seed() {
  * Gets the current internal state of NpuGeneratorImpl. The internal
  * state is returned as a CPU byte tensor.
  */
-c10::intrusive_ptr<c10::TensorImpl> NPUGeneratorImpl::get_state() const {
+c10::intrusive_ptr<c10::TensorImpl> NPUGeneratorImpl::get_state() const
+{
     // The RNG state comprises the seed, and an offset used for Philox.
     // The following line is just here for BC reason. sizeof curandStateMtgp32 is 4120.
     // It used to be static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32);
     // MAX_NUM_BLOCKS was 200 and sizeof(curandStateMtgp32) is 4120. Hardcoding these numbers here
     // because this is just host side code and we don't want to worry about linking with npu
+    c10_npu::assertNotCapturing("Not support Generator while in capture mode");
     static const size_t seed_size = sizeof(uint64_t);
     static const size_t offset_size = sizeof(int64_t);
     static const size_t total_size = seed_size + offset_size;
@@ -191,7 +211,9 @@ c10::intrusive_ptr<c10::TensorImpl> NPUGeneratorImpl::get_state() const {
  * comments of NPUGeneratorImpl::state for information about the layout
  * and size of the internal state.
  */
-void NPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
+void NPUGeneratorImpl::set_state(const c10::TensorImpl& new_state)
+{
+    c10_npu::assertNotCapturing("Not support Generator while in capture mode");
     static const size_t seed_size = sizeof(uint64_t);
     static const size_t offset_size = sizeof(int64_t);
     static const size_t total_size = seed_size + offset_size;
@@ -222,7 +244,9 @@ void NPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
  *
  * See Note [Acquire lock when using random generators]
  */
-void NPUGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
+void NPUGeneratorImpl::set_philox_offset_per_thread(uint64_t offset)
+{
+    c10_npu::assertNotCapturing("Not support Generator while in capture mode");
     // see Note [Why enforce RNG offset % 4 == 0?]
     TORCH_CHECK(offset % 4 == 0, "offset must be a multiple of 4", PTA_ERROR(ErrCode::VALUE));
     philox_offset_per_thread_ = offset;
@@ -231,7 +255,9 @@ void NPUGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
 /**
  * Gets the current philox_offset_per_thread_ of NpuGeneratorImpl.
  */
-uint64_t NPUGeneratorImpl::philox_offset_per_thread() const {
+uint64_t NPUGeneratorImpl::philox_offset_per_thread() const
+{
+    c10_npu::assertNotCapturing("Not support Generator while in capture mode");
     return philox_offset_per_thread_;
 }
 
@@ -240,7 +266,9 @@ uint64_t NPUGeneratorImpl::philox_offset_per_thread() const {
  * offset_extragraph is the initial offset at the start of the graphed region.
  * offset_intragraph tracks the offset in the graphed region.
  */
-void NPUGeneratorImpl::capture_prologue(int64_t* offset_extragraph) {
+void NPUGeneratorImpl::capture_prologue(int64_t* offset_extragraph)
+{
+    c10_npu::assertNotCapturing("Not support Generator while in capture mode");
     offset_extragraph_ = offset_extragraph;
     offset_intragraph_ = 0;
     graph_expects_this_gen_ = true;
@@ -249,7 +277,9 @@ void NPUGeneratorImpl::capture_prologue(int64_t* offset_extragraph) {
 /**
  * Called by NpuGraph to finalize a graph capture region for this instance.
  */
-uint64_t NPUGeneratorImpl::capture_epilogue() {
+uint64_t NPUGeneratorImpl::capture_epilogue()
+{
+    c10_npu::assertNotCapturing("Not support Generator while in capture mode");
     graph_expects_this_gen_ = false;
     return offset_intragraph_;
 }
@@ -275,7 +305,9 @@ uint64_t NPUGeneratorImpl::capture_epilogue() {
  *
  * See Note [Acquire lock when using random generators]
  */
-PhiloxNpuState NPUGeneratorImpl::philox_npu_state(uint64_t increment) {
+PhiloxNpuState NPUGeneratorImpl::philox_npu_state(uint64_t increment)
+{
+    c10_npu::assertNotCapturing("Not support Generator while in capture mode");
     // rounds increment up to the nearest multiple of 4
     increment = ((increment + 3) / 4) * 4;
     /*
@@ -310,7 +342,9 @@ PhiloxNpuState NPUGeneratorImpl::philox_npu_state(uint64_t increment) {
  * Temporarily accommodates call sites that use philox_engine_inputs.
  * Allows incremental refactor of call sites to use philox_npu_state.
  */
-std::pair<uint64_t, uint64_t> NPUGeneratorImpl::philox_engine_inputs(uint64_t increment) {
+std::pair<uint64_t, uint64_t> NPUGeneratorImpl::philox_engine_inputs(uint64_t increment)
+{
+    c10_npu::assertNotCapturing("Not support Generator while in capture mode");
     // rounds increment up to the nearest multiple of 4
     increment = ((increment + 3) / 4) * 4;
     // see Note [Why enforce RNG offset % 4 == 0?]
@@ -324,7 +358,9 @@ std::pair<uint64_t, uint64_t> NPUGeneratorImpl::philox_engine_inputs(uint64_t in
  * Gets the DeviceType of NPUGeneratorImpl.
  * Used for type checking during run time.
  */
-c10::DeviceType NPUGeneratorImpl::device_type() {
+c10::DeviceType NPUGeneratorImpl::device_type()
+{
+    c10_npu::assertNotCapturing("Not support Generator while in capture mode");
     return c10::DeviceType::PrivateUse1;
 }
 
@@ -333,7 +369,9 @@ c10::DeviceType NPUGeneratorImpl::device_type() {
  *
  * See Note [Acquire lock when using random generators]
  */
-std::shared_ptr<NPUGeneratorImpl> NPUGeneratorImpl::clone() const {
+std::shared_ptr<NPUGeneratorImpl> NPUGeneratorImpl::clone() const
+{
+    c10_npu::assertNotCapturing("Not support Generator while in capture mode");
     return std::shared_ptr<NPUGeneratorImpl>(this->clone_impl());
 }
 
@@ -342,7 +380,9 @@ std::shared_ptr<NPUGeneratorImpl> NPUGeneratorImpl::clone() const {
  *
  * See Note [Acquire lock when using random generators]
  */
-NPUGeneratorImpl* NPUGeneratorImpl::clone_impl() const {
+NPUGeneratorImpl* NPUGeneratorImpl::clone_impl() const
+{
+    c10_npu::assertNotCapturing("Not support Generator while in capture mode");
     auto gen = new NPUGeneratorImpl(this->device().index());
     gen->set_current_seed(this->seed_);
     gen->set_philox_offset_per_thread(this->philox_offset_per_thread_);
@@ -350,7 +390,9 @@ NPUGeneratorImpl* NPUGeneratorImpl::clone_impl() const {
 }
 
 // this is used to register generator
-at::Generator make_npu_generator(c10::DeviceIndex device_index) {
+at::Generator make_npu_generator(c10::DeviceIndex device_index)
+{
+    c10_npu::assertNotCapturing("Not support Generator while in capture mode");
     return at::make_generator<NPUGeneratorImpl>(device_index);
 }
 
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index bfab266b9e..e454da1561 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -137,19 +137,22 @@ void update_stat_array(
 }
 
 struct Block;
+struct PrivatePool;
 using Comparison = bool (*)(const Block*, const Block*);
 static bool BlockComparatorSize(const Block* a, const Block* b);
 static bool BlockComparatorAddress(const Block* a, const Block* b);
 
 struct BlockPool{
-  std::set<Block*, Comparison> blocks;
-  std::set<Block*, Comparison> unmapped;
-  const bool is_small;
-
-  BlockPool(bool small)
-      : blocks(BlockComparatorSize),
-        unmapped(BlockComparatorAddress),
-        is_small(small) {}
+    std::set<Block*, Comparison> blocks;
+    std::set<Block*, Comparison> unmapped;
+    const bool is_small;
+    PrivatePool* owner_PrivatePool;
+
+    BlockPool(bool small, PrivatePool* private_pool = nullptr)
+        : blocks(BlockComparatorSize),
+          unmapped(BlockComparatorAddress),
+          is_small(small),
+          owner_PrivatePool(private_pool) {}
 };
 
 struct ExpandableSegment;
@@ -619,6 +622,36 @@ private:
   std::vector<PerDevicePool> pools_;
 };
 
+// NPU graphs helper
+struct PrivatePool {
+    PrivatePool()
+        : large_blocks(false, this),
+          small_blocks(true, this) {}
+    PrivatePool(const PrivatePool&) = delete;
+    PrivatePool(PrivatePool&&) = delete;
+    PrivatePool& operator=(const PrivatePool&) = delete;
+    // Number of live graphs using this pool
+    int use_count{1};
+    // Number of unfreed npuMallocs made for this pool. When use_count and
+    // npuMalloc_count drop to zero, we can delete this PrivatePool from
+    // graph_pools.
+    int npuMalloc_count{0};
+    // Instead of maintaining private BlockPools here, I could stuff all blocks
+    // (private or no) into the top-level large_blocks and small_blocks, and
+    // distinguish private blocks by adding a "pool id" check above the stream
+    // check in BlockComparator. BlockComparator is performance- critical though,
+    // I'd rather not add more logic to it.
+    BlockPool large_blocks;
+    BlockPool small_blocks;
+};
+
+struct MempoolIdHash {
+    std::size_t operator()(const MempoolId_t& mempool_id) const noexcept
+    {
+        return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
+    }
+};
+
 } // namespace
 
 class CachingAllocatorConfig {
@@ -871,6 +904,16 @@ class DeviceCachingAllocator {
   // allocated or in use by a stream
   ska::flat_hash_set<Block*> active_blocks;
 
+  // captures_underway tracks if we are diverting some
+  // allocations to a specific pool.
+  // Most of the time it's empty, in which case malloc can avoid calling
+  // aclrtStreamGetCaptureInfo in the hot path.
+  std::vector<std::pair<MempoolId_t, std::function<bool(aclrtStream)>>>
+      captures_underway;
+
+  // See free() for this thing's purpose
+  std::vector<Block*> needs_events_deferred_until_no_capture;
+
   // outstanding acl events
   ska::flat_hash_map<
       c10_npu::NPUStream,
@@ -903,6 +946,20 @@ class DeviceCachingAllocator {
   std::vector<OutOfMemoryObserver> oom_observers_;
     std::shared_ptr<c10d_npu::HCCLComm> hcclComm_;
 
+  // Private pools for NPU graphs
+  ska::flat_hash_map<MempoolId_t, std::unique_ptr<PrivatePool>, MempoolIdHash>
+      graph_pools;
+
+  // Pools no longer referenced by any graph. Their BlockPools are eligible for
+  // free_blocks. Can't be a vector or deque because we might erase entries in
+  // any order. Could be an std::list, but we don't care much, access and
+  // insert/erase are rare.
+  ska::flat_hash_map<MempoolId_t, PrivatePool*, MempoolIdHash>
+      graph_pools_freeable;
+
+  // mapping from block to a stream_set, containing streams on which the block
+  // was used while npugraph capturing
+  std::unordered_map<Block*, stream_set> block_to_npugraph_stream_uses;
  public:
 
   DeviceCachingAllocator() :
@@ -1016,10 +1073,21 @@ class DeviceCachingAllocator {
         NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
     }
 
-    // process outstanding npuEvents
-    process_events(context);
+    if (C10_LIKELY(captures_underway.empty())) {
+      // Processes end-of-life events for outstanding allocations used on
+      // multiple streams (checks if their NPU-side uses are complete and
+      // recycles their memory if so)
+      //
+      // Q. Why skip process_events if a capture might be underway?
+      // A. process_events involves npuEventQueries, illegal during NPU graph
+      //    capture.
+      //    Dumb simple solution: defer reclaiming these allocations until after
+      //    capture. Cross-stream memory use is uncommon, so the deferral's
+      //    effect on memory use during capture should be small.
+      process_events(context);
+    }
     auto size = round_size(orig_size);
-    auto& pool = get_pool(size);
+    auto& pool = get_pool(size, stream);
 
     const size_t alloc_size = get_allocation_size(size);
     AllocParams params(device, size, stream, &pool, alloc_size, stats);
@@ -1046,9 +1114,10 @@ class DeviceCachingAllocator {
               alloc_block(params, false, context, lock));
     }
 
-    if (!block_found) {
-        ASCEND_LOGE("Get a block from the existing pool failed. %s",
-            "Try to free cached blocks and reallocate. This error log can be ignored.");
+    if (!block_found && C10_LIKELY(captures_underway.empty())) {
+        ASCEND_LOGE(
+            "Get a block from the existing pool failed. Try to free cached blocks and reallocate. This error log "
+            "can be ignored.");
         // Free all non-split cached blocks and retry alloc.
         c10_npu::NPUWorkspaceAllocator::emptyCache(device, true, true);
         block_found = (release_cached_blocks(true, context) && alloc_block(params, true, context, lock));
@@ -1312,7 +1381,15 @@ class DeviceCachingAllocator {
       update_stat(stats.oversize_allocations, -1);
 
     if (!block->stream_uses.empty() && c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) {
-      insert_events(block);
+      if (C10_UNLIKELY(!captures_underway.empty())) {
+        // It's forbidden to npuEventQuery an event recorded during NPU graph
+        // capture. We conservatively defer recording end-of-life events until
+        // the next call to process_events() (which won't happen until no
+        // captures are underway)
+        needs_events_deferred_until_no_capture.push_back(block);
+      } else {
+        insert_events(block);
+      }
     } else {
       free_block(block, context, allocator_type);
     }
@@ -1356,10 +1433,13 @@ class DeviceCachingAllocator {
     return basePtr;
   }
 
-  void recordStream(Block* block, c10_npu::NPUStream stream) {
-    std::lock_guard<std::recursive_mutex> lock(mutex);
-    block->stream_uses.insert(stream);
-  }
+    void recordStream(Block* block, c10_npu::NPUStream stream) {
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+        block->stream_uses.insert(stream);
+        if (C10_UNLIKELY(!captures_underway.empty())) {
+            block_to_npugraph_stream_uses[block].insert(stream);
+        }
+    }
 
   void eraseStream(Block* block, c10_npu::NPUStream stream) {
     std::shared_ptr<c10::GatheredContext> context =
@@ -1432,6 +1512,10 @@ class DeviceCachingAllocator {
     std::lock_guard<std::recursive_mutex> lock(mutex);
     cache_info_aux(large_blocks, total, largest);
     cache_info_aux(small_blocks, total, largest);
+    for (const auto& gp : graph_pools) {
+      cache_info_aux(gp.second->large_blocks, total, largest);
+      cache_info_aux(gp.second->small_blocks, total, largest);
+    }
   }
 
   /** Returns a copy of the memory allocator stats **/
@@ -1482,63 +1566,75 @@ class DeviceCachingAllocator {
     reset_peak_stat(stats.oversize_segments);
   }
 
-  /** Dump a complete snapshot of the memory held by the allocator. Potentially VERY expensive. **/
-  std::vector<SegmentInfo> snapshot()
-  {
-      std::lock_guard<std::recursive_mutex> lock(mutex);
+    /** Dump a complete snapshot of the memory held by the allocator. Potentially VERY expensive. **/
+    std::vector<SegmentInfo> snapshot()
+    {
+        std::lock_guard<std::recursive_mutex> lock(mutex);
 
-      size_t total_active = 0;
-      std::vector<SegmentInfo> result;
-      const auto all_blocks = get_all_blocks();
+        std::unordered_map<PrivatePool*, MempoolId_t> pool_to_id;
+        pool_to_id.reserve(graph_pools.size() + graph_pools_freeable.size());
+        for (const auto& pair : graph_pools) {
+            pool_to_id[pair.second.get()] = pair.first;
+        }
+        for (const auto& pair : graph_pools_freeable) {
+            pool_to_id[pair.second] = pair.first;
+        }
 
-      for (const Block* const head_block : all_blocks) {
-          // For expandable segments, we report one segment for each continguous
-          // mapped range of memory
-          if (head_block->prev && head_block->prev->mapped) {
-              continue;
-          }
-          result.emplace_back();
-          SegmentInfo& segment_info = result.back();
-          segment_info.device = head_block->device;
-          segment_info.address = reinterpret_cast<int64_t>(head_block->ptr);
-          segment_info.stream = head_block->stream;
-          segment_info.is_large = (!head_block->pool->is_small);
-          segment_info.is_expandable = head_block->expandable_segment_;
-          segment_info.context_when_allocated =
-              head_block->context_when_segment_allocated;
-
-          const Block* block = head_block;
-          while (block != nullptr && block->mapped) {
-              segment_info.blocks.emplace_back();
-              BlockInfo& block_info = segment_info.blocks.back();
-
-              block_info.size = block->size;
-              block_info.requested_size = block->requested_size;
-              block_info.allocated = block->allocated;
-              block_info.active = block->allocated || (block->event_count > 0);
-
-              segment_info.total_size += block_info.size;
-              if (block_info.allocated) {
-                  segment_info.allocated_size += block_info.size;
-              }
-              if (block_info.active) {
-                  segment_info.active_size += block_info.size;
-                  segment_info.requested_size += block_info.requested_size;
-              }
-              block_info.context_when_allocated = block->context_when_allocated;
-              block = block->next;
-          }
-          total_active += segment_info.active_size;
-      }
+        size_t total_active = 0;
+        std::vector<SegmentInfo> result;
+        const auto all_blocks = get_all_blocks();
+
+        for (const Block* const head_block : all_blocks) {
+            // For expandable segments, we report one segment for each continguous
+            // mapped range of memory
+            if (head_block->prev && head_block->prev->mapped) {
+                continue;
+            }
+            result.emplace_back();
+            SegmentInfo& segment_info = result.back();
+            segment_info.device = head_block->device;
+            segment_info.address = reinterpret_cast<int64_t>(head_block->ptr);
+            segment_info.stream = head_block->stream;
+            segment_info.is_large = (!head_block->pool->is_small);
+            segment_info.is_expandable = head_block->expandable_segment_;
+            segment_info.context_when_allocated =
+                head_block->context_when_segment_allocated;
+            auto mempool_id = pool_to_id.find(head_block->pool->owner_PrivatePool);
+            if (mempool_id != pool_to_id.end()) {
+                segment_info.owner_private_pool_id = mempool_id->second;
+            }
+            const Block* block = head_block;
+            while (block != nullptr && block->mapped) {
+                segment_info.blocks.emplace_back();
+                BlockInfo& block_info = segment_info.blocks.back();
+
+                block_info.size = block->size;
+                block_info.requested_size = block->requested_size;
+                block_info.allocated = block->allocated;
+                block_info.active = block->allocated || (block->event_count > 0);
+
+                segment_info.total_size += block_info.size;
+                if (block_info.allocated) {
+                    segment_info.allocated_size += block_info.size;
+                }
+                if (block_info.active) {
+                    segment_info.active_size += block_info.size;
+                    segment_info.requested_size += block_info.requested_size;
+                }
+                block_info.context_when_allocated = block->context_when_allocated;
+                block = block->next;
+            }
+            total_active += segment_info.active_size;
+        }
 
-      std::sort(result.begin(), result.end(),
-                [](const SegmentInfo& a, const SegmentInfo& b) {
-                    return a.address < b.address;
-                });
+        std::sort(result.begin(), result.end(),
+                  [](const SegmentInfo& a, const SegmentInfo& b) {
+                      return a.address < b.address;
+                  });
 
-      record_trace(TraceEntry::SNAPSHOT, 0, total_active, nullptr, 0, nullptr);
-      return result;
-  }
+        record_trace(TraceEntry::SNAPSHOT, 0, total_active, nullptr, 0, nullptr);
+        return result;
+    }
 
   std::vector<TraceEntry> trace()
   {
@@ -1562,6 +1658,76 @@ class DeviceCachingAllocator {
     }
   }
 
+    // See Note [Interaction with NPU graph capture]
+
+    // Called by NPUGraph::capture_begin
+    void beginAllocateToPool(
+        MempoolId_t mempool_id,
+        std::function<bool(aclrtStream)> filter)
+    {
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+        auto it = graph_pools.find(mempool_id);
+        if (it == graph_pools.end()) {
+            // mempool_id does not reference an existing pool. Make a new pool for
+            // this capture.
+            graph_pools.emplace(mempool_id, std::make_unique<PrivatePool>());
+        } else {
+            // mempool_id references an existing pool, which the current capture will
+            // share. Check this pool is live (at least one other capture already
+            // references it).
+            TORCH_INTERNAL_ASSERT(it->second->use_count > 0);
+            it->second->use_count++;
+        }
+        for (auto it2 = captures_underway.begin(); it2 != captures_underway.end();
+            ++it2) {
+            TORCH_CHECK(
+                it2->first != mempool_id,
+                "beginAllocateToPool: already recording to mempool_id");
+        }
+        captures_underway.emplace_back(mempool_id, std::move(filter));
+    }
+
+    // Called by NPUGraph::capture_end
+    void endAllocateToPool(MempoolId_t mempool_id)
+    {
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+        for (auto it = captures_underway.begin(); it != captures_underway.end(); ++it) {
+            if (it->first == mempool_id) {
+                captures_underway.erase(it);
+                return;
+            }
+        }
+        TORCH_CHECK(
+            false, "endAllocatePool: not currently recording to mempool_id");
+    }
+
+    // Called by NPUGraph::reset
+    void releasePool(MempoolId_t mempool_id)
+    {
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+        // The instantiated npugraphExec_t has been destroyed. We can't blindly
+        // delete and npuFree the mempool its capture used, because
+        //  1. other graph(s) might share the same pool
+        //  2. the user might still hold references to output tensors allocated
+        //  during capture.
+        // To handle 1 and 2, we track the number of graphs using this particular
+        // mempool. When the count reaches 0, we tell free_cached_blocks it may now
+        // npuFree blocks from this graph's pool when it discovers they're unused
+        // (unsplit).
+        auto it = graph_pools.find(mempool_id);
+        TORCH_INTERNAL_ASSERT(it != graph_pools.end());
+        auto uc = --(it->second->use_count);
+        TORCH_INTERNAL_ASSERT(uc >= 0);
+        if (uc == 0) {
+            // Allows free_cached_blocks to begin npuFreeing this pool's memory,
+            // and makes sure this pool wasn't somehow made freeable already.
+            // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+            bool inserted =
+                graph_pools_freeable.insert({mempool_id, it->second.get()}).second;
+            TORCH_INTERNAL_ASSERT(inserted);
+        }
+    }
+
  private:
 
   // All private methods do not acquire the allocator mutex.
@@ -1570,6 +1736,16 @@ class DeviceCachingAllocator {
     std::vector<const Block*> blocks;
     blocks.insert(blocks.end(), small_blocks.blocks.begin(), small_blocks.blocks.end());
     blocks.insert(blocks.end(), large_blocks.blocks.begin(), large_blocks.blocks.end());
+    for (const auto& gp : graph_pools) {
+      blocks.insert(
+          blocks.end(),
+          gp.second->small_blocks.blocks.begin(),
+          gp.second->small_blocks.blocks.end());
+      blocks.insert(
+          blocks.end(),
+          gp.second->large_blocks.blocks.begin(),
+          gp.second->large_blocks.blocks.end());
+    }
     blocks.insert(blocks.end(), active_blocks.begin(), active_blocks.end());
     return blocks;
   }
@@ -1837,13 +2013,31 @@ class DeviceCachingAllocator {
     return subsumed_size;
   }
 
-  BlockPool& get_pool(size_t size) {
-    if (size <= kSmallSize) {
-      return small_blocks;
-    } else {
-      return large_blocks;
+    BlockPool& get_pool(size_t size, aclrtStream stream)
+    {
+        // captures_underway is a conservative guess that the current stream may be
+        // capturing. It's only non-empty if some thread has begun and not yet ended
+        // a capture, so it's usually 0, and we can short-circuit
+        // npuStreamCaptureStatus (which does a TLS lookup).
+        if (C10_UNLIKELY(!captures_underway.empty())) {
+            for (auto& entry : captures_underway) {
+                if (entry.second(stream)) {
+                    auto it1 = graph_pools.find(entry.first);
+                    TORCH_INTERNAL_ASSERT(it1 != graph_pools.end());
+                if (size <= kSmallSize) {
+                    return it1->second->small_blocks;
+                } else {
+                    return it1->second->large_blocks;
+                }
+                }
+            }
+        }
+        if (size <= kSmallSize) {
+            return small_blocks;
+        } else {
+            return large_blocks;
+        }
     }
-  }
 
   StatTypes get_stat_types_for_pool(const BlockPool& pool) {
     StatTypes stat_types = {false};
@@ -2009,68 +2203,83 @@ class DeviceCachingAllocator {
     }
   }
 
-  bool alloc_block(
-      AllocParams& p,
-      bool isRetry,
-      const std::shared_ptr<c10::GatheredContext>& ctx,
-      std::unique_lock<std::recursive_mutex>& lock)
-  {
-    size_t size = p.alloc_size;
-    void* ptr = nullptr;
+    bool alloc_block(
+        AllocParams& p,
+        bool isRetry,
+        const std::shared_ptr<c10::GatheredContext>& ctx,
+        std::unique_lock<std::recursive_mutex>& lock)
+    {
+        size_t size = p.alloc_size;
+        void* ptr = nullptr;
 
-    if (isRetry) {
-      stats.num_alloc_retries += 1;
-    }
+        if (isRetry) {
+            stats.num_alloc_retries += 1;
+        }
 
-    if (set_fraction && total_allocated_memory + size > allowed_memory_maximum) {
-      p.err = ACL_ERROR_RT_MEMORY_ALLOCATION;
-    } else if (
-        CachingAllocatorConfig::expandable_segments()) {
-      p.block = try_allocate_expandable_block(
-          p.device(), p.stream(), p.pool, p.size(), ctx);
-      if (p.block) {
-        p.err = ACL_ERROR_NONE;
-      } else {
-        p.err = ACL_ERROR_RT_MEMORY_ALLOCATION;
-      }
-      return bool(p.block);
-    } else {
-      p.err = c10_npu::acl::AclrtMallocAlign32(
-          &ptr, size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST);
-    }
+        if (set_fraction && total_allocated_memory + size > allowed_memory_maximum) {
+            p.err = ACL_ERROR_RT_MEMORY_ALLOCATION;
+            return false;
+        } else if (CachingAllocatorConfig::expandable_segments()) {
+            p.block = try_allocate_expandable_block(p.device(), p.stream(), p.pool, p.size(), ctx);
+            if (p.block) {
+                p.err = ACL_ERROR_NONE;
+                if (p.pool->owner_PrivatePool) {
+                    // The block is for a NPU graph's PrivatePool.
+                    p.pool->owner_PrivatePool->npuMalloc_count++;
+                }
+            } else {
+                p.err = ACL_ERROR_RT_MEMORY_ALLOCATION;
+            }
+            return bool(p.block);
+        } else {
+            auto active_pool = MemPoolContext::getActiveMemPool();
+            if (active_pool && active_pool->allocator() && p.pool->owner_PrivatePool) {
+                ptr = active_pool->allocator()->raw_alloc(size);
+                p.err = ptr ? ACL_ERROR_NONE : ACL_ERROR_RT_MEMORY_ALLOCATION;
+            } else {
+                p.err = c10_npu::acl::AclrtMallocAlign32(
+                    &ptr, size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST);
+            }
+            if (p.err != ACL_ERROR_NONE) {
+                return false;
+            }
+        }
 
-    if (p.err != ACL_ERROR_NONE) {
-      return false;
-    }
-    ASCEND_LOGD("NPUCachingAllocator malloc by AclrtMallocAlign32: size=%zu", size);
+        ASCEND_LOGD("NPUCachingAllocator malloc by AclrtMallocAlign32: size=%zu", size);
 
-    total_allocated_memory += size;
-    p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr);
-    for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
-      update_stat(stats.segment[stat_type], 1);
-      update_stat(stats.reserved_bytes[stat_type], size);
-    });
-    if (size >= CachingAllocatorConfig::max_split_size())
-        update_stat(stats.oversize_segments, 1);
-    ASCEND_LOGD("pta_memory acl_malloc: malloc = %zu, ret = %d", size, p.err);
+        if (p.pool->owner_PrivatePool) {
+            // The block is for a NPU graph's PrivatePool.
+            p.pool->owner_PrivatePool->npuMalloc_count++;
+        }
+
+        total_allocated_memory += size;
+        p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr);
+        for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
+            update_stat(stats.segment[stat_type], 1);
+            update_stat(stats.reserved_bytes[stat_type], size);
+        });
+        if (size >= CachingAllocatorConfig::max_split_size()) {
+            update_stat(stats.oversize_segments, 1);
+        }
+        ASCEND_LOGD("pta_memory acl_malloc: malloc = %zu, ret = %d", size, p.err);
 
-    // p.block came from new, not cudaMalloc. It should not be nullptr here.
-    TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr);
+        // p.block came from new, not npuMalloc. It should not be nullptr here.
+        TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr);
 #ifndef BUILD_LIBTORCH
-    mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
-    mstxMemVirtualRangeDesc_t desc{p.block->device, p.block->ptr, p.block->size};
-    torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &desc);
+        mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+        mstxMemVirtualRangeDesc_t desc{p.block->device, p.block->ptr, p.block->size};
+        torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &desc);
 #endif
-    record_trace(
-        TraceEntry::SEGMENT_ALLOC,
-        int64_t(p.block->ptr),
-        p.block->size,
-        p.stream(),
-        p.device(),
-        ctx);
-    p.block->context_when_segment_allocated = ctx;
-    return true;
-  }
+        record_trace(
+            TraceEntry::SEGMENT_ALLOC,
+            int64_t(p.block->ptr),
+            p.block->size,
+            p.stream(),
+            p.device(),
+            ctx);
+        p.block->context_when_segment_allocated = ctx;
+        return true;
+    }
 
   /** Free one or more oversize blocks to the system allocator.  But only enough to satisfy the target size **/
   bool release_available_cached_blocks(const AllocParams& p,
@@ -2129,6 +2338,21 @@ class DeviceCachingAllocator {
       release_blocks(large_blocks, context);
       release_blocks(small_blocks, context);
 
+      for (auto it = graph_pools_freeable.begin();
+          it != graph_pools_freeable.end();) {
+        // See notifyCaptureDestroy for the strategy here.
+        TORCH_INTERNAL_ASSERT(it->second->use_count == 0);
+        release_blocks(it->second->small_blocks, context);
+        release_blocks(it->second->large_blocks, context);
+        if (it->second->npuMalloc_count == 0) {
+          auto erase_count = graph_pools.erase(it->first);
+          TORCH_INTERNAL_ASSERT(erase_count == 1);
+          it = graph_pools_freeable.erase(it);
+        } else {
+          ++it;
+        }
+      }
+
       return true;
   }
 
@@ -2169,6 +2393,11 @@ class DeviceCachingAllocator {
     total_allocated_memory -= block->size;
 
     auto* pool = block->pool;
+    if (pool->owner_PrivatePool) {
+      // The npuFreed block belonged to a NPU graph's PrivatePool.
+      TORCH_INTERNAL_ASSERT(pool->owner_PrivatePool->npuMalloc_count > 0);
+      pool->owner_PrivatePool->npuMalloc_count--;
+    }
 
     StatTypes stat_types = get_stat_types_for_pool(*pool);
     for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
@@ -2239,6 +2468,14 @@ class DeviceCachingAllocator {
     for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
       update_stat(stats.reserved_bytes[stat_type], -unmapped.size);
     });
+
+    if (block->pool->owner_PrivatePool) {
+      // The npuFreed block belonged to a NPU graph's PrivatePool.
+      TORCH_INTERNAL_ASSERT(
+          block->pool->owner_PrivatePool->npuMalloc_count > 0);
+      block->pool->owner_PrivatePool->npuMalloc_count--;
+    }
+
 #ifndef BUILD_LIBTORCH
     mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
     torch_npu::profiler::MstxMgr::GetInstance()->memHeapUnregister(msleaksDomain, block->ptr);
@@ -2287,6 +2524,11 @@ class DeviceCachingAllocator {
 
   void synchronize_and_free_events(bool check_error, const std::shared_ptr<c10::GatheredContext>& context)
   {
+    // This function syncs, so capture should not be underway. Might as well
+    // make sure capture-deferred end of life events get processed too.
+    TORCH_INTERNAL_ASSERT(captures_underway.empty());
+    insert_events_deferred_until_no_capture(context);
+
     // Synchronize on outstanding events and then free associated blocks.
     for (auto& st : npu_events) {
       for (auto& e : st.second) {
@@ -2315,30 +2557,71 @@ class DeviceCachingAllocator {
     npu_events.clear();
   }
 
-  void insert_events(Block* block) {
-    aclrtContext compiler_ctx = aclrtContext();
-    aclError ret_ctx = aclrtGetCurrentContext(&compiler_ctx);
-    NPU_CHECK_ERROR(aclrtSetCurrentContext(c10_npu::GetDeviceContext(block->device)));
+    void remove_npugraph_stream_uses(Block* block)
+    {
+        // remove stream uses added during npugraph capture
+        // (i.e., block->stream_uses - block->npugraph_stream_uses)
+        if (C10_UNLIKELY(
+            block_to_npugraph_stream_uses.find(block) != block_to_npugraph_stream_uses.end())) {
+            stream_set streams(std::move(block->stream_uses));
+            AT_ASSERT(block->stream_uses.empty());
+            for (auto& stream : streams) {
+                if (block_to_npugraph_stream_uses[block].find(stream) ==
+                    block_to_npugraph_stream_uses[block].end()) {
+                    block->stream_uses.insert(stream);
+                }
+            }
+            block_to_npugraph_stream_uses.erase(block);
+        }
+    }
+
+    void insert_events(Block* block)
+    {
+        aclrtContext compiler_ctx = aclrtContext();
+        aclError ret_ctx = aclrtGetCurrentContext(&compiler_ctx);
+        NPU_CHECK_ERROR(aclrtSetCurrentContext(c10_npu::GetDeviceContext(block->device)));
 
-    stream_set streams(std::move(block->stream_uses));
-    AT_ASSERT(block->stream_uses.empty(), PTA_ERROR(ErrCode::VALUE));
-    for (auto& stream : streams) {
-      NPU_CHECK_ERROR(c10_npu::SetDevice(stream.device_index()));
+        stream_set streams(std::move(block->stream_uses));
+        AT_ASSERT(block->stream_uses.empty(), PTA_ERROR(ErrCode::VALUE));
+        for (auto& stream : streams) {
+            NPU_CHECK_ERROR(c10_npu::SetDevice(stream.device_index()));
 
-      EventPool::Event event = create_event_internal(stream.device_index());
-      event->record(stream);
-      ASCEND_LOGI("Event: record DeviceAllocator is successfully executed, event=%p", event.get());
+            EventPool::Event event = create_event_internal(stream.device_index());
+            event->record(stream);
+            ASCEND_LOGI("Event: record DeviceAllocator is successfully executed, event=%p", event.get());
 
-      block->event_count++;
-      npu_events[stream].emplace_back(std::move(event), block);
+            block->event_count++;
+            npu_events[stream].emplace_back(std::move(event), block);
+        }
+        if (ret_ctx == ACL_ERROR_NONE) {
+            NPU_CHECK_ERROR(aclrtSetCurrentContext(compiler_ctx));
+        }
     }
-    if (ret_ctx == ACL_ERROR_NONE) {
-      NPU_CHECK_ERROR(aclrtSetCurrentContext(compiler_ctx));
+
+    void insert_events_deferred_until_no_capture(
+        const std::shared_ptr<c10::GatheredContext>& context)
+    {
+        if (C10_UNLIKELY(!needs_events_deferred_until_no_capture.empty())) {
+            for (auto* block : needs_events_deferred_until_no_capture) {
+                TORCH_INTERNAL_ASSERT(!block->stream_uses.empty());
+                // only streams recorded before npugraph will be used to insert events
+                // since we know all streams recorded during npugraph must have
+                // completed (refer to Section 3.2.8.7.3.1 Cross-stream Dependencies and
+                // Events in CUDA Programming Guide).
+                remove_npugraph_stream_uses(block);
+                insert_events(block);
+                if (block->event_count == 0) {
+                    free_block(block, context);
+                }
+            }
+            needs_events_deferred_until_no_capture.clear();
+        }
     }
-  }
 
   void process_events(const std::shared_ptr<c10::GatheredContext>& context)
   {
+    insert_events_deferred_until_no_capture(context);
+
     // Process outstanding npuEvents. Events that are completed are removed
     // from the queue, and the 'event_count' for the corresponding allocation
     // is decremented. Stops at the first event which has not been completed.
@@ -2678,6 +2961,28 @@ class NpuCachingAllocator : public NPUAllocator {
     return result;
   }
 
+    // CUDAGraph interactions
+    void beginAllocateToPool(
+        c10::DeviceIndex device,
+        MempoolId_t mempool_id,
+        std::function<bool(aclrtStream)> filter) override
+    {
+        assertValidDevice(device);
+        device_allocator[device]->beginAllocateToPool(std::move(mempool_id), std::move(filter));
+    }
+
+    void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id) override
+    {
+        assertValidDevice(device);
+        device_allocator[device]->endAllocateToPool(mempool_id);
+    }
+
+    void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) override
+    {
+        assertValidDevice(device);
+        device_allocator[device]->releasePool(std::move(mempool_id));
+    }
+
   c10::DataPtr allocate(size_t size) override
   {
       constexpr size_t one_exa_bytes = 1152921504606846976ULL;
@@ -2859,3 +3164,62 @@ std::mutex* getFreeMutex() {
 
 } // namespace NPUCachingAllocator
 } // namespace c10_npu
+
+namespace c10_npu {
+
+// uid_ is incremented when a user creates a MemPool,
+// for example: using graph_pool_handle() or c10_npu::MemPool().
+//
+// uuid_ is incremented when NPUGraph creates a MemPool
+// as a result of a user not providing a pool.
+//
+// MempoolId_t of {0, 0} is used to denote when no MemPool has been
+// passed to a function, either by user or NPUGraphs. For example,
+// default value of MempoolId_t for capture_begin function is {0, 0}.
+// That's why uid_ and uuid_ start at 1.
+std::atomic<CaptureId_t> MemPool::uid_{1};
+std::atomic<CaptureId_t> MemPool::uuid_{1};
+
+
+MemPool::MemPool(NPUCachingAllocator::NPUAllocator* allocator, bool is_user_created)
+    : allocator_(allocator), is_user_created_(is_user_created)
+{
+    if (is_user_created_) {
+        id_ = {0, uid_++};
+    } else {
+        id_ = {uuid_++, 0};
+    }
+}
+
+MempoolId_t MemPool::id()
+{
+    return id_;
+}
+
+NPUCachingAllocator::NPUAllocator* MemPool::allocator()
+{
+    return allocator_;
+}
+
+// Note that active_mempool_ is a global variable here
+// and not inside MemPoolContext class, because in windows we
+// can't use __declspec(dllexport) and __declspec(thread)
+static thread_local MemPool* active_mempool_ = nullptr;
+
+MemPoolContext::MemPoolContext(MemPool* mempool)
+    : prev_mempool_(active_mempool_)
+{
+    active_mempool_ = mempool;
+}
+
+MemPoolContext::~MemPoolContext()
+{
+    active_mempool_ = prev_mempool_;
+}
+
+MemPool* MemPoolContext::getActiveMemPool()
+{
+    return active_mempool_;
+}
+
+} // namespace c10_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
index c722107c09..35eeb0d341 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
@@ -3,9 +3,10 @@
 #include <c10/core/Allocator.h>
 #include <c10/util/Registry.h>
 #include <c10/util/SmallVector.h>
+#include "torch_npu/csrc/core/npu/NPUGraphsUtils.h"
 #include "torch_npu/csrc/core/npu/NPUMacros.h"
-#include "torch_npu/csrc/core/npu/register/OptionsManager.h"
 #include "torch_npu/csrc/core/npu/NPUStream.h"
+#include "torch_npu/csrc/core/npu/register/OptionsManager.h"
 #include "torch_npu/csrc/distributed/HCCLUtils.hpp"
 
 #include <mutex>
@@ -110,17 +111,18 @@ struct BlockInfo {
 
 // Struct containing info of a memory segment (i.e. one contiguous cudaMalloc).
 struct SegmentInfo {
-  int64_t device = 0;
-  int64_t  address = 0;
-  aclrtStream stream = 0;
-  int64_t total_size = 0;
-  int64_t requested_size = 0;
-  int64_t allocated_size = 0;
-  int64_t active_size = 0;
-  bool is_large = false;
-  bool is_expandable = false;
-  std::vector<BlockInfo> blocks;
-  std::shared_ptr<c10::GatheredContext> context_when_allocated;
+    int64_t device = 0;
+    int64_t  address = 0;
+    aclrtStream stream = 0;
+    int64_t total_size = 0;
+    int64_t requested_size = 0;
+    int64_t allocated_size = 0;
+    int64_t active_size = 0;
+    bool is_large = false;
+    bool is_expandable = false;
+    MempoolId_t owner_private_pool_id = {0, 0};
+    std::vector<BlockInfo> blocks;
+    std::shared_ptr<c10::GatheredContext> context_when_allocated;
 };
 
 struct TraceEntry {
@@ -190,6 +192,16 @@ public:
     virtual void resetAccumulatedStats(int device) = 0;
     virtual void resetPeakStats(int device) = 0;
     virtual SnapshotInfo snapshot() = 0;
+
+    // CUDAGraph interactions
+    virtual void beginAllocateToPool(
+        c10::DeviceIndex device,
+        MempoolId_t mempool_id,
+        std::function<bool(aclrtStream)> filter) = 0;
+    virtual void endAllocateToPool(
+        c10::DeviceIndex device,
+        MempoolId_t mempool_id) = 0;
+    virtual void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) = 0;
     virtual void FreeDeviceCachedMemory(int device) = 0;
     virtual std::string name() = 0;
     virtual bool isHistoryEnabled()
@@ -297,6 +309,25 @@ inline SnapshotInfo snapshot()
     return get()->snapshot();
 }
 
+// CUDAGraph interactions
+inline void beginAllocateToPool(
+    c10::DeviceIndex device,
+    MempoolId_t mempool_id,
+    std::function<bool(aclrtStream)> filter)
+{
+    get()->beginAllocateToPool(device, mempool_id, std::move(filter));
+}
+
+inline void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id)
+{
+    get()->endAllocateToPool(device, mempool_id);
+}
+
+inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id)
+{
+    return get()->releasePool(device, mempool_id);
+}
+
 inline void FreeDeviceCachedMemory(int device)
 {
     return get()->FreeDeviceCachedMemory(device);
@@ -358,3 +389,46 @@ bool checkConfigExpandableSegments();
 
 } // namespace NPUCachingAllocator
 } // namespace c10_npu
+
+namespace c10_npu {
+
+// MemPool represents a pool of memory in a caching allocator. Currently,
+// it's just the ID of the pool object maintained in the NPUCachingAllocator.
+//
+// An allocator pointer can be passed to the MemPool to define how the
+// allocations should be done in the pool. For example: using a different
+// system allocator such as ncclMemAlloc.
+struct C10_NPU_API MemPool {
+    MemPool(
+        NPUCachingAllocator::NPUAllocator* allocator = nullptr,
+        bool is_user_created = true);
+
+    MempoolId_t id();
+    NPUCachingAllocator::NPUAllocator* allocator();
+
+private:
+    static std::atomic<CaptureId_t> uid_;
+    static std::atomic<CaptureId_t> uuid_;
+    NPUCachingAllocator::NPUAllocator* allocator_;
+    bool is_user_created_;
+    MempoolId_t id_;
+};
+
+// MemPoolContext holds the currently active pool and stashes the previous
+// pool. On deletion it makes the previous pool active.
+struct C10_NPU_API MemPoolContext {
+    MemPoolContext(MemPool* mempool);
+
+    ~MemPoolContext();
+
+    // getActiveMemPool() can be used to get the currently active pool.
+    // For instance: in NPUCachingAllocator, we can route allocations
+    // to a user provided allocator, by doing:
+
+    static MemPool* getActiveMemPool();
+
+private:
+    MemPool* prev_mempool_;
+};
+
+} // namespace c10_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp
index 254ea3da8c..d9c1944978 100644
--- a/torch_npu/csrc/core/npu/NPUFunctions.cpp
+++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp
@@ -185,6 +185,22 @@ int ExchangeDevice(int device)
     return device;
 }
 
+bool IsContextInitialized()
+{
+    int32_t device = -1;
+    aclError err =  aclrtGetDevice(&device);
+    if (err == ACL_ERROR_NONE) {
+        return true;
+    } else {
+        CHECK_AND_THROW_ERROR_WITH_SPECIFIC_MESSAGE(err);
+        if (err == ACL_ERROR_RT_CONTEXT_NULL) {
+            return false;
+        }
+        NPU_CHECK_ERROR_WITHOUT_UCE(err);
+        return false;
+    }
+}
+
 int GetLocalDevice()
 {
     return local_device;
diff --git a/torch_npu/csrc/core/npu/NPUFunctions.h b/torch_npu/csrc/core/npu/NPUFunctions.h
index 6731242f28..87d12cb323 100644
--- a/torch_npu/csrc/core/npu/NPUFunctions.h
+++ b/torch_npu/csrc/core/npu/NPUFunctions.h
@@ -148,4 +148,6 @@ C10_NPU_API inline ModelState& model_state()
     return model_state_;
 }
 
+bool IsContextInitialized();
+
 } // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/NPUGraph.cpp b/torch_npu/csrc/core/npu/NPUGraph.cpp
new file mode 100644
index 0000000000..b53b0340ee
--- /dev/null
+++ b/torch_npu/csrc/core/npu/NPUGraph.cpp
@@ -0,0 +1,234 @@
+#ifndef BUILD_LIBTORCH
+#include "torch_npu/csrc/core/npu/NPUGraph.h"
+#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
+#include "torch_npu/csrc/core/npu/NPUFunctions.h"
+
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <thread>
+#include <vector>
+
+#include <ATen/Functions.h>
+#include <torch/csrc/Exceptions.h>
+
+namespace c10_npu {
+
+static bool _npu_graphs_debug = false;
+constexpr int kSynchronizeBusyWaitMillis = 10;
+
+MempoolId_t graph_pool_handle()
+{
+    // Sets just the second value, to distinguish it from MempoolId_ts created from
+    // aclmdlGetCaptureInfo id_s in capture_begin.
+    auto new_pool = c10_npu::MemPool();
+    return new_pool.id();
+}
+
+/**
+ * Note [CUDA Graph Wrapper Class]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Q: Why do we need graph capture and launch bindings in Pytorch?
+ *    Why can't they live in a user extension, for example?
+ *
+ * A1: Convenience.
+ * A2: To ensure valid numerics on replay, some native CUDA ops (like RNG ops with
+ *     CPU statefulness) need cooperation from the capture and replay bindings
+ *     (see Note [CUDA Graph-safe RNG states] in CUDAGeneratorImpl.h).
+ *
+ *     We can't expect users to know about this cooperation.  If users write capture
+ *     bindings naively in an extension, they likely won't interact with the native
+ *     ops properly.  Their graphs would yield invalid numerics on replay.
+ */
+
+/**
+ * Note [Interaction with CUDA graph capture] in CUDACachingAllocator.cpp
+ * describes memory management for captures.
+ */
+
+std::atomic<int> NPUGraph::pending_event_queries = 0;
+
+// Track any outstanding event queries that could happen e.g., in a NCCL watchdog so that they
+// can be resolved before the capture begins. Note that event queries are not allowed during a
+// graph capture in the default capture mode.
+void NPUGraph::inc_pending_event_queries()
+{
+    pending_event_queries++;
+}
+
+void NPUGraph::dec_pending_event_queries()
+{
+    TORCH_INTERNAL_ASSERT(pending_event_queries > 0,
+                          "Attempted to decrement the number of outstanding events to be queried, but it was <= 0.");
+    pending_event_queries--;
+}
+
+int NPUGraph::num_pending_event_queries()
+{
+    return pending_event_queries;
+}
+
+NPUGraph::NPUGraph()
+    // NPUStreams may not be default-constructed.
+    : capture_stream_(c10_npu::getCurrentNPUStream()) {
+}
+
+void NPUGraph::capture_begin(MempoolId_t pool, aclmdlCaptureMode capture_mode)
+{
+    TORCH_CHECK(!has_graph_exec_,
+                "This NPUGraph instance already owns a captured graph. "
+                "To capture a new graph, create a new instance.");
+
+    auto stream = c10_npu::getCurrentNPUStream();
+
+    TORCH_CHECK(stream != c10_npu::getDefaultNPUStream(),
+                "NPU graphs must be captured on a non-default stream. "
+                "(However, after capture, it's ok to replay them on the "
+                "default stream.)");
+
+    capture_stream_ = stream;
+    capture_dev_ = c10_npu::current_device();
+
+    if (pool.first != 0 || pool.second != 0) {
+        // Either value being nonzero means the user supplied a pool to share.
+        // But only one should be nonzero.
+        // If pool was created by another graph's capture_begin, first should be nonzero.
+        // If pool was created by graph_pool_handle, second should be nonzero.
+        TORCH_INTERNAL_ASSERT(!(pool.first && pool.second));
+        mempool_id_ = pool;
+    } else {
+        // User did not ask us to share a mempool. Create graph pool handle using is_user_created=false.
+        // Sets just the first value, to distinguish it from MempoolId_ts created by graph_pool_handle().
+        auto mempool = c10_npu::MemPool({}, false);
+        mempool_id_ = mempool.id();
+        TORCH_INTERNAL_ASSERT(mempool_id_.first > 0);
+    }
+
+    // Addendum: beginAllocateStreamToPool is now called before cudaStreamBeginCapture to prevent an
+    // autograd thread's free() call triggering an invalid cudaEventRecord in the caching allocator
+    // due to the capture status being updated _after_ a capture had already started.
+    c10_npu::NPUCachingAllocator::beginAllocateToPool(capture_dev_, mempool_id_, [this](aclrtStream stream) {
+        aclmdlCaptureStatus status;
+        uint32_t model_id;
+        NPU_CHECK_ERROR(c10_npu::acl::AclmdlGetCaptureInfo(stream, &status, &model_id));
+        return status == aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_ACTIVE && model_id == model_id_;
+    });
+
+    // At this point, any NCCL watchdogs should be aware that we are in capture mode
+    // and therefore should not enqueue any additional work that could be event-queried.
+    // We still must wait on any existing work that has not been cleaned up.
+    while (num_pending_event_queries()) {
+        TORCH_WARN_ONCE("Waiting for pending NCCL work to finish before starting graph capture.");
+        std::this_thread::sleep_for(
+            std::chrono::milliseconds(kSynchronizeBusyWaitMillis));
+    }
+
+    // cudaStreamCaptureModeGlobal is the most conservative option to
+    // prevent potentially unsafe CUDA API calls during capture.
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlBeginCapture(capture_stream_, capture_mode));
+
+    aclmdlCaptureStatus status;
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlGetCaptureInfo(stream, &status, &model_id_));
+    TORCH_INTERNAL_ASSERT(status == aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_ACTIVE);
+}
+
+void NPUGraph::capture_end()
+{
+    auto stream = c10_npu::getCurrentNPUStream();
+
+    TORCH_CHECK(stream == capture_stream_,
+                "Capture must end on the same stream it began on.");
+
+    uint32_t model_id;
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlEndCapture(capture_stream_, &model_id));
+
+    c10_npu::NPUCachingAllocator::endAllocateToPool(capture_dev_, mempool_id_);
+
+    TORCH_CHECK(model_id == model_id_, "Invalid end capture model id: ", model_id);
+
+    // In typical graph usage some tensors (e.g. the tensors used for graph IO) are not freed
+    // between replays.
+    // If Pytorch compiles and runs with a CUDA 11.4+ toolkit, there's a chance the allocator backend
+    // is cudaMallocAsync.
+    // cudaMallocAsync is generally graph-safe, but if some tensors are not freed between replays,
+    // the graph's internal bookkeeping requires that we instantiate with
+    // cudaGraphInstantiateFlagAutoFreeOnLaunch. See
+    // cudaGraphLaunch
+    // cudaGraphInstantiateWithFlags
+    has_graph_exec_ = true;
+
+    uint32_t num_graph_nodes = 0;
+}
+
+void NPUGraph::replay()
+{
+    TORCH_CHECK(has_graph_exec_,
+                "Called NPUGraph::replay without a preceding successful capture.");
+
+    c10::OptionalDeviceGuard device_guard{capture_stream_.device()};
+
+    // model_id_ may be replayed in any stream.
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlExecuteAsync(model_id_, c10_npu::getCurrentNPUStream()));
+}
+
+void NPUGraph::enable_debug_mode()
+{
+    _npu_graphs_debug = true;
+}
+
+void NPUGraph::debug_dump()
+{
+    if (_npu_graphs_debug) {
+        if (has_graph_exec_) {
+            TORCH_WARN("DEBUG: calling NPUGraph::debug_dump() for model id ", model_id_);
+            NPU_CHECK_ERROR(c10_npu::acl::AclmdlDebugPrint(model_id_));
+        }
+    } else {
+        TORCH_WARN("NPU Graphs debug not enabled, set with NPUGraph::enable_debug_mode().");
+    }
+}
+
+void NPUGraph::reset()
+{
+    // I'd prefer these checks throw exceptions, not print warnings,
+    // but the destructor calls reset(), and at least one CI build
+    // refuses to compile with a throwing destructor.
+    //
+    // Instead of calling reset() in the destructor to clean up, I could
+    // call reset() in the __del__ method of a thin Python wrapper,
+    // in which case reset would be allowed to throw exceptions.
+    // But Stackoverflow does not like user-defined __del__.
+    // __del__ prevents Graph instances from EVER being garbage collected
+    // if they participate in a reference cycle.
+    // And exceptions thrown in __del__ only print a warning anyway.
+    //
+    // Calling reset() in the C++ destructor, with warnings instead of exceptions
+    // if calls fail, is the compromise we chose.
+    //
+    // If capture_begin, the capture, or capture_end failed at some point, this NPUGraph, the generator,
+    // and the allocator could end up in all kinds of weird states depending where failure occurred.
+    // If the user catches the failure exception in a script, or is running in REPL or (god forbid)
+    // a Jupyter notebook, I don't see an easy way for reset() to gracefully fix all such possible error states.
+    if (has_graph_exec_) {
+        // notifyCaptureDestroy may throw. How should we handle this?
+        c10_npu::NPUCachingAllocator::releasePool(capture_dev_, mempool_id_);
+        NPU_CHECK_ERROR(c10_npu::acl::AclmdlUnload(model_id_));
+        has_graph_exec_ = false;
+    }
+}
+
+// Returns an id another graph's capture_begin can use to share the same memory pool as this graph.
+MempoolId_t NPUGraph::pool()
+{
+    TORCH_CHECK(has_graph_exec_,
+                "Called NPUGraph::pool() without a preceding successful capture.");
+    return mempool_id_;
+}
+
+NPUGraph::~NPUGraph()
+{
+    reset();
+}
+
+} // namespace c10_npu
+#endif
diff --git a/torch_npu/csrc/core/npu/NPUGraph.h b/torch_npu/csrc/core/npu/NPUGraph.h
new file mode 100644
index 0000000000..b2833744c1
--- /dev/null
+++ b/torch_npu/csrc/core/npu/NPUGraph.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <c10/core/Device.h>
+#include <c10/util/flat_hash_map.h>
+
+#include "torch_npu/csrc/core/npu/NPUGraphsUtils.h"
+#include "torch_npu/csrc/core/npu/NPUMacros.h"
+#include "torch_npu/csrc/core/npu/NPUStream.h"
+
+namespace c10_npu {
+
+// Standalone way to get a unique mempool id usable as a pool=... argument
+// to CUDAGraph::capture_begin
+TORCH_NPU_API MempoolId_t graph_pool_handle();
+
+struct TORCH_NPU_API NPUGraph {
+    NPUGraph();
+    ~NPUGraph();
+
+    static void inc_pending_event_queries();
+    static void dec_pending_event_queries();
+    static int num_pending_event_queries();
+
+    void capture_begin(
+        MempoolId_t pool = {0, 0},
+        aclmdlCaptureMode capture_mode = aclmdlCaptureMode::ACL_MODEL_CAPTURE_MODE_GLOBAL);
+    void capture_end();
+    void replay();
+    void reset();
+    MempoolId_t pool();
+    void enable_debug_mode();
+    void debug_dump();
+
+protected:
+    uint32_t model_id_ = -1;
+
+    static std::atomic<int> pending_event_queries;
+
+    // Set to true in capture_end if NPU graph is captured succeeded
+    bool has_graph_exec_ = false;
+
+    // the ID assigned by cuda during graph capture,
+    // used to identify when a stream is participating in capture
+    CaptureId_t capture_id_ = -1;
+
+    // uuid used to request a particular private mempool from CUDACachingAllocator.
+    // By default, this will be set to {id_, 0}.
+    //
+    // If capture_begin is called with "pool=other_graph.pool()", this graph's mempool_id_
+    // will be set to the other graph's mempool_id_, and therefore share a mempool with the
+    // other graph.
+    //
+    // If capture_begin is called with "pool=handle" where "handle" came from graph_pool_handle(),
+    // it will share a mempool with any other captures that used "pool=handle".
+    //
+    // Sharing a mempool across graphs saves memory, and it's safe if you
+    // know you'll replay those graphs in the same order you captured them.
+    MempoolId_t mempool_id_;
+
+    // Stream on which capture began
+    NPUStream capture_stream_;
+
+    // Device where capture occurred. Right now, for simplicity, we require all ops
+    // in a capture to run on the same device, but this is a limitation of CUDAGraph,
+    // not CUDA itself.  We can straightforwardly modify CUDAGraph to support multi-device
+    // captures if needed.
+    int capture_dev_;
+};
+
+} // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/NPUGraphsUtils.h b/torch_npu/csrc/core/npu/NPUGraphsUtils.h
new file mode 100644
index 0000000000..0d23b4b019
--- /dev/null
+++ b/torch_npu/csrc/core/npu/NPUGraphsUtils.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#include <iostream>
+#include <utility>
+
+#include "torch_npu/csrc/core/npu/NPUException.h"
+#include "torch_npu/csrc/core/npu/NPUFunctions.h"
+#include "torch_npu/csrc/core/npu/NPUMacros.h"
+#include "torch_npu/csrc/core/npu/NPUStream.h"
+
+namespace c10_npu {
+
+using CaptureId_t = unsigned long long;
+
+// first is set if the instance is created by NPUGraph::capture_begin.
+// second is set if the instance is created by at::cuda::graph_pool_handle.
+using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
+
+// RAII guard for "aclmdlCaptureMode", a thread-local value
+// that controls the error-checking strictness of a capture.
+struct C10_NPU_API NPUStreamCaptureModeGuard{
+    NPUStreamCaptureModeGuard(aclmdlCaptureMode desired)
+    : strictness_(desired) {}
+    ~NPUStreamCaptureModeGuard() {}
+
+    private:
+    aclmdlCaptureMode strictness_;
+};
+
+// Protects against enum aclmdlCaptureStatus implementation changes.
+// Some compilers seem not to like static_assert without the messages.
+static_assert(
+    int(aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_NONE) == 0,
+    "unexpected int(ACL_MODEL_CAPTURE_STATUS_NONE) value");
+static_assert(
+    int(aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_ACTIVE) == 1,
+    "unexpected int(ACL_MODEL_CAPTURE_STATUS_ACTIVE) value");
+static_assert(
+    int(aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_INVALIDATED) == 2,
+    "unexpected int(ACL_MODEL_CAPTURE_STATUS_INVALIDATED) value");
+
+enum class CaptureStatus : int {
+    None = int(aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_NONE),
+    Active = int(aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_ACTIVE),
+    Invalidated = int(aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_INVALIDATED)
+};
+
+inline std::ostream &operator<<(std::ostream &os, CaptureStatus status)
+{
+    switch (status) {
+        case CaptureStatus::None:
+            os << "npuStreamCaptureStatusNone";
+            break;
+        case CaptureStatus::Active:
+            os << "npuStreamCaptureStatusActive";
+            break;
+        case CaptureStatus::Invalidated:
+            os << "npuStreamCaptureStatusInvalidated";
+            break;
+        default:
+            TORCH_INTERNAL_ASSERT(
+                false, "Unknown NPU graph CaptureStatus", int(status));
+    }
+    return os;
+}
+
+// Use this version where you're sure a CUDA context exists already.
+inline CaptureStatus currentStreamCaptureStatusMayInitCtx()
+{
+    if (!c10_npu::acl::IsCaptureSupported()) {
+        return CaptureStatus::None;
+    }
+
+    aclmdlCaptureStatus is_capturing{ACL_MODEL_CAPTURE_STATUS_NONE};
+    uint32_t modelId;
+    NPU_CHECK_ERROR(
+        c10_npu::acl::AclmdlGetCaptureInfo(c10_npu::getCurrentNPUStream(), &is_capturing, &modelId));
+    return CaptureStatus(is_capturing);
+}
+
+// Use this version where you don't want to create a CUDA context if none exists.
+inline CaptureStatus currentStreamCaptureStatus()
+{
+    // don't create a context if we don't have to
+    if (c10_npu::IsContextInitialized()) {
+        return currentStreamCaptureStatusMayInitCtx();
+    } else {
+        return CaptureStatus::None;
+    }
+}
+
+inline void assertNotCapturing(const std::string &attempt)
+{
+    auto status = currentStreamCaptureStatus();
+    TORCH_CHECK(status == CaptureStatus::None,
+                attempt,
+                " during NPU graph capture. If you need this call to be captured, "
+                "please file an issue. "
+                "Current npuStreamCaptureStatus: ",
+                status);
+}
+
+} // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
index 6fc9936cf8..5235d048bb 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
@@ -1,6 +1,5 @@
 #include "AclInterface.h"
 #include <dlfcn.h>
-#include "third_party/acl/inc/acl/acl_rt.h"
 #include "third_party/op-plugin/op_plugin/utils/op_api_common.h"
 #include "torch_npu/csrc/core/npu/register/FunctionLoader.h"
 #include "torch_npu/csrc/core/npu/NpuVariables.h"
@@ -70,6 +69,12 @@ LOAD_FUNCTION(aclrtPeekAtLastError)
 LOAD_FUNCTION(aclrtSynchronizeDevice)
 LOAD_FUNCTION(aclrtSynchronizeDeviceWithTimeout)
 LOAD_FUNCTION(aclrtEventGetTimestamp)
+LOAD_FUNCTION(aclmdlBeginCapture)
+LOAD_FUNCTION(aclmdlGetCaptureInfo)
+LOAD_FUNCTION(aclmdlEndCapture)
+LOAD_FUNCTION(aclmdlDebugPrint)
+LOAD_FUNCTION(aclmdlExecuteAsync)
+LOAD_FUNCTION(aclmdlUnload)
 
 aclprofStepInfoPtr init_stepinfo() {
     typedef aclprofStepInfoPtr(*npdInitFunc)();
@@ -703,5 +708,97 @@ aclError AclrtEventGetTimestamp(aclrtEvent event, uint64_t *timestamp)
     return func(event, timestamp);
 }
 
+aclError AclmdlBeginCapture(aclrtStream stream, aclmdlCaptureMode mode)
+{
+    typedef aclError (*AclmdlBeginCapture)(aclrtStream, aclmdlCaptureMode);
+    static AclmdlBeginCapture func = nullptr;
+    if (func == nullptr) {
+        func = (AclmdlBeginCapture) GET_FUNC(aclmdlBeginCapture);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclmdlBeginCapture", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(stream, mode);
+}
+
+aclError AclmdlGetCaptureInfo(aclrtStream stream, aclmdlCaptureStatus *status, uint32_t *modelId)
+{
+    typedef aclError (*AclmdlGetCaptureInfo)(aclrtStream, aclmdlCaptureStatus *, uint32_t *);
+    static AclmdlGetCaptureInfo func = nullptr;
+    if (func == nullptr) {
+        func = (AclmdlGetCaptureInfo) GET_FUNC(aclmdlGetCaptureInfo);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclmdlGetCaptureInfo", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(stream, status, modelId);
+}
+
+aclError AclmdlEndCapture(aclrtStream stream, uint32_t *modelId)
+{
+    typedef aclError (*AclmdlEndCapture)(aclrtStream, uint32_t *);
+    static AclmdlEndCapture func = nullptr;
+    if (func == nullptr) {
+        func = (AclmdlEndCapture) GET_FUNC(aclmdlEndCapture);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclmdlEndCapture", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(stream, modelId);
+}
+
+aclError AclmdlDebugPrint(uint32_t modelId)
+{
+    typedef aclError (*AclmdlDebugPrint)(uint32_t);
+    static AclmdlDebugPrint func = nullptr;
+    if (func == nullptr) {
+        func = (AclmdlDebugPrint) GET_FUNC(aclmdlDebugPrint);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclmdlDebugPrint", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(modelId);
+}
+
+aclError AclmdlExecuteAsync(uint32_t modelId, aclrtStream stream)
+{
+    typedef aclError (*AclmdlExecuteAsync)(uint32_t, const aclmdlDataset *, aclmdlDataset *, aclrtStream);
+    static AclmdlExecuteAsync func = nullptr;
+    if (func == nullptr) {
+        func = (AclmdlExecuteAsync) GET_FUNC(aclmdlExecuteAsync);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclmdlExecuteAsync", PTA_ERROR(ErrCode::NOT_FOUND));
+
+    static aclmdlDataset *inputs = aclmdlCreateDataset();
+    static aclmdlDataset *outputs = aclmdlCreateDataset();
+    return func(modelId, inputs, outputs, stream);
+}
+
+aclError AclmdlUnload(uint32_t modelId)
+{
+    typedef aclError (*AclmdlUnload)(uint32_t);
+    static AclmdlUnload func = nullptr;
+    if (func == nullptr) {
+        func = (AclmdlUnload) GET_FUNC(aclmdlUnload);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclmdlUnload", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(modelId);
+}
+
+bool IsCaptureSupported()
+{
+    static bool is_support = false;
+    static bool have_load_func = false;
+    static bool default_support_capture = ((GetSocVersion() >= SocVersion::Ascend910B1) &&
+        (GetSocVersion() < SocVersion::Ascend310B1)) ||
+        (GetSocVersion() >= SocVersion::Ascend910_9391);
+    if (default_support_capture && !have_load_func) {
+        have_load_func = true;
+        typedef aclError (*AclmdlGetCaptureInfo)(aclrtStream, aclmdlCaptureStatus *, uint32_t *);
+        static AclmdlGetCaptureInfo func = (AclmdlGetCaptureInfo) GET_FUNC(aclmdlGetCaptureInfo);
+        is_support = (func != nullptr);
+    }
+
+    return is_support;
+}
+
 } // namespace acl
 } // namespace c10
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h
index 2a18f0fd5a..596281f649 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.h
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.h
@@ -4,6 +4,7 @@
 
 #include "third_party/acl/inc/acl/acl_rt.h"
 #include "third_party/acl/inc/acl/acl_base.h"
+#include "third_party/acl/inc/acl/acl_mdl.h"
 #include "third_party/acl/inc/acl/acl_prof.h"
 #include "torch_npu/csrc/core/npu/interface/HcclInterface.h"
 
@@ -181,5 +182,19 @@ aclError AclrtSynchronizeDeviceWithTimeout(void);
 
 aclError AclrtEventGetTimestamp(aclrtEvent event, uint64_t *timestamp);
 
+aclError AclmdlBeginCapture(aclrtStream stream, aclmdlCaptureMode mode);
+
+aclError AclmdlGetCaptureInfo(aclrtStream stream, aclmdlCaptureStatus *status, uint32_t *modelId);
+
+aclError AclmdlEndCapture(aclrtStream stream, uint32_t *modelId);
+
+aclError AclmdlDebugPrint(uint32_t modelId);
+
+aclError AclmdlExecuteAsync(uint32_t modelId, aclrtStream stream);
+
+aclError AclmdlUnload(uint32_t modelId);
+
+bool IsCaptureSupported();
+
 } // namespace acl
 } // namespace c10_npu
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index e68ff7d2a2..3ebc40365d 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -25,6 +25,8 @@
 #include "torch_npu/csrc/core/NPUStorageImpl.h"
 #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
 #include "torch_npu/csrc/core/npu/NPUGuard.h"
+#include "torch_npu/csrc/core/npu/NPUGraph.h"
+#include "torch_npu/csrc/core/npu/NPUGraphsUtils.h"
 #include "torch_npu/csrc/core/npu/NPUAffinityController.h"
 #include "torch_npu/csrc/core/npu/NPUStream.h"
 #include "torch_npu/csrc/core/npu/register/OptionsManager.h"
@@ -1195,6 +1197,7 @@ void ProcessGroupHCCL::workCleanupLoop()
                     refreshStatusInfo(work, "end"); // Update Statusinfo，but not write into the map
                 }
                 it = workMetaList_.erase(it);
+                c10_npu::NPUGraph::dec_pending_event_queries();
             } else {
                 if (status_save_enable && work.isStarted()) {
                     refreshStatusInfo(work, "start"); // Update Statusinfo，but not write into the map
@@ -2326,6 +2329,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::collective(
     PostProcess post,
     c10d::OpType opType)
 {
+    c10_npu::CaptureStatus capture_status = c10_npu::currentStreamCaptureStatusMayInitCtx();
+
     // Bump collective counter
     seq_++;
     op_id_++;
@@ -2471,8 +2476,11 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::collective(
     work->blockingWait_ = blockingWait_;
     work->opTimeout_ = options_->timeout;
     work->store_ = store_;
-    if (asyncErrorHandling_ != NoHandling) {
+    c10_npu::NPUGraph::inc_pending_event_queries();
+    if (asyncErrorHandling_ != NoHandling && capture_status == c10_npu::CaptureStatus::None) {
         workEnqueue(work);
+    } else {
+        c10_npu::NPUGraph::dec_pending_event_queries();
     }
     
     return work;
@@ -2487,7 +2495,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::collectiveCoalesced(
     PostProcess post,
     c10d::OpType opType)
 {
-        // Bump collective counter
+    c10_npu::CaptureStatus capture_status = c10_npu::currentStreamCaptureStatusMayInitCtx();
+    // Bump collective counter
     seq_++;
     op_id_++;
 
@@ -2628,8 +2637,11 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::collectiveCoalesced(
     work->blockingWait_ = blockingWait_;
     work->opTimeout_ = options_->timeout;
     work->store_ = store_;
-    if (asyncErrorHandling_ != NoHandling) {
+    c10_npu::NPUGraph::inc_pending_event_queries();
+    if (asyncErrorHandling_ != NoHandling && capture_status == c10_npu::CaptureStatus::None) {
         workEnqueue(work);
+    } else {
+        c10_npu::NPUGraph::dec_pending_event_queries();
     }
     
     return work;
@@ -2644,6 +2656,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::pointToPoint(
     PreProcess pre,
     PostProcess post)
 {
+    c10_npu::CaptureStatus capture_status = c10_npu::currentStreamCaptureStatusMayInitCtx();
     const auto devices = getDeviceList(tensors);
     int p2pRank = 0, p2pTargetRank = 0;
     bool isSendRecvSelf = false;
@@ -2780,15 +2793,6 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::pointToPoint(
     }
     post(hcclStreams_[key], work);
 
-    // End event should only be recorded after the hcclGroupEnd()
-    for (const auto i : c10::irange(tensors.size())) {
-        c10_npu::NPUStream& hcclStream = hcclStreams_[key][i];
-        (*(work->hcclEndEvents_))[i].record(hcclStream);
-        work->hcclComms_[i] = hcclComms[i];
-        work->blockingWait_ = blockingWait_;
-        work->opTimeout_ = options_->timeout;
-        work->store_ = store_;
-    }
     // Future only needs to be created and marked completed with outputs for
     // recv(), but still create future for use cases such as profiling even for
     // send().
@@ -2800,8 +2804,21 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::pointToPoint(
         work->future_->markCompleted(at::IValue(*work->outputs_));
     }
 
-    if (asyncErrorHandling_ != NoHandling) {
+    // End event should only be recorded after the hcclGroupEnd()
+    for (const auto i : c10::irange(tensors.size())) {
+        c10_npu::NPUStream& hcclStream = hcclStreams_[key][i];
+        (*(work->hcclEndEvents_))[i].record(hcclStream);
+        work->hcclComms_[i] = hcclComms[i];
+        work->blockingWait_ = blockingWait_;
+        work->opTimeout_ = options_->timeout;
+        work->store_ = store_;
+    }
+    
+    c10_npu::NPUGraph::inc_pending_event_queries();
+    if (asyncErrorHandling_ != NoHandling && capture_status == c10_npu::CaptureStatus::None) {
         workEnqueue(work);
+    } else {
+        c10_npu::NPUGraph::dec_pending_event_queries();
     }
 
     return work;
diff --git a/torch_npu/csrc/framework/OpCommand.cpp b/torch_npu/csrc/framework/OpCommand.cpp
index 54f0b8b863..f8551f178b 100644
--- a/torch_npu/csrc/framework/OpCommand.cpp
+++ b/torch_npu/csrc/framework/OpCommand.cpp
@@ -12,6 +12,7 @@
 #include "torch_npu/csrc/framework/LazyInitAclops.h"
 #include "torch_npu/csrc/aten/CustomFunctions.h"
 #include "torch_npu/csrc/core/npu/NPUFunctions.h"
+#include "torch_npu/csrc/core/npu/NPUGraphsUtils.h"
 #ifndef BUILD_LIBTORCH
 #include "torch_npu/csrc/sanitizer/NPUTrace.h"
 #endif
@@ -131,6 +132,11 @@ OpCommand& OpCommand::Output(
 }
 
 void OpCommand::Run() {
+    // Check for npu graph
+    if (aclCmd->CheckCustomHandlerNull()) {
+        c10_npu::assertNotCapturing("Cannot run aclop operators");
+    }
+
     aclCmd->SetEnginePriority();
     const string &op_name = aclCmd->GetName();
     at_npu::aclops::LazyInitAclops();
diff --git a/torch_npu/csrc/framework/OpParamMaker.h b/torch_npu/csrc/framework/OpParamMaker.h
index a586c3f0ab..45921f36d0 100644
--- a/torch_npu/csrc/framework/OpParamMaker.h
+++ b/torch_npu/csrc/framework/OpParamMaker.h
@@ -200,6 +200,11 @@ public:
         execParam.customHandler = func;
     }
 
+    bool CheckCustomHandlerNull()
+    {
+        return execParam.customHandler == nullptr;
+    }
+
     const string &GetName() const { return opName; }
 
     void AddInput(
diff --git a/torch_npu/csrc/npu/Graph.cpp b/torch_npu/csrc/npu/Graph.cpp
new file mode 100644
index 0000000000..b5d24efcfa
--- /dev/null
+++ b/torch_npu/csrc/npu/Graph.cpp
@@ -0,0 +1,66 @@
+#include <torch/csrc/python_headers.h>
+
+#include <pybind11/chrono.h>
+
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/csrc/utils/pybind.h>
+
+#include "torch_npu/csrc/core/npu/NPUGraph.h"
+#include "torch_npu/csrc/core/npu/NPUGraphsUtils.h"
+
+template <typename T>
+using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
+
+void TORCH_NPU_API THNPGraph_init(PyObject* module) {
+    // Pybind11 patch notes say "py::module_" is more up-to-date syntax,
+    // but CI linter and some builds prefer "module".
+    auto torch_N_m = py::handle(module).cast<py::module>();
+
+    torch_N_m.def("_graph_pool_handle", &c10_npu::graph_pool_handle);
+
+    shared_ptr_class_<c10_npu::NPUGraph>(torch_N_m, "_NPUGraph")
+        .def(py::init<>())
+        .def(
+            "capture_begin",
+            [](c10_npu::NPUGraph& self,
+               std::optional<c10_npu::MempoolId_t> pool_opt,
+               std::string capture_error_mode) {
+                aclmdlCaptureMode capture_mode;
+                c10_npu::MempoolId_t pool = pool_opt.has_value()
+                    ? pool_opt.value() :c10_npu::MempoolId_t{0, 0};
+                if (capture_error_mode == "global") {
+                    capture_mode = aclmdlCaptureMode::ACL_MODEL_CAPTURE_MODE_GLOBAL;
+                } else if (capture_error_mode == "thread_local") {
+                    capture_mode = aclmdlCaptureMode::ACL_MODEL_CAPTURE_MODE_THREAD_LOCAL;
+                } else if (capture_error_mode == "relaxed") {
+                    capture_mode = aclmdlCaptureMode::ACL_MODEL_CAPTURE_MODE_RELAXED;
+                } else {
+                    TORCH_CHECK(
+                        false,
+                        "Unknown capture error mode. Expected `global`, `thread_local`, or `relaxed`, got ",
+                        capture_error_mode);
+                }
+                return self.capture_begin(pool, capture_mode);
+            },
+            py::arg("pool"),
+            py::arg("capture_error_mode"),
+            py::call_guard<py::gil_scoped_release>())
+        .def(
+            "capture_end",
+            torch::wrap_pybind_function_no_gil(&c10_npu::NPUGraph::capture_end))
+        .def(
+            "replay",
+            torch::wrap_pybind_function_no_gil(&c10_npu::NPUGraph::replay))
+        .def(
+            "reset",
+            torch::wrap_pybind_function_no_gil(&c10_npu::NPUGraph::reset))
+        .def(
+            "pool",
+            torch::wrap_pybind_function_no_gil(&c10_npu::NPUGraph::pool))
+        .def(
+            "debug_dump",
+            torch::wrap_pybind_function_no_gil(&c10_npu::NPUGraph::debug_dump))
+        .def(
+            "enable_debug_mode",
+            torch::wrap_pybind_function_no_gil(&c10_npu::NPUGraph::enable_debug_mode));
+}
diff --git a/torch_npu/csrc/npu/MemPool.cpp b/torch_npu/csrc/npu/MemPool.cpp
new file mode 100644
index 0000000000..7a14933686
--- /dev/null
+++ b/torch_npu/csrc/npu/MemPool.cpp
@@ -0,0 +1,21 @@
+#include <torch/csrc/python_headers.h>
+
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/csrc/utils/pybind.h>
+
+#include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
+
+template <typename T>
+using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
+
+void TORCH_NPU_API THNPMemPool_init(PyObject* module) {
+    auto torch_C_m = py::handle(module).cast<py::module>();
+    shared_ptr_class_<::c10_npu::MemPool>(torch_C_m, "_MemPool")
+        .def(py::init<c10_npu::NPUCachingAllocator::NPUAllocator*, bool>())
+        .def_property_readonly("id", &::c10_npu::MemPool::id)
+        .def_property_readonly("allocator", &::c10_npu::MemPool::allocator);
+    shared_ptr_class_<::c10_npu::MemPoolContext>(torch_C_m, "_MemPoolContext")
+        .def(py::init<c10_npu::MemPool*>())
+        .def_static(
+            "active_pool", &::c10_npu::MemPoolContext::getActiveMemPool);
+}
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index df23a997f1..07568a31c6 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -221,6 +221,27 @@ void RegisterNpuPluggableAllocator(PyObject* module)
         return torch::npu::NPUPluggableAllocator::createCustomAllocator(
             malloc_fn, free_fn);
     });
+    m.def(
+        "_npu_beginAllocateCurrentStreamToPool",
+        [](c10::DeviceIndex device, c10_npu::MempoolId_t mempool_id) {
+            auto stream = c10_npu::getCurrentNPUStream(device);
+            TORCH_CHECK(stream, "Expected stream capture to be under way");
+            c10_npu::NPUCachingAllocator::beginAllocateToPool(
+                device, mempool_id, [stream](aclrtStream target) {
+                return target == stream;
+            });
+        });
+    m.def(
+        "_npu_beginAllocateToPool",
+        [](c10::DeviceIndex device, c10_npu::MempoolId_t mempool_id) {
+            c10_npu::NPUCachingAllocator::beginAllocateToPool(
+                device, mempool_id, [](aclrtStream) { return true; });
+        });
+    m.def(
+        "_npu_endAllocateCurrentStreamToPool",
+        [](c10::DeviceIndex device, c10_npu::MempoolId_t mempool_id) {
+            c10_npu::NPUCachingAllocator::endAllocateToPool(device, mempool_id);
+        });
 }
 
 PyObject* THNPModule_msTxMark(PyObject* self, PyObject* args)
@@ -542,6 +563,21 @@ PyObject* THNPModule_setStream_wrap(
     END_HANDLE_TH_ERRORS
 }
 
+PyObject* THNPModule_isCurrentStreamCapturing_wrap(
+    PyObject* self,
+    PyObject* noargs)
+{
+    HANDLE_TH_ERRORS
+    // If there's no npu context, c10_npu::currentStreamCaptureStatus returns
+    // CaptureStatus::None without initializing a context.
+    if (c10_npu::currentStreamCaptureStatus() == c10_npu::CaptureStatus::None) {
+        Py_RETURN_FALSE;
+    } else {
+        Py_RETURN_TRUE;
+    }
+    END_HANDLE_TH_ERRORS
+}
+
 PyObject *THNPModule_is_jit_compile_false_wrap(PyObject *self, PyObject *noargs)
 {
     HANDLE_TH_ERRORS
@@ -1275,6 +1311,7 @@ static struct PyMethodDef THNPModule_methods[] = {
     {"_npu_getCurrentRawStream", (PyCFunction)THNPModule_getCurrentStream_raw, METH_O, nullptr},
     {"_npu_getDefaultStream", (PyCFunction)THNPModule_getDefaultStream_wrap, METH_O, nullptr},
     {"_npu_setStream", (PyCFunction)THNPModule_setStream_wrap,  METH_VARARGS | METH_KEYWORDS, nullptr},
+    {"_npu_isCurrentStreamCapturing", (PyCFunction)THNPModule_isCurrentStreamCapturing_wrap, METH_NOARGS, nullptr},
     {"_npu_is_jit_compile_false", (PyCFunction)THNPModule_is_jit_compile_false_wrap, METH_NOARGS, nullptr},
     {"_npu_setMemoryFraction", (PyCFunction) THNPModule_setMemoryFraction, METH_VARARGS, nullptr},
     {"_npu_emptyCache", (PyCFunction) THNPModule_emptyCache, METH_NOARGS, nullptr},
diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp
index d3bf501f6e..bcd802465d 100644
--- a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp
+++ b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp
@@ -228,6 +228,38 @@ c10_npu::NPUCachingAllocator::SnapshotInfo NPUPluggableAllocator::snapshot()
                   "If you need it, please file an issue describing your use case.");
 }
 
+// CUDAGraph interactions
+void NPUPluggableAllocator::beginAllocateToPool(
+    c10::DeviceIndex device,
+    c10_npu::MempoolId_t mempool_id,
+    std::function<bool(aclrtStream)> filter)
+{
+    TORCH_CHECK(
+        false,
+        "NPUPluggableAllocator does not yet support beginAllocateToPool. "
+        "If you need it, please file an issue describing your use case.");
+}
+
+void NPUPluggableAllocator::endAllocateToPool(
+    c10::DeviceIndex device,
+    c10_npu::MempoolId_t mempool_id)
+{
+    TORCH_CHECK(
+        false,
+        "NPUPluggableAllocator does not yet support endAllocateToPool. "
+        "If you need it, please file an issue describing your use case.");
+}
+
+void NPUPluggableAllocator::releasePool(
+    c10::DeviceIndex device,
+    c10_npu::MempoolId_t mempool_id)
+{
+    TORCH_CHECK(
+        false,
+        "NPUPluggableAllocator does not yet support releasePool. "
+        "If you need it, please file an issue describing your use case.");
+}
+
 void NPUPluggableAllocator::FreeDeviceCachedMemory(int device)
 {
     TORCH_NPU_WARN("NPUPluggableAllocator does not yet support FreeDeviceCachedMemory. "
diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.h b/torch_npu/csrc/npu/NPUPluggableAllocator.h
index 9050b6594a..c766619bb1 100644
--- a/torch_npu/csrc/npu/NPUPluggableAllocator.h
+++ b/torch_npu/csrc/npu/NPUPluggableAllocator.h
@@ -66,6 +66,17 @@ struct NPUPluggableAllocator
     void resetAccumulatedStats(int device) override;
     void resetPeakStats(int device) override;
     c10_npu::NPUCachingAllocator::SnapshotInfo snapshot() override;
+
+    // CUDAGraph interactions
+    void beginAllocateToPool(
+        c10::DeviceIndex device,
+        c10_npu::MempoolId_t mempool_id,
+        std::function<bool(aclrtStream)>) override;
+    void endAllocateToPool(
+        c10::DeviceIndex device,
+        c10_npu::MempoolId_t mempool_id) override;
+    void releasePool(c10::DeviceIndex device, c10_npu::MempoolId_t mempool_id) override;
+
     void FreeDeviceCachedMemory(int device) override;
     std::string name() override;
     void copy_data(void* dest, const void* src, std::size_t count) const final;
diff --git a/torch_npu/npu/__init__.py b/torch_npu/npu/__init__.py
index 6ac471873e..12d67a0bb8 100644
--- a/torch_npu/npu/__init__.py
+++ b/torch_npu/npu/__init__.py
@@ -46,6 +46,9 @@ __all__ = [
     "max_memory_cached",
     "memory_snapshot",
     "memory_summary",
+    "MemPool",
+    "MemPoolContext",
+    "use_mem_pool",
     "get_allocator_backend",
     "NPUPluggableAllocator",
     "change_current_allocator",
@@ -100,7 +103,12 @@ __all__ = [
     "enable_deterministic_with_backward",
     "disable_deterministic_with_backward",
     "mstx",
-    "SyncLaunchStream"
+    "SyncLaunchStream",
+    "NPUGraph",
+    "graph",
+    "graph_pool_handle",
+    "is_current_stream_capturing",
+    "make_graphed_callables"
 ]
 
 from typing import Tuple, Union
@@ -130,6 +138,14 @@ from .backends import *  # noqa: F403
 from ._backends import *  # noqa: F403
 from .deterministic import enable_deterministic_with_backward, disable_deterministic_with_backward # noqa: F403
 
+from .graphs import (
+    NPUGraph,
+    graph,
+    graph_pool_handle,
+    is_current_stream_capturing,
+    make_graphed_callables,
+)
+
 # init profiler
 if not torch_npu._C._profiler_init():
     raise RuntimeError("proflier initialization failed" + prof_error(ErrCode.UNAVAIL))
diff --git a/torch_npu/npu/graphs.py b/torch_npu/npu/graphs.py
new file mode 100644
index 0000000000..e78f60f365
--- /dev/null
+++ b/torch_npu/npu/graphs.py
@@ -0,0 +1,468 @@
+import gc
+import typing
+
+import torch
+import torch_npu._C
+from .utils import _dummy_type
+
+
+if not hasattr(torch_npu._C, "_NPUStreamBase"):
+    # Define dummy base classes
+    torch_npu._C.__dict__["_NPUGraph"] = _dummy_type("_NPUGraph")
+    torch_npu._C.__dict__["_graph_pool_handle"] = _dummy_type("_graph_pool_handle")
+    torch_npu._C.__dict__["_npu_isCurrentStreamCapturing"] = _dummy_type(
+        "_npu_isCurrentStreamCapturing"
+    )
+
+from torch_npu._C import (  # noqa: F401
+    _npu_isCurrentStreamCapturing,
+    _NPUGraph,
+    _graph_pool_handle,
+)
+
+
+def is_current_stream_capturing():
+    r"""Return True if NPU graph capture is underway on the current NPU stream, False otherwise.
+
+    If a NPU context does not exist on the current device, returns False without initializing the context.
+    """
+    return _npu_isCurrentStreamCapturing()
+
+
+# Python shim helps Sphinx process docstrings more reliably.
+def graph_pool_handle():
+    r"""Return an opaque token representing the id of a graph memory pool.
+
+    See :ref:`Graph memory management<graph-memory-management>`.
+
+    .. warning::
+        This API is in beta and may change in future releases.
+    """
+    return _graph_pool_handle()
+
+
+# Python shim helps Sphinx process docstrings more reliably.
+class NPUGraph(torch_npu._C._NPUGraph):
+    r"""Wrapper around a NPU graph.
+
+    .. warning::
+        This API is in beta and may change in future releases.
+    """
+
+    def __new__(cls):
+        return super().__new__(cls)
+
+    def capture_begin(self, pool=None, capture_error_mode="global"):
+        r"""Begin capturing NPU work on the current stream.
+
+        Typically, you shouldn't call ``capture_begin`` yourself.
+        Use :class:`~torch.npu.graph` or :func:`~torch.npu.make_graphed_callables`,
+        which call ``capture_begin`` internally.
+
+        Arguments:
+            pool (optional): Token (returned by :func:`~torch.npu.graph_pool_handle` or
+                :meth:`other_Graph_instance.pool()<torch.npu.NPUGraph.pool>`) that hints this graph may share memory
+                with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
+            capture_error_mode (str, optional): specifies the aclmdlCaptureMode for the graph capture stream.
+                Can be "global", "thread_local" or "relaxed". During npu graph capture, some actions, such as npuMalloc,
+                may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
+                actions in the current thread, and "relaxed" will not error on these actions. Do NOT change this setting
+                unless you're familiar with `aclmdlCaptureMode`_
+        """  # noqa: B950
+        super().capture_begin(pool=pool, capture_error_mode=capture_error_mode)
+
+    def capture_end(self):
+        r"""End NPU graph capture on the current stream.
+
+        After ``capture_end``, ``replay`` may be called on this instance.
+
+        Typically, you shouldn't call ``capture_end`` yourself.
+        Use :class:`~torch.npu.graph` or :func:`~torch.npu.make_graphed_callables`,
+        which call ``capture_end`` internally.
+        """
+        super().capture_end()
+
+    def replay(self):
+        r"""Replay the NPU work captured by this graph."""
+        super().replay()
+
+    def reset(self):
+        r"""Delete the graph currently held by this instance."""
+        super().reset()
+
+    def pool(self):
+        r"""Return an opaque token representing the id of this graph's memory pool.
+
+        This id can optionally be passed to another graph's ``capture_begin``,
+        which hints the other graph may share the same memory pool.
+        """
+        return super().pool()
+
+
+class graph:
+    r"""Context-manager that captures NPU work into a :class:`torch.npu.NPUGraph` object for later replay.
+
+    See :ref:`CUDA Graphs <cuda-graph-semantics>` for a general introduction,
+    detailed use, and constraints.
+
+    Arguments:
+        npu_graph (torch.npu.NPUGraph): Graph object used for capture.
+        pool (optional): Opaque token (returned by a call to :func:`~torch.npu.graph_pool_handle()` or
+            :meth:`other_Graph_instance.pool()<torch.npu.NPUGraph.pool>`) hinting this graph's capture
+            may share memory from the specified pool. See :ref:`Graph memory management<graph-memory-management>`.
+        stream (torch.npu.Stream, optional): If supplied, will be set as the current stream in the context.
+            If not supplied, ``graph`` sets its own internal side stream as the current stream in the context.
+        capture_error_mode (str, optional): specifies the aclmdlCaptureMode for the graph capture stream.
+            Can be "global", "thread_local" or "relaxed". During npu graph capture, some actions, such as npuMalloc,
+            may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
+            actions in the current thread, and "relaxed" will not error on actions. Do NOT change this setting
+            unless you're familiar with `aclmdlCaptureMode`_
+
+    .. note::
+        For effective memory sharing, if you pass a ``pool`` used by a previous capture and the previous capture
+        used an explicit ``stream`` argument, you should pass the same ``stream`` argument to this capture.
+
+    .. warning::
+        This API is in beta and may change in future releases.
+    """  # noqa: B950
+
+    default_capture_stream: typing.Optional["torch.npu.Stream"] = None
+
+    def __init__(
+        self,
+        npu_graph,
+        pool=None,
+        stream=None,
+        capture_error_mode: str = "global",
+    ):
+        # Lazy-init of default_capture_stream helps avoid circular-import errors.
+        # Not thread safe, but graphs already have the general (explicitly documented)
+        # restriction that only one capture may be underway at a time in the process.
+        if self.__class__.default_capture_stream is None:
+            self.__class__.default_capture_stream = torch.npu.Stream()
+
+        self.pool = () if pool is None else (pool,)
+        self.capture_stream = (
+            stream if stream is not None else self.__class__.default_capture_stream
+        )
+        if self.capture_stream is None:
+            raise RuntimeError("capture stream is None")
+        self.stream_ctx = torch.npu.stream(self.capture_stream)
+        self.npu_graph = npu_graph
+        self.capture_error_mode = capture_error_mode
+
+    def __enter__(self):
+        # Free as much memory as we can for the graph
+        torch.npu.synchronize()
+        gc.collect()
+        torch.npu.empty_cache()
+
+        # Stackoverflow seems comfortable with this pattern
+        self.stream_ctx.__enter__()
+
+        self.npu_graph.capture_begin(
+            *self.pool, capture_error_mode=self.capture_error_mode
+        )
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.npu_graph.capture_end()
+        self.stream_ctx.__exit__(exc_type, exc_value, traceback)
+        # returning None should propagate exceptions from either capture_end or stream_ctx.__exit__()
+
+
+def make_graphed_callables(
+        callables, sample_args, num_warmup_iters=3, allow_unused_input=False, pool=None
+):
+    r"""Accept callables (functions or :class:`nn.Module<torch.nn.Module>`\ s) and returns graphed versions.
+
+    Each graphed callable's forward pass runs its source callable's
+    forward CUDA work as a CUDA graph inside a single autograd node.
+
+    The graphed callable's forward pass also appends
+    a backward node to the autograd graph. During backward, this node runs the
+    callable's backward work as a CUDA graph.
+
+    Therefore, each graphed callable should be a drop-in replacement for its source callable
+    in an autograd-enabled training loop.
+
+    See :ref:`Partial-network capture<partial-network-capture>` for detailed use and constraints.
+
+    If you pass a tuple of several callables, their captures will use the same memory pool.
+    See :ref:`Graph memory management<graph-memory-management>` for when this is appropriate.
+
+    Arguments:
+        callables (torch.nn.Module or Python function, or tuple of these): Callable or callables to graph.
+            See :ref:`Graph memory management<graph-memory-management>` for when passing a tuple of callables
+            is appropriate.  If you pass a tuple of callables, their order in the tuple must be the same order
+            they'll run in the live workload.
+        sample_args (tuple of Tensors, or tuple of tuples of Tensors): Samples args for each callable.
+            If a single callable was passed, ``sample_args`` must be a single tuple of argument Tensors.
+            If a tuple of callables was passed, ``sample_args`` must be tuple of tuples of argument Tensors.
+        num_warmup_iters (int): The number of warmup iterations. Currently, ``DataDistributedParallel`` needs
+            11 iterations for warm up. Default: ``3``.
+        allow_unused_input (bool): If False, specifying inputs that were not used when computing outputs
+            (and therefore their grad is always zero) is an error. Defaults to False.
+        pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
+            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
+            with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
+    .. note::
+        The ``requires_grad`` state of each Tensor in ``sample_args`` must match the state
+        that's expected for the corresponding real input in the training loop.
+
+    .. warning::
+        This API is in beta and may change in future releases.
+
+    .. warning::
+        ``sample_args`` for each callable must contain only Tensors. Other types are not allowed.
+
+    .. warning::
+        Returned callables do not support higher order differentiation (e.g., double backward).
+
+    .. warning::
+        In any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters
+        may be trainable. Buffers must have ``requires_grad=False``.
+
+    .. warning::
+        After you pass a :class:`torch.nn.Module` through :func:`~make_graphed_callables`,
+        you may not add or remove any of that Module's parameters or buffers.
+
+    .. warning::
+        :class:`torch.nn.Module`\s passed to :func:`~torch.cuda.make_graphed_callables` must not have module hooks
+        registered on them at the time they are passed. However, registering hooks on modules *after* passing them
+        through :func:`~torch.cuda.make_graphed_callables` is allowed.
+
+    .. warning::
+        When running a graphed callable, you must pass its arguments in the same order and format
+        they appeared in that callable's ``sample_args``.
+
+    .. warning::
+        The automatic mixed precision is supported in :func:`~torch.cuda.make_graphed_callables` only with disabled
+        caching. The context manager `torch.cuda.amp.autocast()` must have `cache_enabled=False`.
+    """
+    if torch_npu.npu.is_autocast_enabled() and torch.is_autocast_cache_enabled():
+        raise RuntimeError(
+            "make_graphed_callables does not support the autocast caching. Please set `cache_enabled=False`."
+        )
+
+    just_one_callable = False
+
+    if not isinstance(callables, tuple):
+        just_one_callable = True
+        callables = (callables,)
+        sample_args = (sample_args,)
+
+    flatten_sample_args = []
+
+    for c, args in zip(callables, sample_args):
+        if isinstance(c, torch.nn.Module):
+            if len(c._backward_hooks) > 0 or len(c._forward_hooks) > 0 or len(c._forward_pre_hooks) > 0:
+                raise RuntimeError("Modules must not have hooks registered at the time they are passed. However, "
+                    + "registering hooks on modules after passing them through make_graphed_callables is allowed.")
+            if any(b.requires_grad for b in c.buffers()):
+                raise RuntimeError("In any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`,"
+                    + " only parameters may be trainable. All buffers must have ``requires_grad=False``.")
+        flatten_arg = torch.utils._pytree.arg_tree_leaves(*args)
+        flatten_sample_args.append(tuple(flatten_arg))
+        if not all(isinstance(arg, torch.Tensor) for arg in flatten_arg):
+            raise RuntimeError("In the beta API, sample_args "
+                + "for each callable must contain only Tensors. Other types are not allowed.")
+
+    # If a callable is an nn.Module, its graph's full input surface is the args the user explicitly
+    # passes to forward (ie, its sample_args) AND the module's parameter attributes.
+    per_callable_len_user_args = [len(args) for args in flatten_sample_args]
+    per_callable_module_params = [
+        tuple(c.parameters()) if isinstance(c, torch.nn.Module) else ()
+        for c in callables
+    ]
+    per_callable_static_input_surfaces = [
+        flatten_sample_args[i] + per_callable_module_params[i]
+        for i in range(len(callables))
+    ]
+
+    fwd_graphs = [torch_npu.npu.NPUGraph() for _ in range(len(callables))]
+    bwd_graphs = [torch_npu.npu.NPUGraph() for _ in range(len(callables))]
+
+    mempool = graph_pool_handle() if pool is None else pool
+
+    # Warmup
+    # Hopefully prevents cudnn benchmarking and other lazy-initialization cuda work
+    # from ending up in any captures.
+    torch_npu.npu.synchronize()
+    with torch_npu.npu.stream(torch_npu.npu.Stream()):
+        for func, args, static_input_surface in zip(
+            callables, sample_args, per_callable_static_input_surfaces
+        ):
+            grad_inputs, outputs, outputs_grad = None, None, None
+            for _ in range(num_warmup_iters):
+                outputs = torch.utils._pytree.tree_leaves(func(*args))
+                outputs_grad = tuple(o for o in outputs if o.requires_grad)
+                if len(outputs_grad) > 0:
+                    grad_inputs = torch.autograd.grad(
+                        outputs=outputs_grad,
+                        inputs=tuple(
+                            i for i in static_input_surface if i.requires_grad
+                        ),
+                        grad_outputs=tuple(
+                            torch.empty_like(o) for o in outputs if o.requires_grad
+                        ),
+                        only_inputs=True,
+                        allow_unused=allow_unused_input,
+                    )
+            for v in [outputs, outputs_grad, grad_inputs]:
+                del v
+
+    torch_npu.npu.synchronize()
+
+    # All captures here share a mempool. To avoid replays corrupting each other's memory,
+    # the safest approach is to capture all passes in the same order they'll run:
+    # fwd 1, fwd 2, ... fwd N, then bwd N, bwd N-1, ... bwd 1.
+
+    # Capture forward graphs
+    per_callable_static_outputs = []
+    per_callable_output_unflatten_spec = []
+    for func, args, fwd_graph in zip(callables, sample_args, fwd_graphs):
+        with torch_npu.npu.graph(fwd_graph, pool=mempool):
+            outputs = func(*args)
+
+        flatten_outputs, spec = torch.utils._pytree.tree_flatten(outputs)
+        per_callable_static_outputs.append(tuple(flatten_outputs))
+        per_callable_output_unflatten_spec.append(spec)
+
+    # Capture backward graphs in reverse order
+    per_callable_static_grad_outputs = []
+    per_callable_static_grad_inputs = []
+    for static_input_surface, static_outputs, bwd_graph, module_params in zip(
+        reversed(per_callable_static_input_surfaces),
+        reversed(per_callable_static_outputs),
+        reversed(bwd_graphs),
+        reversed(per_callable_module_params),
+    ):
+        # For now, assumes all static_outputs require grad
+        static_grad_outputs = tuple(
+            torch.empty_like(o) if o.requires_grad else None for o in static_outputs
+        )
+
+        outputs_grad = tuple(o for o in static_outputs if o.requires_grad)
+        grad_inputs = None
+        if len(outputs_grad) > 0:
+            with torch_npu.npu.graph(bwd_graph, pool=mempool):
+                grad_inputs = torch.autograd.grad(
+                    outputs=outputs_grad,
+                    inputs=tuple(i for i in static_input_surface if i.requires_grad),
+                    grad_outputs=tuple(o for o in static_grad_outputs if o is not None),
+                    only_inputs=True,
+                    allow_unused=allow_unused_input,
+                )
+
+        # Constructs a tuple suitable for returning from Graphed.backward:
+        # Pads out the actually-needed grads with Nones in gradient slots for inputs that don't require grad.
+        # I couldn't think of a slick one-liner for this pattern.
+        static_grad_inputs = []
+        grad_idx = 0
+        for arg in static_input_surface:
+            if arg.requires_grad and grad_inputs is not None:
+                static_grad_inputs.append(grad_inputs[grad_idx])
+                grad_idx += 1
+            else:
+                static_grad_inputs.append(None)  # type: ignore[arg-type]
+        static_grad_inputs = tuple(static_grad_inputs)  # type: ignore[assignment]
+
+        per_callable_static_grad_outputs.append(static_grad_outputs)
+        per_callable_static_grad_inputs.append(static_grad_inputs)
+
+    # Reverses the most recent two lists
+    per_callable_static_grad_outputs.reverse()
+    per_callable_static_grad_inputs.reverse()
+    # Now for every per_callable list, per_callable_*[i] holds the stuff for the ith callable.
+
+    def make_graphed_autograd_function(
+        fwd_graph,
+        bwd_graph,
+        module_params,
+        len_user_args,
+        output_unflatten_spec,
+        static_input_surface,
+        static_outputs,
+        static_grad_outputs,
+        static_grad_inputs,
+    ):
+        class Graphed(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, *inputs):
+                # At this stage, only the user args may (potentially) be new tensors.
+                for i in range(len_user_args):
+                    if static_input_surface[i].data_ptr() != inputs[i].data_ptr():
+                        static_input_surface[i].copy_(inputs[i])
+                fwd_graph.replay()
+                if not isinstance(static_outputs, tuple):
+                    raise RuntimeError("static_outputs is not tuple.")
+                return tuple(o.detach() for o in static_outputs)
+
+            @staticmethod
+            @torch.autograd.function.once_differentiable
+            def backward(ctx, *grads):
+                if (len(grads) != len(static_grad_inputs)):
+                    raise RuntimeError("The length of grads"
+                        + " is not equal with the length of static_grad_inputs.")
+                for g, grad in zip(static_grad_outputs, grads):
+                    if g is not None:
+                        # don't copy if autograd gods have been kind and the
+                        # incoming grad is already in the right place
+                        if g.data_ptr() != grad.data_ptr():
+                            g.copy_(grad)
+                bwd_graph.replay()
+
+                # Input args that didn't require grad expect a None gradient.
+                if not isinstance(static_grad_inputs, tuple):
+                    raise RuntimeError("static_grad_inputs is not tuple.")
+                return tuple(
+                    b.detach() if b is not None else b for b in static_grad_inputs
+                )
+
+        def functionalized(*user_args):
+            # Runs the autograd function with inputs == all inputs to the graph that might require grad
+            # (explicit user args + module parameters)
+            # Assumes module params didn't change since capture.
+            flatten_user_args = torch.utils._pytree.arg_tree_leaves(*user_args)
+            out = Graphed.apply(*(tuple(flatten_user_args) + module_params))
+            return torch.utils._pytree.tree_unflatten(out, output_unflatten_spec)
+
+        return functionalized
+
+    # Put together the final graphed callables
+    ret = []
+    for i, func in enumerate(callables):
+        graphed = make_graphed_autograd_function(
+            fwd_graphs[i],
+            bwd_graphs[i],
+            per_callable_module_params[i],
+            per_callable_len_user_args[i],
+            per_callable_output_unflatten_spec[i],
+            per_callable_static_input_surfaces[i],
+            per_callable_static_outputs[i],
+            per_callable_static_grad_outputs[i],
+            per_callable_static_grad_inputs[i],
+        )
+
+        if isinstance(func, torch.nn.Module):
+
+            def make_graphed_forward(func, graph_training_state, graphed, orig_fwd):
+                def new_fwd(*user_args):
+                    # If the module's training-or-eval state matches what we graphed,
+                    # run the graph, otherwise run the original forward method
+                    if func.training == graph_training_state:
+                        return graphed(*user_args)
+                    else:
+                        return orig_fwd(*user_args)
+
+                return new_fwd
+
+            func.forward = make_graphed_forward(func, func.training, graphed, func.forward)  # type: ignore[assignment]
+            ret.append(func)
+        else:
+            ret.append(graphed)
+
+    if just_one_callable:
+        return ret[0]
+
+    return tuple(ret)
\ No newline at end of file
diff --git a/torch_npu/npu/memory.py b/torch_npu/npu/memory.py
index 9f35fc065f..0de7e4a578 100644
--- a/torch_npu/npu/memory.py
+++ b/torch_npu/npu/memory.py
@@ -36,13 +36,26 @@ __all__ = [
     "memory_summary",
     "get_allocator_backend",
     "NPUPluggableAllocator",
-    "change_current_allocator"
+    "change_current_allocator",
+    "MemPool",
+    "MemPoolContext",
+    "use_mem_pool",
 ]
 
 if not hasattr(torch_npu._C, "_npu_NPUAllocator"):
     # Define dummy base classes
     torch_npu._C.__dict__["_npu_NPUAllocator"] = _dummy_type("_npu_NPUAllocator")
 
+if not hasattr(torch_npu._C, "_MemPool"):
+    # Define dummy base classes
+    torch_npu._C.__dict__["_MemPool"] = _dummy_type("_MemPool")
+    torch_npu._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext")
+    torch_npu._C.__dict__["_npu_beginAllocateToPool"] = _dummy_type(
+        "_npu_beginAllocateToPool"
+    )
+    torch_npu._C.__dict__["_npu_endAllocateCurrentStreamToPool"] = _dummy_type(
+        "_npu_endAllocateCurrentStreamToPool"
+    )
 
 @contextlib.contextmanager
 def _free_mutex():
@@ -631,6 +644,76 @@ def _get_current_allocator() -> _NPUAllocator:
     return _NPUAllocator(torch_npu._C._npu_getAllocator())
 
 
+class MemPool(torch_npu._C._MemPool):
+    r"""MemPool represents a pool of memory in a caching allocator. Currently,
+    it's just the ID of the pool object maintained in the NPUCachingAllocator.
+
+    Args:
+        allocator(torch_npu._C._npu_NPUAllocator, optional): a
+            torch_npu._C._npu_NPUAllocator object that can be used to
+            define how memory gets allocated in the pool. If :attr:`allocator`
+            is ``None`` (default), memory allocation follows the default/
+            current configuration of the NPUCachingAllocator.
+
+    """
+
+    def __init__(self, allocator: Optional[torch_npu._C._npu_NPUAllocator] = None):
+        super().__init__(allocator, True)
+
+    @property
+    def id(self) -> Tuple[int, int]:
+        r"""Returns the ID of this pool as a tuple of two ints."""
+        return super().id
+
+    @property
+    def allocator(self) -> Optional[torch_npu._C._npu_NPUAllocator]:
+        r"""Returns the allocator this MemPool routes allocations to"""
+        return super().allocator
+
+
+class MemPoolContext(torch_npu._C._MemPoolContext):
+    r"""MemPoolContext holds the currently active pool and stashes the previous
+    pool. On deletion it makes the previous pool active.
+
+    Args:
+        pool(torch_npu.npu.MemPool): a MemPool object to be made active so that
+        allocations route to this pool.
+
+    """
+
+    def __init__(self, pool: MemPool):
+        super().__init__(pool)
+
+    @staticmethod
+    def active_pool() -> Optional[torch_npu._C._MemPool]:
+        r"""Returns the active MemPool"""
+        return torch_npu._C._MemPoolContext.active_pool()
+
+
+@contextlib.contextmanager
+def use_mem_pool(pool: MemPool, device=None):
+    r"""A context manager that routes allocations to a given pool.
+
+    Args:
+        pool(torch_npu.npu.MemPool): a MemPool object to be made active so that
+            allocations route to this pool.
+        device (torch.device or int, optional): selected device. Uses MemPool on
+            the current device, given by :func:`~torch_npu.npu.current_device,
+            if :attr:`device` is ``None`` (default).
+
+    """
+    ctx = MemPoolContext(pool)
+    device_index = (
+        torch_npu.npu.current_device() if device is None else _get_device_index(device)
+    )
+    torch_npu._C._npu_beginAllocateToPool(device_index, pool.id)
+    try:
+        yield
+    finally:
+        torch_npu._C._npu_endAllocateCurrentStreamToPool(device_index, pool.id)
+        del ctx
+
+
 def _record_memory_history(enabled="all", *args, **kwargs):
     """Enable recording of stack traces associated with memory
     allocations, so you can tell what allocated any piece of memory in
-- 
Gitee


From d6143db45b594fc72fe9c0df1f38347f1089f2b0 Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Wed, 12 Mar 2025 02:48:19 +0000
Subject: [PATCH 133/358] !18819 Update torchair commit id Merge pull request
 !18819 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 65c3601a0e..7d6785c75a 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 65c3601a0e890b7913638f166668580673d0bc9a
+Subproject commit 7d6785c75a31e0d7802d17e4119f3e8d519facfe
-- 
Gitee


From 97da056cbf3a6d76b7ec8479d729afdbd7802884 Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Wed, 12 Mar 2025 02:48:19 +0000
Subject: [PATCH 134/358] !18819 Update torchair commit id Merge pull request
 !18819 from torchair_robot/v2.6.0

-- 
Gitee


From a2d438dc188b4a74e6047757887ac334e4027360 Mon Sep 17 00:00:00 2001
From: will-devil <wangyicheng16@huawei.com>
Date: Wed, 12 Mar 2025 08:13:24 +0000
Subject: [PATCH 135/358] !18794 [Feature] fsdp2 testcase 3/N. Merge pull
 request !18794 from will-devil/fsdp2-26-3

---
 .../fsdp2/test_fully_shard_autograd.py        |  324 +++++
 .../fsdp2/test_fully_shard_comm.py            | 1133 +++++++++++++++++
 2 files changed, 1457 insertions(+)
 create mode 100644 test/distributed/fsdp2/test_fully_shard_autograd.py
 create mode 100644 test/distributed/fsdp2/test_fully_shard_comm.py

diff --git a/test/distributed/fsdp2/test_fully_shard_autograd.py b/test/distributed/fsdp2/test_fully_shard_autograd.py
new file mode 100644
index 0000000000..97329617a7
--- /dev/null
+++ b/test/distributed/fsdp2/test_fully_shard_autograd.py
@@ -0,0 +1,324 @@
+import collections
+import copy
+import functools
+import itertools
+import unittest
+from typing import Any, List, Optional, Type, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed.fsdp import fully_shard
+from torch.nn.parallel.scatter_gather import _is_namedtuple
+from torch.testing._internal.common_fsdp import (
+    check_sharded_parity,
+    DoubleLinear,
+    FSDPTestMultiThread,
+    MLP,
+)
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    ModelArgs,
+    Transformer,
+)
+
+import torch_npu
+from torch_npu.testing.common_utils import SupportedDevices
+from torch_npu.testing._internal.common_fsdp import FSDPNPUTest
+
+torch.use_deterministic_algorithms(True)
+
+
+class TestFullyShardAutograd(FSDPNPUTest):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.npu.device_count())
+
+    def _reduce_1d_partial_grads(
+        self, module: nn.Module, group: Optional[dist.ProcessGroup] = None
+    ) -> None:
+        group = group or dist.distributed_c10d._get_default_group()
+        for param in module.parameters():
+            if param.grad is not None:
+                param.grad.div_(group.size())
+
+    def test_unused_forward_output(self):
+        """
+        Tests that gradients propagate when running a backward where some
+        forward output is not used to compute the loss.
+        """
+        self.run_subtests(
+            {"reshard_after_forward": [True, False, 2]},
+            self._test_unused_forward_output,
+        )
+
+    def _test_unused_forward_output(self, reshard_after_forward: Union[bool, int]):
+        torch.manual_seed(42)
+        local_batch_size = 2
+        global_batch_size, dim = (self.world_size * local_batch_size, 24)
+        model = DoubleLinear(dim=dim, use_second_linear=True)
+        ref_model = copy.deepcopy(model).npu()
+        fully_shard(model.lin1, reshard_after_forward=reshard_after_forward)
+        fully_shard(model, reshard_after_forward=reshard_after_forward)
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+        torch.manual_seed(1)  # same on all ranks
+        for iter_idx in range(10):
+            # Use all forward outputs in the loss/backward for the first half
+            # of the iterations and only the 1st forward output for the rest
+            global_inp = torch.rand((global_batch_size, dim), device="npu")
+            local_inp = global_inp[self.rank * local_batch_size:(self.rank + 1) * local_batch_size].detach()
+            out1, out2 = model(local_inp)
+            loss = (out1 * out2).sum() if iter_idx < 3 else out1.sum()
+            loss.backward()
+            optim.step()
+            ref_out1, ref_out2 = ref_model(global_inp)
+            ref_loss = (ref_out1 * ref_out2).sum() if iter_idx < 3 else ref_out1.sum()
+            ref_loss.backward()
+            self._reduce_1d_partial_grads(ref_model)
+            ref_optim.step()
+            dist.all_reduce(loss)  # partial -> replicated
+            self.assertEqual(loss, ref_loss)
+            optim.zero_grad(set_to_none=(iter_idx % 2))
+            ref_optim.zero_grad(set_to_none=(iter_idx % 2))
+            check_sharded_parity(self, ref_model, model)
+
+    def test_unused_forward_module(self):
+        """
+        Tests that gradients propagate when running a backward where some
+        forward module is not used to compute the loss.
+        """
+        self.run_subtests(
+            {"reshard_after_forward": [True, False, 2]},
+            self._test_unused_forward_module,
+        )
+
+    def _test_unused_forward_module(self, reshard_after_forward: Union[bool, int]):
+        torch.manual_seed(42)
+        local_batch_size, dim = (2, 24)
+        global_batch_size = self.world_size * local_batch_size
+        model = DoubleLinear(dim=dim, use_second_linear=False)
+        ref_model = copy.deepcopy(model).npu()
+        fully_shard(model.lin1, reshard_after_forward=reshard_after_forward)
+        fully_shard(model.lin2, reshard_after_forward=reshard_after_forward)
+        fully_shard(model, reshard_after_forward=reshard_after_forward)
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+        torch.manual_seed(1)  # same on all ranks
+        for iter_idx in range(10):
+            global_inp = torch.rand((global_batch_size, dim), device="npu")
+            local_inp = global_inp[self.rank * local_batch_size:(self.rank + 1) * local_batch_size].detach()
+            losses: List[torch.Tensor] = []
+            for _model, inp in ((ref_model, global_inp), (model, local_inp)):
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+            self._reduce_1d_partial_grads(ref_model)
+            dist.all_reduce(losses[1])  # partial -> replicated
+            self.assertEqual(losses[0], losses[1])
+            check_sharded_parity(self, ref_model, model)
+            for _optim in (optim, ref_optim):
+                _optim.step()
+                _optim.zero_grad(set_to_none=(iter_idx % 2))
+
+    def test_nontensor_activations(self):
+        """
+        Tests that gradients propagate when running forward with nontensor
+        data structures wrapping the activations. This is mainly to test the
+        hook registration.
+        """
+        self.run_subtests(
+            {"container_type": [list, collections.namedtuple, tuple, dict]},
+            self._test_nontensor_activations,
+        )
+
+    def _test_nontensor_activations(self, container_type: Type):
+        class Module(nn.Module):
+            def __init__(self, dim: int):
+                super().__init__()
+                self.lin1 = nn.Linear(dim, dim)
+                self.lin2 = nn.Linear(dim, dim)
+                self.relu = nn.ReLU()
+
+            def forward(self, inp: Any):
+                # Assume that the "0th" element of `inp` is a tensor, run some
+                # forward computation on it, and pack it back into the same
+                # data structure type as `inp`
+                if isinstance(inp, list):
+                    return [self._forward(inp[0])]
+                elif _is_namedtuple(inp):
+                    return type(inp)(*([self._forward(inp[0])] + list(inp[1:])))
+                elif isinstance(inp, tuple):
+                    return (self._forward(inp[0]),)
+                elif isinstance(inp, dict):
+                    return {"x": self._forward(inp["x"])}
+                else:
+                    raise NotImplementedError(
+                        f"Unsupported input type {type(inp)}: {inp}"
+                    )
+
+            def _forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.relu(self.lin2(self.relu(self.lin1(x))))
+
+        class ToContainerType(nn.Module):
+            def __init__(self, container_type: Type):
+                super().__init__()
+                self.container_type = container_type
+
+            def forward(self, x: torch.Tensor):
+                if self.container_type is list:
+                    return [x]
+                elif self.container_type is collections.namedtuple:
+                    nt = collections.namedtuple("NT", "x y")
+                    return nt(x, torch.ones_like(x))
+                elif self.container_type is tuple:
+                    return (x,)
+                elif self.container_type is dict:
+                    return {"x": x}
+                else:
+                    raise NotImplementedError(
+                        f"Unsupported container type: {self.container_type}"
+                    )
+
+        class FromContainerType(nn.Module):
+            def __init__(self, container_type: Type):
+                super().__init__()
+                self.container_type = container_type
+
+            def forward(self, x: torch.Tensor):
+                if self.container_type in (list, collections.namedtuple, tuple):
+                    return x[0]
+                elif self.container_type is dict:
+                    return x["x"]
+                else:
+                    raise NotImplementedError(
+                        f"Unsupported container type: {self.container_type}"
+                    )
+
+        torch.manual_seed(42)
+        local_batch_size, dim = (2, 24)
+        global_batch_size = self.world_size * local_batch_size
+        model = nn.Sequential(
+            ToContainerType(container_type),
+            Module(dim),
+            Module(dim),
+            Module(dim),
+            FromContainerType(container_type),
+        )
+        ref_model = copy.deepcopy(model).npu()
+        for module in model:
+            fully_shard(module)
+        fully_shard(model)
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+        torch.manual_seed(1)  # same on all ranks
+        for iter_idx in range(10):
+            global_inp = torch.rand((global_batch_size, dim), device="npu")
+            local_inp = global_inp[self.rank * local_batch_size:(self.rank + 1) * local_batch_size].detach()
+            losses: List[torch.Tensor] = []
+            for _model, inp in ((ref_model, global_inp), (model, local_inp)):
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+            self._reduce_1d_partial_grads(ref_model)
+            dist.all_reduce(losses[1])  # partial -> replicated
+            self.assertEqual(losses[0], losses[1])
+            check_sharded_parity(self, ref_model, model)
+            for _optim in (optim, ref_optim):
+                _optim.step()
+                _optim.zero_grad(set_to_none=(iter_idx % 2))
+
+
+class TestFullyShardPostAccGradHookMultiThread(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    def perThreadSetUp(self):
+        super().perThreadSetUp()
+        torch.npu.set_device(0)
+
+    @SupportedDevices(['Ascend910B'])
+    def test_post_acc_grad_hook_runs(self):
+        param_name_to_hook_count = collections.defaultdict(int)
+
+        def hook(param_name: str, param: torch.Tensor) -> None:
+            nonlocal param_name_to_hook_count
+            param_name_to_hook_count[param_name] += 1
+
+        model = MLP(8)
+        for module in (model.in_proj, model.out_proj, model):
+            fully_shard(module)
+        for param_name, param in model.named_parameters():
+            param_hook = functools.partial(hook, param_name)
+            param.register_post_accumulate_grad_hook(param_hook)
+
+        inp = torch.randn((2, 8), device="npu")
+        model(inp).sum().backward()
+        param_names = {param_name for param_name, _ in model.named_parameters()}
+        self.assertEqual(param_names, set(param_name_to_hook_count.keys()))
+        for param_name, count in param_name_to_hook_count.items():
+            self.assertEqual(count, 1)
+
+
+class TestFullyShardPostAccGradHookMultiProcess(FSDPNPUTest):
+    @property
+    def world_size(self) -> int:
+        return min(torch.npu.device_count(), 2)
+
+    @SupportedDevices(['Ascend910B'])
+    def test_post_acc_grad_hook_optim_parity(self):
+        """
+        Tests parity of running the optimizer via the post-accumulate-grad
+        hook vs. normally.
+        """
+        torch.manual_seed(42)
+        model_args = ModelArgs(dropout_p=0.0)
+        model = Transformer(model_args)
+
+        ref_model = copy.deepcopy(model).npu()
+        for module in itertools.chain(ref_model.layers, [ref_model]):
+            fully_shard(module)
+        optim_kwargs = {"lr": 1e-2, "foreach": False}
+        ref_optim = torch.optim.AdamW(ref_model.parameters(), **optim_kwargs)
+        lr_scheduler_kwargs = {"step_size": 5}
+        ref_lr_scheduler = torch.optim.lr_scheduler.StepLR(
+            ref_optim, **lr_scheduler_kwargs
+        )
+
+        for module in itertools.chain(model.layers, [model]):
+            fully_shard(module)
+        param_to_optim = {}
+        param_to_lr_scheduler = {}
+        for param in model.parameters():
+            param_to_optim[param] = torch.optim.AdamW([param], **optim_kwargs)
+            param_to_lr_scheduler[param] = torch.optim.lr_scheduler.StepLR(
+                param_to_optim[param], **lr_scheduler_kwargs
+            )
+
+        def optim_hook(param: nn.Parameter) -> None:
+            param_to_optim[param].step()
+            param_to_optim[param].zero_grad()
+            param_to_lr_scheduler[param].step()
+
+        for param in model.parameters():
+            param.register_post_accumulate_grad_hook(optim_hook)
+
+        torch.manual_seed(42 + self.rank)
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device="npu")
+        for _ in range(10):
+            ref_loss = ref_model(inp).sum()
+            ref_loss.backward()
+            ref_optim.step()
+            ref_optim.zero_grad()
+            ref_lr_scheduler.step()
+            loss = model(inp).sum()
+            loss.backward()
+            self.assertTrue(torch.equal(ref_loss, loss))
+            for ref_param, param in zip(ref_model.parameters(), model.parameters()):
+                self.assertTrue(torch.equal(ref_param, param))
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/fsdp2/test_fully_shard_comm.py b/test/distributed/fsdp2/test_fully_shard_comm.py
new file mode 100644
index 0000000000..721ee5d0af
--- /dev/null
+++ b/test/distributed/fsdp2/test_fully_shard_comm.py
@@ -0,0 +1,1133 @@
+import copy
+import functools
+import itertools
+import unittest
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributed._composable import checkpoint, replicate
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
+from torch.distributed.fsdp import (
+    FSDPModule,
+    fully_shard,
+    MixedPrecisionPolicy,
+    OffloadPolicy,
+)
+from torch.distributed.fsdp._fully_shard._fsdp_collectives import (
+    _div_if_needed,
+    _get_gradient_divide_factors,
+    foreach_all_gather,
+    foreach_all_gather_copy_out,
+    foreach_reduce,
+)
+from torch.distributed.fsdp._fully_shard._fsdp_common import FSDPMeshInfo, TrainingState
+from torch.distributed.fsdp._fully_shard._fsdp_init import (
+    _get_post_forward_mesh_info,
+    _init_default_fully_shard_mesh,
+)
+from torch.distributed.fsdp._fully_shard._fsdp_param import ShardedState
+from torch.distributed.fsdp._fully_shard._fsdp_param_group import FSDPParamGroup
+from torch.distributed.tensor import DTensor
+from torch.distributed.tensor.debug import CommDebugMode
+from torch.distributed.tensor.experimental import implicit_replication
+from torch.testing._internal.common_fsdp import (
+    check_sharded_parity,
+    DoubleLinear,
+    FSDPTest,
+    FSDPTestMultiThread,
+    MLP,
+    patch_post_backward,
+    patch_reshard,
+    patch_unshard,
+)
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    ModelArgs,
+    Transformer,
+    TransformerBlock,
+)
+
+import torch_npu
+from torch_npu.testing.common_utils import SupportedDevices
+from torch_npu.testing._internal.common_fsdp import FSDPNPUTest
+
+
+c10d_ops = torch.ops.c10d
+
+# For recording FSDP events like unshard or post-backward
+EventType = Tuple[str, str, TrainingState]
+
+
+class TestFullyShardCollectiveOps(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 128
+
+    def perThreadSetUp(self):
+        super().perThreadSetUp()
+        torch.npu.set_device(0)
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device("npu:0")
+
+    def _get_param_sizes(self) -> List[torch.Size]:
+        # For world size 128, the fp32 all-gather and reduce-scatter testing
+        # requires ~0.22 GB
+        return [
+            torch.Size([17, 257]),
+            torch.Size([17]),
+            torch.Size([64, 312]),
+            torch.Size([64]),
+            torch.Size([64, 64]),
+            torch.Size([512, 64]),
+            torch.Size([256]),
+            torch.Size([64, 297]),
+        ]
+
+    def _init_params(self, param_sizes: List[torch.Size]) -> List[nn.Parameter]:
+        torch.manual_seed(42)
+        orig_params = [
+            nn.Parameter(torch.randn(size, device=self.device)) for size in param_sizes
+        ]
+        # Since seed is per process, not per thread, we broadcast to ensure the
+        # same original parameters across ranks
+        for orig_param in orig_params:
+            dist.broadcast(orig_param, src=0)
+        return orig_params
+
+    def _init_fsdp_param_group(
+        self, params: List[nn.Parameter], reshard_after_forward: Union[bool, int]
+    ):
+        module = nn.ParameterList([param.detach().clone() for param in params])
+        mesh_info = FSDPMeshInfo(_init_default_fully_shard_mesh(), shard_mesh_dim=0)
+        post_forward_mesh_info = _get_post_forward_mesh_info(
+            reshard_after_forward, mesh_info
+        )
+        fsdp_param_group = FSDPParamGroup(
+            list(module.parameters()),
+            (module,),
+            mesh_info,
+            post_forward_mesh_info,
+            self.device,
+            None,  # shard_placement_fn
+            MixedPrecisionPolicy(),
+            OffloadPolicy(),
+        )
+        fsdp_param_group.lazy_init()
+        return fsdp_param_group
+
+    @SupportedDevices(['Ascend910B'])
+    def test_all_gather_fp32(self):
+        param_sizes = self._get_param_sizes()
+        default_stream = torch.npu.current_stream()
+        stream1, stream2 = torch.npu.Stream(), torch.npu.Stream()
+        for async_op, streams, reshard_after_forward in itertools.product(
+            (False, True),
+            ((default_stream, default_stream), (stream1, stream2)),
+            (True, 8),
+        ):
+            all_gather_copy_in_stream, all_gather_stream = streams
+            # Save test time by only testing reshard after forward as an int
+            # for non-async and non-default streams (like in pre-backward)
+            if type(reshard_after_forward) is int and (
+                async_op or all_gather_stream is default_stream
+            ):
+                continue
+            self._test_all_gather(
+                param_sizes,
+                reshard_after_forward=reshard_after_forward,
+                async_op=async_op,
+                all_gather_copy_in_stream=all_gather_copy_in_stream,
+                all_gather_stream=all_gather_stream,
+            )
+
+    def _test_all_gather(
+        self,
+        param_sizes: List[torch.Size],
+        reshard_after_forward: Union[bool, int],
+        async_op: bool,
+        all_gather_copy_in_stream: torch.npu.Stream,
+        all_gather_stream: torch.npu.Stream,
+    ):
+        def all_gather(fsdp_param_group: FSDPParamGroup, group: dist.ProcessGroup):
+            all_gather_result = foreach_all_gather(
+                fsdp_param_group.fsdp_params,
+                group,
+                async_op=async_op,
+                all_gather_copy_in_stream=all_gather_copy_in_stream,
+                all_gather_stream=all_gather_stream,
+                device=self.device,
+            )
+            foreach_all_gather_copy_out(all_gather_result, fsdp_params, group)
+            # Transition to unsharded state to register unsharded parameters
+            for fsdp_param in fsdp_param_group.fsdp_params:
+                fsdp_param.init_unsharded_param()
+            fsdp_param_group._to_unsharded()
+
+        def check_all_gathered_params(
+            orig_params: List[nn.Parameter], module: nn.Module
+        ):
+            for orig_param, param in zip(orig_params, module.parameters()):
+                self.assertIsInstance(param, torch.Tensor)
+                self.assertIsInstance(param, nn.Parameter)
+                self.assertEqual(param, orig_param.to(param.dtype))
+
+        # Set up the reference parameters and construct the FSDP group
+        orig_params = self._init_params(param_sizes)
+        fsdp_param_group = self._init_fsdp_param_group(
+            orig_params, reshard_after_forward
+        )
+        fsdp_params = fsdp_param_group.fsdp_params
+        module = fsdp_param_group.modules[0]
+
+        # Sanity check that the parameter sharding is as expected
+        for orig_param, param in zip(orig_params, module.parameters()):
+            self.assertTrue(isinstance(param, DTensor))
+            self.assertEqual(param.full_tensor(), orig_param)
+
+        # Run the foreach all-gather (including copy-in and copy-out)
+        all_gather(fsdp_param_group, fsdp_param_group.mesh_info.shard_process_group)
+
+        # Check all-gather correctness
+        check_all_gathered_params(orig_params, module)
+
+        # For reshard after after forward as an int, further test emulating the
+        # pre-backward all-gather
+        if type(reshard_after_forward) is not int:
+            return
+        fsdp_param_group._to_sharded_post_forward()
+        all_gather(
+            fsdp_param_group,
+            fsdp_param_group.post_forward_mesh_info.shard_process_group,
+        )
+        check_all_gathered_params(orig_params, module)
+
+    @SupportedDevices(['Ascend910B'])
+    def test_reduce_scatter_fp32(self):
+        param_sizes = self._get_param_sizes()
+        default_stream = torch.npu.current_stream()
+        stream = torch.npu.Stream()
+        for reduce_scatter_stream in (default_stream, stream):
+            self._test_reduce_scatter(
+                param_sizes,
+                reduce_scatter_stream=reduce_scatter_stream,
+                reduce_scatter_dtype=torch.float32,
+            )
+
+    @SupportedDevices(['Ascend910B'])
+    def test_reduce_scatter_fp16(self):
+        param_sizes = self._get_param_sizes()
+        default_stream = torch.npu.current_stream()
+        stream = torch.npu.Stream()
+        for reduce_scatter_stream in (default_stream, stream):
+            self._test_reduce_scatter(
+                param_sizes,
+                reduce_scatter_stream=reduce_scatter_stream,
+                reduce_scatter_dtype=torch.float16,
+            )
+
+    def _test_reduce_scatter(
+        self,
+        param_sizes: List[torch.Size],
+        reduce_scatter_stream: torch.npu.Stream,
+        reduce_scatter_dtype: torch.dtype,
+    ):
+        # Set up the reference parameters and construct the FSDP group
+        orig_params = self._init_params(param_sizes)
+        fsdp_param_group = self._init_fsdp_param_group(orig_params, True)
+        fsdp_params = fsdp_param_group.fsdp_params
+        fsdp_param_group.comm_ctx.lazy_init(self.device)
+
+        # Run one unshard to initialize metadata
+        fsdp_param_group.unshard()
+        fsdp_param_group.wait_for_unshard()
+        fsdp_param_group.reshard()
+
+        # Run the foreach reduce-scatter (including copy-in and view-out)
+        torch.manual_seed(42)
+        unsharded_grads = [torch.ones_like(param) * self.rank for param in orig_params]
+        group = fsdp_param_group.mesh_info.shard_process_group
+        self.assertEqual(group.size(), self.world_size)
+        all_reduce_stream = torch.npu.Stream()
+        (
+            reduce_scatter_input,
+            reduce_scatter_event,
+            post_reduce_event,
+            _,
+            _,
+            _,
+        ) = foreach_reduce(
+            fsdp_params,
+            unsharded_grads,
+            group,
+            reduce_scatter_stream,
+            orig_dtype=orig_params[0].dtype,
+            reduce_dtype=reduce_scatter_dtype,
+            device=self.device,
+            reduce_scatter_reduce_op=None,
+            all_reduce_group=None,
+            all_reduce_stream=all_reduce_stream,
+            all_reduce_grads=True,
+            partial_reduce_output=None,
+        )
+        torch.npu.current_stream().wait_event(post_reduce_event)
+
+        # Check reduce-scatter correctness
+        predivide_factor, postdivide_factor = _get_gradient_divide_factors(
+            group, None, reduce_scatter_dtype
+        )
+        reduced_grads = [grad.detach().clone() for grad in unsharded_grads]
+        for grad in reduced_grads:
+            _div_if_needed(grad, predivide_factor)
+            dist.all_reduce(
+                grad,
+                group=group,
+                op=dist.ReduceOp.AVG if predivide_factor is None else dist.ReduceOp.SUM,
+            )
+            _div_if_needed(grad, postdivide_factor)
+        for fsdp_param, reduced_grad in zip(fsdp_params, reduced_grads):
+            sharded_grad = fsdp_param.sharded_param.grad
+            self.assertIsInstance(sharded_grad, DTensor)
+            self.assertEqual(sharded_grad.full_tensor(), reduced_grad)
+
+
+class TestFullyShardCommunication(FSDPNPUTest):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.npu.device_count())
+
+    @SupportedDevices(['Ascend910B'])
+    def test_fully_shard_communication_count(self):
+        """
+        Tests that FSDP issues the expected number of all-gathers and
+        reduce-scatters during forward and backward.
+        """
+        self.run_subtests(
+            {"reshard_after_forward": [True, False, 2]},
+            self._test_communication_count,
+        )
+
+    def _test_communication_count(
+        self,
+        reshard_after_forward: Union[bool, int],
+    ):
+        torch.manual_seed(42)
+        model_args = ModelArgs()
+        model = Transformer(model_args)
+        fully_shard_fn = functools.partial(
+            fully_shard, reshard_after_forward=reshard_after_forward
+        )
+        num_blocks = 0
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                fully_shard_fn(module)
+                num_blocks += 1
+        fully_shard_fn(model)
+        # We construct `num_blocks` plus 1 FSDP states/communication groups
+
+        torch.manual_seed(42 + self.rank)
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device="npu")
+        with CommDebugMode() as fwd_comm_mode:
+            loss = model(inp)
+        fwd_comm_counts = fwd_comm_mode.get_comm_counts()
+        self.assertEqual(len(fwd_comm_counts), 1)
+        self.assertEqual(fwd_comm_counts[c10d_ops._allgather_base_], num_blocks + 1)
+        with CommDebugMode() as bwd_comm_mode:
+            loss.sum().backward()
+        bwd_comm_counts = bwd_comm_mode.get_comm_counts()
+        if reshard_after_forward is False:
+            self.assertEqual(len(bwd_comm_counts), 1)
+        else:
+            # The root always does not reshard after forward
+            self.assertEqual(len(bwd_comm_counts), 2)
+            self.assertEqual(bwd_comm_counts[c10d_ops._allgather_base_], num_blocks)
+        self.assertEqual(
+            bwd_comm_counts[c10d_ops._reduce_scatter_base_], num_blocks + 1
+        )
+
+    @SupportedDevices(['Ascend910B'])
+    def test_manual_reshard_with_reshard_after_forward_false(self):
+        """
+        Tests that we can manually call ``reshard`` on FSDP modules that were
+        initialized with ``reshard_after_forward=False`` and still run unshard.
+        """
+        torch.manual_seed(42)
+        model_args = ModelArgs()
+        model = Transformer(model_args)
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                fully_shard(module, reshard_after_forward=False)
+        model = fully_shard(model, reshard_after_forward=False)
+        num_fsdp_modules = sum(
+            isinstance(module, FSDPModule) for module in model.modules()
+        )
+
+        torch.manual_seed(42 + self.rank)
+        inp = torch.randint(0, model_args.vocab_size, (2, 16), device="npu")
+        with CommDebugMode() as fwd_comm_mode:
+            loss = model(inp)
+        fwd_comm_counts = fwd_comm_mode.get_comm_counts()
+        self.assertEqual(len(fwd_comm_counts), 1)
+        self.assertEqual(fwd_comm_counts[c10d_ops._allgather_base_], num_fsdp_modules)
+
+        for module in model.modules():
+            if isinstance(module, FSDPModule):
+                module.reshard()
+
+        with CommDebugMode() as bwd_comm_mode:
+            loss.sum().backward()
+        bwd_comm_counts = bwd_comm_mode.get_comm_counts()
+        self.assertEqual(len(bwd_comm_counts), 2)
+        self.assertEqual(bwd_comm_counts[c10d_ops._allgather_base_], num_fsdp_modules)
+        self.assertEqual(
+            bwd_comm_counts[c10d_ops._reduce_scatter_base_], num_fsdp_modules
+        )
+
+
+class TestFullyShardPrefetch(FSDPNPUTest):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.npu.device_count())
+
+    @SupportedDevices(['Ascend910B'])
+    def test_fully_shard_backward_prefetch(self):
+        # Activation checkpointing should not affect the expected FSDP events
+        self.run_subtests(
+            {
+                "reshard_after_forward": [True, False, 2],
+                "checkpoint_impl": [None, "utils", "composable"],
+            },
+            self._test_backward_prefetch_forward_backward,
+        )
+        self.run_subtests(
+            {
+                "reshard_after_forward": [True, False, 2],
+                "checkpoint_impl": [None, "utils", "composable"],
+            },
+            self._test_backward_prefetch_multi_forward,
+        )
+        self._test_backward_prefetch_unused_in_backward(True)
+
+    def _test_backward_prefetch_forward_backward(
+        self, reshard_after_forward: Union[bool, int], checkpoint_impl: Optional[str]
+    ):
+        n_layers = 3
+        model, optim, inp = self._init_transformer(
+            n_layers, reshard_after_forward, checkpoint_impl
+        )
+        events: List[EventType] = []
+        unshard_with_record = self._get_unshard_with_record(
+            FSDPParamGroup.unshard, events
+        )
+        post_backward_with_record = self._get_post_backward_with_record(
+            FSDPParamGroup.post_backward, events
+        )
+        # Check the order for normal 1 forward, 1 backward, 1 optimizer step
+        with patch_unshard(unshard_with_record), patch_post_backward(
+            post_backward_with_record
+        ):
+            for iter_idx in range(3):
+                loss = model(inp)
+                expected_events = [
+                    ("unshard", "", TrainingState.FORWARD),  # root
+                    ("unshard", "layers.0", TrainingState.FORWARD),
+                    ("unshard", "layers.1", TrainingState.FORWARD),
+                    ("unshard", "layers.2", TrainingState.FORWARD),
+                ]
+                self.assertEqual(events, expected_events)
+                events.clear()
+                loss.sum().backward()
+                expected_events = [
+                    # Root does not reshard after forward so there is no
+                    # unshard event for it in backward
+                    ("unshard", "layers.2", TrainingState.PRE_BACKWARD),
+                    # Explicit backward prefetching moves the unshards early
+                    # by one module (note how swapping each unshard down one
+                    # event would give the natural event order)
+                    ("unshard", "layers.1", TrainingState.PRE_BACKWARD),
+                    ("post_backward", "layers.2", TrainingState.POST_BACKWARD),
+                    ("unshard", "layers.0", TrainingState.PRE_BACKWARD),
+                    ("post_backward", "layers.1", TrainingState.POST_BACKWARD),
+                    ("post_backward", "layers.0", TrainingState.POST_BACKWARD),
+                    ("post_backward", "", TrainingState.POST_BACKWARD),
+                ]
+                if reshard_after_forward is False:
+                    # No reshard after forward means no backward unshards
+                    expected_events = [e for e in expected_events if e[0] != "unshard"]
+                self.assertEqual(events, expected_events)
+                events.clear()
+                optim.step()
+                optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+
+    def _test_backward_prefetch_multi_forward(
+        self, reshard_after_forward: Union[bool, int], checkpoint_impl: Optional[str]
+    ):
+        n_layers = 3
+        model, optim, inp = self._init_transformer(
+            n_layers, reshard_after_forward, checkpoint_impl
+        )
+        events: List[EventType] = []
+        unshard_with_record = self._get_unshard_with_record(
+            FSDPParamGroup.unshard, events
+        )
+        post_backward_with_record = self._get_post_backward_with_record(
+            FSDPParamGroup.post_backward, events
+        )
+        # Check the order for multiple forwards before 1 backward
+        with patch_unshard(unshard_with_record), patch_post_backward(
+            post_backward_with_record
+        ):
+            loss1 = model(inp)
+            loss2 = model(inp)
+            expected_events = [
+                ("unshard", "", TrainingState.FORWARD),  # root
+                ("unshard", "layers.0", TrainingState.FORWARD),
+                ("unshard", "layers.1", TrainingState.FORWARD),
+                ("unshard", "layers.2", TrainingState.FORWARD),
+                # Root does not reshard after forward so there is not another
+                # unshard event for it
+                ("unshard", "layers.0", TrainingState.FORWARD),
+                ("unshard", "layers.1", TrainingState.FORWARD),
+                ("unshard", "layers.2", TrainingState.FORWARD),
+            ]
+            if reshard_after_forward is False:
+                # No reshard after forward means no second set of unshards
+                expected_events = expected_events[:-3]
+            self.assertEqual(events, expected_events)
+            events.clear()
+            (loss1 + loss2).sum().backward()
+            expected_events = [
+                # Same as the single forward/backward case except the root's
+                # post-backward does not run until the end of backward in the
+                # final callback (since the input not requiring gradient means
+                # that we do not have a tensor on which to hook for
+                # post-backward)
+                ("unshard", "layers.2", TrainingState.PRE_BACKWARD),
+                ("unshard", "layers.1", TrainingState.PRE_BACKWARD),
+                ("post_backward", "layers.2", TrainingState.POST_BACKWARD),
+                ("unshard", "layers.0", TrainingState.PRE_BACKWARD),
+                ("post_backward", "layers.1", TrainingState.POST_BACKWARD),
+                ("post_backward", "layers.0", TrainingState.POST_BACKWARD),
+            ]
+            if reshard_after_forward is False:
+                # No reshard after forward means no backward unshards
+                expected_events = [e for e in expected_events if e[0] != "unshard"]
+                # However, the post-backward reshards, so the second set of
+                # unshards will run as real ops
+            expected_events += [
+                # Repeat the same pattern except with the root's post-backward
+                # at the end since the final callback runs
+                ("unshard", "layers.2", TrainingState.PRE_BACKWARD),
+                ("unshard", "layers.1", TrainingState.PRE_BACKWARD),
+                ("post_backward", "layers.2", TrainingState.POST_BACKWARD),
+                ("unshard", "layers.0", TrainingState.PRE_BACKWARD),
+                ("post_backward", "layers.1", TrainingState.POST_BACKWARD),
+                ("post_backward", "layers.0", TrainingState.POST_BACKWARD),
+                ("post_backward", "", TrainingState.POST_BACKWARD),
+            ]
+            self.assertEqual(events, expected_events)
+            events.clear()
+
+    def _test_backward_prefetch_unused_in_backward(
+        self, reshard_after_forward: Union[bool, int]
+    ):
+        """
+        Test a model with a linear module then a split into two linear modules,
+        where we run backward through one path first before the other, meaning
+        that (1) only one linear of the two split is used per backward and (2)
+        the initial shared linear is used in both backwards.
+        """
+        dim = 8
+        model = nn.Sequential(nn.Linear(dim, dim), DoubleLinear(dim))
+        fully_shard(model[0], reshard_after_forward=reshard_after_forward)
+        fully_shard(model[1].lin1, reshard_after_forward=reshard_after_forward)
+        fully_shard(model[1].lin2, reshard_after_forward=reshard_after_forward)
+        fully_shard(model, reshard_after_forward=reshard_after_forward)
+        inp = torch.randn((4, dim), device="npu")
+        events: List[EventType] = []
+        unshard_with_record = self._get_unshard_with_record(
+            FSDPParamGroup.unshard, events
+        )
+        post_backward_with_record = self._get_post_backward_with_record(
+            FSDPParamGroup.post_backward, events
+        )
+        with patch_unshard(unshard_with_record), patch_post_backward(
+            post_backward_with_record
+        ):
+            loss1, loss2 = model(inp)
+            expected_events = [
+                # Root has no parameters, so it does not have an unshard
+                ("unshard", "0", TrainingState.FORWARD),
+                ("unshard", "1.lin1", TrainingState.FORWARD),
+                ("unshard", "1.lin2", TrainingState.FORWARD),
+            ]
+            self.assertEqual(events, expected_events)
+            events.clear()
+
+            model.set_is_last_backward(False)
+            loss2.sum().backward(retain_graph=True)
+            expected_events = [
+                ("unshard", "1.lin2", TrainingState.PRE_BACKWARD),
+                # NOTE: This `1.lin1` unshard is a mistargeted prefetch.
+                ("unshard", "1.lin1", TrainingState.PRE_BACKWARD),
+                ("post_backward", "1.lin2", TrainingState.POST_BACKWARD),
+                ("unshard", "0", TrainingState.PRE_BACKWARD),
+                ("post_backward", "0", TrainingState.POST_BACKWARD),
+            ]
+            self.assertEqual(events, expected_events)
+            events.clear()
+
+            model.set_is_last_backward(True)
+            loss1.sum().backward()
+            expected_events = [
+                # NOTE: `1.lin1` is already unsharded from the mistargeted
+                # prefetch in the first backward.
+                # Prefetch `0`
+                ("unshard", "0", TrainingState.PRE_BACKWARD),
+                ("post_backward", "1.lin1", TrainingState.POST_BACKWARD),
+                ("post_backward", "0", TrainingState.POST_BACKWARD),
+            ]
+            self.assertEqual(events, expected_events)
+            events.clear()
+
+    @SupportedDevices(['Ascend910B'])
+    def test_set_modules_to_forward_prefetch(self):
+        n_layers = 4
+        reshard_after_forward = True
+        checkpoint_impl = "utils"
+        model, _, inp = self._init_transformer(
+            n_layers, reshard_after_forward, checkpoint_impl
+        )
+
+        def set_forward_prefetch(model: Transformer, num_to_prefetch: int) -> None:
+            # Use model-specific knowledge to configure forward prefetching:
+            # each transformer block (layer) prefetches for the next few
+            for i, layer in enumerate(model.layers):
+                if i >= len(model.layers) - num_to_prefetch:
+                    break
+                layers_to_prefetch = [
+                    model.layers[i + j] for j in range(1, num_to_prefetch + 1)
+                ]
+                layer.set_modules_to_forward_prefetch(layers_to_prefetch)
+
+        events: List[EventType] = []
+        unshard_with_record = self._get_unshard_with_record(
+            FSDPParamGroup.unshard, events
+        )
+        reshard_with_record = self._get_reshard_with_record(
+            FSDPParamGroup.reshard, events
+        )
+        post_backward_with_record = self._get_post_backward_with_record(
+            FSDPParamGroup.post_backward, events
+        )
+        expected_backward_events = [
+            # Default backward prefetching
+            ("unshard", "layers.3", TrainingState.PRE_BACKWARD),
+            ("unshard", "layers.2", TrainingState.PRE_BACKWARD),
+            ("reshard", "layers.3", TrainingState.POST_BACKWARD),
+            ("post_backward", "layers.3", TrainingState.POST_BACKWARD),
+            ("unshard", "layers.1", TrainingState.PRE_BACKWARD),
+            ("reshard", "layers.2", TrainingState.POST_BACKWARD),
+            ("post_backward", "layers.2", TrainingState.POST_BACKWARD),
+            ("unshard", "layers.0", TrainingState.PRE_BACKWARD),
+            ("reshard", "layers.1", TrainingState.POST_BACKWARD),
+            ("post_backward", "layers.1", TrainingState.POST_BACKWARD),
+            ("reshard", "layers.0", TrainingState.POST_BACKWARD),
+            ("post_backward", "layers.0", TrainingState.POST_BACKWARD),
+            ("reshard", "", TrainingState.POST_BACKWARD),
+            ("post_backward", "", TrainingState.POST_BACKWARD),
+        ]
+        with patch_unshard(unshard_with_record), patch_reshard(
+            reshard_with_record
+        ), patch_post_backward(post_backward_with_record):
+            set_forward_prefetch(model, num_to_prefetch=1)
+            loss = model(inp)
+            expected_forward_events = [
+                ("unshard", "", TrainingState.FORWARD),
+                # `layers.i` prefetches `layers.i+1`
+                ("unshard", "layers.0", TrainingState.FORWARD),
+                ("unshard", "layers.1", TrainingState.FORWARD),
+                ("reshard", "layers.0", TrainingState.FORWARD),
+                ("unshard", "layers.2", TrainingState.FORWARD),
+                ("reshard", "layers.1", TrainingState.FORWARD),
+                ("unshard", "layers.3", TrainingState.FORWARD),
+                ("reshard", "layers.2", TrainingState.FORWARD),
+                ("reshard", "layers.3", TrainingState.FORWARD),
+            ]
+            self.assertEqual(events, expected_forward_events)
+            events.clear()
+            loss.sum().backward()
+            self.assertEqual(events, expected_backward_events)
+            events.clear()
+
+            set_forward_prefetch(model, num_to_prefetch=2)
+            loss = model(inp)
+            expected_forward_events = [
+                ("unshard", "", TrainingState.FORWARD),
+                # `layers.i` prefetches `layers.i+1` and `layers.i+2`
+                ("unshard", "layers.0", TrainingState.FORWARD),
+                ("unshard", "layers.1", TrainingState.FORWARD),
+                ("unshard", "layers.2", TrainingState.FORWARD),
+                ("reshard", "layers.0", TrainingState.FORWARD),
+                ("unshard", "layers.3", TrainingState.FORWARD),
+                ("reshard", "layers.1", TrainingState.FORWARD),
+                ("reshard", "layers.2", TrainingState.FORWARD),
+                ("reshard", "layers.3", TrainingState.FORWARD),
+            ]
+            self.assertEqual(events, expected_forward_events)
+            events.clear()
+            loss.sum().backward()
+            self.assertEqual(events, expected_backward_events)
+            events.clear()
+
+    @SupportedDevices(['Ascend910B'])
+    def test_set_modules_to_backward_prefetch(self):
+        n_layers = 4
+        reshard_after_forward = True
+        checkpoint_impl = "utils"
+        model, _, inp = self._init_transformer(
+            n_layers, reshard_after_forward, checkpoint_impl
+        )
+
+        def set_backward_prefetch(model: Transformer, num_to_prefetch: int) -> None:
+            # Use model-specific knowledge to configure backward prefetching:
+            # each transformer block (layer) prefetches for the previous few
+            for i, layer in enumerate(model.layers):
+                if i < num_to_prefetch:
+                    continue
+                layers_to_prefetch = [
+                    model.layers[i - j] for j in range(1, num_to_prefetch + 1)
+                ]
+                layer.set_modules_to_backward_prefetch(layers_to_prefetch)
+
+        events: List[EventType] = []
+        unshard_with_record = self._get_unshard_with_record(
+            FSDPParamGroup.unshard, events
+        )
+        reshard_with_record = self._get_reshard_with_record(
+            FSDPParamGroup.reshard, events
+        )
+        post_backward_with_record = self._get_post_backward_with_record(
+            FSDPParamGroup.post_backward, events
+        )
+        expected_forward_events = [
+            # Default forward prefetching
+            ("unshard", "", TrainingState.FORWARD),  # root
+            ("unshard", "layers.0", TrainingState.FORWARD),
+            ("reshard", "layers.0", TrainingState.FORWARD),
+            ("unshard", "layers.1", TrainingState.FORWARD),
+            ("reshard", "layers.1", TrainingState.FORWARD),
+            ("unshard", "layers.2", TrainingState.FORWARD),
+            ("reshard", "layers.2", TrainingState.FORWARD),
+            ("unshard", "layers.3", TrainingState.FORWARD),
+            ("reshard", "layers.3", TrainingState.FORWARD),
+        ]
+        with patch_unshard(unshard_with_record), patch_reshard(
+            reshard_with_record
+        ), patch_post_backward(post_backward_with_record):
+            set_backward_prefetch(model, num_to_prefetch=1)
+            loss = model(inp)
+            self.assertEqual(events, expected_forward_events)
+            events.clear()
+            loss.sum().backward()
+            expected_backward_events = [
+                # Root prefetches `layers.3` per default
+                ("unshard", "layers.3", TrainingState.PRE_BACKWARD),
+                # `layers.i` prefetches for `layers.i-1` (same as default)
+                ("unshard", "layers.2", TrainingState.PRE_BACKWARD),
+                ("reshard", "layers.3", TrainingState.POST_BACKWARD),
+                ("post_backward", "layers.3", TrainingState.POST_BACKWARD),
+                ("unshard", "layers.1", TrainingState.PRE_BACKWARD),
+                ("reshard", "layers.2", TrainingState.POST_BACKWARD),
+                ("post_backward", "layers.2", TrainingState.POST_BACKWARD),
+                ("unshard", "layers.0", TrainingState.PRE_BACKWARD),
+                ("reshard", "layers.1", TrainingState.POST_BACKWARD),
+                ("post_backward", "layers.1", TrainingState.POST_BACKWARD),
+                ("reshard", "layers.0", TrainingState.POST_BACKWARD),
+                ("post_backward", "layers.0", TrainingState.POST_BACKWARD),
+                ("reshard", "", TrainingState.POST_BACKWARD),
+                ("post_backward", "", TrainingState.POST_BACKWARD),
+            ]
+            self.assertEqual(events, expected_backward_events)
+            events.clear()
+
+            set_backward_prefetch(model, num_to_prefetch=2)
+            loss = model(inp)
+            self.assertEqual(events, expected_forward_events)
+            events.clear()
+            loss.sum().backward()
+            expected_backward_events = [
+                # Root prefetches `layers.3` per default
+                ("unshard", "layers.3", TrainingState.PRE_BACKWARD),
+                # `layers.i` prefetches for `layers.i-1` and `layers.i-2`
+                ("unshard", "layers.2", TrainingState.PRE_BACKWARD),
+                ("unshard", "layers.1", TrainingState.PRE_BACKWARD),
+                ("reshard", "layers.3", TrainingState.POST_BACKWARD),
+                ("post_backward", "layers.3", TrainingState.POST_BACKWARD),
+                ("unshard", "layers.0", TrainingState.PRE_BACKWARD),
+                ("reshard", "layers.2", TrainingState.POST_BACKWARD),
+                ("post_backward", "layers.2", TrainingState.POST_BACKWARD),
+                ("reshard", "layers.1", TrainingState.POST_BACKWARD),
+                ("post_backward", "layers.1", TrainingState.POST_BACKWARD),
+                ("reshard", "layers.0", TrainingState.POST_BACKWARD),
+                ("post_backward", "layers.0", TrainingState.POST_BACKWARD),
+                ("reshard", "", TrainingState.POST_BACKWARD),
+                ("post_backward", "", TrainingState.POST_BACKWARD),
+            ]
+            self.assertEqual(events, expected_backward_events)
+            events.clear()
+
+    @SupportedDevices(['Ascend910B'])
+    def test_fully_shard_multi_module_backward_prefetch(self):
+        n_layers = 5
+        model_args = ModelArgs(n_layers=n_layers, checkpoint_activations=True)
+        model = Transformer(model_args)
+        for i in range(n_layers):
+            if i == 0:
+                fully_shard(model.layers[i])
+            elif i % 2 == 1:
+                fully_shard([model.layers[i], model.layers[i + 1]])
+        fully_shard([model.tok_embeddings, model.pos_embeddings])
+        fully_shard([model.norm, model.output], reshard_after_forward=False)
+        fully_shard(model)
+        optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
+
+        events: List[EventType] = []
+        unshard_with_record = self._get_unshard_with_record(
+            FSDPParamGroup.unshard, events
+        )
+        post_backward_with_record = self._get_post_backward_with_record(
+            FSDPParamGroup.post_backward, events
+        )
+        inp = torch.randint(
+            0, model_args.vocab_size, (2, model_args.max_seq_len), device="npu"
+        )
+        with patch_unshard(unshard_with_record), patch_post_backward(
+            post_backward_with_record
+        ):
+            for iter_idx in range(3):
+                loss = model(inp)
+                expected_events = [
+                    (
+                        "unshard",
+                        "tok_embeddings, pos_embeddings",
+                        TrainingState.FORWARD,
+                    ),
+                    ("unshard", "layers.0", TrainingState.FORWARD),
+                    ("unshard", "layers.1, layers.2", TrainingState.FORWARD),
+                    ("unshard", "layers.3, layers.4", TrainingState.FORWARD),
+                    ("unshard", "norm, output", TrainingState.FORWARD),
+                ]
+                self.assertEqual(events, expected_events)
+                events.clear()
+                loss.sum().backward()
+                expected_events = [
+                    # (norm, output) does not reshard after forward, so there is
+                    # no unshard to begin backward
+                    ("unshard", "layers.3, layers.4", TrainingState.PRE_BACKWARD),
+                    ("post_backward", "norm, output", TrainingState.POST_BACKWARD),
+                    ("unshard", "layers.1, layers.2", TrainingState.PRE_BACKWARD),
+                    (
+                        "post_backward",
+                        "layers.3, layers.4",
+                        TrainingState.POST_BACKWARD,
+                    ),
+                    ("unshard", "layers.0", TrainingState.PRE_BACKWARD),
+                    (
+                        "post_backward",
+                        "layers.1, layers.2",
+                        TrainingState.POST_BACKWARD,
+                    ),
+                    (
+                        "unshard",
+                        "tok_embeddings, pos_embeddings",
+                        TrainingState.PRE_BACKWARD,
+                    ),
+                    ("post_backward", "layers.0", TrainingState.POST_BACKWARD),
+                    (
+                        "post_backward",
+                        "tok_embeddings, pos_embeddings",
+                        TrainingState.POST_BACKWARD,
+                    ),
+                ]
+                events.clear()
+                optim.step()
+                optim.zero_grad()
+
+    @SupportedDevices(['Ascend910B'])
+    def test_fully_shard_multi_module_unused_module(self):
+        class ModuleWithUnusedLinear(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.unused_lin = nn.Linear(1, 1)
+                self.lin = nn.Linear(16, 16)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return nn.functional.relu(self.lin(x))
+
+        model = nn.Sequential(
+            ModuleWithUnusedLinear(), ModuleWithUnusedLinear(), nn.Linear(16, 16)
+        )
+        fully_shard([model[0].unused_lin, model[0].lin], reshard_after_forward=True)
+        fully_shard([model[1].unused_lin, model[1].lin], reshard_after_forward=True)
+        fully_shard(model)
+        optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
+
+        events: List[EventType] = []
+        unshard_with_record = self._get_unshard_with_record(
+            FSDPParamGroup.unshard, events
+        )
+        post_backward_with_record = self._get_post_backward_with_record(
+            FSDPParamGroup.post_backward, events
+        )
+        inp = torch.randn((2, 16), device="npu")
+        with patch_unshard(unshard_with_record), patch_post_backward(
+            post_backward_with_record
+        ):
+            for iter_idx in range(3):
+                loss = model(inp)
+                expected_events = [
+                    ("unshard", "", TrainingState.FORWARD),
+                    ("unshard", "0.unused_lin, 0.lin", TrainingState.FORWARD),
+                    ("unshard", "1.unused_lin, 1.lin", TrainingState.FORWARD),
+                ]
+                self.assertEqual(events, expected_events)
+                events.clear()
+                loss.sum().backward()
+                expected_events = [
+                    # Since both `model[0]` and `model[1]` have unused modules
+                    # that never ran forward, they do not reshard after forward
+                    # despite setting it to `True`. Check that there are no
+                    # unshards in backward.
+                    (
+                        "post_backward",
+                        "1.unused_lin, 1.lin",
+                        TrainingState.POST_BACKWARD,
+                    ),
+                    (
+                        "post_backward",
+                        "0.unused_lin, 0.lin",
+                        TrainingState.POST_BACKWARD,
+                    ),
+                    ("post_backward", "", TrainingState.POST_BACKWARD),
+                ]
+                events.clear()
+                optim.step()
+                optim.zero_grad()
+
+    def test_backward_misprefetch(self):
+        torch.manual_seed(42)
+        model = MLP(dim=16, device="npu")
+        ref_model = copy.deepcopy(model)
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        fully_shard(model.in_proj)
+        fully_shard(model.out_proj)
+        fully_shard(model)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+        # Backward should run through `out_proj` -> `in_proj`, so if `in_proj`
+        # prefetches for `out_proj`, then this is a misprefetch, as `out_proj`
+        # should not be needed anymore for backward.
+        model.in_proj.set_modules_to_backward_prefetch([model.out_proj])
+
+        torch.manual_seed(self.rank + 1)
+        inp = torch.randn((2, 16), device="npu")
+        for _ in range(3):
+            ref_optim.zero_grad()
+            ref_loss = ref_model(inp).sum()
+            ref_loss.backward()
+            for param in ref_model.parameters():
+                dist.all_reduce(param.grad, op=dist.ReduceOp.AVG)
+            ref_optim.step()
+            optim.zero_grad()
+            loss = model(inp).sum()
+            loss.backward()
+            optim.step()
+            self.assertEqual(ref_loss, loss)
+
+    def _init_transformer(
+        self,
+        n_layers: int,
+        reshard_after_forward: Union[bool, int],
+        checkpoint_impl: Optional[str],
+    ):
+        model_args = ModelArgs(
+            n_layers=n_layers, checkpoint_activations=(checkpoint_impl == "utils")
+        )
+        model = Transformer(model_args)
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                if checkpoint_impl == "composable":
+                    checkpoint(module)
+                fully_shard(module, reshard_after_forward=reshard_after_forward)
+        fully_shard(model, reshard_after_forward=reshard_after_forward)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+        inp = torch.randint(
+            0, model_args.vocab_size, (2, model_args.max_seq_len), device="npu"
+        )
+        return model, optim, inp
+
+    def _get_unshard_with_record(
+        self, orig_unshard: Callable, events: List[EventType]
+    ) -> Callable:
+        def unshard_with_record(self, *args, **kwargs):
+            nonlocal events
+            if (
+                self._all_gather_result is None
+                and self._sharded_state != ShardedState.UNSHARDED
+            ):  # skip no-ops
+                events.append(("unshard", self._module_fqn, self._training_state))
+            return orig_unshard(self, *args, **kwargs)
+
+        return unshard_with_record
+
+    def _get_reshard_with_record(
+        self, orig_reshard: Callable, events: List[EventType]
+    ) -> Callable:
+        def reshard_with_record(self, *args, **kwargs):
+            nonlocal events
+            if (
+                self._training_state == TrainingState.FORWARD
+                and not self._reshard_after_forward
+            ):  # skip no-ops
+                return None
+            events.append(("reshard", self._module_fqn, self._training_state))
+            return orig_reshard(self, *args, **kwargs)
+
+        return reshard_with_record
+
+    def _get_post_backward_with_record(
+        self, orig_post_backward: Callable, events: List[EventType]
+    ) -> Callable:
+        def post_backward_with_record(self, *args, **kwargs):
+            nonlocal events
+            ret = orig_post_backward(self, *args, **kwargs)
+            # Use training state after running post-backward to check that the
+            # state is transitioned to `POST_BACKWARD` as expected
+            events.append(("post_backward", self._module_fqn, self._training_state))
+            return ret
+
+        return post_backward_with_record
+
+
+class TestFullyShardUnshardMultiProcess(FSDPNPUTest):
+    @property
+    def world_size(self) -> int:
+        return min(torch.npu.device_count(), 2)
+
+    def test_unshard_async(self):
+        class ReduceModule(nn.Module):
+            def __init__(self, dim: int, mesh: DeviceMesh):
+                super().__init__()
+                self.mesh = mesh
+                self.weight = nn.Parameter(torch.randn(dim, dim))
+
+            def forward(self, x: torch.Tensor):
+                y = F.relu(x @ self.weight)
+                # NOTE: This all-reduce is not differentiable and is included
+                # to exercise the overlap.
+                work = dist.all_reduce(y, group=self.mesh.get_group(), async_op=True)
+                return y, work
+
+        class MLPs(nn.Module):
+            def __init__(self, dim: int):
+                super().__init__()
+                self.mlp1 = MLP(dim)
+                self.mlp2 = MLP(dim)
+                self.mlp3 = MLP(dim)
+
+            def forward(self, ys: List[torch.Tensor], works: List[dist.Work]):
+                (y1, y2, y3), (work1, work2, work3) = ys, works
+                work1.wait()
+                z1 = self.mlp1(y1)
+                work2.wait()
+                z2 = self.mlp2(y2)
+                work3.wait()
+                z3 = self.mlp3(y3)
+                return z1 + z2 + z3
+
+        class ReduceModel(nn.Module):
+            def __init__(self, dim: int, mesh: DeviceMesh):
+                super().__init__()
+                self.reduce_module1 = ReduceModule(dim, mesh)
+                self.reduce_module2 = ReduceModule(dim, mesh)
+                self.reduce_module3 = ReduceModule(dim, mesh)
+                self.mlps = MLPs(dim)
+
+            def forward(self, x: torch.Tensor):
+                y1, work1 = self.reduce_module1(x)
+                if isinstance(self.mlps.mlp1, FSDPModule):
+                    self.mlps.mlp1.unshard(async_op=True)
+                y2, work2 = self.reduce_module2(x)
+                if isinstance(self.mlps.mlp2, FSDPModule):
+                    self.mlps.mlp2.unshard(async_op=True)
+                y3, work3 = self.reduce_module3(x)
+                if isinstance(self.mlps.mlp3, FSDPModule):
+                    self.mlps.mlp3.unshard(async_op=True)
+                return self.mlps([y1, y2, y3], [work1, work2, work3])
+
+        mesh = init_device_mesh("npu", (self.world_size,))
+        batch_size, dim = 2, 8
+        torch.manual_seed(42)
+        ref_model = replicate(ReduceModel(dim, mesh).npu())
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        torch.manual_seed(42)
+        model = ReduceModel(dim, mesh)
+        fully_shard(model.mlps.mlp1, reshard_after_forward=False)
+        fully_shard(model.mlps.mlp2, reshard_after_forward=False)
+        fully_shard(model.mlps.mlp3, reshard_after_forward=False)
+        fully_shard(model.mlps)
+        replicate(model.npu())
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn((batch_size, dim), device="npu")
+        for _ in range(10):
+            losses: List[torch.Tensor] = []
+            for _model, _optim in ((ref_model, ref_optim), (model, optim)):
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+                with implicit_replication():
+                    _optim.step()
+                _optim.zero_grad()
+            self.assertEqual(losses[0], losses[1])
+
+
+class TestFullyShardUnshardMultiThread(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    def perThreadSetUp(self):
+        super().perThreadSetUp()
+        torch.npu.set_device(0)
+
+    @SupportedDevices(['Ascend910B'])
+    def test_unshard_no_param_group(self):
+        # Check that we can call `unshard()` on a module with no parameter
+        # group / no managed parameters without erroring
+        model = nn.Sequential(nn.Linear(4, 4), nn.Linear(4, 4))
+        for lin in model:
+            fully_shard(lin)
+        fully_shard(model)
+        handle = model.unshard(async_op=True)
+        handle.wait()
+
+    @SupportedDevices(['Ascend910B'])
+    def test_unshard_without_lazy_init(self):
+        torch.manual_seed(42)
+        model = MLP(4)
+        for param in model.parameters():
+            dist.broadcast(param, src=0)
+        ref_model = copy.deepcopy(model)
+        fully_shard(model)
+        model.unshard()  # no lazy init yet
+        for ref_param, param in zip(ref_model.parameters(), model.parameters()):
+            self.assertEqual(ref_param, param)
+
+
+if __name__ == "__main__":
+    run_tests()
-- 
Gitee


From 31e2fd60e1295b9b3ff2322dec6a4c4fda997dbd Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Wed, 12 Mar 2025 09:45:33 +0000
Subject: [PATCH 136/358] !18852 Update op_plugin commit id Merge pull request
 !18852 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 86b640511d..187f0023e1 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 86b640511dfc78798cff7690fef8deacc7cb048b
+Subproject commit 187f0023e179e24b286f5ed00f3c5a39e99932a4
-- 
Gitee


From b755609754c1c6eb666b77c5892cea780fadc3fe Mon Sep 17 00:00:00 2001
From: jiangpengfei <jiangpengfei24@huawei.com>
Date: Wed, 12 Mar 2025 10:57:31 +0000
Subject: [PATCH 137/358] !18857 aclgraph change name Merge pull request !18857
 from jiangpengfei/v2.6.0

---
 third_party/acl/inc/acl/acl_mdl.h             |  6 +--
 torch_npu/csrc/core/npu/NPUGraph.cpp          | 10 ++---
 torch_npu/csrc/core/npu/NPUGraphsUtils.h      |  2 +-
 .../csrc/core/npu/interface/AclInterface.cpp  | 40 +++++++++----------
 .../csrc/core/npu/interface/AclInterface.h    |  6 +--
 5 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/third_party/acl/inc/acl/acl_mdl.h b/third_party/acl/inc/acl/acl_mdl.h
index 247ccc9f57..78dcabb8f1 100755
--- a/third_party/acl/inc/acl/acl_mdl.h
+++ b/third_party/acl/inc/acl/acl_mdl.h
@@ -1523,7 +1523,7 @@ ACL_FUNC_VISIBILITY const char *aclmdlGetTensorRealName(const aclmdlDesc *modelD
  * @retval ACL_SUCCESS The function is successfully executed.
  * @retval OtherValues Failure
  */
-ACL_FUNC_VISIBILITY aclError aclmdlBeginCapture(aclrtStream stream, aclmdlCaptureMode mode);
+ACL_FUNC_VISIBILITY aclError aclmdlCaptureBegin(aclrtStream stream, aclmdlCaptureMode mode);
 
 /**
  * @ingroup AscendCL
@@ -1534,7 +1534,7 @@ ACL_FUNC_VISIBILITY aclError aclmdlBeginCapture(aclrtStream stream, aclmdlCaptur
  * @retval ACL_SUCCESS The function is successfully executed.
  * @retval OtherValues Failure
  */
-ACL_FUNC_VISIBILITY aclError aclmdlGetCaptureInfo(aclrtStream stream, aclmdlCaptureStatus *status, uint32_t *modelId);
+ACL_FUNC_VISIBILITY aclError aclmdlCaptureGetInfo(aclrtStream stream, aclmdlCaptureStatus *status, uint32_t *modelId);
 
 /**
  * @ingroup AscendCL
@@ -1544,7 +1544,7 @@ ACL_FUNC_VISIBILITY aclError aclmdlGetCaptureInfo(aclrtStream stream, aclmdlCapt
  * @retval ACL_SUCCESS The function is successfully executed.
  * @retval OtherValues Failure
  */
-ACL_FUNC_VISIBILITY aclError aclmdlEndCapture(aclrtStream stream, uint32_t *modelId);
+ACL_FUNC_VISIBILITY aclError aclmdlCaptureEnd(aclrtStream stream, uint32_t *modelId);
 
 /**
  * @ingroup AscendCL
diff --git a/torch_npu/csrc/core/npu/NPUGraph.cpp b/torch_npu/csrc/core/npu/NPUGraph.cpp
index b53b0340ee..53eb51e998 100644
--- a/torch_npu/csrc/core/npu/NPUGraph.cpp
+++ b/torch_npu/csrc/core/npu/NPUGraph.cpp
@@ -20,7 +20,7 @@ constexpr int kSynchronizeBusyWaitMillis = 10;
 MempoolId_t graph_pool_handle()
 {
     // Sets just the second value, to distinguish it from MempoolId_ts created from
-    // aclmdlGetCaptureInfo id_s in capture_begin.
+    // aclmdlCaptureGetInfo id_s in capture_begin.
     auto new_pool = c10_npu::MemPool();
     return new_pool.id();
 }
@@ -110,7 +110,7 @@ void NPUGraph::capture_begin(MempoolId_t pool, aclmdlCaptureMode capture_mode)
     c10_npu::NPUCachingAllocator::beginAllocateToPool(capture_dev_, mempool_id_, [this](aclrtStream stream) {
         aclmdlCaptureStatus status;
         uint32_t model_id;
-        NPU_CHECK_ERROR(c10_npu::acl::AclmdlGetCaptureInfo(stream, &status, &model_id));
+        NPU_CHECK_ERROR(c10_npu::acl::AclmdlCaptureGetInfo(stream, &status, &model_id));
         return status == aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_ACTIVE && model_id == model_id_;
     });
 
@@ -125,10 +125,10 @@ void NPUGraph::capture_begin(MempoolId_t pool, aclmdlCaptureMode capture_mode)
 
     // cudaStreamCaptureModeGlobal is the most conservative option to
     // prevent potentially unsafe CUDA API calls during capture.
-    NPU_CHECK_ERROR(c10_npu::acl::AclmdlBeginCapture(capture_stream_, capture_mode));
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlCaptureBegin(capture_stream_, capture_mode));
 
     aclmdlCaptureStatus status;
-    NPU_CHECK_ERROR(c10_npu::acl::AclmdlGetCaptureInfo(stream, &status, &model_id_));
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlCaptureGetInfo(stream, &status, &model_id_));
     TORCH_INTERNAL_ASSERT(status == aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_ACTIVE);
 }
 
@@ -140,7 +140,7 @@ void NPUGraph::capture_end()
                 "Capture must end on the same stream it began on.");
 
     uint32_t model_id;
-    NPU_CHECK_ERROR(c10_npu::acl::AclmdlEndCapture(capture_stream_, &model_id));
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlCaptureEnd(capture_stream_, &model_id));
 
     c10_npu::NPUCachingAllocator::endAllocateToPool(capture_dev_, mempool_id_);
 
diff --git a/torch_npu/csrc/core/npu/NPUGraphsUtils.h b/torch_npu/csrc/core/npu/NPUGraphsUtils.h
index 0d23b4b019..ec7cc6b82b 100644
--- a/torch_npu/csrc/core/npu/NPUGraphsUtils.h
+++ b/torch_npu/csrc/core/npu/NPUGraphsUtils.h
@@ -74,7 +74,7 @@ inline CaptureStatus currentStreamCaptureStatusMayInitCtx()
     aclmdlCaptureStatus is_capturing{ACL_MODEL_CAPTURE_STATUS_NONE};
     uint32_t modelId;
     NPU_CHECK_ERROR(
-        c10_npu::acl::AclmdlGetCaptureInfo(c10_npu::getCurrentNPUStream(), &is_capturing, &modelId));
+        c10_npu::acl::AclmdlCaptureGetInfo(c10_npu::getCurrentNPUStream(), &is_capturing, &modelId));
     return CaptureStatus(is_capturing);
 }
 
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
index 5235d048bb..dc6c7381bd 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
@@ -69,9 +69,9 @@ LOAD_FUNCTION(aclrtPeekAtLastError)
 LOAD_FUNCTION(aclrtSynchronizeDevice)
 LOAD_FUNCTION(aclrtSynchronizeDeviceWithTimeout)
 LOAD_FUNCTION(aclrtEventGetTimestamp)
-LOAD_FUNCTION(aclmdlBeginCapture)
-LOAD_FUNCTION(aclmdlGetCaptureInfo)
-LOAD_FUNCTION(aclmdlEndCapture)
+LOAD_FUNCTION(aclmdlCaptureBegin)
+LOAD_FUNCTION(aclmdlCaptureGetInfo)
+LOAD_FUNCTION(aclmdlCaptureEnd)
 LOAD_FUNCTION(aclmdlDebugPrint)
 LOAD_FUNCTION(aclmdlExecuteAsync)
 LOAD_FUNCTION(aclmdlUnload)
@@ -708,39 +708,39 @@ aclError AclrtEventGetTimestamp(aclrtEvent event, uint64_t *timestamp)
     return func(event, timestamp);
 }
 
-aclError AclmdlBeginCapture(aclrtStream stream, aclmdlCaptureMode mode)
+aclError AclmdlCaptureBegin(aclrtStream stream, aclmdlCaptureMode mode)
 {
-    typedef aclError (*AclmdlBeginCapture)(aclrtStream, aclmdlCaptureMode);
-    static AclmdlBeginCapture func = nullptr;
+    typedef aclError (*AclmdlCaptureBegin)(aclrtStream, aclmdlCaptureMode);
+    static AclmdlCaptureBegin func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlBeginCapture) GET_FUNC(aclmdlBeginCapture);
+        func = (AclmdlCaptureBegin) GET_FUNC(aclmdlCaptureBegin);
     }
 
-    TORCH_CHECK(func, "Failed to find function aclmdlBeginCapture", PTA_ERROR(ErrCode::NOT_FOUND));
+    TORCH_CHECK(func, "Failed to find function aclmdlCaptureBegin", PTA_ERROR(ErrCode::NOT_FOUND));
     return func(stream, mode);
 }
 
-aclError AclmdlGetCaptureInfo(aclrtStream stream, aclmdlCaptureStatus *status, uint32_t *modelId)
+aclError AclmdlCaptureGetInfo(aclrtStream stream, aclmdlCaptureStatus *status, uint32_t *modelId)
 {
-    typedef aclError (*AclmdlGetCaptureInfo)(aclrtStream, aclmdlCaptureStatus *, uint32_t *);
-    static AclmdlGetCaptureInfo func = nullptr;
+    typedef aclError (*AclmdlCaptureGetInfo)(aclrtStream, aclmdlCaptureStatus *, uint32_t *);
+    static AclmdlCaptureGetInfo func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlGetCaptureInfo) GET_FUNC(aclmdlGetCaptureInfo);
+        func = (AclmdlCaptureGetInfo) GET_FUNC(aclmdlCaptureGetInfo);
     }
 
-    TORCH_CHECK(func, "Failed to find function aclmdlGetCaptureInfo", PTA_ERROR(ErrCode::NOT_FOUND));
+    TORCH_CHECK(func, "Failed to find function aclmdlCaptureGetInfo", PTA_ERROR(ErrCode::NOT_FOUND));
     return func(stream, status, modelId);
 }
 
-aclError AclmdlEndCapture(aclrtStream stream, uint32_t *modelId)
+aclError AclmdlCaptureEnd(aclrtStream stream, uint32_t *modelId)
 {
-    typedef aclError (*AclmdlEndCapture)(aclrtStream, uint32_t *);
-    static AclmdlEndCapture func = nullptr;
+    typedef aclError (*AclmdlCaptureEnd)(aclrtStream, uint32_t *);
+    static AclmdlCaptureEnd func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlEndCapture) GET_FUNC(aclmdlEndCapture);
+        func = (AclmdlCaptureEnd) GET_FUNC(aclmdlCaptureEnd);
     }
 
-    TORCH_CHECK(func, "Failed to find function aclmdlEndCapture", PTA_ERROR(ErrCode::NOT_FOUND));
+    TORCH_CHECK(func, "Failed to find function aclmdlCaptureEnd", PTA_ERROR(ErrCode::NOT_FOUND));
     return func(stream, modelId);
 }
 
@@ -792,8 +792,8 @@ bool IsCaptureSupported()
         (GetSocVersion() >= SocVersion::Ascend910_9391);
     if (default_support_capture && !have_load_func) {
         have_load_func = true;
-        typedef aclError (*AclmdlGetCaptureInfo)(aclrtStream, aclmdlCaptureStatus *, uint32_t *);
-        static AclmdlGetCaptureInfo func = (AclmdlGetCaptureInfo) GET_FUNC(aclmdlGetCaptureInfo);
+        typedef aclError (*AclmdlCaptureGetInfo)(aclrtStream, aclmdlCaptureStatus *, uint32_t *);
+        static AclmdlCaptureGetInfo func = (AclmdlCaptureGetInfo) GET_FUNC(aclmdlCaptureGetInfo);
         is_support = (func != nullptr);
     }
 
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h
index 596281f649..96a080eb0c 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.h
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.h
@@ -182,11 +182,11 @@ aclError AclrtSynchronizeDeviceWithTimeout(void);
 
 aclError AclrtEventGetTimestamp(aclrtEvent event, uint64_t *timestamp);
 
-aclError AclmdlBeginCapture(aclrtStream stream, aclmdlCaptureMode mode);
+aclError AclmdlCaptureBegin(aclrtStream stream, aclmdlCaptureMode mode);
 
-aclError AclmdlGetCaptureInfo(aclrtStream stream, aclmdlCaptureStatus *status, uint32_t *modelId);
+aclError AclmdlCaptureGetInfo(aclrtStream stream, aclmdlCaptureStatus *status, uint32_t *modelId);
 
-aclError AclmdlEndCapture(aclrtStream stream, uint32_t *modelId);
+aclError AclmdlCaptureEnd(aclrtStream stream, uint32_t *modelId);
 
 aclError AclmdlDebugPrint(uint32_t modelId);
 
-- 
Gitee


From a76b6004e04f9b2fe6740f20f40ea64a3a0f8e18 Mon Sep 17 00:00:00 2001
From: jiangpengfei <jiangpengfei24@huawei.com>
Date: Wed, 12 Mar 2025 13:15:49 +0000
Subject: [PATCH 138/358] !18861 check if in capture mode Merge pull request
 !18861 from jiangpengfei/v2.6.0

---
 torch_npu/csrc/core/npu/NPUGraph.cpp     | 4 ++++
 torch_npu/csrc/core/npu/NPUGraphsUtils.h | 2 ++
 torch_npu/csrc/framework/OpCommand.cpp   | 2 +-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/torch_npu/csrc/core/npu/NPUGraph.cpp b/torch_npu/csrc/core/npu/NPUGraph.cpp
index 53eb51e998..be5f367cdd 100644
--- a/torch_npu/csrc/core/npu/NPUGraph.cpp
+++ b/torch_npu/csrc/core/npu/NPUGraph.cpp
@@ -127,6 +127,8 @@ void NPUGraph::capture_begin(MempoolId_t pool, aclmdlCaptureMode capture_mode)
     // prevent potentially unsafe CUDA API calls during capture.
     NPU_CHECK_ERROR(c10_npu::acl::AclmdlCaptureBegin(capture_stream_, capture_mode));
 
+    c10_npu::is_stream_capturing.store(true);
+
     aclmdlCaptureStatus status;
     NPU_CHECK_ERROR(c10_npu::acl::AclmdlCaptureGetInfo(stream, &status, &model_id_));
     TORCH_INTERNAL_ASSERT(status == aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_ACTIVE);
@@ -142,6 +144,8 @@ void NPUGraph::capture_end()
     uint32_t model_id;
     NPU_CHECK_ERROR(c10_npu::acl::AclmdlCaptureEnd(capture_stream_, &model_id));
 
+    c10_npu::is_stream_capturing.store(false);
+
     c10_npu::NPUCachingAllocator::endAllocateToPool(capture_dev_, mempool_id_);
 
     TORCH_CHECK(model_id == model_id_, "Invalid end capture model id: ", model_id);
diff --git a/torch_npu/csrc/core/npu/NPUGraphsUtils.h b/torch_npu/csrc/core/npu/NPUGraphsUtils.h
index ec7cc6b82b..395f27a049 100644
--- a/torch_npu/csrc/core/npu/NPUGraphsUtils.h
+++ b/torch_npu/csrc/core/npu/NPUGraphsUtils.h
@@ -10,6 +10,8 @@
 
 namespace c10_npu {
 
+static std::atomic<bool> is_stream_capturing(false);
+
 using CaptureId_t = unsigned long long;
 
 // first is set if the instance is created by NPUGraph::capture_begin.
diff --git a/torch_npu/csrc/framework/OpCommand.cpp b/torch_npu/csrc/framework/OpCommand.cpp
index f8551f178b..ecaadcb804 100644
--- a/torch_npu/csrc/framework/OpCommand.cpp
+++ b/torch_npu/csrc/framework/OpCommand.cpp
@@ -133,7 +133,7 @@ OpCommand& OpCommand::Output(
 
 void OpCommand::Run() {
     // Check for npu graph
-    if (aclCmd->CheckCustomHandlerNull()) {
+    if (c10_npu::is_stream_capturing.load() && aclCmd->CheckCustomHandlerNull()) {
         c10_npu::assertNotCapturing("Cannot run aclop operators");
     }
 
-- 
Gitee


From 7c9f5c435402845027a03120880a45c84a6ff502 Mon Sep 17 00:00:00 2001
From: zhangqiongwen <zhangqiongwen@huawei.com>
Date: Thu, 13 Mar 2025 01:10:26 +0000
Subject: [PATCH 139/358] !18807 fsdp:_unsafe_set_version_counter patched Merge
 pull request !18807 from zhangqiongwen/v2.6.0_fsdp_fix

---
 torch_npu/csrc/core/npu/NPURecovery.cpp       |  25 +++
 torch_npu/distributed/fsdp/_add_fsdp_patch.py | 188 +++++++++++++++++-
 .../distributed/fsdp/_fsdp_collectives.py     |  12 ++
 3 files changed, 223 insertions(+), 2 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPURecovery.cpp b/torch_npu/csrc/core/npu/NPURecovery.cpp
index c2f6b2680f..c1697c35ed 100644
--- a/torch_npu/csrc/core/npu/NPURecovery.cpp
+++ b/torch_npu/csrc/core/npu/NPURecovery.cpp
@@ -4,6 +4,8 @@
 #include <torch/csrc/utils/pybind.h>
 #endif
 
+#include <torch/csrc/autograd/autograd.h>
+
 #include "torch_npu/csrc/core/npu/DeviceUtils.h"
 #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
 #include "torch_npu/csrc/core/npu/NPUException.h"
@@ -107,6 +109,25 @@ void check_and_update_npu_tensor_for_copy(const at::TensorList& dsts, const at::
     return;
 }
 
+void _unsafe_set_npu_version_counter(const std::vector<at::Tensor>& tensors, const std::vector<int64_t>& versions)
+{
+    auto tensors_len = tensors.size();
+    auto versions_len = versions.size();
+    TORCH_CHECK(
+        tensors_len == versions_len,
+        "tensors_len is not equals to versions_len",
+        "tensors_len=",
+        tensors_len,
+        ", versions_len=",
+        versions_len,
+        PTA_ERROR(ErrCode::PARAM));
+    for (const auto i : c10::irange(tensors_len)) {
+        auto vc = torch::autograd::impl::version_counter(tensors[i]);
+        vc.set_version(versions[i]);
+    }
+    return;
+}
+
 #ifndef BUILD_LIBTORCH
 void bind_npu_recovery_functions(PyObject* module)
 {
@@ -129,6 +150,10 @@ void bind_npu_recovery_functions(PyObject* module)
     m.def("_recovery_all_npu_stream", [](int device) -> void {
         return c10_npu::recovery_all_npu_streams(device);
     });
+    m.def("_unsafe_set_npu_version_counter",
+        [](const std::vector<at::Tensor>& tensors, const std::vector<int64_t>& versions) -> void {
+        return _unsafe_set_npu_version_counter(tensors, versions);
+    });
 }
 #endif
 
diff --git a/torch_npu/distributed/fsdp/_add_fsdp_patch.py b/torch_npu/distributed/fsdp/_add_fsdp_patch.py
index cc3d5fcfb9..7405620686 100644
--- a/torch_npu/distributed/fsdp/_add_fsdp_patch.py
+++ b/torch_npu/distributed/fsdp/_add_fsdp_patch.py
@@ -1,8 +1,18 @@
+from typing import Tuple, Union, cast
+
 import torch
 from torch import distributed as dist
-from torch.distributed.fsdp._fully_shard._fsdp_param_group import FSDPParamGroup
-
+from torch._dynamo import tensor_version_op
+from torch._prims import _make_prim, RETURN_TYPE
+from torch.autograd.grad_mode import _unsafe_preserve_version_counter
+from torch.distributed.device_mesh import _get_device_handle
+from torch.distributed.fsdp._fully_shard._fsdp_collectives import AllGatherResult
+from torch.distributed.fsdp._fully_shard._fsdp_common import compiled_autograd_enabled, TrainingState
+from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam, ShardedState
+from torch.distributed.fsdp._fully_shard._fsdp_param_group import FSDPParamGroup, AllGatherState
+from torch.profiler import record_function
 import torch_npu
+from torch_npu.utils._error_code import ErrCode, pta_error
 
 
 def _patched_finalize_backward(self):
@@ -23,5 +33,179 @@ def _patched_finalize_backward(self):
     self._post_forward_indices.clear()
 
 
+class _patched_unsafe_preserve_version_counter(_unsafe_preserve_version_counter):
+    r"""DO NOT USE THIS UNLESS YOU KNOW EXACTLY WHAT YOU'RE DOING.
+
+    This context manager can lead to arbitrary silent-correctness issues in any other part of your code
+    (even the ones not touched directly by the context manager)!
+
+    Ordinarily, autograd will track mutations to tensors by incrementing it's `._version` attribute.
+    This is generally important for correctness, as for example, mutating a tensor that autograd has saved
+    for the backwards pass can result in incorrect gradients, and autograd uses the version counter to detect
+    and error out in this situation.
+
+    However, there are rare instances where it might be useful to hide mutations from autograd. For example:
+    if a tensor is very large, and you'd like to free its memory by storing it elsewhere, and re-populate
+    the tensor right before it is needed by autograd.
+
+    Args:
+        tensor (torch.Tensor): the tensor in question, that you would like to preserve the version counter of.
+
+    .. note::
+        This API does not apply to :ref:`forward-mode AD <forward-mode-ad>`.
+
+    """
+
+    def __init__(self, tensors: Union[torch.Tensor, Tuple[torch.Tensor, ...]]) -> None:
+        self.tensors = (tensors,) if isinstance(tensors, torch.Tensor) else tensors
+        if not isinstance(self.tensors, tuple):
+            raise TypeError("Input must be tuple tensors." + pta_error(ErrCode.TYPE))
+        self.prev_versions = tuple(t._version for t in self.tensors)
+
+    def __exit__(self, *args) -> None:
+        torch_npu._C._unsafe_set_npu_version_counter(self.tensors, self.prev_versions)
+
+
+_patched_unsafe_set_npu_version_counter = _make_prim(
+    schema="_unsafe_set_npu_version_counter(Tensor[] tensors, SymInt[] versions) -> ()",
+    return_type=RETURN_TYPE.NEW,
+    meta=lambda self, version: None,
+    impl_aten=torch_npu._C._unsafe_set_npu_version_counter,
+    doc="Tracable+SymInt version of torch_npu._C._unsafe_set_npu_version_counter",
+)
+
+
+def _patched_unsafe_set_version_counter_functional(ctx, tensors, version):
+    torch_npu._C._unsafe_set_npu_version_counter(tensors, version)
+
+
+@torch.no_grad()
+def foreach_all_gather_copy_out_npu(
+        all_gather_result: AllGatherResult,
+        fsdp_params: list[FSDPParam],
+        group: dist.ProcessGroup,
+) -> None:
+    (
+        all_gather_output,
+        all_gather_event,
+        all_gather_work,
+        param_all_gather_input_dtypes,
+        param_all_gather_input_numels,
+        all_gather_input_split_sizes,
+    ) = all_gather_result
+    _dtype, device = all_gather_output.dtype, all_gather_output.device
+    device_handle = _get_device_handle(device.type)
+    if all_gather_event is not None:  # sync op
+        device_handle.current_stream().wait_event(all_gather_event)
+    if isinstance(all_gather_work, dist.distributed_c10d.Work):  # async op
+        all_gather_work.wait()
+    world_size, device = group.size(), all_gather_output.device
+    split_with_sizes_out: list[torch.Tensor] = []
+    shard_i_copy_infos: list[tuple[FSDPParam, list[torch.Tensor]]] = []
+    for all_gather_input_numels, all_gather_input_dtypes, fsdp_param in zip(
+            param_all_gather_input_numels, param_all_gather_input_dtypes, fsdp_params
+    ):
+        # NOTE: Under compile, make sure we always recreate all_gather_outputs
+        # per AllGather. See [Note: Invariants for torch.compile Traceable FSDP2].
+        force_recreate = compiled_autograd_enabled()
+        fsdp_param.init_all_gather_outputs(
+            all_gather_input_numels,
+            all_gather_input_dtypes,
+            world_size,
+            device,
+            force_recreate=force_recreate,
+        )
+        if not force_recreate:
+            fsdp_param.alloc_all_gather_outputs()
+        param_all_gather_outputs = fsdp_param.all_gather_outputs
+        if fsdp_param.fsdp_placement.dim != 0:
+            # Copy to a temporary and then chunk-cat into the final all-gather
+            # output tensors
+            param_all_gather_outputs = [
+                torch.empty_like(t) for t in param_all_gather_outputs
+            ]
+            shard_i_copy_infos.append((fsdp_param, param_all_gather_outputs))
+        split_with_sizes_out.extend(param_all_gather_outputs)
+    all_gather_output = all_gather_output.view(world_size, -1)
+    if all_gather_output.dtype == torch.uint8:
+        out = [t.view(world_size, -1).view(torch.uint8) for t in split_with_sizes_out]
+    else:
+        out = [t.view(world_size, -1) for t in split_with_sizes_out]
+    with torch.autograd._unsafe_preserve_version_counter(tuple(out)):
+        torch.ops.fsdp.split_with_sizes_copy(
+            all_gather_output, all_gather_input_split_sizes, dim=1, out=out
+        )
+
+    for fsdp_param, param_all_gather_outputs in shard_i_copy_infos:
+        # Chunk-cat from the temporary to the final all-gather output tensors
+        shard_dim = fsdp_param.fsdp_placement.dim
+
+        with torch.autograd._unsafe_preserve_version_counter(
+                tuple(fsdp_param.all_gather_outputs)
+        ):
+            for param_all_gather_output, target_all_gather_output in zip(
+                    param_all_gather_outputs, fsdp_param.all_gather_outputs
+            ):
+                padded_sharded_size = (
+                    fsdp_param.padded_sharded_param_size
+                    if fsdp_param.sharded_state == ShardedState.SHARDED
+                    else cast(
+                        torch.Tensor, fsdp_param._sharded_post_forward_param_data
+                    ).size()
+                )
+                pre_param_size = list(padded_sharded_size)
+                pre_param_size[0] *= world_size
+                chunks = torch.chunk(
+                    param_all_gather_output.view(pre_param_size), world_size, dim=0
+                )
+                post_param_size = list(padded_sharded_size)
+                post_param_size[shard_dim] *= world_size
+                cat_out = target_all_gather_output.view(post_param_size)
+                torch.cat(chunks, dim=shard_dim, out=cat_out)
+
+
+def patched_wait_for_unshard(self):
+    """
+    1. In forward with implict prefetching, to overlap the current copy-out
+    with the next all-gather, we save a reference to the current all-gather
+    result to free after the next copy-out.
+    2. Otherwise (explicit prefetching or in backward), we free the
+    all-gather result immediately after the current copy-out since we can
+    already overlap the current copy-out with the previous reduce-scatter.
+    """
+    if not self._all_gather_result:
+        return  # no preceding unshard
+    async_op = self._all_gather_result.all_gather_work is not None
+    if self._training_state == TrainingState.FORWARD:  # implicit prefetch
+        if prev_all_gather_state := self.comm_ctx.all_gather_state:
+            self._wait_all_gather_streams_on_event(prev_all_gather_state.event)
+            self.comm_ctx.all_gather_state = None  # free the all-gather result
+    with record_function(self._with_fqn("FSDP::all_gather_copy_out")):
+        foreach_all_gather_copy_out_npu(
+            self._all_gather_result,
+            self.fsdp_params,
+            self._all_gather_process_group,
+        )
+    for fsdp_param in self.fsdp_params:
+        fsdp_param.init_unsharded_param()
+    self._to_unsharded()
+    all_gather_copy_out_event = self.device_handle.Event()
+    all_gather_copy_out_event.record()
+    if not async_op and self._training_state == TrainingState.FORWARD:
+        # Defer free to allow for overlap of this copy-out with next
+        # all-gather collective
+        self.comm_ctx.all_gather_state = AllGatherState(
+            self._all_gather_result, all_gather_copy_out_event
+        )
+    else:
+        self._wait_all_gather_streams_on_event(all_gather_copy_out_event)
+    self._all_gather_result = None  # free unless saved in `all_gather_state`
+
+
 def _apply_fsdp_patch():
     FSDPParamGroup.finalize_backward = _patched_finalize_backward
+    FSDPParamGroup.wait_for_unshard = patched_wait_for_unshard
+    tensor_version_op._unsafe_set_version_counter_functional = _patched_unsafe_set_version_counter_functional
+    tensor_version_op._unsafe_set_version_counter = _patched_unsafe_set_npu_version_counter
+    _unsafe_preserve_version_counter.__init__ = _patched_unsafe_preserve_version_counter.__init__
+    _unsafe_preserve_version_counter.__exit__ = _patched_unsafe_preserve_version_counter.__exit__
diff --git a/torch_npu/distributed/fsdp/_fsdp_collectives.py b/torch_npu/distributed/fsdp/_fsdp_collectives.py
index a1c203ffe0..1e7a886095 100644
--- a/torch_npu/distributed/fsdp/_fsdp_collectives.py
+++ b/torch_npu/distributed/fsdp/_fsdp_collectives.py
@@ -38,3 +38,15 @@ def all_gather_copy_in_npu(
     with torch.no_grad():
         torch._foreach_copy_(foreach_copy_dsts, all_gather_inputs)
     return all_gather_input, all_gather_output
+
+
+@torch.library.impl(lib, "split_with_sizes_copy", "PrivateUse1")
+def split_with_sizes_copy(
+    all_gather_output: torch.Tensor,
+    all_gather_input_split_sizes: List[int],
+    dim: int,
+    out: List[torch.Tensor],
+) -> None:
+    torch.split_with_sizes_copy(
+        all_gather_output, all_gather_input_split_sizes, dim=dim, out=out
+    )
-- 
Gitee


From f15395aff30e02bfcbc9b241b5a936a17ec9e921 Mon Sep 17 00:00:00 2001
From: jiangpengfei <jiangpengfei24@huawei.com>
Date: Thu, 13 Mar 2025 02:03:55 +0000
Subject: [PATCH 140/358] !18869 !18773 Fix Cpu Affinity Merge pull request
 !18869 from jiangpengfei/v2.6.0

---
 torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
index 42d80b0792..843f6606f6 100644
--- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
+++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
@@ -191,8 +191,6 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id)
 
     GetAffinityInfo();
 
-    SetThreadAffinity(device_id_);
-
     init_flag_ = true;
     ASCEND_LOGD("Npu sys ctrl initialize successfully.");
 
-- 
Gitee


From f16e1d1cd1c37e0d56f5d08b52482601058775ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= <wangchao430@huawei.com>
Date: Thu, 13 Mar 2025 02:15:20 +0000
Subject: [PATCH 141/358] =?UTF-8?q?!18841=20Add=20distributed=20security?=
 =?UTF-8?q?=20note=20Merge=20pull=20request=20!18841=20from=20=E7=8E=8B?=
 =?UTF-8?q?=E8=B6=85/v2.6.0=5Fsec?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 SECURITYNOTE.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/SECURITYNOTE.md b/SECURITYNOTE.md
index ccc955b2b8..55ce2863b0 100644
--- a/SECURITYNOTE.md
+++ b/SECURITYNOTE.md
@@ -62,6 +62,7 @@ torch_npu支持源码编译安装，在编译时会下载依赖第三方库并
 
 1. 建议用户结合运行环境资源状况编写对应训练脚本。若训练脚本与资源状况不匹配，如数据集加载内存大小超出内存容量限制、训练脚本在本地生成数据超过磁盘空间大小等情况，可能引发错误并导致进程意外退出。
 2. PyTorch和torch_npu在运行异常时会退出进程并打印报错信息，属于正常现象。建议用户根据报错提示定位具体错误原因，包括设定算子同步执行、查看CANN日志、解析生成的Core Dump文件等方式。
+3. PyTorch和torch_npu的分布式特性仅适用于内部通信。出于性能考虑，这些分布式特性不包含任何授权协议，并且会发送未加密的消息。关于PyTorch分布式特性的详细说明及安全注意事项，可参考[using-distributed-features](https://github.com/pytorch/pytorch/security#using-distributed-features)。
 
 ## 公网地址声明
 
-- 
Gitee


From 56d8e363860a3b95542f0d5fedc44614118c33f3 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 13 Mar 2025 03:00:32 +0000
Subject: [PATCH 142/358] !18880 Update op_plugin commit id Merge pull request
 !18880 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 187f0023e1..077f2113e5 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 187f0023e179e24b286f5ed00f3c5a39e99932a4
+Subproject commit 077f2113e5056f24996efcd6cd1cdfd3db547f05
-- 
Gitee


From 254cca3a187cdac5fbd98307cf517aee746ad966 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 13 Mar 2025 03:00:32 +0000
Subject: [PATCH 143/358] !18880 Update op_plugin commit id Merge pull request
 !18880 from pta-robot/v2.6.0

-- 
Gitee


From 3638240c0f45567accaa41f5b8981a5e50ed955d Mon Sep 17 00:00:00 2001
From: wangzixuan <617225691@qq.com>
Date: Thu, 13 Mar 2025 03:18:55 +0000
Subject: [PATCH 144/358] !18620 export db Merge pull request !18620 from
 wangzixuan/dev-2.6.0

---
 .../analysis/prof_common_func/_constant.py    |   3 +-
 .../analysis/prof_common_func/_db_manager.py  |  52 +++++++
 .../analysis/prof_config/_parser_config.py    |  28 +---
 .../prof_config/_parser_deps_config.py        |  25 +---
 .../prof_db_parse/_basic_db_parser.py         | 100 +++++++++++++
 .../prof_db_parse/_communication_db_parser.py |  51 +++----
 .../prof_view/prof_db_parse/_db_parser.py     | 134 +++++++-----------
 .../prof_db_parse/_fwk_api_db_parser.py       |  71 +++++-----
 .../prof_db_parse/_gc_record_db_parser.py     |  21 +--
 .../prof_db_parse/_memory_db_parser.py        |  38 ++---
 .../prof_db_parse/_step_info_db_parser.py     |  56 ++++----
 .../_trace_step_time_db_parser.py             |  59 +++-----
 12 files changed, 334 insertions(+), 304 deletions(-)
 create mode 100644 torch_npu/profiler/analysis/prof_view/prof_db_parse/_basic_db_parser.py

diff --git a/torch_npu/profiler/analysis/prof_common_func/_constant.py b/torch_npu/profiler/analysis/prof_common_func/_constant.py
index 3fa82e7ede..1ce2c1fbbd 100644
--- a/torch_npu/profiler/analysis/prof_common_func/_constant.py
+++ b/torch_npu/profiler/analysis/prof_common_func/_constant.py
@@ -188,7 +188,8 @@ class Constant(object):
     MEMORY_PREPARE = "memory_prepare"
     MEMORY_TIMELINE_PARSER = "memory_timeline"
 
-    DB_PARSER = "cann_db"
+    DB_PARSER = "torch_db"
+    BASIC_DB_PARSER = "basic_db"
     FWK_API_DB_PARSER = "fwk_api_db"
     MEMORY_DB_PARSER = "memory_db"
     STEP_INFO_DB_PARSER = "step_info_db"
diff --git a/torch_npu/profiler/analysis/prof_common_func/_db_manager.py b/torch_npu/profiler/analysis/prof_common_func/_db_manager.py
index acc1d89eea..593a44dd78 100644
--- a/torch_npu/profiler/analysis/prof_common_func/_db_manager.py
+++ b/torch_npu/profiler/analysis/prof_common_func/_db_manager.py
@@ -3,6 +3,7 @@ import sqlite3
 
 from ._constant import Constant, print_warn_msg, print_error_msg
 from ._file_manager import FileManager
+from ._singleton import Singleton
 
 __all__ = []
 
@@ -167,3 +168,54 @@ class DbManager:
             print_error_msg("SQLite Error: %s" % " ".join(err.args))
             return []
         return res
+
+
+class BasicDb:
+    def __init__(self) -> None:
+        self.db_path = None
+        self.conn = None
+        self.curs = None
+
+    def init(self, db_path: str) -> None:
+        if self.db_path is None:
+            self.db_path = db_path
+
+    def create_connect_db(self) -> bool:
+        if self.conn and self.curs:
+            return True
+        self.conn, self.curs = DbManager.create_connect_db(self.db_path)
+        return True if (self.conn and self.curs) else False
+
+    def get_db_path(self) -> str:
+        return self.db_path
+
+    def close(self) -> None:
+        self.db_path = None
+        DbManager.destroy_db_connect(self.conn, self.curs)
+
+    def judge_table_exist(self, table_name: str) -> bool:
+        return DbManager.judge_table_exist(self.curs, table_name)
+
+    def create_table_with_headers(self, table_name: str, headers: list) -> None:
+        DbManager.create_table_with_headers(self.conn, self.curs, table_name, headers)
+
+    def insert_data_into_table(self, table_name: str, data: list) -> None:
+        DbManager.insert_data_into_table(self.conn, table_name, data)
+
+    def fetch_all_data(self, sql: str) -> list:
+        return DbManager.fetch_all_data(self.curs, sql)
+
+    def fetch_one_data(self, sql: str) -> list:
+        return DbManager.fetch_one_data(self.curs, sql)
+
+
+@Singleton
+class TorchDb(BasicDb):
+    def __init__(self) -> None:
+        super().__init__()
+
+
+@Singleton
+class AnalysisDb(BasicDb):
+    def __init__(self) -> None:
+        super().__init__()
diff --git a/torch_npu/profiler/analysis/prof_config/_parser_config.py b/torch_npu/profiler/analysis/prof_config/_parser_config.py
index f5f5cddfa1..986ff3bb30 100644
--- a/torch_npu/profiler/analysis/prof_config/_parser_config.py
+++ b/torch_npu/profiler/analysis/prof_config/_parser_config.py
@@ -28,13 +28,7 @@ from ..prof_view._memory_view_parser import MemoryViewParser
 from ..prof_view._integrate_parser import IntegrateParser
 from ..prof_view._communication_parser import CommunicationParser
 from ..prof_view._memory_timeline_parser import MemoryTimelineParser
-from ..prof_view.prof_db_parse._fwk_api_db_parser import FwkApiDbParser
-from ..prof_view.prof_db_parse._memory_db_parser import MemoryDbParser
 from ..prof_view.prof_db_parse._db_parser import DbParser
-from ..prof_view.prof_db_parse._step_info_db_parser import StepInfoDbParser
-from ..prof_view.prof_db_parse._communication_db_parser import CommunicationDbParser
-from ..prof_view.prof_db_parse._trace_step_time_db_parser import TraceStepTimeDbParser
-from ..prof_view.prof_db_parse._gc_record_db_parser import GCRecordDbParser
 
 __all__ = []
 
@@ -59,14 +53,11 @@ class ParserConfig:
         Constant.Db: {
             Constant.TENSORBOARD_TRACE_HANDLER: [
                 CANNExportParser,
-                DbParser,
                 CANNTimelineParser,
                 CANNAnalyzeParser,
-                FwkApiDbParser,
                 TreeBuildParser,
                 MemoryPrepareParser,
-                MemoryDbParser,
-                GCRecordDbParser
+                DbParser
             ]
         }
     }
@@ -97,17 +88,11 @@ class ParserConfig:
         Constant.Db: {
             Constant.TENSORBOARD_TRACE_HANDLER: [
                 CANNExportParser,
-                DbParser,
                 CANNTimelineParser,
                 CANNAnalyzeParser,
-                FwkApiDbParser,
                 TreeBuildParser,
                 MemoryPrepareParser,
-                MemoryDbParser,
-                StepInfoDbParser,
-                CommunicationDbParser,
-                TraceStepTimeDbParser,
-                GCRecordDbParser
+                DbParser
             ]
         }
     }
@@ -120,8 +105,7 @@ class ParserConfig:
             Constant.EXPORT_MEMORY_TIMELINE: [MemoryTimelineParser]
         },
         Constant.Db: {
-            Constant.TENSORBOARD_TRACE_HANDLER: [CANNExportParser, DbParser, FwkApiDbParser, MemoryDbParser,
-                                                 GCRecordDbParser]
+            Constant.TENSORBOARD_TRACE_HANDLER: [CANNExportParser, DbParser]
         }
     }
 
@@ -146,10 +130,4 @@ class ParserConfig:
 
         # db parser
         DbParser: Constant.DB_PARSER,
-        FwkApiDbParser: Constant.FWK_API_DB_PARSER,
-        MemoryDbParser: Constant.MEMORY_DB_PARSER,
-        StepInfoDbParser: Constant.STEP_INFO_DB_PARSER,
-        CommunicationDbParser: Constant.COMMUNICATION_DB_PARSER,
-        TraceStepTimeDbParser: Constant.TRACE_STEP_TIME_DB_PARSER,
-        GCRecordDbParser: Constant.GC_RECORD_DB_PARSER
     }
diff --git a/torch_npu/profiler/analysis/prof_config/_parser_deps_config.py b/torch_npu/profiler/analysis/prof_config/_parser_deps_config.py
index 5353d4c877..51cd44ba02 100644
--- a/torch_npu/profiler/analysis/prof_config/_parser_deps_config.py
+++ b/torch_npu/profiler/analysis/prof_config/_parser_deps_config.py
@@ -54,20 +54,8 @@ class ParserDepsConfig:
         Constant.MEMORY_PREPARE: {Constant.MODE: ConcurrentMode.PTHREAD,
                                   Constant.DEPS: [Constant.TREE_BUILD_PARSER]},
         Constant.DB_PARSER: {Constant.MODE: ConcurrentMode.PTHREAD,
-                                  Constant.DEPS: [Constant.CANN_EXPORT_PARSER]},
-        Constant.FWK_API_DB_PARSER: {Constant.MODE: ConcurrentMode.PTHREAD,
-                                     Constant.DEPS: [Constant.DB_PARSER]},
-        Constant.MEMORY_DB_PARSER: {Constant.MODE: ConcurrentMode.PTHREAD,
-                                    Constant.DEPS: [Constant.DB_PARSER, Constant.MEMORY_PREPARE, Constant.FWK_API_DB_PARSER]},
-        Constant.STEP_INFO_DB_PARSER: {Constant.MODE: ConcurrentMode.PTHREAD,
-                                       Constant.DEPS: [Constant.DB_PARSER, Constant.TREE_BUILD_PARSER]},
-        Constant.COMMUNICATION_DB_PARSER: {Constant.MODE: ConcurrentMode.PTHREAD,
-                                           Constant.DEPS: [Constant.DB_PARSER, Constant.CANN_ANALYZE_PARSER,
-                                                           Constant.STEP_INFO_DB_PARSER]},
-        Constant.TRACE_STEP_TIME_DB_PARSER: {Constant.MODE: ConcurrentMode.PTHREAD,
-                                             Constant.DEPS: [Constant.DB_PARSER, Constant.STEP_INFO_DB_PARSER]},
-        Constant.GC_RECORD_DB_PARSER: {Constant.MODE: ConcurrentMode.PTHREAD,
-                                       Constant.DEPS: [Constant.DB_PARSER]},
+                             Constant.DEPS: [Constant.CANN_EXPORT_PARSER, Constant.MEMORY_PREPARE,
+                                             Constant.TREE_BUILD_PARSER]},
         Constant.MEMORY_TIMELINE_PARSER: {}
     }
 
@@ -77,13 +65,6 @@ class ParserDepsConfig:
         Constant.MEMORY_VIEW_PARSER: {Constant.MODE: ConcurrentMode.SUB_PROCESS, Constant.DEPS: []},
         Constant.STACK_VIEW_PARSER: {Constant.MODE: ConcurrentMode.SUB_PROCESS, Constant.DEPS: []},
         Constant.CANN_EXPORT_PARSER: {Constant.MODE: ConcurrentMode.SUB_PROCESS, Constant.DEPS: []},
-        Constant.DB_PARSER: {Constant.MODE: ConcurrentMode.PTHREAD,
-                                  Constant.DEPS: [Constant.CANN_EXPORT_PARSER]},
-        Constant.FWK_API_DB_PARSER: {Constant.MODE: ConcurrentMode.PTHREAD,
-                                     Constant.DEPS: [Constant.DB_PARSER]},
-        Constant.MEMORY_DB_PARSER: {Constant.MODE: ConcurrentMode.PTHREAD,
-                                    Constant.DEPS: [Constant.DB_PARSER, Constant.FWK_API_DB_PARSER]},
-        Constant.GC_RECORD_DB_PARSER: {Constant.MODE: ConcurrentMode.PTHREAD,
-                                       Constant.DEPS: [Constant.DB_PARSER]},
+        Constant.DB_PARSER: {Constant.MODE: ConcurrentMode.PTHREAD, Constant.DEPS: [Constant.CANN_EXPORT_PARSER]},
         Constant.MEMORY_TIMELINE_PARSER: {}
     }
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_basic_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_basic_db_parser.py
new file mode 100644
index 0000000000..940fa6417d
--- /dev/null
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_basic_db_parser.py
@@ -0,0 +1,100 @@
+import os
+import re
+import shutil
+import json
+
+from ...prof_common_func._log import ProfilerLogger
+from ...prof_common_func._utils import collect_env_vars
+from ...prof_common_func._path_manager import ProfilerPathManager
+from ...prof_common_func._file_manager import FileManager
+from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager
+from ...prof_common_func._db_manager import TorchDb
+from ...prof_common_func._host_info import get_host_info
+from .._base_parser import BaseParser
+from ..._profiler_config import ProfilerConfig
+
+__all__ = []
+
+
+class BasicDbParser(BaseParser):
+    def __init__(self, name: str, param_dict: dict):
+        super().__init__(name, param_dict)
+        self._cann_path = ProfilerPathManager.get_cann_path(self._profiler_path)
+        ProfilerLogger.init(self._profiler_path, "BasicDbParser")
+        self.logger = ProfilerLogger.get_instance()
+
+    def run(self, deps_data: dict):
+        try:
+            cann_db_path = self.get_cann_db_path()
+            if cann_db_path:
+                shutil.move(cann_db_path, TorchDb().get_db_path())
+            self.create_ascend_db()
+            self.save_rank_info_to_db()
+            self.save_host_info_to_db()
+            self.save_env_vars_info_to_db()
+            self.save_profiler_metadata_to_db()
+        except Exception as error:
+            self.logger.error("Failed to generate basic db file. Error: %s", str(error), exc_info=True)
+            return Constant.FAIL, ""
+        return Constant.SUCCESS, ""
+
+    def get_cann_db_path(self):
+        if not self._cann_path:
+            return ""
+        db_patten = '^msprof_\d+\.db$'
+        for cann_file in os.listdir(self._cann_path):
+            file_path = os.path.join(self._cann_path, cann_file)
+            if re.match(db_patten, cann_file):
+                try:
+                    FileManager.check_db_file_vaild(file_path)
+                except RuntimeError:
+                    self.logger.warning("Invalid cann db file. file name is: %s", cann_file)
+                    continue
+                return file_path
+        return ""
+    
+    def create_ascend_db(self):
+        if not TorchDb().create_connect_db():
+            raise RuntimeError(f"Failed to connect to db file: {TorchDb().get_db_path()}")
+
+    def save_rank_info_to_db(self):
+        if ProfilerConfig().rank_id == -1:
+            return
+        TorchDb().create_table_with_headers(DbConstant.TABLE_RANK_DEVICE_MAP,
+                                            TableColumnsManager.TableColumns.get(DbConstant.TABLE_RANK_DEVICE_MAP))
+        TorchDb().insert_data_into_table(DbConstant.TABLE_RANK_DEVICE_MAP,
+                                         [[ProfilerConfig().rank_id, ProfilerPathManager.get_device_id(self._cann_path)]])
+    
+    def save_host_info_to_db(self):
+        if TorchDb().judge_table_exist(DbConstant.TABLE_HOST_INFO):
+            return
+        host_info = get_host_info()
+        TorchDb().create_table_with_headers(DbConstant.TABLE_HOST_INFO,
+                                            TableColumnsManager.TableColumns.get(DbConstant.TABLE_HOST_INFO))
+        TorchDb().insert_data_into_table(DbConstant.TABLE_HOST_INFO,
+                                         [[host_info.get('host_uid'), host_info.get('host_name')]])
+
+    def save_env_vars_info_to_db(self):
+        env_vars_dict = collect_env_vars()
+        TorchDb().create_table_with_headers(DbConstant.TABLE_META_DATA,
+                                            TableColumnsManager.TableColumns.get(DbConstant.TABLE_META_DATA))
+        TorchDb().insert_data_into_table(DbConstant.TABLE_META_DATA,
+                                         [['ENV_VARIABLES', json.dumps(env_vars_dict.get('ENV_VARIABLES'))]])
+
+    def save_profiler_metadata_to_db(self):
+        profiler_metadata_path = os.path.join(self._profiler_path, Constant.PROFILER_META_DATA)
+        if not os.path.exists(profiler_metadata_path):
+            self.logger.warning("Can not find profiler_metadata.json, path is: %s", profiler_metadata_path)
+            return
+        profiler_metadata = FileManager.file_read_all(profiler_metadata_path)
+        try:
+            profiler_metadata = json.loads(profiler_metadata)
+        except json.JSONDecodeError as e:
+            self.logger.warning("profiler_metadata.json parse failed, error is: %s", str(e))
+            return
+        data = [
+            [str(key), json.dumps(value)] for key, value in profiler_metadata.items()
+        ]
+        TorchDb().create_table_with_headers(DbConstant.TABLE_META_DATA,
+                                            TableColumnsManager.TableColumns.get(DbConstant.TABLE_META_DATA))
+        TorchDb().insert_data_into_table(DbConstant.TABLE_META_DATA, data)
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_communication_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_communication_db_parser.py
index 29bf8259a3..d51533c477 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_communication_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_communication_db_parser.py
@@ -13,11 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 from enum import Enum
 
 from ...prof_parse._cann_file_parser import CANNDataEnum, CANNFileParser
 from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager
+from ...prof_common_func._db_manager import AnalysisDb
 from ...prof_common_func._constant import convert_us2ns
 from ...prof_common_func._db_manager import DbManager
 from .._communication_parser import CommunicationParser
@@ -73,8 +73,6 @@ class CommunicationDbParser(CommunicationParser):
         super().__init__(name, param_dict)
         self.cann_comm_db_conn = None
         self.cann_comm_db_curs = None
-        self.analysis_db_conn = None
-        self.analysis_db_curs = None
         ProfilerLogger.init(self._profiler_path, "CommunicationDbParser")
         self.logger = ProfilerLogger.get_instance()
 
@@ -82,10 +80,9 @@ class CommunicationDbParser(CommunicationParser):
         try:
             self._init_step_list(deps_data)
             self.generate_view()
-        except Exception as e:
-            self.logger.error("Failed to generate communication table, error: %s", str(e), exc_info=True)
+        except Exception as error:
+            self.logger.error("Failed to generate communication table, error: %s", str(error), exc_info=True)
             DbManager.destroy_db_connect(self.cann_comm_db_conn, self.cann_comm_db_curs)
-            DbManager.destroy_db_connect(self.analysis_db_conn, self.analysis_db_curs)
             return Constant.FAIL, None
         return Constant.SUCCESS, None
 
@@ -110,6 +107,7 @@ class CommunicationDbParser(CommunicationParser):
         self.save_communication_db_data(band_width_data, matrix_data, time_data, output_path)
 
     def get_communication_db_data(self, db_path: str):
+        # 在处理原analysis.db里的数据
         band_width_data, matrix_data, time_data = [], [], []
         conn, curs = DbManager.create_connect_db(db_path)
         if not (conn and curs):
@@ -119,18 +117,18 @@ class CommunicationDbParser(CommunicationParser):
         self.cann_comm_db_curs = curs
         if DbManager.judge_table_exist(curs, DbConstant.TABLE_ANALYZER_BANDWIDTH):
             sql = "select hccl_op_name, group_name, transport_type, transit_size, transit_time, " \
-                  "bandwidth, large_packet_ratio, package_size, count, total_duration" \
-                  " from {};".format(DbConstant.TABLE_ANALYZER_BANDWIDTH)
+                  "bandwidth, large_packet_ratio, package_size, count, total_duration " \
+                  "from {};".format(DbConstant.TABLE_ANALYZER_BANDWIDTH)
             band_width_data = DbManager.fetch_all_data(curs, sql)
         if DbManager.judge_table_exist(curs, DbConstant.TABLE_ANALYZER_MATRIX):
             sql = "select hccl_op_name, group_name, src_rank, dst_rank, "\
-                  "transport_type, transit_size, transit_time, bandwidth" \
-                  " from {};".format(DbConstant.TABLE_ANALYZER_MATRIX)
+                  "transport_type, transit_size, transit_time, bandwidth " \
+                  "from {};".format(DbConstant.TABLE_ANALYZER_MATRIX)
             matrix_data = DbManager.fetch_all_data(curs, sql)
         if DbManager.judge_table_exist(curs, DbConstant.TABLE_ANALYZER_TIME):
             sql = "select hccl_op_name, group_name, start_timestamp, elapse_time, "\
-                  "transit_time, wait_time, synchronization_time, idle_time" \
-                  " from {};".format(DbConstant.TABLE_ANALYZER_TIME)
+                  "transit_time, wait_time, synchronization_time, idle_time " \
+                  "from {};".format(DbConstant.TABLE_ANALYZER_TIME)
             time_data = DbManager.fetch_all_data(curs, sql)
         DbManager.destroy_db_connect(conn, curs)
         return band_width_data, matrix_data, time_data
@@ -218,21 +216,16 @@ class CommunicationDbParser(CommunicationParser):
             reformat_data += extract_data_from_dict(step, self.COLLECTIVE, collective_dict)
         return reformat_data
 
-    def save_communication_db_data(self, band_width_data, matrix_data, time_data, output_path):
-        db_path = os.path.join(output_path, DbConstant.DB_ANALYSIS)
-        conn, curs = DbManager.create_connect_db(db_path)
-        if not (conn and curs):
-            self.logger.warning("Failed to connect to db file: %s", db_path)
+    def save_communication_db_data(self, band_width_data, matrix_data, time_data):
+        if not AnalysisDb().create_connect_db():
+            self.logger.warning("Failed to connect to db file: %s", AnalysisDb().get_db_path())
             return
-        self.analysis_db_conn = conn
-        self.analysis_db_curs = curs
-        DbManager.create_table_with_headers(conn, curs, DbConstant.TABLE_ANALYZER_BANDWIDTH,
-                                            TableColumnsManager.TableColumns.get(DbConstant.TABLE_ANALYZER_BANDWIDTH))
-        DbManager.insert_data_into_table(conn, DbConstant.TABLE_ANALYZER_BANDWIDTH, band_width_data)
-        DbManager.create_table_with_headers(conn, curs, DbConstant.TABLE_ANALYZER_MATRIX,
-                                            TableColumnsManager.TableColumns.get(DbConstant.TABLE_ANALYZER_MATRIX))
-        DbManager.insert_data_into_table(conn, DbConstant.TABLE_ANALYZER_MATRIX, matrix_data)
-        DbManager.create_table_with_headers(conn, curs, DbConstant.TABLE_ANALYZER_TIME,
-                                            TableColumnsManager.TableColumns.get(DbConstant.TABLE_ANALYZER_TIME))
-        DbManager.insert_data_into_table(conn, DbConstant.TABLE_ANALYZER_TIME, time_data)
-        DbManager.destroy_db_connect(conn, curs)
+        AnalysisDb().create_table_with_headers(DbConstant.TABLE_ANALYZER_BANDWIDTH,
+                                               TableColumnsManager.TableColumns.get(DbConstant.TABLE_ANALYZER_BANDWIDTH))
+        AnalysisDb().insert_data_into_table(DbConstant.TABLE_ANALYZER_BANDWIDTH, band_width_data)
+        AnalysisDb().create_table_with_headers(DbConstant.TABLE_ANALYZER_MATRIX,
+                                               TableColumnsManager.TableColumns.get(DbConstant.TABLE_ANALYZER_MATRIX))
+        AnalysisDb().insert_data_into_table(DbConstant.TABLE_ANALYZER_MATRIX, matrix_data)
+        AnalysisDb().create_table_with_headers(DbConstant.TABLE_ANALYZER_TIME,
+                                               TableColumnsManager.TableColumns.get(DbConstant.TABLE_ANALYZER_TIME))
+        AnalysisDb().insert_data_into_table(DbConstant.TABLE_ANALYZER_TIME, time_data)
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py
index d4e63b210b..a2c36511a1 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py
@@ -1,101 +1,63 @@
 import os
-import re
-import shutil
-import json
-from ...prof_common_func._utils import collect_env_vars
-from ...prof_common_func._path_manager import ProfilerPathManager
-from ...prof_common_func._file_manager import FileManager
-from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager, print_warn_msg
-from ...prof_common_func._db_manager import DbManager
-from ...prof_common_func._host_info import get_host_info
-from ...prof_common_func._log import ProfilerLogger
+
+from ._basic_db_parser import BasicDbParser
+from ._communication_db_parser import CommunicationDbParser
+from ._fwk_api_db_parser import FwkApiDbParser
+from ._gc_record_db_parser import GCRecordDbParser
+from ._memory_db_parser import MemoryDbParser
+from ._step_info_db_parser import StepInfoDbParser
+from ._trace_step_time_db_parser import TraceStepTimeDbParser
+from ...prof_common_func._constant import Constant, DbConstant, print_error_msg
+from ...prof_common_func._db_manager import TorchDb, AnalysisDb
 from .._base_parser import BaseParser
 from ..._profiler_config import ProfilerConfig
+from ...prof_common_func._log import ProfilerLogger
+from ...prof_common_func._path_manager import ProfilerPathManager
 
 __all__ = []
 
 
 class DbParser(BaseParser):
+    PYTORCH_DB_MAP = {
+        Constant.BASIC_DB_PARSER: BasicDbParser,
+        Constant.FWK_API_DB_PARSER: FwkApiDbParser,
+        Constant.MEMORY_DB_PARSER: MemoryDbParser,
+        Constant.GC_RECORD_DB_PARSER: GCRecordDbParser,
+    }
+
+    ANALYSIS_DB_MAP = {
+        Constant.STEP_INFO_DB_PARSER: StepInfoDbParser,
+        Constant.COMMUNICATION_DB_PARSER: CommunicationDbParser,
+        Constant.TRACE_STEP_TIME_DB_PARSER: TraceStepTimeDbParser,
+    }
+
     def __init__(self, name: str, param_dict: dict):
         super().__init__(name, param_dict)
-        self._cann_path = ProfilerPathManager.get_cann_path(self._profiler_path)
-        self._ascend_db_path = os.path.join(self._output_path, DbConstant.DB_ASCEND_PYTORCH_PROFILER)
-        self._conn = None
-        self._cur = None
         ProfilerLogger.init(self._profiler_path, "DbParser")
         self.logger = ProfilerLogger.get_instance()
 
-    def run(self, depth_data: dict):
+    def run(self, deps_data: dict):
+        ProfilerConfig().load_info(self._profiler_path)
+        torch_db_path = DbConstant.DB_ASCEND_PYTORCH_PROFILER
+        if ProfilerConfig().rank_id != -1:
+            torch_db_path = f"ascend_pytorch_profiler_{ProfilerConfig().rank_id}.db"
+        TorchDb().init(os.path.join(self._output_path, torch_db_path))
+        AnalysisDb().init(os.path.join(self._output_path, DbConstant.DB_ANALYSIS))
+
+        parser_db_map = self.PYTORCH_DB_MAP
+        if ProfilerPathManager.get_cann_path(self._profiler_path) and ProfilerConfig().get_level() != "Level_none":
+            parser_db_map = {**self.PYTORCH_DB_MAP, **self.ANALYSIS_DB_MAP}
         try:
-            ProfilerConfig().load_info(self._profiler_path)
-            if ProfilerConfig().rank_id != -1:
-                self._ascend_db_path = os.path.join(self._output_path, f"ascend_pytorch_profiler_{ProfilerConfig().rank_id}.db")
-            cann_db_path = self.get_cann_db_path()
-            if cann_db_path:
-                shutil.move(cann_db_path, self._ascend_db_path)
-            self.create_ascend_db()
-            self.save_rank_info_to_db()
-            self.save_host_info_to_db()
-            self.save_env_vars_info_to_db()
-            self.save_profiler_metadata_to_db()
-            DbManager.destroy_db_connect(self._conn, self._cur)
-        except RuntimeError as e:
-            self.logger.error("Failed to generate ascend_pytorch_profiler db file, error: %s", str(e), exc_info=True)
-            DbManager.destroy_db_connect(self._conn, self._cur)
+            for name, parser in parser_db_map.items():
+                parser_obj = parser(name, self._param_dict)
+                parser_status, parser_data = parser_obj.run(deps_data)
+                deps_data.setdefault(name, parser_data)
+                if parser_status != Constant.SUCCESS:
+                    print_error_msg(f"DB parser failed, parser is {name}")
+        except Exception as error:
+            self.logger.error("Failed to generate ascend_pytorch_profiler db file, error: %s", str(error), exc_info=True)
             return Constant.FAIL, ""
-        return Constant.SUCCESS, self._ascend_db_path
-
-    def get_cann_db_path(self):
-        if not self._cann_path:
-            return ""
-        db_patten = '^msprof_\d+\.db$'
-        for cann_file in os.listdir(self._cann_path):
-            file_path = os.path.join(self._cann_path, cann_file)
-            if re.match(db_patten, cann_file):
-                try:
-                    FileManager.check_db_file_vaild(file_path)
-                except RuntimeError:
-                    print_warn_msg("Invalid cann db file.")
-                    continue
-                return file_path
-        return ""
-    
-    def create_ascend_db(self):
-        self._conn, self._cur = DbManager.create_connect_db(self._ascend_db_path)
-        if not (self._conn and self._cur):
-            raise RuntimeError(f"Failed to connect to db file: {self._ascend_db_path}")
-
-    def save_rank_info_to_db(self):
-        if ProfilerConfig().rank_id == -1:
-            return
-        DbManager.create_table_with_headers(self._conn, self._cur, DbConstant.TABLE_RANK_DEVICE_MAP, TableColumnsManager.TableColumns.get(DbConstant.TABLE_RANK_DEVICE_MAP))
-        DbManager.insert_data_into_table(self._conn, DbConstant.TABLE_RANK_DEVICE_MAP, [[ProfilerConfig().rank_id, ProfilerPathManager.get_device_id(self._cann_path)]])
-    
-    def save_host_info_to_db(self):
-        if DbManager.judge_table_exist(self._cur, DbConstant.TABLE_HOST_INFO):
-            return
-        _host_info = get_host_info()
-        DbManager.create_table_with_headers(self._conn, self._cur, DbConstant.TABLE_HOST_INFO, TableColumnsManager.TableColumns.get(DbConstant.TABLE_HOST_INFO))
-        DbManager.insert_data_into_table(self._conn, DbConstant.TABLE_HOST_INFO, [[_host_info.get('host_uid'), _host_info.get('host_name')]])
-
-    def save_env_vars_info_to_db(self):
-        env_vars_dict = collect_env_vars()
-        DbManager.create_table_with_headers(self._conn, self._cur, DbConstant.TABLE_META_DATA, TableColumnsManager.TableColumns.get(DbConstant.TABLE_META_DATA))
-        DbManager.insert_data_into_table(self._conn, DbConstant.TABLE_META_DATA,
-                                         [['ENV_VARIABLES', json.dumps(env_vars_dict.get('ENV_VARIABLES'))]])
-
-    def save_profiler_metadata_to_db(self):
-        profiler_metadata_path = os.path.join(self._profiler_path, Constant.PROFILER_META_DATA)
-        if not os.path.exists(profiler_metadata_path):
-            return
-        profiler_metadata = FileManager.file_read_all(profiler_metadata_path)
-        try:
-            profiler_metadata = json.loads(profiler_metadata)
-        except json.JSONDecodeError as e:
-            self.logger.warning("profiler_metadata.json parse failed. %s", str(e))
-            return
-        data = [
-            [str(key), json.dumps(value)] for key, value in profiler_metadata.items()
-        ]
-        DbManager.create_table_with_headers(self._conn, self._cur, DbConstant.TABLE_META_DATA, TableColumnsManager.TableColumns.get(DbConstant.TABLE_META_DATA))
-        DbManager.insert_data_into_table(self._conn, DbConstant.TABLE_META_DATA, data)
+        finally:
+            TorchDb().close()
+            AnalysisDb().close()
+        return Constant.SUCCESS, ""
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_fwk_api_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_fwk_api_db_parser.py
index 6572f4a472..c9a7d5aff1 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_fwk_api_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_fwk_api_db_parser.py
@@ -1,8 +1,9 @@
 from enum import Enum
-from ...prof_common_func._db_manager import DbManager
+from ...prof_common_func._db_manager import TorchDb
 from ...prof_common_func._id_manager import Str2IdManager, ConnectionIdManager, CallChainIdManager
 from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager
 from .._base_parser import BaseParser
+from ...prof_common_func._log import ProfilerLogger
 from ...prof_parse._fwk_file_parser import FwkFileParser
 
 __all__ = []
@@ -54,24 +55,21 @@ class FwkApiDbParser(BaseParser):
 
     def __init__(self, name: str, param_dict: dict):
         super().__init__(name, param_dict)
-        self._conn = None
-        self._cur = None
-        self._db_path = ""
         self._max_cann_connection_id = 0
         self._fwk_apis = []
- 
+        ProfilerLogger.init(self._profiler_path, "FwkApiDbParser")
+        self.logger = ProfilerLogger.get_instance()
+
     def run(self, deps_data: dict):
         try:
-            self._db_path = deps_data.get(Constant.DB_PARSER, "")
             self.init_db_connect()
             self.set_start_string_id()
             self.get_max_cann_id()
             fwk_api_data = FwkFileParser(self._profiler_path).get_fwk_api()
             self.get_api_data_for_db(fwk_api_data)
             self.save_api_data_to_db()
-        except Exception as e:
-            logging.error("Failed to generate framework api table, error: %s", str(e), exc_info=True)
-            DbManager.destroy_db_connect(self._conn, self._cur)
+        except Exception as error:
+            self.logger.error("Failed to generate framework api table, error: %s", str(error), exc_info=True)
             return Constant.FAIL, None
         return Constant.SUCCESS, None
 
@@ -138,8 +136,9 @@ class FwkApiDbParser(BaseParser):
                                    ApiType.MSTX_OP.value])
 
     def get_mstx_mark_op_connection_ids_with_cann_api(self, task_enqueues: list, task_dequeues: list, mstx_mark_apis: list):
-        sql = "select startNs, endNs, globalTid, connectionId from {} order by startNs".format(DbConstant.TABLE_MSTX_EVENTS)
-        cann_tx_apis = DbManager.fetch_all_data(self._cur, sql)
+        sql = "select startNs, endNs, globalTid, connectionId from {} order by startNs".format(
+            DbConstant.TABLE_MSTX_EVENTS)
+        cann_tx_apis = TorchDb().fetch_all_data(sql)
         if not cann_tx_apis:
             raise RuntimeWarning("Failed to get msprof_tx apis")
         mstx_mark_apis.sort(key=lambda x: x[TorchOpDataOri.START_NS.value])
@@ -150,14 +149,16 @@ class FwkApiDbParser(BaseParser):
 
     def get_torch_op_connection_ids_with_cann_api(self, task_enqueues: list, task_dequeues: list, torch_op_apis: list):
         sql = "select id from {} where value = 'launch'".format(DbConstant.TABLE_STRING_IDS)
-        node_launch_str_ids = DbManager.fetch_one_data(self._cur, sql)
+        node_launch_str_ids = TorchDb().fetch_one_data(sql)
         node_launch_str_id = 0
         if node_launch_str_ids and node_launch_str_ids[0]:
             node_launch_str_id = node_launch_str_ids[0]
         else:
             raise RuntimeWarning("Failed to find node launch str id")
-        sql = "select startNs, endNs, globalTid, connectionId from {} where name = {} and type = 10000 order by startNs".format(DbConstant.TABLE_CANN_API, node_launch_str_id) # 10000 : node level
-        node_lauch_apis = DbManager.fetch_all_data(self._cur, sql)
+        sql = "select startNs, endNs, globalTid, connectionId from {} " \
+              "where name = {} and type = 10000 order by startNs" \
+            .format(DbConstant.TABLE_CANN_API, node_launch_str_id)  # 10000 : node level
+        node_lauch_apis = TorchDb().fetch_all_data(sql)
         if not node_lauch_apis:
             raise RuntimeWarning("Failed to get node launch apis")
         torch_op_apis.sort(key=lambda x: x[TorchOpDataOri.START_NS.value])
@@ -229,29 +230,31 @@ class FwkApiDbParser(BaseParser):
 
     def set_start_string_id(self):
         Str2IdManager().set_start_id(DbConstant.START_STRING_ID_FWK_API)
-    
+
     def get_max_cann_id(self):
-        if not DbManager.judge_table_exist(self._cur, DbConstant.TABLE_CANN_API):
+        if not TorchDb().judge_table_exist(DbConstant.TABLE_CANN_API):
             return
         sql = "select max(connectionId) from {}".format(DbConstant.TABLE_CANN_API)
-        connectionIds = DbManager.fetch_one_data(self._cur, sql)
+        connectionIds = TorchDb().fetch_one_data(sql)
         if connectionIds and connectionIds[0]:
             self._max_cann_connection_id = connectionIds[0] + 1
-  
+
     def init_db_connect(self) -> None:
-        self._conn, self._cur = DbManager.create_connect_db(self._db_path)
-        if not (self._conn and self._cur):
-            raise RuntimeError(f"Failed to connect to db file: {self._db_path}")
+        if not TorchDb().create_connect_db():
+            raise RuntimeError(f"Failed to connect to db file: {TorchDb().get_db_path()}")
 
     def save_fwk_api(self):
         if not self._fwk_apis:
             return
-        DbManager.create_table_with_headers(self._conn, self._cur, DbConstant.TABLE_PYTORCH_API, TableColumnsManager.TableColumns.get(DbConstant.TABLE_PYTORCH_API))
-        DbManager.insert_data_into_table(self._conn, DbConstant.TABLE_PYTORCH_API, self._fwk_apis)
+        TorchDb().create_table_with_headers(DbConstant.TABLE_PYTORCH_API,
+                                            TableColumnsManager.TableColumns.get(DbConstant.TABLE_PYTORCH_API))
+        TorchDb().insert_data_into_table(DbConstant.TABLE_PYTORCH_API, self._fwk_apis)
 
     def save_string_ids(self):
-        DbManager.create_table_with_headers(self._conn, self._cur, DbConstant.TABLE_STRING_IDS, TableColumnsManager.TableColumns.get(DbConstant.TABLE_STRING_IDS))
-        DbManager.insert_data_into_table(self._conn, DbConstant.TABLE_STRING_IDS, Str2IdManager().get_all_string_2_id_data())
+        TorchDb().create_table_with_headers(DbConstant.TABLE_STRING_IDS,
+                                            TableColumnsManager.TableColumns.get(DbConstant.TABLE_STRING_IDS))
+        TorchDb().insert_data_into_table(DbConstant.TABLE_STRING_IDS,
+                                         Str2IdManager().get_all_string_2_id_data())
 
     def sava_connection_ids(self):
         connection_ids = ConnectionIdManager().get_all_connection_ids()
@@ -261,8 +264,9 @@ class FwkApiDbParser(BaseParser):
         for index, conn_ids in connection_ids.items():
             for conn_id in conn_ids:
                 save_connection_ids.append([index, conn_id])
-        DbManager.create_table_with_headers(self._conn, self._cur, DbConstant.TABLE_CONNECTION_IDS, TableColumnsManager.TableColumns.get(DbConstant.TABLE_CONNECTION_IDS))
-        DbManager.insert_data_into_table(self._conn, DbConstant.TABLE_CONNECTION_IDS, save_connection_ids)
+        TorchDb().create_table_with_headers(DbConstant.TABLE_CONNECTION_IDS,
+                                            TableColumnsManager.TableColumns.get(DbConstant.TABLE_CONNECTION_IDS))
+        TorchDb().insert_data_into_table(DbConstant.TABLE_CONNECTION_IDS, save_connection_ids)
 
     def save_callchain_ids(self):
         callchain_ids = CallChainIdManager().get_all_callchain_id()
@@ -272,19 +276,21 @@ class FwkApiDbParser(BaseParser):
         for index, callstack_ids in callchain_ids.items():
             for callstack_id in callstack_ids:
                 save_callchain_ids.append([index] + callstack_id)
-        DbManager.create_table_with_headers(self._conn, self._cur, DbConstant.TABLE_PYTORCH_CALLCHAINS, TableColumnsManager.TableColumns.get(DbConstant.TABLE_PYTORCH_CALLCHAINS))
-        DbManager.insert_data_into_table(self._conn, DbConstant.TABLE_PYTORCH_CALLCHAINS, save_callchain_ids)
+        TorchDb().create_table_with_headers(DbConstant.TABLE_PYTORCH_CALLCHAINS,
+                                            TableColumnsManager.TableColumns.get(DbConstant.TABLE_PYTORCH_CALLCHAINS))
+        TorchDb().insert_data_into_table(DbConstant.TABLE_PYTORCH_CALLCHAINS, save_callchain_ids)
 
     def save_enum_api_types_to_db(self):
-        if not DbManager.judge_table_exist(self._cur, DbConstant.TABLE_ENUM_API_TYPE):
-            DbManager.create_table_with_headers(self._conn, self._cur, DbConstant.TABLE_ENUM_API_TYPE, TableColumnsManager.TableColumns.get(DbConstant.TABLE_ENUM_API_TYPE))
+        if not TorchDb().judge_table_exist(DbConstant.TABLE_ENUM_API_TYPE):
+            TorchDb().create_table_with_headers(DbConstant.TABLE_ENUM_API_TYPE,
+                                                TableColumnsManager.TableColumns.get(DbConstant.TABLE_ENUM_API_TYPE))
         api_types = [
             (ApiType.TORCH_OP.value, 'op'),
             (ApiType.TASK_QUEUE.value, 'queue'),
             (ApiType.PYTHON_TRACE.value, 'trace'),
             (ApiType.MSTX_OP.value, 'mstx')
         ]
-        DbManager.insert_data_into_table(self._conn, DbConstant.TABLE_ENUM_API_TYPE, api_types)
+        TorchDb().insert_data_into_table(DbConstant.TABLE_ENUM_API_TYPE, api_types)
 
     def save_api_data_to_db(self):
         self.save_fwk_api()
@@ -292,4 +298,3 @@ class FwkApiDbParser(BaseParser):
         self.sava_connection_ids()
         self.save_callchain_ids()
         self.save_enum_api_types_to_db()
-        DbManager.destroy_db_connect(self._conn, self._cur)
\ No newline at end of file
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_gc_record_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_gc_record_db_parser.py
index 76a67b41bf..a570e909e3 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_gc_record_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_gc_record_db_parser.py
@@ -12,8 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from ...prof_common_func._db_manager import DbManager
 from ...prof_common_func._log import ProfilerLogger
+from ...prof_common_func._db_manager import TorchDb
 from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager
 from ...prof_parse._fwk_file_parser import FwkFileParser
 from .._base_parser import BaseParser
@@ -25,32 +25,25 @@ class GCRecordDbParser(BaseParser):
 
     def __init__(self, name: str, param_dict: dict):
         super().__init__(name, param_dict)
-        self._conn = None
-        self._cur = None
-        self._db_path = ""
         self._gc_record_data = []
         ProfilerLogger.init(self._profiler_path, "GCRecordDbParser")
         self.logger = ProfilerLogger.get_instance()
 
     def run(self, deps_data: dict):
         try:
-            self._db_path = deps_data.get(Constant.DB_PARSER, "")
             self.init_db_connect()
             self._gc_record_data = FwkFileParser(self._profiler_path).get_gc_record_db_data()
             self.save_gc_record_data_to_db()
-        except Exception as e:
-            self.logger.error("Failed to generate gc record table, error: %s", str(e), exc_info=True)
-            DbManager.destroy_db_connect(self._conn, self._cur)
+        except Exception as error:
+            self.logger.error("Failed to generate gc record table, error: %s", str(error), exc_info=True)
             return Constant.FAIL, None
         return Constant.SUCCESS, None
 
     def init_db_connect(self) -> None:
-        self._conn, self._cur = DbManager.create_connect_db(self._db_path)
-        if not (self._conn and self._cur):
-            raise RuntimeError(f"Failed to connect to db file: {self._db_path}")
+        if not TorchDb().create_connect_db():
+            raise RuntimeError(f"Failed to connect to db file: {TorchDb().get_db_path()}")
 
     def save_gc_record_data_to_db(self):
         if self._gc_record_data:
-            DbManager.create_table_with_headers(self._conn, self._cur, DbConstant.TABLE_GC_RECORD, TableColumnsManager.TableColumns.get(DbConstant.TABLE_GC_RECORD))
-            DbManager.insert_data_into_table(self._conn, DbConstant.TABLE_GC_RECORD, self._gc_record_data)
-        DbManager.destroy_db_connect(self._conn, self._cur)
+            TorchDb().create_table_with_headers(DbConstant.TABLE_GC_RECORD, TableColumnsManager.TableColumns.get(DbConstant.TABLE_GC_RECORD))
+            TorchDb().insert_data_into_table(DbConstant.TABLE_GC_RECORD, self._gc_record_data)
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py
index 09ca81a73d..6afff95c70 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py
@@ -1,13 +1,10 @@
-import os
-
 from enum import Enum
 from collections import namedtuple
 from ...prof_parse._fwk_file_parser import FwkFileParser
 from .._memory_prepare_parser import MemoryPrepareParser
-from ...prof_common_func._db_manager import DbManager
+from ...prof_common_func._db_manager import TorchDb
 from ...prof_common_func._id_manager import Str2IdManager
 from ...prof_common_func._path_manager import ProfilerPathManager
-from ...prof_parse._cann_file_parser import CANNFileParser, CANNDataEnum
 from ...prof_common_func._constant import Constant, DbConstant, TableColumnsManager
 from ...prof_common_func._log import ProfilerLogger
 from .._base_parser import BaseParser
@@ -57,9 +54,6 @@ class GeOpMemRecordsOri(Enum):
 class MemoryDbParser(BaseParser):
     def __init__(self, name: str, param_dict: dict):
         super().__init__(name, param_dict)
-        self._conn = None
-        self._cur = None
-        self._db_path = ""
         self._pta_op_memory_data = []
         self._ge_op_memory_data = []
         self._pta_memory_bean_list = []
@@ -82,32 +76,29 @@ class MemoryDbParser(BaseParser):
     
     def run(self, deps_data: dict):
         try:
-            self._db_path = deps_data.get(Constant.DB_PARSER, "")
             self.init_db_connect()
             self.set_start_string_id()
             self._pta_op_memory_data = deps_data.get(Constant.MEMORY_PREPARE, {}).get("memory_data", {}).get(Constant.Db, [])
             self._pta_memory_bean_list = deps_data.get(Constant.MEMORY_PREPARE, {}).get("pta_record_list", [])
             self.init_pta_memory_data()
             self.save_memory_data_to_db()
-        except Exception as e:
-            self.logger.error("Failed to generate memory_record table or op_memory table, error: %s", str(e), exc_info=True)
-            DbManager.destroy_db_connect(self._conn, self._cur)
+        except Exception as error:
+            self.logger.error("Failed to generate memory_record table or op_memory table, error: %s", str(error), exc_info=True)
             return Constant.FAIL, None
         return Constant.SUCCESS, None
     
     def init_db_connect(self):
-        self._conn, self._cur = DbManager.create_connect_db(self._db_path)
-        if not (self._conn and self._cur):
-            raise RuntimeError(f"Failed to connect to db file: {self._db_path}")
+        if not TorchDb().create_connect_db():
+            raise RuntimeError(f"Failed to connect to db file: {TorchDb().get_db_path()}")
 
     def set_start_string_id(self):
         Str2IdManager().set_start_id(DbConstant.START_STRING_ID_MEMORY)
 
     def get_ge_memory_data(self):
-        if not DbManager.judge_table_exist(self._cur, DbConstant.TABLE_NPU_OP_MEM):
+        if not TorchDb().judge_table_exist(DbConstant.TABLE_NPU_OP_MEM):
             return
         sql = "select operatorName, addr, type, size, timestampNs, totalAllocate, totalReserve, deviceId from {}".format(DbConstant.TABLE_NPU_OP_MEM)
-        ge_mem_records = DbManager.fetch_all_data(self._cur, sql)
+        ge_mem_records = TorchDb().fetch_all_data(sql)
         record_type_dict = {}
         for index, mem_record in enumerate(ge_mem_records):
             if ge_mem_records[index][GeOpMemRecordsOri.TYPE.value] in record_type_dict:
@@ -117,7 +108,7 @@ class MemoryDbParser(BaseParser):
                     ge_mem_records[index] = tuple(record)
                 continue
             sql = "select value from {} where id = {}".format(DbConstant.TABLE_STRING_IDS, ge_mem_records[index][GeOpMemRecordsOri.TYPE.value])
-            record_type = DbManager.fetch_one_data(self._cur, sql)
+            record_type = TorchDb().fetch_one_data(sql)
             if record_type and record_type[0]:
                 if record_type[0] == "release":
                     record = list(ge_mem_records[index])
@@ -181,8 +172,8 @@ class MemoryDbParser(BaseParser):
             return
         for memory in self._pta_op_memory_data:
             memory[OpMemoryTableRow.NAME.value] = Str2IdManager().get_id_from_str(memory[OpMemoryTableRow.NAME.value])
-        DbManager.create_table_with_headers(self._conn, self._cur, DbConstant.TABLE_OPERATOR_MEMORY, TableColumnsManager.TableColumns.get(DbConstant.TABLE_OPERATOR_MEMORY))
-        DbManager.insert_data_into_table(self._conn, DbConstant.TABLE_OPERATOR_MEMORY, self._pta_op_memory_data + self._ge_op_memory_data)
+        TorchDb().create_table_with_headers(DbConstant.TABLE_OPERATOR_MEMORY, TableColumnsManager.TableColumns.get(DbConstant.TABLE_OPERATOR_MEMORY))
+        TorchDb().insert_data_into_table(DbConstant.TABLE_OPERATOR_MEMORY, self._pta_op_memory_data + self._ge_op_memory_data)
 
     def get_pta_memort_record_list(self):
         if not self._pta_memory_bean_list:
@@ -231,8 +222,8 @@ class MemoryDbParser(BaseParser):
         self.get_pta_ge_record_list()
         if not self._record_list:
             return
-        DbManager.create_table_with_headers(self._conn, self._cur, DbConstant.TABLE_MEMORY_RECORD, TableColumnsManager.TableColumns.get(DbConstant.TABLE_MEMORY_RECORD))
-        DbManager.insert_data_into_table(self._conn, DbConstant.TABLE_MEMORY_RECORD, self._record_list)
+        TorchDb().create_table_with_headers(DbConstant.TABLE_MEMORY_RECORD, TableColumnsManager.TableColumns.get(DbConstant.TABLE_MEMORY_RECORD))
+        TorchDb().insert_data_into_table(DbConstant.TABLE_MEMORY_RECORD, self._record_list)
 
     def init_pta_memory_data(self):
         if not ProfilerPathManager.get_cann_path(self._profiler_path):
@@ -243,12 +234,11 @@ class MemoryDbParser(BaseParser):
             self._pta_memory_bean_list = pta_data.get("pta_record_list", [])
 
     def save_strings_id(self):
-        DbManager.create_table_with_headers(self._conn, self._cur, DbConstant.TABLE_STRING_IDS, TableColumnsManager.TableColumns.get(DbConstant.TABLE_STRING_IDS))
-        DbManager.insert_data_into_table(self._conn, DbConstant.TABLE_STRING_IDS, Str2IdManager().get_all_string_2_id_data())
+        TorchDb().create_table_with_headers(DbConstant.TABLE_STRING_IDS, TableColumnsManager.TableColumns.get(DbConstant.TABLE_STRING_IDS))
+        TorchDb().insert_data_into_table(DbConstant.TABLE_STRING_IDS, Str2IdManager().get_all_string_2_id_data())
     
     def save_memory_data_to_db(self):
         self.get_ge_memory_data()
         self.save_memory_record_data_to_db()
         self.save_op_memory_data_to_db()
         self.save_strings_id()
-        DbManager.destroy_db_connect(self._conn, self._cur)
\ No newline at end of file
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_step_info_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_step_info_db_parser.py
index fc871036a3..df3b8fea4f 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_step_info_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_step_info_db_parser.py
@@ -12,12 +12,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import logging
-
 from .._base_parser import BaseParser
 from ...prof_bean._torch_op_node import TorchOpNode
-from ...prof_common_func._constant import DbConstant, Constant, TableColumnsManager, print_error_msg, print_warn_msg
-from ...prof_common_func._db_manager import DbManager
+from ...prof_common_func._constant import DbConstant, Constant, TableColumnsManager, print_warn_msg
+from ...prof_common_func._db_manager import TorchDb
+from ...prof_common_func._log import ProfilerLogger
 
 __all__ = []
 
@@ -28,42 +27,39 @@ class StepInfoDbParser(BaseParser):
 
     def __init__(self, name: str, param_dict: dict):
         super().__init__(name, param_dict)
-        self.db_conn = None
-        self.db_curs = None
-        self._db_path = ""
+        ProfilerLogger.init(self._profiler_path, "StepInfoDbParser")
+        self.logger = ProfilerLogger.get_instance()
 
     def run(self, deps_data: dict):
         try:
-            self._db_path = deps_data.get(Constant.DB_PARSER, "")
             torch_op_node = deps_data.get(Constant.TREE_BUILD_PARSER, [])
             step_range = self.get_step_range(torch_op_node[0] if torch_op_node else None)
-        except Exception as e:
-            logging.error("Failed to get step info from db, error: %s", str(e), exc_info=True)
-            DbManager.destroy_db_connect(self.db_conn, self.db_curs)
+        except Exception as error:
+            self.logger.error("Failed to get step info from db, error: %s", str(error), exc_info=True)
             return Constant.FAIL, []
         return Constant.SUCCESS, step_range
 
-    def get_api_data_in_time_range(self, begin_ts, end_ts, db_cur) -> list:
-        if not DbManager.judge_table_exist(db_cur, DbConstant.TABLE_CANN_API):
+    def get_api_data_in_time_range(self, begin_ts, end_ts) -> list:
+        if not TorchDb().judge_table_exist(DbConstant.TABLE_CANN_API):
             print_warn_msg("Failed to get api data from db.")
             return []
         sql = f"select connectionId from {DbConstant.TABLE_CANN_API} " \
               f"where type={self.NODE_LEVEL} and {begin_ts} <= startNs and endNs <= {end_ts}"
-        return DbManager.fetch_all_data(db_cur, sql)
+        return TorchDb().fetch_all_data(sql)
 
-    def get_all_api_data(self, db_cur) -> list:
-        if not DbManager.judge_table_exist(db_cur, DbConstant.TABLE_CANN_API):
+    def get_all_api_data(self) -> list:
+        if not TorchDb().judge_table_exist(DbConstant.TABLE_CANN_API):
             print_warn_msg("Failed to get api data from db.")
             return []
         sql = f"select connectionId from {DbConstant.TABLE_CANN_API} where type={self.NODE_LEVEL}"
-        return DbManager.fetch_all_data(db_cur, sql)
+        return TorchDb().fetch_all_data(sql)
 
-    def get_task_info_from_api(self, api_data, db_cur) -> dict:
-        if not DbManager.judge_table_exist(db_cur, DbConstant.TABLE_TASK):
+    def get_task_info_from_api(self, api_data) -> dict:
+        if not TorchDb().judge_table_exist(DbConstant.TABLE_TASK):
             print_warn_msg("Failed to get task data from db.")
             return {}
         sql = f"select startNs, endNs, connectionId, globalTaskId from {DbConstant.TABLE_TASK}"
-        task_data = DbManager.fetch_all_data(db_cur, sql)
+        task_data = TorchDb().fetch_all_data(sql)
         api_connection_ids = {info[0] for info in api_data}
         api_task_info = {}
         for task_info in task_data:
@@ -75,19 +71,16 @@ class StepInfoDbParser(BaseParser):
         step_node_list = []
         if root_node is not None:
             step_node_list = [node for node in root_node.child_node_list if node.is_profiler_step()]
-        conn, curs = DbManager.create_connect_db(self._db_path)
-        if not (conn and curs):
-            print_warn_msg(f"Failed to connect to db file: {self._db_path}")
+        if not TorchDb().create_connect_db():
+            print_warn_msg(f"Failed to connect to db file: {TorchDb().get_db_path()}")
             return []
-        self.db_conn = conn
-        self.db_curs = curs
         step_range = []
         if not step_node_list:
             start_time = 0
             end_time = float('inf')
             step_id = None
-            api_data = self.get_all_api_data(curs)
-            task_info = self.get_task_info_from_api(api_data, curs)
+            api_data = self.get_all_api_data()
+            task_info = self.get_task_info_from_api(api_data)
             device_start_ts = min(info['startNs'] for info in task_info.values()) if task_info else start_time
             device_end_ts = max(info['endNs'] for info in task_info.values()) if task_info else Constant.INVALID_VALUE
             step_range.append(
@@ -102,8 +95,8 @@ class StepInfoDbParser(BaseParser):
         else:
             for step_node in step_node_list:
                 step_id = step_node.event.name.split("#")[-1]
-                api_data = self.get_api_data_in_time_range(step_node.start_time, step_node.end_time, curs)
-                task_info = self.get_task_info_from_api(api_data, curs)
+                api_data = self.get_api_data_in_time_range(step_node.start_time, step_node.end_time)
+                task_info = self.get_task_info_from_api(api_data)
                 device_start_ts = \
                     min(info['startNs'] for info in task_info.values()) if task_info else step_node.start_time
                 device_end_ts = \
@@ -118,7 +111,6 @@ class StepInfoDbParser(BaseParser):
                     }
                 )
         self.save_step_time(step_node_list)
-        DbManager.destroy_db_connect(conn, curs)
         return step_range
 
     def save_step_time(self, step_node_list: list) -> None:
@@ -127,5 +119,5 @@ class StepInfoDbParser(BaseParser):
         step_time_list = []
         for step_node in step_node_list:
             step_time_list.append([step_node.event.name.split("#")[-1], step_node.start_time, step_node.end_time])
-        DbManager.create_table_with_headers(self.db_conn, self.db_curs, DbConstant.TABLE_STEP_TIME, TableColumnsManager.TableColumns.get(DbConstant.TABLE_STEP_TIME))
-        DbManager.insert_data_into_table(self.db_conn, DbConstant.TABLE_STEP_TIME, step_time_list)
+        TorchDb().create_table_with_headers(DbConstant.TABLE_STEP_TIME, TableColumnsManager.TableColumns.get(DbConstant.TABLE_STEP_TIME))
+        TorchDb().insert_data_into_table(DbConstant.TABLE_STEP_TIME, step_time_list)
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py
index 6b7e1bd37e..97a164b73d 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_trace_step_time_db_parser.py
@@ -12,15 +12,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import os
-import logging
 from enum import Enum
 from .._base_parser import BaseParser
 from ...prof_common_func._constant import Constant, print_warn_msg
 from ...prof_common_func._constant import DbConstant, TableColumnsManager
-from ...prof_common_func._db_manager import DbManager
+from ...prof_common_func._db_manager import AnalysisDb, TorchDb
 from ...prof_common_func._constant import convert_ns2us_float
+from ...prof_common_func._log import ProfilerLogger
 from ...prof_common_func._time_range_calculator import CommunicationTimeRange, RangeCaculator
 from ...prof_parse._fwk_file_parser import FwkFileParser
 
@@ -41,11 +39,8 @@ class TraceStepTimeDbParser(BaseParser):
         self.string_id_map = {}
         self.compute_task_info = {}
         self.communication_op_info = []
-        self.task_db_con = None
-        self.task_db_curs = None
-        self.analysis_db_con = None
-        self.analysis_db_curs = None
-        self.db_path = ""
+        ProfilerLogger.init(self._profiler_path, "TraceStepTimeDbParser")
+        self.logger = ProfilerLogger.get_instance()
 
     @staticmethod
     def get_e2e_time(task_time_list):
@@ -66,29 +61,21 @@ class TraceStepTimeDbParser(BaseParser):
             return (first_task_start_ts - first_fwk_op.ts) if first_fwk_op else 0
         return first_task_start_ts - step_info.get(Constant.FWK_START_TS, 0)
 
-    def save_step_trace_db_data(self, output_path, step_trace_data):
-        db_path = os.path.join(output_path, DbConstant.DB_ANALYSIS)
-        conn, curs = DbManager.create_connect_db(db_path)
-        if not (conn and curs):
-            print_warn_msg(f"Failed to connect to db file: {db_path}")
+    def save_step_trace_db_data(self, step_trace_data):
+        if not AnalysisDb().create_connect_db():
+            print_warn_msg(f"Failed to connect to db file: {AnalysisDb().get_db_path()}")
             return
-        self.analysis_db_con = conn
-        self.analysis_db_curs = curs
-        DbManager.create_table_with_headers(conn, curs, DbConstant.TABLE_STEP_TRACE_TIME,
-                                            TableColumnsManager.TableColumns.get(DbConstant.TABLE_STEP_TRACE_TIME))
-        DbManager.insert_data_into_table(conn, DbConstant.TABLE_STEP_TRACE_TIME, step_trace_data)
-        DbManager.destroy_db_connect(conn, curs)
+        AnalysisDb().create_table_with_headers(DbConstant.TABLE_STEP_TRACE_TIME,
+                                               TableColumnsManager.TableColumns.get(DbConstant.TABLE_STEP_TRACE_TIME))
+        AnalysisDb().insert_data_into_table(DbConstant.TABLE_STEP_TRACE_TIME, step_trace_data)
 
     def run(self, deps_data: dict):
         try:
-            self.db_path = deps_data.get(Constant.DB_PARSER, "")
             self._init_step_range(deps_data)
             self._init_task_info_from_db()
             self.generate_view()
-        except Exception as e:
-            logging.error("Failed to generate step_trace_time table, error: %s", str(e), exc_info=True)
-            DbManager.destroy_db_connect(self.task_db_con, self.task_db_curs)
-            DbManager.destroy_db_connect(self.analysis_db_con, self.analysis_db_curs)
+        except Exception as error:
+            self.logger.error("Failed to generate step_trace_time table, error: %s", str(error), exc_info=True)
             return Constant.FAIL, None
         return Constant.SUCCESS, None
 
@@ -130,30 +117,26 @@ class TraceStepTimeDbParser(BaseParser):
             step_time_data = [step['compute'], step['comunNotOverlp'], step['Overlp'], step['comun'], step['free'],
                               step['stage'], step['bubble'], step['comunNotOverlpRec'], step['prepare']]
             reformat_time.append([step['step'], ] + [convert_ns2us_float(data) for data in step_time_data])
-        self.save_step_trace_db_data(self._output_path, reformat_time)
+        self.save_step_trace_db_data(reformat_time)
 
     def _init_step_range(self, deps_data: dict):
         self.step_range = deps_data.get(Constant.STEP_INFO_DB_PARSER, [])
 
     def _init_task_info_from_db(self):
-        conn, curs = DbManager.create_connect_db(self.db_path)
-        if not (conn and curs):
-            print_warn_msg(f"Failed to connect to db file: {self.db_path}")
+        if not TorchDb().create_connect_db():
+            print_warn_msg(f"Failed to connect to db file: {TorchDb().get_db_path()}")
             return
-        self.task_db_con = conn
-        self.task_db_curs = curs
-        if DbManager.judge_table_exist(curs, DbConstant.TABLE_STRING_IDS):
+        if TorchDb().judge_table_exist(DbConstant.TABLE_STRING_IDS):
             sql = "select id, value from {}".format(DbConstant.TABLE_STRING_IDS)
-            string_id_data = DbManager.fetch_all_data(curs, sql)
+            string_id_data = TorchDb().fetch_all_data(sql)
             self.string_id_map = {data[0]: data[1] for data in string_id_data}
-        if DbManager.judge_table_exist(curs, DbConstant.TABLE_COMPUTE_TASK_INFO):
+        if TorchDb().judge_table_exist(DbConstant.TABLE_COMPUTE_TASK_INFO):
             sql = "select name, globalTaskId from {}".format(DbConstant.TABLE_COMPUTE_TASK_INFO)
-            compute_task_data = DbManager.fetch_all_data(curs, sql)
+            compute_task_data = TorchDb().fetch_all_data(sql)
             self.compute_task_info = {data[1]: data[0] for data in compute_task_data}
-        if DbManager.judge_table_exist(curs, DbConstant.TABLE_COMMUNICATION_OP):
+        if TorchDb().judge_table_exist(DbConstant.TABLE_COMMUNICATION_OP):
             sql = "select opName, startNs, endNs from {}".format(DbConstant.TABLE_COMMUNICATION_OP)
-            self.communication_op_info = DbManager.fetch_all_data(curs, sql)
-        DbManager.destroy_db_connect(conn, curs)
+            self.communication_op_info = TorchDb().fetch_all_data(sql)
 
     def _get_compute_data_in_step(self, step_info):
         compute_data = []
-- 
Gitee


From c0139c555d50a3b0201fac3215da2a1efe9c343a Mon Sep 17 00:00:00 2001
From: xudaohong <xudaohong@huawei.com>
Date: Thu, 13 Mar 2025 05:35:18 +0000
Subject: [PATCH 145/358] !18790 [fix] third_party compile cannot find hccl.h
 Merge pull request !18790 from xudaohong/v2.6.0

---
 third_party/hccl/inc/hccl/hccl.h         | 4 ++--
 torch_npu/csrc/distributed/HCCLUtils.hpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/hccl/inc/hccl/hccl.h b/third_party/hccl/inc/hccl/hccl.h
index b060c6857f..4ccda684b3 100644
--- a/third_party/hccl/inc/hccl/hccl.h
+++ b/third_party/hccl/inc/hccl/hccl.h
@@ -6,8 +6,8 @@
 #ifndef HCCL_H_
 #define HCCL_H_
 
-#include "hccl_types.h"
-#include <acl/acl.h>
+#include "third_party/hccl/inc/hccl/hccl_types.h"
+#include "third_party/acl/inc/acl/acl.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/torch_npu/csrc/distributed/HCCLUtils.hpp b/torch_npu/csrc/distributed/HCCLUtils.hpp
index 3cf22db173..e39c59643f 100644
--- a/torch_npu/csrc/distributed/HCCLUtils.hpp
+++ b/torch_npu/csrc/distributed/HCCLUtils.hpp
@@ -9,8 +9,8 @@
 
 #include <ATen/ATen.h>
 #include <c10/util/Optional.h>
-#include "hccl/hccl.h"
-#include "hccl/hccl_types.h"
+#include "third_party/hccl/inc/hccl/hccl.h"
+#include "third_party/hccl/inc/hccl/hccl_types.h"
 
 #define HCCL_CHECK_ERROR(err_code, ...)                                      \
     do {                                                                     \
-- 
Gitee


From 2f562833543008dcd0b97debd8096c25968076a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= <yanpengquan@huawei.com>
Date: Thu, 13 Mar 2025 06:13:47 +0000
Subject: [PATCH 146/358] =?UTF-8?q?!18883=20register=20lccl=20backend=20Me?=
 =?UTF-8?q?rge=20pull=20request=20!18883=20from=20=E9=97=AB=E9=B9=8F?=
 =?UTF-8?q?=E5=85=A8/v2.6.0lccl?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/__init__.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index 76f0903b23..81db1cd90b 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -228,11 +228,6 @@ def _new_process_group_hccl_helper(dist_backend_opts, pg_options):
     return torch_npu._C._distributed_c10d.ProcessGroupHCCL(store, group_rank, group_size, pg_options)
 
 
-# init and register hccl backend
-torch.distributed.Backend.register_backend("hccl", lambda dist_backend_opts, pg_options:
-    _new_process_group_hccl_helper(dist_backend_opts, pg_options), extended_api=True, devices=["npu"])
-
-
 def _new_process_group_lccl_helper(dist_backend_opts, pg_options):
     store = dist_backend_opts.store
     group_rank = dist_backend_opts.group_rank
@@ -240,6 +235,23 @@ def _new_process_group_lccl_helper(dist_backend_opts, pg_options):
     return torch_npu._C._distributed_c10d.ProcessGroupLCCL(store, group_rank, group_size)
 
 
+def _register_distributed_backend_for_npu():
+    # init and register lccl backend
+    torch.distributed.Backend.register_backend("lccl", lambda dist_backend_opts, pg_options:
+        _new_process_group_lccl_helper(dist_backend_opts, pg_options), extended_api=True, devices=["npu"])
+
+    # init and register hccl backend
+    # Note: The hccl backend must be registered last. 
+    # This is because the "Backend.default_device_backend_map" variable is refreshed during each registration process. 
+    # Therefore, it is essential to register the hccl backend last.
+    torch.distributed.Backend.register_backend("hccl", lambda dist_backend_opts, pg_options:
+        _new_process_group_hccl_helper(dist_backend_opts, pg_options), extended_api=True, devices=["npu"])
+
+
+# init and register distributed backend
+_register_distributed_backend_for_npu()
+
+
 # set default device type for gradient checkpointing
 DefaultDeviceType.set_device_type("npu")
 del DefaultDeviceType
-- 
Gitee


From 9b74a8a590def4bcb5572623c3d54c678269878a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=82=B5=E9=9D=9E=E5=87=A1?= <shaofeifan2@huawei.com>
Date: Thu, 13 Mar 2025 06:17:56 +0000
Subject: [PATCH 147/358] =?UTF-8?q?!18705=20Avoid=20memory=20occupation=20?=
 =?UTF-8?q?by=20the=20get=5Fdevice=5Fname=20interface.=20Merge=20pull=20re?=
 =?UTF-8?q?quest=20!18705=20from=20=E9=82=B5=E9=9D=9E=E5=87=A1/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/npu/Module.cpp | 13 +++++++++++++
 torch_npu/npu/utils.py        |  4 +---
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index 07568a31c6..03aecd7adb 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -80,6 +80,16 @@ void RegisterNPUDeviceProperties(PyObject* module)
           []() { return c10_npu::NPUCachingAllocator::isHistoryEnabled(); });
 }
 
+std::string GetDeviceName()
+{
+    const char* device_name = c10_npu::acl::AclrtGetSocName();
+    if (device_name == nullptr) {
+        ASCEND_LOGE("NPU get device name fail.");
+        return "";
+    }
+    return std::string(device_name);
+}
+
 NPUDeviceProp* GetDeviceProperties(int64_t deviceid)
 {
     const char* device_name;
@@ -103,6 +113,9 @@ void BindGetDeviceProperties(PyObject* module)
     m.def("_npu_getDeviceProperties", [](int deviceid) -> NPUDeviceProp* {
       return GetDeviceProperties(deviceid);
     }, py::return_value_policy::reference);
+    m.def("_npu_getDeviceName", []() -> std::string {
+      return GetDeviceName();
+    }, py::return_value_policy::reference);
 }
 
 NPUDeviceMem memory;
diff --git a/torch_npu/npu/utils.py b/torch_npu/npu/utils.py
index f4ce430d97..8e2b7b9222 100644
--- a/torch_npu/npu/utils.py
+++ b/torch_npu/npu/utils.py
@@ -67,9 +67,7 @@ def get_device_name(device_name=None):
     device_id = _get_device_index(device_name, optional=True)
     if device_id < 0 or device_id >= device_count():
         raise AssertionError("Invalid device id" + pta_error(ErrCode.VALUE))
-    torch_npu.npu._lazy_init()
-    device_prop = torch_npu._C._npu_getDeviceProperties(device_id)
-    return device_prop.name
+    return torch_npu._C._npu_getDeviceName()
 
 
 def get_device_properties(device_name=None):
-- 
Gitee


From 6623b3d97138df3190a33228a449f77b3e2b34a2 Mon Sep 17 00:00:00 2001
From: dilililiwhy <why.wuhuanyu@huawei.com>
Date: Thu, 13 Mar 2025 07:27:35 +0000
Subject: [PATCH 148/358] !18815 [2/N] cleancode (torch_npu/csrc/aten) Merge
 pull request !18815 from dilililiwhy/cleancode_aten_260_part2

---
 torch_npu/csrc/aten/common/TensorFactories.h  |  65 +--
 torch_npu/csrc/aten/common/TensorShape.cpp    | 232 ++++++-----
 torch_npu/csrc/aten/common/ToKernelNpu.cpp    | 111 ++---
 .../csrc/aten/mirror/NPUMemoryOverlap.cpp     |  54 ++-
 .../csrc/aten/mirror/NPUTensorIterator.cpp    | 387 +++++++++---------
 .../csrc/aten/mirror/NPUTypeProperties.cpp    |  99 ++---
 .../csrc/aten/mirror/NPUTypeProperties.h      |  14 +-
 .../aten/ops/CopyFromAndResizeKernelNpu.cpp   |  17 +-
 .../aten/ops/FlattenDenseTensorsKernelNpu.cpp |  24 +-
 torch_npu/csrc/aten/ops/FullKernelNpu.cpp     |  22 +-
 .../aten/ops/HasCompatibleShallowCopyType.cpp |  25 +-
 .../csrc/aten/ops/StreamAndEventKernelNpu.cpp |  13 +-
 12 files changed, 532 insertions(+), 531 deletions(-)

diff --git a/torch_npu/csrc/aten/common/TensorFactories.h b/torch_npu/csrc/aten/common/TensorFactories.h
index 22e09f35bd..d84f201370 100644
--- a/torch_npu/csrc/aten/common/TensorFactories.h
+++ b/torch_npu/csrc/aten/common/TensorFactories.h
@@ -6,18 +6,20 @@
 namespace at_npu {
 namespace native {
 
-inline void check_size_nonnegative(c10::IntArrayRef& size) {
-  for (auto& x : size) {
-    TORCH_CHECK(
-        x >= 0,
-        "Trying to create tensor with negative dimension ",
-        x,
-        ": ",
-        size, OPS_ERROR(ErrCode::VALUE));
-  }
+inline void check_size_nonnegative(c10::IntArrayRef& size)
+{
+    for (auto& x : size) {
+        TORCH_CHECK(
+            x >= 0,
+            "Trying to create tensor with negative dimension ",
+            x,
+            ": ",
+            size, OPS_ERROR(ErrCode::VALUE));
+    }
 }
 
-inline void check_args(int64_t row, int64_t col, const c10::TensorOptions& options) {
+inline void check_args(int64_t row, int64_t col, const c10::TensorOptions& options)
+{
     TORCH_CHECK(row >= 0, "row must be non-negative, got", row, OPS_ERROR(ErrCode::VALUE));
     TORCH_CHECK(col >= 0, "col must be non-negative, got", col, OPS_ERROR(ErrCode::VALUE));
     if (options.has_layout()) {
@@ -28,27 +30,28 @@ inline void check_args(int64_t row, int64_t col, const c10::TensorOptions& optio
     }
 }
 
-inline int64_t get_tril_size(int64_t row, int64_t col, int64_t offset) {
-  // number of elements in the first row of the tril
-  auto m_first_row = offset > 0 ?
-    std::min<int64_t>(col, 1 + offset) : // upper bounded by col
-    row + offset > 0; // either 0 or 1
-  // number of elements in the last row of the tril, bounded by [0, col]
-  auto m_last_row = std::max<int64_t>(0, std::min<int64_t>(col, row + offset));
-  // number of rows, bounded by [0, row]
-  auto n_row_all = std::max<int64_t>(0, std::min<int64_t>(row, row + offset));
-  auto n_row_trapezoid = (m_last_row - m_first_row + 1);
-
-  // calculate # of elements in the top trapezoid
-  auto tril_size = (m_first_row + m_last_row) * n_row_trapezoid >> 1;
-
-  // calculate # of elements in the bottom rectangle if there is any
-  auto diff_row = n_row_all - n_row_trapezoid;
-  if (diff_row > 0) {
-    tril_size += diff_row * col;
-  }
-
-  return tril_size;
+inline int64_t get_tril_size(int64_t row, int64_t col, int64_t offset)
+{
+    // number of elements in the first row of the tril
+    auto m_first_row = offset > 0 ?
+        std::min<int64_t>(col, 1 + offset) : // upper bounded by col
+        row + offset > 0; // either 0 or 1
+    // number of elements in the last row of the tril, bounded by [0, col]
+    auto m_last_row = std::max<int64_t>(0, std::min<int64_t>(col, row + offset));
+    // number of rows, bounded by [0, row]
+    auto n_row_all = std::max<int64_t>(0, std::min<int64_t>(row, row + offset));
+    auto n_row_trapezoid = (m_last_row - m_first_row + 1);
+
+    // calculate # of elements in the top trapezoid
+    auto tril_size = (m_first_row + m_last_row) * n_row_trapezoid >> 1;
+
+    // calculate # of elements in the bottom rectangle if there is any
+    auto diff_row = n_row_all - n_row_trapezoid;
+    if (diff_row > 0) {
+        tril_size += diff_row * col;
+    }
+
+    return tril_size;
 }
 
 } // namespace native
diff --git a/torch_npu/csrc/aten/common/TensorShape.cpp b/torch_npu/csrc/aten/common/TensorShape.cpp
index b5b8581940..6245d9e1f4 100644
--- a/torch_npu/csrc/aten/common/TensorShape.cpp
+++ b/torch_npu/csrc/aten/common/TensorShape.cpp
@@ -24,47 +24,50 @@ namespace {
 // Named type instead of a pair/tuple so that we can be sure to
 // construct the vectors in place and get NRVO.
 struct InferUnsqueezeGeometryResult {
-  at::DimVector sizes;
-  at::DimVector strides;
-  InferUnsqueezeGeometryResult(c10::IntArrayRef tensor_sizes, c10::IntArrayRef tensor_strides)
-    : sizes(tensor_sizes.begin(), tensor_sizes.end()), strides(tensor_strides.begin(), tensor_strides.end()) {}
+    at::DimVector sizes;
+    at::DimVector strides;
+    InferUnsqueezeGeometryResult(c10::IntArrayRef tensor_sizes, c10::IntArrayRef tensor_strides)
+        : sizes(tensor_sizes.begin(), tensor_sizes.end()), strides(tensor_strides.begin(), tensor_strides.end()) {}
 };
 }
 
-InferUnsqueezeGeometryResult inferUnsqueezeGeometry(const at::Tensor& tensor, int64_t dim) {
-  InferUnsqueezeGeometryResult result(tensor.sizes(), tensor.strides());
-  int64_t new_stride = dim >= tensor.dim() ? 1 : result.sizes[dim] * result.strides[dim];
-  result.sizes.insert(result.sizes.begin() + dim, 1);
-  result.strides.insert(result.strides.begin() + dim, new_stride);
+InferUnsqueezeGeometryResult inferUnsqueezeGeometry(const at::Tensor& tensor, int64_t dim)
+{
+    InferUnsqueezeGeometryResult result(tensor.sizes(), tensor.strides());
+    int64_t new_stride = dim >= tensor.dim() ? 1 : result.sizes[dim] * result.strides[dim];
+    result.sizes.insert(result.sizes.begin() + dim, 1);
+    result.strides.insert(result.strides.begin() + dim, new_stride);
 
-  return result;
+    return result;
 }
 
-std::tuple<at::DimVector, at::DimVector> inferSqueezeGeometry(const at::Tensor &tensor) {
-  at::DimVector sizes;
-  at::DimVector strides;
+std::tuple<at::DimVector, at::DimVector> inferSqueezeGeometry(const at::Tensor &tensor)
+{
+    at::DimVector sizes;
+    at::DimVector strides;
 
-  for (const auto d : c10::irange(tensor.dim())) {
-    if (tensor.sizes()[d] != 1) {
-      sizes.push_back(tensor.sizes()[d]);
-      strides.push_back(tensor.strides()[d]);
+    for (const auto d : c10::irange(tensor.dim())) {
+        if (tensor.sizes()[d] != 1) {
+            sizes.push_back(tensor.sizes()[d]);
+            strides.push_back(tensor.strides()[d]);
+        }
     }
-  }
 
-  return std::make_tuple(std::move(sizes), std::move(strides));
+    return std::make_tuple(std::move(sizes), std::move(strides));
 }
 
-std::tuple<at::DimVector, at::DimVector> inferSqueezeGeometry(const at::Tensor& tensor, int64_t dim) {
-  at::DimVector sizes;
-  at::DimVector strides;
+std::tuple<at::DimVector, at::DimVector> inferSqueezeGeometry(const at::Tensor& tensor, int64_t dim)
+{
+    at::DimVector sizes;
+    at::DimVector strides;
 
-  for (const auto d : c10::irange(tensor.dim())) {
-    if (d != dim || tensor.sizes()[dim] != 1) {
-      sizes.push_back(tensor.sizes()[d]);
-      strides.push_back(tensor.strides()[d]);
+    for (const auto d : c10::irange(tensor.dim())) {
+        if (d != dim || tensor.sizes()[dim] != 1) {
+            sizes.push_back(tensor.sizes()[d]);
+            strides.push_back(tensor.strides()[d]);
+        }
     }
-  }
-  return std::make_tuple(std::move(sizes), std::move(strides));
+    return std::make_tuple(std::move(sizes), std::move(strides));
 }
 
 namespace at_npu {
@@ -73,114 +76,121 @@ namespace native {
 at::Tensor alias_with_sizes_and_strides_npu(
     const at::Tensor& self,
     const c10::IntArrayRef sizes,
-    const c10::IntArrayRef strides) {
-  at::Tensor self_;
-  if (self.is_quantized()) {
-    self_ = at::detail::make_tensor<at::QTensorImpl>(
-        c10::TensorImpl::VIEW,
-        c10::Storage(self.storage()),
-        self.key_set(),
-        self.dtype(),
-        get_qtensorimpl(self)->quantizer());
-    auto* self_tmp_ = self_.unsafeGetTensorImpl();
-    self_tmp_->set_storage_offset(self.storage_offset());
-    self_tmp_->set_sizes_and_strides(sizes, strides);
-  } else {
-    self_ = at::detail::make_tensor<at::TensorImpl>(
-        c10::TensorImpl::VIEW,
-        c10::Storage(self.storage()),
-        self.key_set(),
-        self.dtype());
-    auto* self_tmp_ = self_.unsafeGetTensorImpl();
-    self_tmp_->set_storage_offset(self.storage_offset());
-    self_tmp_->set_sizes_and_strides(sizes, strides);
-  }
-  at::namedinference::propagate_names(self_, self);
-  return self_;
+    const c10::IntArrayRef strides)
+{
+    at::Tensor self_;
+    if (self.is_quantized()) {
+        self_ = at::detail::make_tensor<at::QTensorImpl>(
+            c10::TensorImpl::VIEW,
+            c10::Storage(self.storage()),
+            self.key_set(),
+            self.dtype(),
+            get_qtensorimpl(self)->quantizer());
+        auto* self_tmp_ = self_.unsafeGetTensorImpl();
+        self_tmp_->set_storage_offset(self.storage_offset());
+        self_tmp_->set_sizes_and_strides(sizes, strides);
+    } else {
+        self_ = at::detail::make_tensor<at::TensorImpl>(
+            c10::TensorImpl::VIEW,
+            c10::Storage(self.storage()),
+            self.key_set(),
+            self.dtype());
+        auto* self_tmp_ = self_.unsafeGetTensorImpl();
+        self_tmp_->set_storage_offset(self.storage_offset());
+        self_tmp_->set_sizes_and_strides(sizes, strides);
+    }
+    at::namedinference::propagate_names(self_, self);
+    return self_;
 }
 
-at::Tensor NPUNativeFunctions::view(const at::Tensor& self, c10::IntArrayRef size) {
-  auto inferred_size = at::infer_size(size, self.numel());
-  auto stride =
-      at::detail::computeStride(self.sizes(), self.strides(), inferred_size);
-  TORCH_CHECK(
-      stride.has_value(),
-      "view size is "
-      "not compatible with input tensor's size and stride (at least one dimension"
-      " spans across two contiguous subspaces). Use .reshape(...) instead.", OPS_ERROR(ErrCode::PARAM));
-  auto stride_value = *stride;
-  auto dst = self;
-  return alias_with_sizes_and_strides_npu(dst, inferred_size, stride_value);
+at::Tensor NPUNativeFunctions::view(const at::Tensor& self, c10::IntArrayRef size)
+{
+    auto inferred_size = at::infer_size(size, self.numel());
+    auto stride = at::detail::computeStride(self.sizes(), self.strides(), inferred_size);
+    TORCH_CHECK(
+        stride.has_value(),
+        "view size is "
+        "not compatible with input tensor's size and stride (at least one dimension"
+        " spans across two contiguous subspaces). Use .reshape(...) instead.", OPS_ERROR(ErrCode::PARAM));
+    auto stride_value = *stride;
+    auto dst = self;
+    return alias_with_sizes_and_strides_npu(dst, inferred_size, stride_value);
 }
 
 at::Tensor NPUNativeFunctions::as_strided(
     const at::Tensor& self,
     c10::IntArrayRef size,
     c10::IntArrayRef stride,
-    c10::optional<int64_t> storage_offset_) {
-  auto dst = self;
-  if (InferFormat::IsDefiniteTensorWhenMetaDataChanges(dst, size) && !FormatHelper::IsOpInputBaseFormat(dst)) {
-    TORCH_WARN_ONCE("current tensor is running as_strided, don't perform inplace operations on the returned value."
-        " If you encounter this warning and have precision issues,"
-        " you can try torch.npu.config.allow_internal_format = False to resolve precision issues.")
-    dst = FormatCastHelper::ApplyBaseFormatTensorBy(dst);
-  }
-  auto storage_offset = storage_offset_.value_or(dst.storage_offset());
-  auto result = at::detail::make_tensor<at::TensorImpl>(
-      c10::TensorImpl::VIEW,
-      c10::Storage(dst.storage()),
-      dst.key_set(),
-      dst.dtype());
-  setStrided(result, size, stride, storage_offset);
-  return result;
+    c10::optional<int64_t> storage_offset_)
+{
+    auto dst = self;
+    if (InferFormat::IsDefiniteTensorWhenMetaDataChanges(dst, size) && !FormatHelper::IsOpInputBaseFormat(dst)) {
+        TORCH_WARN_ONCE("current tensor is running as_strided, don't perform inplace operations on the returned value."
+            " If you encounter this warning and have precision issues,"
+            " you can try torch.npu.config.allow_internal_format = False to resolve precision issues.")
+        dst = FormatCastHelper::ApplyBaseFormatTensorBy(dst);
+    }
+    auto storage_offset = storage_offset_.value_or(dst.storage_offset());
+    auto result = at::detail::make_tensor<at::TensorImpl>(
+        c10::TensorImpl::VIEW,
+        c10::Storage(dst.storage()),
+        dst.key_set(),
+        dst.dtype());
+    setStrided(result, size, stride, storage_offset);
+    return result;
 }
 
 const at::Tensor& NPUNativeFunctions::as_strided__symint(
     const at::Tensor& self,
     c10::SymIntArrayRef size,
     c10::SymIntArrayRef stride,
-    c10::optional<c10::SymInt> storage_offset_) {
-  at::Tensor result = self;
-  if (InferFormat::IsDefiniteTensorWhenMetaDataChanges(result, c10::asIntArrayRefUnchecked(size)) &&
-      !FormatHelper::IsOpInputBaseFormat(result)) {
-    TORCH_WARN_ONCE("current tensor is running as_strided, don't perform inplace operations on the returned value."
-        " If you encounter this warning and have precision issues,"
-        " you can try torch.npu.config.allow_internal_format = False to resolve precision issues.")
-    result = FormatCastHelper::CovertSelfToBaseFormat(result);
-  }
-  auto storage_offset = storage_offset_.value_or(result.storage_offset());
-  at::native::setStrided(result, size, stride, storage_offset);
-  return result;
+    c10::optional<c10::SymInt> storage_offset_)
+{
+    at::Tensor result = self;
+    if (InferFormat::IsDefiniteTensorWhenMetaDataChanges(result, c10::asIntArrayRefUnchecked(size)) &&
+        !FormatHelper::IsOpInputBaseFormat(result)) {
+        TORCH_WARN_ONCE("current tensor is running as_strided, don't perform inplace operations on the returned value."
+            " If you encounter this warning and have precision issues,"
+            " you can try torch.npu.config.allow_internal_format = False to resolve precision issues.")
+        result = FormatCastHelper::CovertSelfToBaseFormat(result);
+    }
+    auto storage_offset = storage_offset_.value_or(result.storage_offset());
+    at::native::setStrided(result, size, stride, storage_offset);
+    return result;
 }
 
-at::Tensor NPUNativeFunctions::unsqueeze(const at::Tensor& self, int64_t dim) {
+at::Tensor NPUNativeFunctions::unsqueeze(const at::Tensor& self, int64_t dim)
+{
     dim = at::maybe_wrap_dim(dim, self.dim() + 1);
     auto g = inferUnsqueezeGeometry(self, dim);
     return self.as_strided(g.sizes, g.strides);
 }
 
-at::Tensor NPUNativeFunctions::squeeze(const at::Tensor& self) {
-  auto g = inferSqueezeGeometry(self);
-  at::Tensor result = self.as_strided(std::get<0>(g), std::get<1>(g));
-  auto maybe_outnames = at::namedinference::compute_squeeze_outnames(self);
-  at::namedinference::propagate_names_if_nonempty(result, maybe_outnames);
-  return result;
+at::Tensor NPUNativeFunctions::squeeze(const at::Tensor& self)
+{
+    auto g = inferSqueezeGeometry(self);
+    at::Tensor result = self.as_strided(std::get<0>(g), std::get<1>(g));
+    auto maybe_outnames = at::namedinference::compute_squeeze_outnames(self);
+    at::namedinference::propagate_names_if_nonempty(result, maybe_outnames);
+    return result;
 }
 
-at::Tensor NPUNativeFunctions::squeeze(const at::Tensor& self, int64_t dim) {
-  int64_t dims = self.dim();
-  dim = at::maybe_wrap_dim(dim, dims);
-  if (dims == 0 || self.sizes()[dim] != 1) {
-    return self.as_strided(self.sizes(), self.strides());
-  }
-  auto g = inferSqueezeGeometry(self, dim);
-  auto result = self.as_strided(std::get<0>(g), std::get<1>(g));
-  at::namedinference::propagate_names_except(result, self, {dim});
-  return result;
+at::Tensor NPUNativeFunctions::squeeze(const at::Tensor& self, int64_t dim)
+{
+    int64_t dims = self.dim();
+    dim = at::maybe_wrap_dim(dim, dims);
+    if (dims == 0 || self.sizes()[dim] != 1) {
+        return self.as_strided(self.sizes(), self.strides());
+    }
+    auto g = inferSqueezeGeometry(self, dim);
+    auto result = self.as_strided(std::get<0>(g), std::get<1>(g));
+    at::namedinference::propagate_names_except(result, self, {dim});
+    return result;
 }
 
-at::Tensor NPUNativeFunctions::_reshape_alias(const at::Tensor& self, at::IntArrayRef sizes, at::IntArrayRef strides) {
-  return self.view(sizes);
+at::Tensor NPUNativeFunctions::_reshape_alias(const at::Tensor& self, at::IntArrayRef sizes, at::IntArrayRef strides)
+{
+    return self.view(sizes);
 }
 
 } // namespace native
diff --git a/torch_npu/csrc/aten/common/ToKernelNpu.cpp b/torch_npu/csrc/aten/common/ToKernelNpu.cpp
index c4b9a941af..59025bbf97 100644
--- a/torch_npu/csrc/aten/common/ToKernelNpu.cpp
+++ b/torch_npu/csrc/aten/common/ToKernelNpu.cpp
@@ -14,48 +14,50 @@ namespace native {
 // representing the current device) and return the corresponding c10::Device
 // according to the actual device at the time of this function call.  No-op
 // if the device_index is set.
-static inline c10::Device ensure_has_index(c10::Device device) {
-  if (device.is_cpu() || device.has_index()) {
-    return device;
-  }
-  const c10::impl::DeviceGuardImplInterface* impl =
-      c10::impl::getDeviceGuardImpl(device.type());
-  return impl->getDevice();
+static inline c10::Device ensure_has_index(c10::Device device)
+{
+    if (device.is_cpu() || device.has_index()) {
+        return device;
+    }
+    const c10::impl::DeviceGuardImplInterface* impl =
+        c10::impl::getDeviceGuardImpl(device.type());
+    return impl->getDevice();
 }
 
 static inline at::Tensor to_impl_npu(
     const at::Tensor& self,
     const c10::TensorOptions& options,
     bool non_blocking,
-    bool copy) {
-  auto memory_format = options.memory_format_opt().value_or(
-      c10::MemoryFormat::Contiguous); // Here cpu's default value is Preserve
-  if (self.dtype() == options.dtype() && self.layout() == options.layout() &&
-      self.device() == options.device() && !copy &&
-      (memory_format == c10::MemoryFormat::Preserve ||
-       self.suggest_memory_format() == memory_format)) {
-    return self;
-  }
+    bool copy)
+{
+    auto memory_format = options.memory_format_opt().value_or(
+        c10::MemoryFormat::Contiguous); // Here cpu's default value is Preserve
+    if (self.dtype() == options.dtype() &&
+        self.layout() == options.layout() &&
+        self.device() == options.device() && !copy &&
+        (memory_format == c10::MemoryFormat::Preserve || self.suggest_memory_format() == memory_format)) {
+        return self;
+    }
 
     bool pin_out = non_blocking && torch_npu::utils::is_npu(self) && options.device().is_cpu() &&
                     (options.layout() == c10::kStrided);
 
-  if (memory_format == c10::MemoryFormat::Preserve) {
-    if (self.is_non_overlapping_and_dense()) {
-      // Copy all strides
-      auto r = at::empty_strided(
-          self.sizes(), self.strides(), options.memory_format(c10::nullopt).pinned_memory(pin_out));
-      r.copy_(self, non_blocking);
-      return r;
-    } else {
-      memory_format = self.suggest_memory_format();
+    if (memory_format == c10::MemoryFormat::Preserve) {
+        if (self.is_non_overlapping_and_dense()) {
+            // Copy all strides
+            auto r = at::empty_strided(
+                self.sizes(), self.strides(), options.memory_format(c10::nullopt).pinned_memory(pin_out));
+            r.copy_(self, non_blocking);
+            return r;
+        } else {
+            memory_format = self.suggest_memory_format();
+        }
     }
-  }
-  // See Note [Explicit nullopt c10::MemoryFormat argument]
-  auto r = at::empty(
-      self.sizes(), options.memory_format(memory_format).pinned_memory(pin_out), c10::nullopt);
-  r.copy_(self, non_blocking);
-  return r;
+    // See Note [Explicit nullopt c10::MemoryFormat argument]
+    auto r = at::empty(
+        self.sizes(), options.memory_format(memory_format).pinned_memory(pin_out), c10::nullopt);
+    r.copy_(self, non_blocking);
+    return r;
 }
 
 at::Tensor NPUNativeFunctions::_to_copy(
@@ -134,13 +136,13 @@ at::Tensor NPUNativeFunctions::to(
     bool copy,
     c10::optional<c10::MemoryFormat> optional_memory_format)
 {
-  device = ensure_has_index(device);
-  return to_impl_npu(
-      self,
-      self.options().device(device).dtype(dtype).memory_format(
-          optional_memory_format),
-      non_blocking,
-      copy);
+    device = ensure_has_index(device);
+    return to_impl_npu(
+        self,
+        self.options().device(device).dtype(dtype).memory_format(
+            optional_memory_format),
+        non_blocking,
+        copy);
 }
 
 at::Tensor NPUNativeFunctions::to(
@@ -148,17 +150,18 @@ at::Tensor NPUNativeFunctions::to(
     at::ScalarType dtype,
     bool non_blocking,
     bool copy,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
-  if (self.dtype() == dtype) {
-    return self;
-  }
-  if (at::ScalarType::Double == dtype) {
-      TORCH_NPU_WARN_ONCE(
-          "Warning: Device do not support double dtype now, "
-          "dtype cast repalce with float.");
-  }
-  dtype = (at::ScalarType::Double == dtype) ? at::ScalarType::Float : dtype;
-  return custom_ops::npu_dtype_cast(self, dtype);
+    c10::optional<c10::MemoryFormat> optional_memory_format)
+{
+    if (self.dtype() == dtype) {
+        return self;
+    }
+    if (at::ScalarType::Double == dtype) {
+        TORCH_NPU_WARN_ONCE(
+            "Warning: Device do not support double dtype now, "
+            "dtype cast repalce with float.");
+    }
+    dtype = (at::ScalarType::Double == dtype) ? at::ScalarType::Float : dtype;
+    return custom_ops::npu_dtype_cast(self, dtype);
 }
 
 at::Tensor NPUNativeFunctions::to(
@@ -166,10 +169,12 @@ at::Tensor NPUNativeFunctions::to(
     const at::Tensor& other,
     bool non_blocking,
     bool copy,
-    c10::optional<c10::MemoryFormat> optional_memory_format) {
-  auto options = other.options();
-  return to_impl_npu(
-      self, options.memory_format(optional_memory_format), non_blocking, copy);
+    c10::optional<c10::MemoryFormat> optional_memory_format)
+{
+    auto options = other.options();
+    return to_impl_npu(
+        self, options.memory_format(optional_memory_format), non_blocking, copy);
 }
+
 } // namespace native
 } // namespace at_npu
diff --git a/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.cpp b/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.cpp
index 4863285950..2454731c80 100644
--- a/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.cpp
+++ b/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.cpp
@@ -2,13 +2,16 @@
 #include "torch_npu/csrc/core/npu/NPUException.h"
 #include <c10/core/Layout.h>
 
-namespace at_npu { namespace native {
+namespace at_npu {
+namespace native {
 
-MemOverlap has_internal_overlap(const at::Tensor& tensor) {
-  return has_internal_overlap(tensor.unsafeGetTensorImpl());
+MemOverlap has_internal_overlap(const at::Tensor& tensor)
+{
+    return has_internal_overlap(tensor.unsafeGetTensorImpl());
 }
 
-MemOverlap has_internal_overlap(at::TensorImpl* t) {
+MemOverlap has_internal_overlap(at::TensorImpl* t)
+{
     AT_ASSERT(t->layout() == at::kStrided, PTA_ERROR(ErrCode::PARAM));
 
     if (t->is_contiguous()) {
@@ -26,22 +29,26 @@ MemOverlap has_internal_overlap(at::TensorImpl* t) {
     return MemOverlap::TOO_HARD;
 }
 
-void assert_no_internal_overlap(const at::Tensor& t) {
-  assert_no_internal_overlap(t.unsafeGetTensorImpl());
+void assert_no_internal_overlap(const at::Tensor& t)
+{
+    assert_no_internal_overlap(t.unsafeGetTensorImpl());
 }
 
-void assert_no_internal_overlap(at::TensorImpl* t) {
-  TORCH_CHECK(has_internal_overlap(t) != MemOverlap::YES,
-      "unsupported operation: more than one element of the written-to tensor "
-      "refers to a single memory location. Please clone() the tensor before "
-      "performing the operation.", PTA_ERROR(ErrCode::NOT_SUPPORT));
+void assert_no_internal_overlap(at::TensorImpl* t)
+{
+    TORCH_CHECK(has_internal_overlap(t) != MemOverlap::YES,
+        "unsupported operation: more than one element of the written-to tensor "
+        "refers to a single memory location. Please clone() the tensor before "
+        "performing the operation.", PTA_ERROR(ErrCode::NOT_SUPPORT));
 }
 
-MemOverlapStatus get_overlap_status(const at::Tensor& a, const at::Tensor& b) {
-  return get_overlap_status(a.unsafeGetTensorImpl(), b.unsafeGetTensorImpl());
+MemOverlapStatus get_overlap_status(const at::Tensor& a, const at::Tensor& b)
+{
+    return get_overlap_status(a.unsafeGetTensorImpl(), b.unsafeGetTensorImpl());
 }
 
-MemOverlapStatus get_overlap_status(const at::TensorImpl* a, const at::TensorImpl* b) {
+MemOverlapStatus get_overlap_status(const at::TensorImpl* a, const at::TensorImpl* b)
+{
     if (a == b) {
         return MemOverlapStatus::FULL;
     }
@@ -66,15 +73,18 @@ MemOverlapStatus get_overlap_status(const at::TensorImpl* a, const at::TensorImp
     return MemOverlapStatus::NO;
 }
 
-void assert_no_partial_overlap(const at::Tensor& a, const at::Tensor& b) {
-  assert_no_partial_overlap(a.unsafeGetTensorImpl(), b.unsafeGetTensorImpl());
+void assert_no_partial_overlap(const at::Tensor& a, const at::Tensor& b)
+{
+    assert_no_partial_overlap(a.unsafeGetTensorImpl(), b.unsafeGetTensorImpl());
 }
 
-void assert_no_partial_overlap(at::TensorImpl* a, at::TensorImpl* b) {
-  TORCH_CHECK(get_overlap_status(a, b) != MemOverlapStatus::PARTIAL,
-      "unsupported operation: some elements of the input tensor and "
-      "the written-to tensor refer to a single memory location. "
-      "Please clone() the tensor before performing the operation.", PTA_ERROR(ErrCode::NOT_SUPPORT));
+void assert_no_partial_overlap(at::TensorImpl* a, at::TensorImpl* b)
+{
+    TORCH_CHECK(get_overlap_status(a, b) != MemOverlapStatus::PARTIAL,
+        "unsupported operation: some elements of the input tensor and "
+        "the written-to tensor refer to a single memory location. "
+        "Please clone() the tensor before performing the operation.", PTA_ERROR(ErrCode::NOT_SUPPORT));
 }
 
-}}
\ No newline at end of file
+}
+}
diff --git a/torch_npu/csrc/aten/mirror/NPUTensorIterator.cpp b/torch_npu/csrc/aten/mirror/NPUTensorIterator.cpp
index ef9531e7e4..4e82f0be2a 100644
--- a/torch_npu/csrc/aten/mirror/NPUTensorIterator.cpp
+++ b/torch_npu/csrc/aten/mirror/NPUTensorIterator.cpp
@@ -2,233 +2,212 @@
 #include "torch_npu/csrc/aten/mirror/NPUTensorIterator.h"
 #include "torch_npu/csrc/core/npu/NPUException.h"
 
-namespace at_npu
+namespace at_npu {
+namespace native {
+
+std::tuple<at::ScalarType, c10::IntArrayRef> NPUTensorIterator::binary_op(
+    at::Tensor &out,
+    const at::Tensor &a,
+    const at::Tensor &b,
+    bool check_mem_overlap)
 {
-  namespace native
-  {
+    auto iter = NPUTensorIterator();
+    iter.add_output(out);
+    iter.add_input(a);
+    iter.add_input(b);
+    iter.promote_common_dtype();
+    iter.compute_types();
+    auto common_type = iter.common_dtype();
+    auto common_shape = a.sizes();
+    return std::tie(common_type, common_shape);
+}
 
-    std::tuple<at::ScalarType, c10::IntArrayRef> NPUTensorIterator::binary_op(
-        at::Tensor &out,
-        const at::Tensor &a,
-        const at::Tensor &b,
-        bool check_mem_overlap)
-    {
-      auto iter = NPUTensorIterator();
-      iter.add_output(out);
-      iter.add_input(a);
-      iter.add_input(b);
-      iter.promote_common_dtype();
-      iter.compute_types();
-      auto common_type = iter.common_dtype();
-      auto common_shape = a.sizes();
-      return std::tie(common_type, common_shape);
+std::tuple<at::ScalarType, c10::IntArrayRef> NPUTensorIterator::binary_op(
+    const at::Tensor &a,
+    const c10::Scalar b)
+{
+    at::ScalarType scalar_type;
+    if (b.isFloatingPoint()) {
+        scalar_type = at::ScalarType::Float;
+    } else if (b.isBoolean()) {
+        scalar_type = at::ScalarType::Bool;
+    } else if (b.isComplex()) {
+        scalar_type = at::ScalarType::ComplexFloat;
+    } else {
+        AT_ASSERT(b.isIntegral(false), OPS_ERROR(ErrCode::PARAM));
+        scalar_type = at::ScalarType::Int;
     }
-
-    std::tuple<at::ScalarType, c10::IntArrayRef> NPUTensorIterator::binary_op(
-        const at::Tensor &a,
-        const c10::Scalar b)
-    {
-        at::ScalarType scalar_type;
-        if (b.isFloatingPoint())
-        {
-            scalar_type = at::ScalarType::Float;
-        }
-        else if (b.isBoolean())
-        {
-            scalar_type = at::ScalarType::Bool;
-        }
-        else if (b.isComplex())
-        {
-            scalar_type = at::ScalarType::ComplexFloat;
-        }
-        else
-        {
-            AT_ASSERT(b.isIntegral(false), OPS_ERROR(ErrCode::PARAM));
-            scalar_type = at::ScalarType::Int;
-        }
-        if (a.scalar_type() == at::ScalarType::Half)
-        {
-            scalar_type = at::ScalarType::Half;
-        }
-        if (a.scalar_type() == at::ScalarType::BFloat16)
-        {
-            scalar_type = at::ScalarType::BFloat16;
-        }
-        if (a.scalar_type() != scalar_type)
-        {
-            scalar_type = result_type(a.scalar_type(), scalar_type);
-        }
-        auto common_shape = a.sizes();
-        return std::tie(scalar_type, common_shape);
+    if (a.scalar_type() == at::ScalarType::Half) {
+        scalar_type = at::ScalarType::Half;
     }
-
-    std::tuple<at::ScalarType, c10::IntArrayRef> NPUTensorIterator::comparison_op(
-        at::Tensor &out,
-        const at::Tensor &a,
-        const at::Tensor &b,
-        bool check_mem_overlap)
-    {
-      auto iter = NPUTensorIterator();
-      iter.add_output(out);
-      iter.add_input(a);
-      iter.add_input(b);
-      iter.compute_common_dtype_only_for_inputs();
-      iter.compute_types();
-      auto common_type = iter.common_dtype();
-      auto common_shape = a.sizes();
-      return std::tie(common_type, common_shape);
+    if (a.scalar_type() == at::ScalarType::BFloat16) {
+        scalar_type = at::ScalarType::BFloat16;
     }
-
-    std::tuple<at::ScalarType, c10::IntArrayRef> NPUTensorIterator::unary_op(
-        at::Tensor &out,
-        const at::Tensor &a,
-        bool check_mem_overlap)
-    {
-      auto iter = NPUTensorIterator();
-      iter.add_output(out);
-      iter.add_input(a);
-      iter.num_outputs_ = 1;
-      iter.compute_types();
-      auto common_type = iter.common_dtype();
-      auto common_shape = a.sizes();
-      return std::tie(common_type, common_shape);
+    if (a.scalar_type() != scalar_type) {
+        scalar_type = result_type(a.scalar_type(), scalar_type);
     }
+    auto common_shape = a.sizes();
+    return std::tie(scalar_type, common_shape);
+}
 
-    void NPUTensorIterator::nullary_op(at::Tensor &out)
-    {
-      auto iter = NPUTensorIterator();
-      iter.add_output(out);
-      iter.compute_types();
-    }
+std::tuple<at::ScalarType, c10::IntArrayRef> NPUTensorIterator::comparison_op(
+    at::Tensor &out,
+    const at::Tensor &a,
+    const at::Tensor &b,
+    bool check_mem_overlap)
+{
+    auto iter = NPUTensorIterator();
+    iter.add_output(out);
+    iter.add_input(a);
+    iter.add_input(b);
+    iter.compute_common_dtype_only_for_inputs();
+    iter.compute_types();
+    auto common_type = iter.common_dtype();
+    auto common_shape = a.sizes();
+    return std::tie(common_type, common_shape);
+}
 
-    std::tuple<at::ScalarType, c10::IntArrayRef> NPUTensorIterator::reduce_op(at::Tensor &out, const at::Tensor &a)
-    {
-        TORCH_INTERNAL_ASSERT(out.defined(), OPS_ERROR(ErrCode::PARAM));
-        auto iter = NPUTensorIterator();
-        iter.add_output(out);
-        iter.add_input(a);
-        iter.promote_npu_output_dtypes_ = true;
-        iter.is_reduction_ = true;
-        // (Ascend): This is only really necessary for arg{min,max}
-        iter.compute_common_dtype_only_for_inputs();
-        iter.compute_types();
-        auto common_type = iter.common_dtype();
-        auto common_shape = a.sizes();
-        return std::tie(common_type, common_shape);
-    }
+std::tuple<at::ScalarType, c10::IntArrayRef> NPUTensorIterator::unary_op(
+    at::Tensor &out,
+    const at::Tensor &a,
+    bool check_mem_overlap)
+{
+    auto iter = NPUTensorIterator();
+    iter.add_output(out);
+    iter.add_input(a);
+    iter.num_outputs_ = 1;
+    iter.compute_types();
+    auto common_type = iter.common_dtype();
+    auto common_shape = a.sizes();
+    return std::tie(common_type, common_shape);
+}
 
-    std::tuple<at::ScalarType, c10::IntArrayRef> NPUTensorIterator::reduce_op(
-        at::Tensor &out1,
-        at::Tensor &out2,
-        const at::Tensor &a)
-    {
-        TORCH_INTERNAL_ASSERT(out1.defined(), OPS_ERROR(ErrCode::PARAM));
-        TORCH_INTERNAL_ASSERT(out2.defined(), OPS_ERROR(ErrCode::PARAM));
-        TORCH_CHECK(out1.dim() == out2.dim(), "reduce_op(): expected both outputs to have same number of dims, but output1 has ", out1.dim(),
-                    " and output2 has ", out2.dim(), OPS_ERROR(ErrCode::PARAM));
-        TORCH_CHECK(out1.sizes() == out2.sizes(), "reduce_op(): expected both outputs to have same sizes, but output1 has ", out1.sizes(),
-                    " and output2 has ", out2.sizes(), OPS_ERROR(ErrCode::PARAM));
-        TORCH_CHECK(out1.strides() == out2.strides(), "reduce_op(): expected both outputs to have same strides, but output1 has ", out1.strides(),
-                    " and output2 has ", out2.strides(), OPS_ERROR(ErrCode::PARAM));
-        auto iter = NPUTensorIterator();
-        iter.add_output(out1);
-        iter.add_output(out2);
-        iter.add_input(a);
-        iter.promote_npu_output_dtypes_ = true;
-        iter.is_reduction_ = true;
-        iter.compute_types();
-        auto common_type = iter.common_dtype();
-        auto common_shape = a.sizes();
-        return std::tie(common_type, common_shape);
-    }
+void NPUTensorIterator::nullary_op(at::Tensor &out)
+{
+    auto iter = NPUTensorIterator();
+    iter.add_output(out);
+    iter.compute_types();
+}
+
+std::tuple<at::ScalarType, c10::IntArrayRef> NPUTensorIterator::reduce_op(at::Tensor &out, const at::Tensor &a)
+{
+    TORCH_INTERNAL_ASSERT(out.defined(), OPS_ERROR(ErrCode::PARAM));
+    auto iter = NPUTensorIterator();
+    iter.add_output(out);
+    iter.add_input(a);
+    iter.promote_npu_output_dtypes_ = true;
+    iter.is_reduction_ = true;
+    // (Ascend): This is only really necessary for arg{min,max}
+    iter.compute_common_dtype_only_for_inputs();
+    iter.compute_types();
+    auto common_type = iter.common_dtype();
+    auto common_shape = a.sizes();
+    return std::tie(common_type, common_shape);
+}
+
+std::tuple<at::ScalarType, c10::IntArrayRef> NPUTensorIterator::reduce_op(
+    at::Tensor &out1,
+    at::Tensor &out2,
+    const at::Tensor &a)
+{
+    TORCH_INTERNAL_ASSERT(out1.defined(), OPS_ERROR(ErrCode::PARAM));
+    TORCH_INTERNAL_ASSERT(out2.defined(), OPS_ERROR(ErrCode::PARAM));
+    TORCH_CHECK(out1.dim() == out2.dim(),
+        "reduce_op(): expected both outputs to have same number of dims, but output1 has ", out1.dim(),
+        " and output2 has ", out2.dim(), OPS_ERROR(ErrCode::PARAM));
+    TORCH_CHECK(out1.sizes() == out2.sizes(),
+        "reduce_op(): expected both outputs to have same sizes, but output1 has ", out1.sizes(),
+        " and output2 has ", out2.sizes(), OPS_ERROR(ErrCode::PARAM));
+    TORCH_CHECK(out1.strides() == out2.strides(),
+        "reduce_op(): expected both outputs to have same strides, but output1 has ", out1.strides(),
+        " and output2 has ", out2.strides(), OPS_ERROR(ErrCode::PARAM));
+    auto iter = NPUTensorIterator();
+    iter.add_output(out1);
+    iter.add_output(out2);
+    iter.add_input(a);
+    iter.promote_npu_output_dtypes_ = true;
+    iter.is_reduction_ = true;
+    iter.compute_types();
+    auto common_type = iter.common_dtype();
+    auto common_shape = a.sizes();
+    return std::tie(common_type, common_shape);
+}
 
-    static std::tuple<at::ScalarType, bool> compute_common_type_(at::ArrayRef<NPUOperandInfo> operands)
-    {
-        // See [Result type computation] in NPUTensorIterator.h
-        auto common_type = at::ScalarType::Undefined;
-        bool all_same_type = true;
-        for (const auto &op : operands)
-        {
-            if (!op.tensor.defined())
-                continue;
-            // don't handle scalars
-            if (op.tensor.dim() > 0)
-            {
-                at::ScalarType current = op.current_dtype;
-                if (current == at::ScalarType::Undefined)
-                {
-                    all_same_type = false;
-                    break;
-                }
-                if (common_type == at::ScalarType::Undefined)
-                {
-                    common_type = current;
-                }
-                if (common_type != current)
-                {
-                    all_same_type = false;
-                    break;
-                }
+static std::tuple<at::ScalarType, bool> compute_common_type_(at::ArrayRef<NPUOperandInfo> operands)
+{
+    // See [Result type computation] in NPUTensorIterator.h
+    auto common_type = at::ScalarType::Undefined;
+    bool all_same_type = true;
+    for (const auto &op : operands) {
+        if (!op.tensor.defined())
+            continue;
+        // don't handle scalars
+        if (op.tensor.dim() > 0) {
+            at::ScalarType current = op.current_dtype;
+            if (current == at::ScalarType::Undefined) {
+                all_same_type = false;
+                break;
+            }
+            if (common_type == at::ScalarType::Undefined) {
+                common_type = current;
             }
-            else
-            {
+            if (common_type != current) {
                 all_same_type = false;
                 break;
             }
+        } else {
+            all_same_type = false;
+            break;
         }
-        if (all_same_type)
-        {
-            return std::make_tuple(common_type, true);
-        }
-
-        ResultTypeState state = {};
-        for (const auto &op : operands)
-        {
-            state = update_result_type_state(op.tensor, state);
-        }
-        auto dtype = result_type(state);
-
-        auto result = std::make_tuple(dtype, false);
-        TORCH_INTERNAL_ASSERT(dtype != at::ScalarType::Undefined, OPS_ERROR(ErrCode::TYPE));
-        return result;
+    }
+    if (all_same_type) {
+        return std::make_tuple(common_type, true);
     }
 
-    std::tuple<at::ScalarType, bool> NPUTensorIterator::compute_common_type()
-    {
-      return compute_common_type_(operands_);
+    ResultTypeState state = {};
+    for (const auto &op : operands) {
+        state = update_result_type_state(op.tensor, state);
     }
+    auto dtype = result_type(state);
+
+    auto result = std::make_tuple(dtype, false);
+    TORCH_INTERNAL_ASSERT(dtype != at::ScalarType::Undefined, OPS_ERROR(ErrCode::TYPE));
+    return result;
+}
+
+std::tuple<at::ScalarType, bool> NPUTensorIterator::compute_common_type()
+{
+    return compute_common_type_(operands_);
+}
 
-    void NPUTensorIterator::compute_types()
-    {
-      bool missing_dtypes = false;
-      bool missing_output_dtypes = false;
-      common_dtype_ = dtype();
-      for (auto &op : operands_)
-      {
-        if (!op.tensor.defined() && !op.is_type_defined())
-        {
-          missing_dtypes = true;
-          if (op.is_output)
-          {
-            missing_output_dtypes = true;
-          }
+void NPUTensorIterator::compute_types()
+{
+    bool missing_dtypes = false;
+    bool missing_output_dtypes = false;
+    common_dtype_ = dtype();
+    for (auto &op : operands_) {
+        if (!op.tensor.defined() && !op.is_type_defined()) {
+            missing_dtypes = true;
+            if (op.is_output) {
+                missing_output_dtypes = true;
+            }
         }
-      }
+    }
 
-      if (common_dtype_strategy_ == CommonDTypeStrategy::PROMOTE_INPUTS)
-      {
-        TORCH_CHECK(!missing_output_dtypes, "unable to compute and promote common dtype based only on inputs if there are missing dtypes for outputs",
-                    OPS_ERROR(ErrCode::TYPE));
-      }
-      bool compute_common_dtype = (common_dtype_strategy_ != CommonDTypeStrategy::NONE);
-      bool compute_common_dtype_only_for_inputs = (common_dtype_strategy_ == CommonDTypeStrategy::PROMOTE_INPUTS);
-      if (missing_dtypes || compute_common_dtype)
-      {
-        auto operands = compute_common_dtype_only_for_inputs ? at::ArrayRef<NPUOperandInfo>(operands_).slice(noutputs()) : operands_;
+    if (common_dtype_strategy_ == CommonDTypeStrategy::PROMOTE_INPUTS) {
+        TORCH_CHECK(!missing_output_dtypes,
+            "unable to compute and promote common dtype based only on inputs if there are missing dtypes for outputs",
+            OPS_ERROR(ErrCode::TYPE));
+    }
+    bool compute_common_dtype = (common_dtype_strategy_ != CommonDTypeStrategy::NONE);
+    bool compute_common_dtype_only_for_inputs = (common_dtype_strategy_ == CommonDTypeStrategy::PROMOTE_INPUTS);
+    if (missing_dtypes || compute_common_dtype) {
+        auto operands = compute_common_dtype_only_for_inputs ?
+            at::ArrayRef<NPUOperandInfo>(operands_).slice(noutputs()) : operands_;
         auto common_type = compute_common_type_(operands);
         common_dtype_ = std::get<0>(common_type);
-      }
     }
+}
 
-  } // namespace native
+} // namespace native
 } // namespace at_npu
diff --git a/torch_npu/csrc/aten/mirror/NPUTypeProperties.cpp b/torch_npu/csrc/aten/mirror/NPUTypeProperties.cpp
index 9ede25e5e0..5e376a5947 100644
--- a/torch_npu/csrc/aten/mirror/NPUTypeProperties.cpp
+++ b/torch_npu/csrc/aten/mirror/NPUTypeProperties.cpp
@@ -1,76 +1,63 @@
 #include "torch_npu/csrc/aten/mirror/NPUTypeProperties.h"
 
-namespace at_npu
-{
-  namespace native
-  {
+namespace at_npu {
+namespace native {
 
-    static inline at::ScalarType promote_skip_undefined(at::ScalarType a, at::ScalarType b)
-    {
-      if (a == at::ScalarType::Undefined)
-      {
+static inline at::ScalarType promote_skip_undefined(at::ScalarType a, at::ScalarType b)
+{
+    if (a == at::ScalarType::Undefined) {
         return b;
-      }
-      if (b == at::ScalarType::Undefined)
-      {
+    }
+    if (b == at::ScalarType::Undefined) {
         return a;
-      }
-      return promoteTypes(a, b);
     }
+    return promoteTypes(a, b);
+}
 
-    static inline at::ScalarType combine_categories(at::ScalarType higher, at::ScalarType lower)
-    {
-      if (isFloatingType(higher))
-      {
+static inline at::ScalarType combine_categories(at::ScalarType higher, at::ScalarType lower)
+{
+    if (isFloatingType(higher)) {
         return higher;
-      }
-      if (higher == at::ScalarType::Bool || isFloatingType(lower))
-      {
+    }
+    if (higher == at::ScalarType::Bool || isFloatingType(lower)) {
         return promote_skip_undefined(higher, lower);
-      }
-      if (higher != at::ScalarType::Undefined)
-      {
+    }
+    if (higher != at::ScalarType::Undefined) {
         return higher;
-      }
-      return lower;
     }
+    return lower;
+}
 
-    ResultTypeState update_result_type_state(const at::Tensor &tensor, const ResultTypeState &in_state)
-    {
-      if (!tensor.defined())
-      {
+ResultTypeState update_result_type_state(const at::Tensor &tensor, const ResultTypeState &in_state)
+{
+    if (!tensor.defined()) {
         return in_state;
-      }
-      ResultTypeState new_state = in_state;
-      at::ScalarType current = tensor.scalar_type();
-      if (tensor.unsafeGetTensorImpl()->is_wrapped_number() && isFloatingType(current))
-      {
+    }
+    ResultTypeState new_state = in_state;
+    at::ScalarType current = tensor.scalar_type();
+    if (tensor.unsafeGetTensorImpl()->is_wrapped_number() && isFloatingType(current)) {
         current = c10::typeMetaToScalarType(at::get_default_dtype());
-      }
-      if (tensor.dim() > 0)
-      {
+    }
+    if (tensor.dim() > 0) {
         new_state.dimResult = promote_skip_undefined(in_state.dimResult, current);
-      }
-      else if (tensor.unsafeGetTensorImpl()->is_wrapped_number())
-      {
+    } else if (tensor.unsafeGetTensorImpl()->is_wrapped_number()) {
         new_state.wrappedResult = promote_skip_undefined(in_state.wrappedResult, current);
-      }
-      else
-      {
+    } else {
         new_state.zeroResult = promote_skip_undefined(in_state.zeroResult, current);
-      }
-
-      return new_state;
     }
 
-    at::ScalarType result_type(const ResultTypeState &in_state)
-    {
-      return combine_categories(in_state.dimResult, combine_categories(in_state.zeroResult, in_state.wrappedResult));
-    }
+    return new_state;
+}
 
-    at::ScalarType result_type(at::ScalarType a, at::ScalarType b)
-    {
-      return promote_skip_undefined(a, b);
-    }
-  }
-}
\ No newline at end of file
+at::ScalarType result_type(const ResultTypeState &in_state)
+{
+    return combine_categories(in_state.dimResult, combine_categories(in_state.zeroResult, in_state.wrappedResult));
+}
+
+at::ScalarType result_type(at::ScalarType a, at::ScalarType b)
+{
+    return promote_skip_undefined(a, b);
+}
+
+}
+}
diff --git a/torch_npu/csrc/aten/mirror/NPUTypeProperties.h b/torch_npu/csrc/aten/mirror/NPUTypeProperties.h
index 9f53a56f51..cb2452ba6a 100644
--- a/torch_npu/csrc/aten/mirror/NPUTypeProperties.h
+++ b/torch_npu/csrc/aten/mirror/NPUTypeProperties.h
@@ -4,18 +4,20 @@
 
 #include <ATen/ATen.h>
 
-namespace at_npu { namespace native {
+namespace at_npu {
+namespace native {
 
 struct ResultTypeState {
-  at::ScalarType dimResult = at::ScalarType::Undefined;
-  at::ScalarType wrappedResult = at::ScalarType::Undefined;
-  at::ScalarType zeroResult = at::ScalarType::Undefined;
+    at::ScalarType dimResult = at::ScalarType::Undefined;
+    at::ScalarType wrappedResult = at::ScalarType::Undefined;
+    at::ScalarType zeroResult = at::ScalarType::Undefined;
 };
 
 ResultTypeState update_result_type_state(const at::Tensor& tensor, const ResultTypeState& in_state);
 at::ScalarType result_type(const ResultTypeState& state);
 at::ScalarType result_type(at::ScalarType a, at::ScalarType b);
 
-}}
+}
+}
 
-#endif // __NATIVE_NPU_UTILS_NPU_TYPE_PROPERIES__
\ No newline at end of file
+#endif // __NATIVE_NPU_UTILS_NPU_TYPE_PROPERIES__
diff --git a/torch_npu/csrc/aten/ops/CopyFromAndResizeKernelNpu.cpp b/torch_npu/csrc/aten/ops/CopyFromAndResizeKernelNpu.cpp
index 8c7d87d3a1..6f1abc8911 100644
--- a/torch_npu/csrc/aten/ops/CopyFromAndResizeKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/CopyFromAndResizeKernelNpu.cpp
@@ -4,14 +4,15 @@
 namespace at_npu {
 namespace native {
 
-at::Tensor NPUNativeFunctions::_copy_from_and_resize(const at::Tensor& self, const at::Tensor& dst) {
-  TORCH_CHECK(self.sizes() == dst.sizes(), "_copy_from_and_resize now only support copy with same size!",
-              OPS_ERROR(ErrCode::NOT_SUPPORT));
-  TORCH_CHECK(self.is_cpu() && dst.device().is_privateuseone(),
-      "_copy_from_and_resize now only support copy from cpu tensor to npu tensor, but got src tensor device is ",
-      self.device(), " and dst device is ", dst.device(), OPS_ERROR(ErrCode::NOT_SUPPORT));
-  dst.copy_(self);
-  return dst;
+at::Tensor NPUNativeFunctions::_copy_from_and_resize(const at::Tensor& self, const at::Tensor& dst)
+{
+    TORCH_CHECK(self.sizes() == dst.sizes(),
+        "_copy_from_and_resize now only support copy with same size!", OPS_ERROR(ErrCode::NOT_SUPPORT));
+    TORCH_CHECK(self.is_cpu() && dst.device().is_privateuseone(),
+        "_copy_from_and_resize now only support copy from cpu tensor to npu tensor, but got src tensor device is ",
+        self.device(), " and dst device is ", dst.device(), OPS_ERROR(ErrCode::NOT_SUPPORT));
+    dst.copy_(self);
+    return dst;
 }
 
 }
diff --git a/torch_npu/csrc/aten/ops/FlattenDenseTensorsKernelNpu.cpp b/torch_npu/csrc/aten/ops/FlattenDenseTensorsKernelNpu.cpp
index 7f9108ddd7..e2be317874 100644
--- a/torch_npu/csrc/aten/ops/FlattenDenseTensorsKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/FlattenDenseTensorsKernelNpu.cpp
@@ -4,18 +4,18 @@
 namespace at_npu {
 namespace native {
 
-at::Tensor NPUNativeFunctions::flatten_dense_tensors(at::TensorList tensors) {
-  static auto cast_back_to_ori_format = [](const at::Tensor& t) {
-      return custom_ops::npu_format_cast(t,
-          torch_npu::NPUBridge::GetNpuStorageImpl(t)->npu_desc_.origin_format_);
-  };
-  static auto flatten = [](const at::Tensor& t) {
-    return cast_back_to_ori_format(t).contiguous().view({-1});
-  };
-  if (tensors.size() == 1) {
-    return flatten(tensors[0]);
-  }
-  return at::cat(c10::fmap(tensors, flatten));
+at::Tensor NPUNativeFunctions::flatten_dense_tensors(at::TensorList tensors)
+{
+    static auto cast_back_to_ori_format = [](const at::Tensor& t) {
+        return custom_ops::npu_format_cast(t, torch_npu::NPUBridge::GetNpuStorageImpl(t)->npu_desc_.origin_format_);
+    };
+    static auto flatten = [](const at::Tensor& t) {
+        return cast_back_to_ori_format(t).contiguous().view({-1});
+    };
+    if (tensors.size() == 1) {
+        return flatten(tensors[0]);
+    }
+    return at::cat(c10::fmap(tensors, flatten));
 }
 
 }
diff --git a/torch_npu/csrc/aten/ops/FullKernelNpu.cpp b/torch_npu/csrc/aten/ops/FullKernelNpu.cpp
index 9ed2a7010f..43682288ba 100644
--- a/torch_npu/csrc/aten/ops/FullKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/FullKernelNpu.cpp
@@ -7,14 +7,15 @@
 namespace at_npu {
 namespace native {
 
-at::Tensor& NPUNativeFunctions::full_out(at::IntArrayRef size, const at::Scalar& fill_value, at::Tensor& out) {
-  OpPreparation::CheckOut(
-      {},
-      out,
-      out,
-      size);
-  out.fill_(fill_value);
-  return out;
+at::Tensor& NPUNativeFunctions::full_out(at::IntArrayRef size, const at::Scalar& fill_value, at::Tensor& out)
+{
+    OpPreparation::CheckOut(
+        {},
+        out,
+        out,
+        size);
+    out.fill_(fill_value);
+    return out;
 }
 
 at::Tensor NPUNativeFunctions::full(
@@ -24,7 +25,8 @@ at::Tensor NPUNativeFunctions::full(
     c10::optional<at::ScalarType> dtype_opt,
     c10::optional<at::Layout> layout_opt,
     c10::optional<at::Device> device_opt,
-    c10::optional<bool> pin_memory_opt) {
+    c10::optional<bool> pin_memory_opt)
+{
     c10::TensorOptions option = c10::TensorOptions().dtype(dtype_opt)
                                                     .device(device_opt)
                                                     .layout(layout_opt)
@@ -47,4 +49,4 @@ at::Tensor NPUNativeFunctions::full(
 }
 
 }
-}
\ No newline at end of file
+}
diff --git a/torch_npu/csrc/aten/ops/HasCompatibleShallowCopyType.cpp b/torch_npu/csrc/aten/ops/HasCompatibleShallowCopyType.cpp
index e2aace0cc7..59c1b3aedb 100644
--- a/torch_npu/csrc/aten/ops/HasCompatibleShallowCopyType.cpp
+++ b/torch_npu/csrc/aten/ops/HasCompatibleShallowCopyType.cpp
@@ -7,20 +7,21 @@ namespace native {
 
 // True if `self` and `from` have compatible tensor type so that `from`'s
 // TensorImpl can be copied to `self`.
-bool _has_compatible_shallow_copy_type(const at::Tensor &self,
-                                       const at::Tensor &from) {
-  c10::DispatchKeySet self_key = self.key_set();
-  c10::DispatchKeySet from_key = from.key_set();
-  auto is_dense = [](c10::DispatchKeySet ts) {
-    return ts.has(c10::DispatchKey::CPU) ||
-           ts.has(c10::DispatchKey::PrivateUse1);
-  };
-  return (self_key == from_key) || (is_dense(self_key) && is_dense(from_key));
+bool _has_compatible_shallow_copy_type(
+    const at::Tensor &self,
+    const at::Tensor &from)
+{
+    c10::DispatchKeySet self_key = self.key_set();
+    c10::DispatchKeySet from_key = from.key_set();
+    auto is_dense = [](c10::DispatchKeySet ts) {
+        return ts.has(c10::DispatchKey::CPU) || ts.has(c10::DispatchKey::PrivateUse1);
+    };
+    return (self_key == from_key) || (is_dense(self_key) && is_dense(from_key));
 }
 
 TORCH_LIBRARY_IMPL(aten, CatchAll, m) {
-  m.impl("_has_compatible_shallow_copy_type",
-         TORCH_FN(_has_compatible_shallow_copy_type));
+    m.impl("_has_compatible_shallow_copy_type", TORCH_FN(_has_compatible_shallow_copy_type));
 }
+
 } // namespace native
-} // namespace at_npu
\ No newline at end of file
+} // namespace at_npu
diff --git a/torch_npu/csrc/aten/ops/StreamAndEventKernelNpu.cpp b/torch_npu/csrc/aten/ops/StreamAndEventKernelNpu.cpp
index eea566e55c..3309f47c3c 100644
--- a/torch_npu/csrc/aten/ops/StreamAndEventKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/StreamAndEventKernelNpu.cpp
@@ -4,12 +4,13 @@
 namespace at_npu {
 namespace native {
 
-void NPUNativeFunctions::record_stream(at::Tensor& self, c10::Stream stream) {
-  struct c10::StreamData3 data = stream.pack3();
-  c10_npu::NPUCachingAllocator::recordStream(
-      self.storage().data_ptr(),
-      c10_npu::NPUStream::unpack3(
-          data.stream_id, data.device_index, data.device_type));
+void NPUNativeFunctions::record_stream(at::Tensor& self, c10::Stream stream)
+{
+    struct c10::StreamData3 data = stream.pack3();
+    c10_npu::NPUCachingAllocator::recordStream(
+        self.storage().data_ptr(),
+        c10_npu::NPUStream::unpack3(
+            data.stream_id, data.device_index, data.device_type));
 }
 
 } // namespace native
-- 
Gitee


From 6328cb8297747cdeb99768de7ffc854c9a4a60ee Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 13 Mar 2025 09:15:33 +0000
Subject: [PATCH 149/358] !18894 Update op_plugin commit id Merge pull request
 !18894 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 077f2113e5..c0f8bf1798 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 077f2113e5056f24996efcd6cd1cdfd3db547f05
+Subproject commit c0f8bf1798410c72e72953d3e5cfa453c5f453ed
-- 
Gitee


From c3a0ed542a3667df9d2e56dde4abc87d6f8df739 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 13 Mar 2025 09:15:33 +0000
Subject: [PATCH 150/358] !18894 Update op_plugin commit id Merge pull request
 !18894 from pta-robot/v2.6.0

-- 
Gitee


From 57ede2e3bb67c87c479ab02fca41fed1370c337d Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 13 Mar 2025 11:00:36 +0000
Subject: [PATCH 151/358] !18903 Update op_plugin commit id Merge pull request
 !18903 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index c0f8bf1798..657e14922e 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit c0f8bf1798410c72e72953d3e5cfa453c5f453ed
+Subproject commit 657e14922e40980da9ab1c9f6df589a553cd293f
-- 
Gitee


From e870bd4660e70af1503b2a9fc1f7c2619772e03f Mon Sep 17 00:00:00 2001
From: Yuanhao Ji <jiyuanhao@apache.org>
Date: Thu, 13 Mar 2025 11:03:37 +0000
Subject: [PATCH 152/358] !18278 [v2.6.0][Docs] Fix minial version desp for
 autoloading Merge pull request !18278 from Yuanhao Ji/fix/docs/260

---
 README.md    | 2 +-
 README.zh.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 4a6f737bea..7fab4df860 100644
--- a/README.md
+++ b/README.md
@@ -118,7 +118,7 @@ You can quickly experience **Ascend NPU** by the following simple examples.
 
 ```diff
 import torch
-- import torch_npu # No longer needed in torch_npu 2.6.0 and later versions
+- import torch_npu # No longer needed in torch_npu 2.5.1 and later versions
 
 x = torch.randn(2, 2).npu()
 y = torch.randn(2, 2).npu()
diff --git a/README.zh.md b/README.zh.md
index 29ec7d6d46..04689e4b71 100644
--- a/README.zh.md
+++ b/README.zh.md
@@ -131,7 +131,7 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh
 
 ```diff
 import torch
-- import torch_npu # torch_npu2.6.0及以后版本可以不用手动导包
+- import torch_npu # torch_npu2.5.1及以后版本可以不用手动导包
 
 x = torch.randn(2, 2).npu()
 y = torch.randn(2, 2).npu()
-- 
Gitee


From de9bfe0ef586e7f2687e3d59edd639fa55048703 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 13 Mar 2025 14:15:33 +0000
Subject: [PATCH 153/358] !18921 Update op_plugin commit id Merge pull request
 !18921 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 657e14922e..89acff2ca3 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 657e14922e40980da9ab1c9f6df589a553cd293f
+Subproject commit 89acff2ca318b2dee4814e6aa4be603448f85262
-- 
Gitee


From 3f55dfcdc21e44ce94f978fb80f2484cb057ceb9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E6=B5=B7=E6=9D=B0?= <chenhaijie2@huawei.com>
Date: Fri, 14 Mar 2025 06:06:10 +0000
Subject: [PATCH 154/358] =?UTF-8?q?!18951=20add=20npu=5Fkronecker=5Fquant?=
 =?UTF-8?q?=20in=20PublicAPI=20Merge=20pull=20request=20!18951=20from=20?=
 =?UTF-8?q?=E9=99=88=E6=B5=B7=E6=9D=B0/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/allowlist_for_publicAPI.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index 5d0c43090b..5e71e5330a 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -2851,6 +2851,7 @@
     "npu_quant_scatter_",
     "scatter_update",
     "scatter_update_",
+    "npu_kronecker_quant",
     "npu_moe_compute_expert_tokens",
     "npu_moe_gating_top_k_softmax",
     "npu_moe_init_routing",
-- 
Gitee


From 7ae98c2252a19a7694d23e33b6da1dc8506f0715 Mon Sep 17 00:00:00 2001
From: liuyun <liuyun90@huawei.com>
Date: Fri, 14 Mar 2025 06:48:10 +0000
Subject: [PATCH 155/358] !18937 add gemma_rms_norm Merge pull request !18937
 from liuyun/v2.6.0

---
 test/allowlist_for_publicAPI.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index 5e71e5330a..21a43d8c4f 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -2838,6 +2838,7 @@
     "npu_dynamic_quant_asymmetric",
     "npu_yolo_boxes_encode",
     "npu_yolo_boxes_encode",
+    "npu_gemma_rms_norm",
     "npu_weight_quant_batchmatmul",
     "npu_transpose",
     "npu_trans_quant_param",
-- 
Gitee


From 5d90c808a1671fea87ed9f0b34820f8367a6e52b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B2=88=E7=8F=88=E9=9D=93?= <shenjialiang@huawei.com>
Date: Fri, 14 Mar 2025 07:14:08 +0000
Subject: [PATCH 156/358] =?UTF-8?q?!18930=20add=20npu=5Fbatch=5Fgather=5Fm?=
 =?UTF-8?q?atmul=20in=20allowlist=20Merge=20pull=20request=20!18930=20from?=
 =?UTF-8?q?=20=E6=B2=88=E7=8F=88=E9=9D=93/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/allowlist_for_publicAPI.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index 21a43d8c4f..3385fe5e0c 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -2814,6 +2814,7 @@
     "npu_alloc_float_status",
     "npu_apply_adam",
     "npu_advance_step_flashattn",
+    "npu_batch_gather_matmul",
     "npu_bert_apply_adam",
     "npu_clear_float_status",
     "npu_cross_entropy_loss",
-- 
Gitee


From bdcb91bfe93dc3d6de3b5489923a6bb9b2c77923 Mon Sep 17 00:00:00 2001
From: liushiyu0214 <liushiyu25@huawei.com>
Date: Fri, 14 Mar 2025 07:33:29 +0000
Subject: [PATCH 157/358] !18963 add_npu_dequant_bias Merge pull request !18963
 from liushiyu0214/v2.6.0

---
 test/allowlist_for_publicAPI.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index 3385fe5e0c..7d498ce029 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -2834,6 +2834,7 @@
     "npu_quant_scatter",
     "npu_scatter_nd_update_",
     "npu_swiglu",
+    "npu_dequant_bias",
     "npu_group_quant",
     "npu_dynamic_quant",
     "npu_dynamic_quant_asymmetric",
-- 
Gitee


From 38f58e0360e7633a2f45c3dcc8ee4ca1b0b6853b Mon Sep 17 00:00:00 2001
From: wgb <wgb_strive@163.com>
Date: Fri, 14 Mar 2025 07:41:49 +0000
Subject: [PATCH 158/358] !18900 device check support  for kwargs parameters
 Merge pull request !18900 from wgb/2.6_copy

---
 codegen/custom_functions.py            |  1 +
 codegen/utils.py                       |  3 ++-
 test/custom_ops/test_npu_anti_quant.py | 24 ++++++++++++++++++++----
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/codegen/custom_functions.py b/codegen/custom_functions.py
index 20e84f7d63..69411e46fe 100644
--- a/codegen/custom_functions.py
+++ b/codegen/custom_functions.py
@@ -146,6 +146,7 @@ def compute_op_definition(f: NativeFunction):
     candidate_args = itertools.chain(
         f.func.arguments.out,
         f.func.arguments.flat_positional,
+        f.func.arguments.flat_kwarg_only,
     )
     device_check = RegisterDispatchKey.gen_device_check(
         f.device_check, list(candidate_args), name
diff --git a/codegen/utils.py b/codegen/utils.py
index 39f013437c..a6719a4752 100644
--- a/codegen/utils.py
+++ b/codegen/utils.py
@@ -57,6 +57,7 @@ GLOBAL_OPAPI_INFO_CACHE = set()
 CUSTOM_YAML_NAME = "npu_native_functions_by_codegen.yaml"
 FIELDS_TO_USE = ["func", "tags", "dispatch", "device_check"]
 DEVICE_NOCHECK_SET = set()
+DEVICE_CHECK_NOTSUPPORT_TYPE = {"Tensor[]?"}
 
 
 class PathManager:
@@ -542,7 +543,7 @@ def gen_device_check(
     device_check += "(void)common_device; // Suppress unused variable warning\n"
     for arg in args:
         # Only tensor like arguments are eligible
-        if arg.type.is_tensor_like():
+        if arg.type.is_tensor_like() and str(arg.type) not in DEVICE_CHECK_NOTSUPPORT_TYPE:
             device_check += \
 f"""c10::impl::check_and_update_common_device(common_device, {arg.name}, "{method_name}", "{arg.name}");\n"""
     return device_check
diff --git a/test/custom_ops/test_npu_anti_quant.py b/test/custom_ops/test_npu_anti_quant.py
index fd9067c4dd..c702ec9887 100644
--- a/test/custom_ops/test_npu_anti_quant.py
+++ b/test/custom_ops/test_npu_anti_quant.py
@@ -4,8 +4,8 @@ import torch
 
 import torch_npu
 from torch_npu.testing.testcase import TestCase, run_tests
-from torch_npu.testing.common_utils import create_common_tensor
-DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
+from torch_npu.testing.common_utils import create_common_tensor, SupportedDevices
+from torch_npu.testing.common_distributed import skipIfUnsupportMultiNPU
 
 
 class TestAntiQuant(TestCase):
@@ -27,8 +27,7 @@ class TestAntiQuant(TestCase):
         output = torch_npu.npu_anti_quant(input_x, scale, offset=offset, dst_dtype=dst_dtype, src_dtype=src_dtype)
         return output.cpu().detach()
 
-    @unittest.skipIf(DEVICE_NAME != 'Ascend910B',
-        "OP `AscendAntiQuant` is only supported on 910B, skip this ut for this device type!")
+    @SupportedDevices(['Ascend910B'])
     def test_anti_quant(self, device="npu"):
         shape_format = [
             [[np.int8, -1, [10, 100]], [np.float32, -1, [100]], [np.float32, -1, [100]], torch.float16, None],
@@ -54,5 +53,22 @@ class TestAntiQuant(TestCase):
                 custom_output = custom_output.to(torch.float32)
             self.assertRtolEqual(npu_output, custom_output)
 
+    @skipIfUnsupportMultiNPU(2)
+    @SupportedDevices(['Ascend910B'])
+    def test_anti_quant_device_check(self, device="npu"):
+        shape_format = [
+            [[np.int8, -1, [10, 100]], [np.float32, -1, [100]], [np.float32, -1, [100]], torch.float16, None],
+        ]
+        
+        for item in shape_format:
+            _, npu_input_x = create_common_tensor(item[0], -127, 127)
+            _, npu_scale = create_common_tensor(item[1], -100, 100)
+            _, npu_offset = (None, None) if item[2] is None else create_common_tensor(item[2], -100, 100)
+
+            msg = "Expected all tensors to be on the same device, but found at least two devices, npu:"
+            with self.assertRaisesRegex(RuntimeError, msg):
+                self.npu_op_exec(npu_input_x, npu_scale, npu_offset.to("npu:1"), *item[3:])
+
+
 if __name__ == "__main__":
     run_tests()
-- 
Gitee


From ed920b1da05485faea7901dfa74851cabcf1ce85 Mon Sep 17 00:00:00 2001
From: wangjie <wjchuee@foxmail.com>
Date: Fri, 14 Mar 2025 07:56:33 +0000
Subject: [PATCH 159/358] !18957 [PORF] Profiler bugfix for CannPackageManager
 Merge pull request !18957 from wangjie/profiler_fix_cann_manager_260

---
 torch_npu/profiler/analysis/_profiling_parser.py          | 4 ++--
 .../analysis/prof_common_func/_cann_package_manager.py    | 8 +++++++-
 torch_npu/profiler/profiler_interface.py                  | 6 ------
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/torch_npu/profiler/analysis/_profiling_parser.py b/torch_npu/profiler/analysis/_profiling_parser.py
index 5848840ecd..451b37fc82 100644
--- a/torch_npu/profiler/analysis/_profiling_parser.py
+++ b/torch_npu/profiler/analysis/_profiling_parser.py
@@ -66,7 +66,7 @@ class ProfilingParser:
             return
         if not ProfilerPathManager.get_cann_path(self._profiler_path):
             return
-        if not CannPackageManager.SUPPORT_EXPORT_DB:
+        if not CannPackageManager.is_support_export_db():
             raise RuntimeError("Current CANN package version does not support export db. "
                                "If you want to export db, you can install supported CANN package version.")
 
@@ -103,7 +103,7 @@ class ProfilingParser:
         parser_config = ParserConfig.ONLY_FWK_CONFIG
         if ProfilerPathManager.get_cann_path(self._profiler_path):
             CANNFileParser(self._profiler_path).del_summary_and_timeline_data()
-            CANNFileParser(self._profiler_path).del_output_path_data() 
+            CANNFileParser(self._profiler_path).del_output_path_data()
             if ProfilerConfig().get_level() == "Level_none":
                 parser_config = ParserConfig.LEVEL_NONE_CONFIG
             else:
diff --git a/torch_npu/profiler/analysis/prof_common_func/_cann_package_manager.py b/torch_npu/profiler/analysis/prof_common_func/_cann_package_manager.py
index 9e1f72771f..2694dfae35 100644
--- a/torch_npu/profiler/analysis/prof_common_func/_cann_package_manager.py
+++ b/torch_npu/profiler/analysis/prof_common_func/_cann_package_manager.py
@@ -19,4 +19,10 @@ def check_cann_package_support_export_db() -> bool:
 
 
 class CannPackageManager:
-    SUPPORT_EXPORT_DB = check_cann_package_support_export_db()
+    SUPPORT_EXPORT_DB = None
+
+    @classmethod
+    def is_support_export_db(cls) -> bool:
+        if cls.SUPPORT_EXPORT_DB is None:
+            cls.SUPPORT_EXPORT_DB = check_cann_package_support_export_db()
+        return cls.SUPPORT_EXPORT_DB
diff --git a/torch_npu/profiler/profiler_interface.py b/torch_npu/profiler/profiler_interface.py
index 9acafb6072..4c6e676787 100644
--- a/torch_npu/profiler/profiler_interface.py
+++ b/torch_npu/profiler/profiler_interface.py
@@ -35,7 +35,6 @@ from .analysis.prof_common_func._utils import collect_env_vars, no_exception_fun
 from .analysis.prof_common_func._path_manager import ProfilerPathManager
 from .analysis.prof_common_func._log import ProfilerLogger
 from ..utils._path_manager import PathManager
-from .analysis.prof_common_func._cann_package_manager import CannPackageManager
 
 __all__ = ['supported_activities']
 
@@ -190,11 +189,6 @@ class _ProfInterface:
         if ProfilerActivity.NPU not in self.activities and self.experimental_config is not None:
             print_warn_msg("Experimental config will not be uesd while ProfilerActivity.NPU is not set.")
 
-        if ProfilerActivity.NPU in self.activities and Constant.Db in self.experimental_config.export_type:
-            if not CannPackageManager.SUPPORT_EXPORT_DB:
-                raise RuntimeError("Current cann package does not support export db. "
-                                   "If you want to export db, you can install supported CANN package version.")
-
         if ProfilerActivity.CPU not in self.activities and self.experimental_config.with_gc:
             print_warn_msg("GC detect will not take effect while ProfilerActivity.CPU is not set.")
 
-- 
Gitee


From ce4852668a8f29fe0f55835b59ff00f60cb0b513 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 14 Mar 2025 09:00:40 +0000
Subject: [PATCH 160/358] !18973 Update op_plugin commit id Merge pull request
 !18973 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 89acff2ca3..c8c845492d 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 89acff2ca318b2dee4814e6aa4be603448f85262
+Subproject commit c8c845492d0d4d7b3957748a632167a2f467913a
-- 
Gitee


From d0eff547677950dbfb86325e23f375aba7fb63aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= <jiangyiwen5@huawei.com>
Date: Fri, 14 Mar 2025 09:09:29 +0000
Subject: [PATCH 161/358] =?UTF-8?q?!18835=20Fix=20jit=20compile=20set=20Me?=
 =?UTF-8?q?rge=20pull=20request=20!18835=20from=20=E5=A7=9C=E6=80=A1?=
 =?UTF-8?q?=E6=96=87/v2.6.0=5Flz?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/framework/interface/EnvVariables.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/torch_npu/csrc/framework/interface/EnvVariables.cpp b/torch_npu/csrc/framework/interface/EnvVariables.cpp
index bcf973d26d..347601aea8 100644
--- a/torch_npu/csrc/framework/interface/EnvVariables.cpp
+++ b/torch_npu/csrc/framework/interface/EnvVariables.cpp
@@ -45,10 +45,20 @@ REGISTER_OPTION_HOOK(mdldumpconfigpath, [](const std::string &val) {
   aclmdlSetDump(val.c_str());
 })
 
+auto acl_op_init_mode = c10_npu::option::OptionsManager::GetAclOpInitMode();
+
 REGISTER_OPTION_BOOL_FUNCTION(CheckJitDisableInner, jitCompile, "enable", "disable")
 REGISTER_OPTION_CACHE(bool, isJitDisable, CheckJitDisableInner)
 REGISTER_OPTION_HOOK(jitCompile, [](const std::string &val) {
-    NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, val.c_str()));
+    if (acl_op_init_mode == 0) {
+        NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, val.c_str()));
+    } else if (GET_OPTION_WITH_CACHE(isJitDisable) != ("disable" == val)) {
+        TORCH_CHECK(acl_op_init_mode != 2,
+                    "Jit compile set is disabled! If you want to set, ",
+                    "please change the environment variable ACL_OP_INIT_MODE to 0 or 1.",
+                    PTA_ERROR(ErrCode::NOT_SUPPORT));
+        NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, val.c_str()));
+    }
     SET_OPTION_WITH_CACHE(isJitDisable, ("disable" == val) ? true : false);
 })
 
-- 
Gitee


From 873be6a4c043f824b6910c706f2d52a5e33869b4 Mon Sep 17 00:00:00 2001
From: dilililiwhy <why.wuhuanyu@huawei.com>
Date: Fri, 14 Mar 2025 09:29:13 +0000
Subject: [PATCH 162/358] !18971 [3/N] cleancode (torch_npu/csrc/aten) Merge
 pull request !18971 from dilililiwhy/cleancode_aten_260_part3

---
 torch_npu/csrc/aten/common/CopyKernel.cpp     | 347 +++++++++---------
 torch_npu/csrc/aten/common/CopyKernelNpu.cpp  |   4 +-
 .../csrc/aten/common/CopyMemoryKernel.cpp     |   5 +-
 .../csrc/aten/common/LocalScalarDenseNpu.cpp  |   3 +-
 torch_npu/csrc/aten/common/ResizeNpu.cpp      |   6 +-
 .../csrc/aten/common/TensorFactories.cpp      |  12 +-
 .../csrc/aten/common/TensorProperties.cpp     |   6 +-
 .../csrc/aten/mirror/NPUTensorIterator.h      |  41 ++-
 8 files changed, 232 insertions(+), 192 deletions(-)

diff --git a/torch_npu/csrc/aten/common/CopyKernel.cpp b/torch_npu/csrc/aten/common/CopyKernel.cpp
index 34c7563570..53d1b3c18e 100644
--- a/torch_npu/csrc/aten/common/CopyKernel.cpp
+++ b/torch_npu/csrc/aten/common/CopyKernel.cpp
@@ -23,21 +23,23 @@ namespace {
 // needs to ensure that the parameters are correct.
 
 // the caller should ensure the tensor.is_npu == true
-bool is_same_format(const at::Tensor& a, const at::Tensor& b) {
-  bool isSameFormat = FormatHelper::GetFormat(a) == FormatHelper::GetFormat(b);
-  if (!isSameFormat) {
-    bool isBaseFormat =
-        FormatHelper::IsBaseFormatType(a) && FormatHelper::IsBaseFormatType(b);
-    return isBaseFormat;
-  }
-  return true;
+bool is_same_format(const at::Tensor& a, const at::Tensor& b)
+{
+    bool isSameFormat = FormatHelper::GetFormat(a) == FormatHelper::GetFormat(b);
+    if (!isSameFormat) {
+        bool isBaseFormat =
+            FormatHelper::IsBaseFormatType(a) && FormatHelper::IsBaseFormatType(b);
+        return isBaseFormat;
+    }
+    return true;
 }
 
-bool try_to_optimize_copy_with_any_format(at::Tensor& self, const at::Tensor& src) {
-  // Some Ops support inputs with 5HD/NZ format, Transdata is redundant
-  // Record:
-  // Op:Reshape; SliceD || Supportformat: 5HD/NZ
-  return TransContiguous::ContiguousOptimizeWithAnyFormat(self, src);
+bool try_to_optimize_copy_with_any_format(at::Tensor& self, const at::Tensor& src)
+{
+    // Some Ops support inputs with 5HD/NZ format, Transdata is redundant
+    // Record:
+    // Op:Reshape; SliceD || Supportformat: 5HD/NZ
+    return TransContiguous::ContiguousOptimizeWithAnyFormat(self, src);
 }
 
 // the dst and src are same format now
@@ -47,35 +49,37 @@ void copy_d2d_last_method(
     at::Tensor& self,
     const at::Tensor& src,
     bool same_type,
-    bool non_blocking) {
-  // general copy method but Low performance
-  RECORD_FUNCTION("contiguous_d_ViewCopy", std::vector<c10::IValue>({src}));
-  op_plugin::npu_view_copy(self, src, non_blocking);
+    bool non_blocking)
+{
+    // general copy method but Low performance
+    RECORD_FUNCTION("contiguous_d_ViewCopy", std::vector<c10::IValue>({src}));
+    op_plugin::npu_view_copy(self, src, non_blocking);
 }
 
 // the dst and src are same format now
-void copy_d2d_dtype_format(at::Tensor& self, const at::Tensor& src, bool non_blocking) {
-  // Note: Src & Self have the same format.
-  if (try_to_optimize_copy_with_any_format(self, src)) {
-    return;
-  }
+void copy_d2d_dtype_format(at::Tensor& self, const at::Tensor& src, bool non_blocking)
+{
+    // Note: Src & Self have the same format.
+    if (try_to_optimize_copy_with_any_format(self, src)) {
+        return;
+    }
 
-  if (!FormatHelper::IsBaseFormatType(self)) { // 必须要非NCHW的才行？
-    if (can_use_memcpy(self, src)) {
-      RECORD_FUNCTION(
-          "d2dCopyAsync with format", std::vector<c10::IValue>({src}));
-      return copy_d2d_by_memcpy(self, src);
+    if (!FormatHelper::IsBaseFormatType(self)) { // 必须要非NCHW的才行？
+        if (can_use_memcpy(self, src)) {
+            RECORD_FUNCTION(
+                "d2dCopyAsync with format", std::vector<c10::IValue>({src}));
+            return copy_d2d_by_memcpy(self, src);
+        }
     }
-  }
 
-  if (!FormatHelper::IsBaseFormatType(self)) {
-    at::Tensor src_4D = FormatCastHelper::ApplyBaseFormatTensorBy(src);
-    at::Tensor dst_4D = FormatCastHelper::ApplyBaseFormatTensorBy(self);
-    copy_d2d_dtype_baseformat(dst_4D, src_4D, non_blocking);
-    NPUNativeFunctions::npu_format_cast_(self, dst_4D);
-    return;
-  }
-  copy_d2d_dtype_baseformat(self, src, non_blocking);
+    if (!FormatHelper::IsBaseFormatType(self)) {
+        at::Tensor src_4D = FormatCastHelper::ApplyBaseFormatTensorBy(src);
+        at::Tensor dst_4D = FormatCastHelper::ApplyBaseFormatTensorBy(self);
+        copy_d2d_dtype_baseformat(dst_4D, src_4D, non_blocking);
+        NPUNativeFunctions::npu_format_cast_(self, dst_4D);
+        return;
+    }
+    copy_d2d_dtype_baseformat(self, src, non_blocking);
 }
 
 // the format of dst and src is base format now
@@ -85,7 +89,8 @@ void copy_between_host_and_device(
     at::Tensor& dst,
     const at::Tensor& src,
     aclrtMemcpyKind kind,
-    bool non_blocking) {
+    bool non_blocking)
+{
     int64_t nbytes = dst.numel() * dst.element_size();
     c10_npu::NPUStream stream = c10_npu::getCurrentNPUStream();
 
@@ -124,11 +129,12 @@ void copy_between_host_and_device(
 void copy_h2d_baseformat_dtype_contigous(
     at::Tensor& dst,
     const at::Tensor& src,
-    bool non_blocking) {
-  c10_npu::OptionalNPUGuard device_guard;
-  device_guard.set_device(dst.device());
-  aclrtMemcpyKind kind = aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE;
-  copy_between_host_and_device(dst, src, kind, non_blocking);
+    bool non_blocking)
+{
+    c10_npu::OptionalNPUGuard device_guard;
+    device_guard.set_device(dst.device());
+    aclrtMemcpyKind kind = aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE;
+    copy_between_host_and_device(dst, src, kind, non_blocking);
 }
 
 // the format of dst and src is baseformat now
@@ -137,11 +143,12 @@ void copy_h2d_baseformat_dtype_contigous(
 void copy_d2h_baseformat_dtype_contigous(
     at::Tensor& dst,
     const at::Tensor& src,
-    bool non_blocking) {
-  c10_npu::OptionalNPUGuard device_guard;
-  device_guard.set_device(src.device());
-  aclrtMemcpyKind kind = aclrtMemcpyKind::ACL_MEMCPY_DEVICE_TO_HOST;
-  copy_between_host_and_device(dst, src, kind, non_blocking);
+    bool non_blocking)
+{
+    c10_npu::OptionalNPUGuard device_guard;
+    device_guard.set_device(src.device());
+    aclrtMemcpyKind kind = aclrtMemcpyKind::ACL_MEMCPY_DEVICE_TO_HOST;
+    copy_between_host_and_device(dst, src, kind, non_blocking);
 }
 
 // the format of dst and src is baseformat now
@@ -149,57 +156,60 @@ void copy_h2d_baseformat(
     at::Tensor& dst,
     const at::Tensor& src,
     bool non_blocking,
-    bool dst_must_be_contiguous = false) {
-  bool same_type = (src.dtype() == dst.dtype());
-  bool same_size = (src.sizes() == dst.sizes());
-  bool dst_is_contiguous = dst_must_be_contiguous ? true : dst.is_contiguous();
-  if (same_type && dst_is_contiguous && src.is_contiguous() && same_size) {
-    copy_h2d_baseformat_dtype_contigous(dst, src, non_blocking);
-    return;
-  }
+    bool dst_must_be_contiguous = false)
+{
+    bool same_type = (src.dtype() == dst.dtype());
+    bool same_size = (src.sizes() == dst.sizes());
+    bool dst_is_contiguous = dst_must_be_contiguous ? true : dst.is_contiguous();
+    if (same_type && dst_is_contiguous && src.is_contiguous() && same_size) {
+        copy_h2d_baseformat_dtype_contigous(dst, src, non_blocking);
+        return;
+    }
 
-  at::Tensor dst_contig = dst_is_contiguous ? dst : at::empty_like(dst);
-  at::Tensor src_contig;
-  if (!same_type) {
-    src_contig = src.to(dst.dtype()).expand_as(dst).contiguous();
-  } else {
-    src_contig = src.expand_as(dst).contiguous();
-  }
-  // perform a same-dtype copy on contiguous tensors
-  TORCH_INTERNAL_ASSERT(dst_contig.sizes().equals(src_contig.sizes()));
-  TORCH_INTERNAL_ASSERT(dst_contig.scalar_type() == src_contig.scalar_type());
-  copy_h2d_baseformat_dtype_contigous(dst_contig, src_contig, non_blocking);
-  // if necessary, copy back into dst
-  if (!dst_contig.is_same(dst)) {
-    TORCH_INTERNAL_ASSERT(dst_contig.device() == dst.device());
-    copy_d2d_dtype(dst, dst_contig, non_blocking);
-  }
+    at::Tensor dst_contig = dst_is_contiguous ? dst : at::empty_like(dst);
+    at::Tensor src_contig;
+    if (!same_type) {
+        src_contig = src.to(dst.dtype()).expand_as(dst).contiguous();
+    } else {
+        src_contig = src.expand_as(dst).contiguous();
+    }
+    // perform a same-dtype copy on contiguous tensors
+    TORCH_INTERNAL_ASSERT(dst_contig.sizes().equals(src_contig.sizes()));
+    TORCH_INTERNAL_ASSERT(dst_contig.scalar_type() == src_contig.scalar_type());
+    copy_h2d_baseformat_dtype_contigous(dst_contig, src_contig, non_blocking);
+    // if necessary, copy back into dst
+    if (!dst_contig.is_same(dst)) {
+        TORCH_INTERNAL_ASSERT(dst_contig.device() == dst.device());
+        copy_d2d_dtype(dst, dst_contig, non_blocking);
+    }
 }
 
 // the format of dst and src is baseformat now
-void copy_d2h_baseformat(at::Tensor& dst, const at::Tensor& src, bool non_blocking) {
-  bool same_type = (src.dtype() == dst.dtype());
-  bool same_size = (src.sizes() == dst.sizes());
-  bool dst_is_contiguous = dst.is_contiguous();
-  if (same_type && dst_is_contiguous && src.is_contiguous() && same_size) {
-    copy_d2h_baseformat_dtype_contigous(dst, src, non_blocking);
-    return;
-  }
-  at::Tensor dst_contig =
-      (dst_is_contiguous && same_type) ? dst : at::empty_like(dst, src.dtype());
-  at::Tensor src_contig = src.expand_as(dst).contiguous();
-  // perform a same-dtype copy on contiguous tensors
-  TORCH_INTERNAL_ASSERT(dst_contig.sizes().equals(src_contig.sizes()));
-  TORCH_INTERNAL_ASSERT(dst_contig.scalar_type() == src_contig.scalar_type());
-  copy_d2h_baseformat_dtype_contigous(dst_contig, src_contig, non_blocking);
-  // if necessary, copy back into dst
-  if (!dst_contig.is_same(dst)) {
-    TORCH_INTERNAL_ASSERT(dst_contig.device() == dst.device());
-    dst.copy_(dst_contig, non_blocking); // h2h, use cpu copy
-  }
+void copy_d2h_baseformat(at::Tensor& dst, const at::Tensor& src, bool non_blocking)
+{
+    bool same_type = (src.dtype() == dst.dtype());
+    bool same_size = (src.sizes() == dst.sizes());
+    bool dst_is_contiguous = dst.is_contiguous();
+    if (same_type && dst_is_contiguous && src.is_contiguous() && same_size) {
+        copy_d2h_baseformat_dtype_contigous(dst, src, non_blocking);
+        return;
+    }
+    at::Tensor dst_contig =
+        (dst_is_contiguous && same_type) ? dst : at::empty_like(dst, src.dtype());
+    at::Tensor src_contig = src.expand_as(dst).contiguous();
+    // perform a same-dtype copy on contiguous tensors
+    TORCH_INTERNAL_ASSERT(dst_contig.sizes().equals(src_contig.sizes()));
+    TORCH_INTERNAL_ASSERT(dst_contig.scalar_type() == src_contig.scalar_type());
+    copy_d2h_baseformat_dtype_contigous(dst_contig, src_contig, non_blocking);
+    // if necessary, copy back into dst
+    if (!dst_contig.is_same(dst)) {
+        TORCH_INTERNAL_ASSERT(dst_contig.device() == dst.device());
+        dst.copy_(dst_contig, non_blocking); // h2h, use cpu copy
+    }
 }
 
-void copy_h2d(at::Tensor& self, const at::Tensor& src, bool non_blocking) {
+void copy_h2d(at::Tensor& self, const at::Tensor& src, bool non_blocking)
+{
     c10_npu::NPUGuard guard(self.device());
     if (!FormatHelper::IsBaseFormatType(self)) {
         at::Tensor dst = OpPreparation::ApplyTensorWithSizes(self.sizes(), self.options());
@@ -210,7 +220,8 @@ void copy_h2d(at::Tensor& self, const at::Tensor& src, bool non_blocking) {
     copy_h2d_baseformat(self, src, non_blocking);
 }
 
-void copy_d2h(at::Tensor& self, const at::Tensor& src, bool non_blocking) {
+void copy_d2h(at::Tensor& self, const at::Tensor& src, bool non_blocking)
+{
     c10_npu::NPUGuard guard(src.device());
     if (!FormatHelper::IsBaseFormatType(src)) {
         at::Tensor src_4D = FormatCastHelper::ApplyBaseFormatTensorBy(src);
@@ -222,30 +233,32 @@ void copy_d2h(at::Tensor& self, const at::Tensor& src, bool non_blocking) {
 } // namespace
 
 // the caller should guarantee that the format and dtype are same
-bool can_use_memcpy(at::Tensor& dst, const at::Tensor& src) {
-  if (StorageDescHelper::IsSameDesc(dst, src)) {
-    // Make sure that the metadata are same.
-    if (!dst.sizes().equals(src.sizes())) {
-      return false;
-    }
-    if (!dst.strides().equals(src.strides())) {
-      return false;
-    }
-    // Make sure that copy the whole memory.
-    // we just need to compare one of them, because of the NpuStorageDesc
-    // and metadata(sizes and stride) of src and dst are same.
-    if (StorageDescHelper::GetValidMemorySize(src) != src.numel()) {
-      return false;
-    }
-    if ((dst.storage_offset() != 0) || (src.storage_offset() != 0)) {
-      return false;
+bool can_use_memcpy(at::Tensor& dst, const at::Tensor& src)
+{
+    if (StorageDescHelper::IsSameDesc(dst, src)) {
+        // Make sure that the metadata are same.
+        if (!dst.sizes().equals(src.sizes())) {
+            return false;
+        }
+        if (!dst.strides().equals(src.strides())) {
+            return false;
+        }
+        // Make sure that copy the whole memory.
+        // we just need to compare one of them, because of the NpuStorageDesc
+        // and metadata(sizes and stride) of src and dst are same.
+        if (StorageDescHelper::GetValidMemorySize(src) != src.numel()) {
+            return false;
+        }
+        if ((dst.storage_offset() != 0) || (src.storage_offset() != 0)) {
+            return false;
+        }
+        return true;
     }
-    return true;
-  }
-  return false;
+    return false;
 }
 
-void copy_d2d(at::Tensor& self, const at::Tensor& src, bool non_blocking) {
+void copy_d2d(at::Tensor& self, const at::Tensor& src, bool non_blocking)
+{
     c10_npu::NPUGuard guard(src.device());
     // p2p enable and synchronize self stream
     auto self_device_idx = self.device().index();
@@ -296,7 +309,8 @@ at::Tensor copy_d2d_format_cast(at::Tensor& dst, const at::Tensor& src)
 }
 
 // the dst and src are same dtype now
-void copy_d2d_dtype(at::Tensor& self, const at::Tensor& src, bool non_blocking) {
+void copy_d2d_dtype(at::Tensor& self, const at::Tensor& src, bool non_blocking)
+{
     if (!is_same_format(self, src)) {
         auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_;
         if (src.is_contiguous() && FormatHelper::IsBaseFormatType(src) && src_desc.base_sizes_.size() == 1) {
@@ -325,66 +339,69 @@ void copy_d2d_dtype(at::Tensor& self, const at::Tensor& src, bool non_blocking)
 void copy_d2d_dtype_baseformat(
     at::Tensor& self,
     const at::Tensor& src,
-    bool non_blocking) {
-  if (!self.is_contiguous()) {
-    // Contiguous/discontiguous source tensor copy to discontiguous self tensor
-    return copy_d2d_last_method(self, src, true, non_blocking);
-  }
+    bool non_blocking)
+{
+    if (!self.is_contiguous()) {
+        // Contiguous/discontiguous source tensor copy to discontiguous self tensor
+        return copy_d2d_last_method(self, src, true, non_blocking);
+    }
 
-  if (!src.is_contiguous()) {
-    // Discontiguous source tensor copy to contiguous self tensor
-    if (TransContiguous::ContiguousOptimizeWithBaseFormat(self, src)) {
-      // Optimized trans-contiguous method
-      return;
+    if (!src.is_contiguous()) {
+        // Discontiguous source tensor copy to contiguous self tensor
+        if (TransContiguous::ContiguousOptimizeWithBaseFormat(self, src)) {
+            // Optimized trans-contiguous method
+            return;
+        } else {
+            // General trans-contiguous method
+            RECORD_FUNCTION("contiguous_d_AsStrided", std::vector<c10::IValue>({src}));
+            op_plugin::npu_stride_copy_out(src, src.sizes(), src.strides(), src.storage_offset(), self);
+            return;
+        }
     } else {
-      // General trans-contiguous method
-      RECORD_FUNCTION("contiguous_d_AsStrided", std::vector<c10::IValue>({src}));
-      op_plugin::npu_stride_copy_out(src, src.sizes(), src.strides(), src.storage_offset(), self);
-      return;
-    }
-  } else {
-    // Contiguous source tensor copy to contiguous self tensor
-    int64_t numel = self.numel();
-    if (numel == src.numel()) {
-      RECORD_FUNCTION("d2dCopyAsync", std::vector<c10::IValue>({src}));
-      ASCEND_LOGD("copy contiguous tensor inside device");
-      return copy_d2d_by_memcpy(self, src, numel);
+        // Contiguous source tensor copy to contiguous self tensor
+        int64_t numel = self.numel();
+        if (numel == src.numel()) {
+            RECORD_FUNCTION("d2dCopyAsync", std::vector<c10::IValue>({src}));
+            ASCEND_LOGD("copy contiguous tensor inside device");
+            return copy_d2d_by_memcpy(self, src, numel);
+        }
     }
-  }
-  // such as discontiguous tensor copy to unmatched tensor
-  copy_d2d_last_method(self, src, true, non_blocking);
+    // such as discontiguous tensor copy to unmatched tensor
+    copy_d2d_last_method(self, src, true, non_blocking);
 }
 
-bool try_to_optimize_copy_with_any_format(at::Tensor& self, const at::Tensor& src) {
-  // Some Ops support inputs with 5HD/NZ format, Transdata is redundant
-  // Record:
-  // Op:Reshape; SliceD || Supportformat: 5HD/NZ
-  return TransContiguous::ContiguousOptimizeWithAnyFormat(self, src);
+bool try_to_optimize_copy_with_any_format(at::Tensor& self, const at::Tensor& src)
+{
+    // Some Ops support inputs with 5HD/NZ format, Transdata is redundant
+    // Record:
+    // Op:Reshape; SliceD || Supportformat: 5HD/NZ
+    return TransContiguous::ContiguousOptimizeWithAnyFormat(self, src);
 }
 
-at::Tensor& NPUNativeFunctions::copy_(at::Tensor& self, const at::Tensor& src, bool non_blocking) {
-  if (self.numel() == 0) {
-    return self;
-  }
-  // save tensor dim name
-  c10::optional<at::DimnameList> names = src.opt_names();
-  if (names.has_value()) {
-    internal_set_names_inplace(self, names);
-  }
+at::Tensor& NPUNativeFunctions::copy_(at::Tensor& self, const at::Tensor& src, bool non_blocking)
+{
+    if (self.numel() == 0) {
+        return self;
+    }
+    // save tensor dim name
+    c10::optional<at::DimnameList> names = src.opt_names();
+    if (names.has_value()) {
+        internal_set_names_inplace(self, names);
+    }
 
-  if (torch_npu::utils::is_npu(self)) {
-    if (torch_npu::utils::is_npu(src)) {
-      copy_d2d(self, src, non_blocking);
+    if (torch_npu::utils::is_npu(self)) {
+        if (torch_npu::utils::is_npu(src)) {
+            copy_d2d(self, src, non_blocking);
+        } else {
+            copy_h2d(self, src, non_blocking);
+        }
     } else {
-      copy_h2d(self, src, non_blocking);
-    }
-  } else {
-    if (torch_npu::utils::is_npu(src)) {
-      copy_d2h(self, src, non_blocking);
+        if (torch_npu::utils::is_npu(src)) {
+            copy_d2h(self, src, non_blocking);
+        }
     }
-  }
-  return self;
+    return self;
 }
 
 } // namespace native
-} // namespace at_npu
\ No newline at end of file
+} // namespace at_npu
diff --git a/torch_npu/csrc/aten/common/CopyKernelNpu.cpp b/torch_npu/csrc/aten/common/CopyKernelNpu.cpp
index 944e8e2e6d..8122b50166 100644
--- a/torch_npu/csrc/aten/common/CopyKernelNpu.cpp
+++ b/torch_npu/csrc/aten/common/CopyKernelNpu.cpp
@@ -19,7 +19,8 @@ namespace native {
 // the dst and src have same elemsize
 // if exceptCopySize is not defined, we will copy dst storage size
 // so: caller should make sure that the storage size of src and dst are reasonable.
-void copy_d2d_by_memcpy(at::Tensor& dst, const at::Tensor& src, int64_t exceptSize) {
+void copy_d2d_by_memcpy(at::Tensor& dst, const at::Tensor& src, int64_t exceptSize)
+{
     c10_npu::NPUGuard guard(src.device());
     int64_t size = exceptSize;
     auto dst_mem_size = StorageDescHelper::GetMemorySize(dst);
@@ -49,5 +50,6 @@ void copy_d2d_by_memcpy(at::Tensor& dst, const at::Tensor& src, int64_t exceptSi
         size * dst.element_size(),
         ACL_MEMCPY_DEVICE_TO_DEVICE));
 }
+
 } // namespace native
 } // namespace at_npu
diff --git a/torch_npu/csrc/aten/common/CopyMemoryKernel.cpp b/torch_npu/csrc/aten/common/CopyMemoryKernel.cpp
index b8a76a6441..f7e1db3638 100644
--- a/torch_npu/csrc/aten/common/CopyMemoryKernel.cpp
+++ b/torch_npu/csrc/aten/common/CopyMemoryKernel.cpp
@@ -13,7 +13,8 @@
 namespace at_npu {
 namespace native {
 
-at::Tensor& NPUNativeFunctions::copy_memory_(at::Tensor& self, const at::Tensor& src, bool non_blocking) {
+at::Tensor& NPUNativeFunctions::copy_memory_(at::Tensor& self, const at::Tensor& src, bool non_blocking)
+{
     c10_npu::NPUGuard guard(src.device());
     AT_ASSERT(torch_npu::utils::is_npu(src), "copy_memory_ only support npu tensor", OPS_ERROR(ErrCode::PARAM));
     AT_ASSERT(
@@ -67,4 +68,4 @@ at::Tensor& NPUNativeFunctions::copy_memory_(at::Tensor& self, const at::Tensor&
 }
 
 } // namespace native
-} // namespace at_npu
\ No newline at end of file
+} // namespace at_npu
diff --git a/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp b/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp
index a912fde74e..775d95cbfa 100644
--- a/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp
+++ b/torch_npu/csrc/aten/common/LocalScalarDenseNpu.cpp
@@ -10,7 +10,8 @@
 namespace at_npu {
 namespace native {
 
-c10::Scalar NPUNativeFunctions::_local_scalar_dense(const at::Tensor& self) {
+c10::Scalar NPUNativeFunctions::_local_scalar_dense(const at::Tensor& self)
+{
     c10::Scalar r;
     AT_DISPATCH_ALL_TYPES_AND3(
         at::ScalarType::Half,
diff --git a/torch_npu/csrc/aten/common/ResizeNpu.cpp b/torch_npu/csrc/aten/common/ResizeNpu.cpp
index fe6d438545..510c97b2fa 100644
--- a/torch_npu/csrc/aten/common/ResizeNpu.cpp
+++ b/torch_npu/csrc/aten/common/ResizeNpu.cpp
@@ -37,7 +37,8 @@ inline const at::Tensor& resize_named_tensor_(
 const at::Tensor& NPUNativeFunctions::resize_(
     const at::Tensor& self,
     c10::IntArrayRef size,
-    c10::optional<c10::MemoryFormat> format) {
+    c10::optional<c10::MemoryFormat> format)
+{
     if (self.has_names()) {
         return resize_named_tensor_(self, size, format);
     }
@@ -55,7 +56,8 @@ const at::Tensor& NPUNativeFunctions::resize_(
 const at::Tensor& NPUNativeFunctions::resize_as_(
     const at::Tensor& self,
     const at::Tensor& the_template,
-    c10::optional<c10::MemoryFormat> format) {
+    c10::optional<c10::MemoryFormat> format)
+{
     TORCH_CHECK(
         !(self.is_sparse() || the_template.is_sparse()),
         "NPU does not support sparse tensors.", OPS_ERROR(ErrCode::NOT_SUPPORT));
diff --git a/torch_npu/csrc/aten/common/TensorFactories.cpp b/torch_npu/csrc/aten/common/TensorFactories.cpp
index e0e2235b07..c1b701f455 100644
--- a/torch_npu/csrc/aten/common/TensorFactories.cpp
+++ b/torch_npu/csrc/aten/common/TensorFactories.cpp
@@ -606,9 +606,10 @@ at::Tensor tensor_npu(c10::ArrayRef<T> values, const c10::TensorOptions &options
 {
     auto result = at::empty(values.size(), options);
     AT_ASSERT(result.is_contiguous(), OPS_ERROR(ErrCode::VALUE));
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX(result.scalar_type(), "tensor_npu", [&]
-                                        { std::copy(
-                                            values.begin(), values.end(), result.template data_ptr<scalar_t>()); });
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX(result.scalar_type(), "tensor_npu", [&] {
+        std::copy(
+            values.begin(), values.end(), result.template data_ptr<scalar_t>());
+    });
     return result;
 }
 
@@ -646,9 +647,8 @@ at::Tensor NPUNativeFunctions::clone(
         // clone with base formats
         auto baseSelf = OpPreparation::ApplyTensorWithSizes(src.sizes(), src.options());
         at::Tensor baseSrc = src;
-        if (!FormatHelper::IsBaseFormatType(src))
-        {
-          baseSrc = FormatCastHelper::ApplyBaseFormatTensorBy(src);
+        if (!FormatHelper::IsBaseFormatType(src)) {
+            baseSrc = FormatCastHelper::ApplyBaseFormatTensorBy(src);
         }
         copy_d2d_dtype_baseformat(baseSelf, baseSrc, false);
         return baseSelf;
diff --git a/torch_npu/csrc/aten/common/TensorProperties.cpp b/torch_npu/csrc/aten/common/TensorProperties.cpp
index c4ad2f2759..faeb37153c 100644
--- a/torch_npu/csrc/aten/common/TensorProperties.cpp
+++ b/torch_npu/csrc/aten/common/TensorProperties.cpp
@@ -7,7 +7,8 @@
 namespace at_npu {
 namespace native {
 
-at::Tensor NPUNativeFunctions::contiguous(const at::Tensor& self, c10::MemoryFormat memory_format) {
+at::Tensor NPUNativeFunctions::contiguous(const at::Tensor& self, c10::MemoryFormat memory_format)
+{
     if (self.is_contiguous(memory_format)) {
         return self;
     }
@@ -18,7 +19,8 @@ at::Tensor NPUNativeFunctions::contiguous(const at::Tensor& self, c10::MemoryFor
     return self.clone(memory_format);
 }
 
-bool NPUNativeFunctions::is_set_to(const at::Tensor& self, const at::Tensor& src) {
+bool NPUNativeFunctions::is_set_to(const at::Tensor& self, const at::Tensor& src)
+{
     if (self.storage().unsafeGetStorageImpl() == src.storage().unsafeGetStorageImpl() &&
         self.storage_offset() == src.storage_offset() && self.dim() == src.dim() &&
         NPUNativeFunctions::get_storage_size(self) == NPUNativeFunctions::get_storage_size(src) &&
diff --git a/torch_npu/csrc/aten/mirror/NPUTensorIterator.h b/torch_npu/csrc/aten/mirror/NPUTensorIterator.h
index ce282e745f..5b6a0427eb 100644
--- a/torch_npu/csrc/aten/mirror/NPUTensorIterator.h
+++ b/torch_npu/csrc/aten/mirror/NPUTensorIterator.h
@@ -14,7 +14,8 @@ namespace native {
 struct NPUOperandInfo {
     using StrideVector = c10::SmallVector<int64_t, 6>;
     NPUOperandInfo() {}
-    explicit NPUOperandInfo(const at::Tensor& t) : tensor(t) {
+    explicit NPUOperandInfo(const at::Tensor& t) : tensor(t)
+    {
         if (t.defined()) {
             target_dtype = t.scalar_type();
             current_dtype = target_dtype;
@@ -22,15 +23,18 @@ struct NPUOperandInfo {
         validate();
     }
     NPUOperandInfo(const at::Tensor& t, at::ScalarType dtype)
-        : tensor(t), target_dtype(dtype), current_dtype(t.scalar_type()) {
+        : tensor(t), target_dtype(dtype), current_dtype(t.scalar_type())
+    {
         validate();
     }
 
-    bool is_type_defined() const {
+    bool is_type_defined() const
+    {
         return target_dtype != at::ScalarType::Undefined;
     }
 
-    void validate() {
+    void validate()
+    {
         TORCH_CHECK(
             !tensor.defined() || tensor.layout() == at::kStrided,
             "unsupported tensor layout: ", tensor.layout(), OPS_ERROR(ErrCode::TYPE));
@@ -81,39 +85,50 @@ public:
         at::Tensor& out2,
         const at::Tensor& a);
 
-    int noutputs() const {
+    int noutputs() const
+    {
         return num_outputs_;
     }
 
-    c10::IntArrayRef strides(int arg) const {
+    c10::IntArrayRef strides(int arg) const
+    {
         return operands_[arg].stride_bytes;
     }
-    at::ScalarType dtype(int arg = 0) const {
+
+    at::ScalarType dtype(int arg = 0) const
+    {
         return operands_[arg].current_dtype;
     }
-    at::ScalarType common_dtype() const {
+
+    at::ScalarType common_dtype() const
+    {
         return common_dtype_;
     }
 
-    const c10::SmallVector<NPUOperandInfo, 4> GetOperandInfo() const {
+    const c10::SmallVector<NPUOperandInfo, 4> GetOperandInfo() const
+    {
         return operands_;
     }
 
     // Construction
-    void add_output(const at::Tensor& output) {
+    void add_output(const at::Tensor& output)
+    {
         operands_.emplace_back(output);
         num_outputs_++;
     }
 
-    void add_input(const at::Tensor& input) {
+    void add_input(const at::Tensor& input)
+    {
         operands_.emplace_back(input);
     }
 
-    void promote_common_dtype() {
+    void promote_common_dtype()
+    {
         common_dtype_strategy_ = CommonDTypeStrategy::PROMOTE;
     }
 
-    void compute_common_dtype_only_for_inputs() {
+    void compute_common_dtype_only_for_inputs()
+    {
         common_dtype_strategy_ = CommonDTypeStrategy::PROMOTE_INPUTS;
     }
 
-- 
Gitee


From 900bf78cc06c8b57fb7c9724552f66347b8636a9 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 14 Mar 2025 10:45:35 +0000
Subject: [PATCH 163/358] !18985 Update op_plugin commit id Merge pull request
 !18985 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index c8c845492d..e64623e4df 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit c8c845492d0d4d7b3957748a632167a2f467913a
+Subproject commit e64623e4dfbac6891089a3560e106af46ce75136
-- 
Gitee


From e7b72a9a692ef854b0c304050abdbef2523bb9a6 Mon Sep 17 00:00:00 2001
From: Gallium <hujia23@huawei.com>
Date: Sat, 15 Mar 2025 02:57:45 +0000
Subject: [PATCH 164/358] !18914 stop dynolog when exit Merge pull request
 !18914 from Gallium/bug_fix_2.6.0

---
 .../_dynamic_monitor_proxy.py                 | 21 +++++++++++++++++++
 .../_dynamic_profiler_monitor.py              | 21 ++++++-------------
 torch_npu/profiler/dynamic_profile.py         |  8 +++++++
 3 files changed, 35 insertions(+), 15 deletions(-)
 create mode 100644 torch_npu/profiler/_dynamic_profiler/_dynamic_monitor_proxy.py

diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_monitor_proxy.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_monitor_proxy.py
new file mode 100644
index 0000000000..18e0e51a5d
--- /dev/null
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_monitor_proxy.py
@@ -0,0 +1,21 @@
+from ..analysis.prof_common_func._singleton import Singleton
+from ._dynamic_profiler_utils import DynamicProfilerUtils
+
+
+@Singleton
+class PyDynamicMonitorProxySingleton():
+    def __init__(self):
+        self._proxy = None
+        self._load_proxy()
+
+    def _load_proxy(self):
+        if not self._proxy:
+            try:
+                from IPCMonitor import PyDynamicMonitorProxy
+                self._proxy = PyDynamicMonitorProxy()
+            except Exception as e:
+                dynamic_profiler_utils.stdout_log(f"Import IPCMonitro module failed :{e}!",
+                                                dynamic_profiler_utils.LoggerLevelEnum.WARNING)
+
+    def get_proxy(self):
+        return self._proxy
\ No newline at end of file
diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py
index eb95cce837..2a23115df1 100644
--- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py
@@ -8,6 +8,7 @@ import multiprocessing
 from ._dynamic_profiler_config_context import ConfigContext
 from ._dynamic_profiler_utils import DynamicProfilerUtils
 from ._dynamic_profiler_monitor_shm import DynamicProfilerShareMemory
+from ._dynamic_monitor_proxy import PyDynamicMonitorProxySingleton
 
 
 class DynamicProfilerMonitor:
@@ -109,14 +110,9 @@ class DynamicProfilerMonitor:
 
     def _call_dyno_monitor(self, json_data: dict):
         json_data = {key: str(value) for key, value in json_data.items()}
-        try:
-            from IPCMonitor import PyDynamicMonitorProxy
-        except Exception as e:
-            dynamic_profiler_utils.stdout_log(f"Import IPCMonitro module failed :{e}!",
-                                             dynamic_profiler_utils.LoggerLevelEnum.WARNING)
-            return
-        py = PyDynamicMonitorProxy()
-        py.enable_dyno_npu_monitor(json_data)
+        py_dyno_monitor = PyDynamicMonitorProxySingleton().get_proxy()
+        if py_dyno_monitor:
+            py_dyno_monitor.enable_dyno_npu_monitor(json_data)
 
 
 def worker_func(params_dict):
@@ -189,14 +185,9 @@ def worker_dyno_func(params_dict):
     max_size = params_dict.get("max_size")
     dynamic_profiler_utils = params_dict.get("dynamic_profiler_utils")
 
-    try:
-        from IPCMonitor import PyDynamicMonitorProxy
-    except Exception as e:
-        dynamic_profiler_utils.stdout_log(f"Import IPCMonitor module failed: {e}!",
-                                          dynamic_profiler_utils.LoggerLevelEnum.WARNING)
+    py_dyno_monitor = PyDynamicMonitorProxySingleton().get_proxy()
+    if not py_dyno_monitor:
         return
-
-    py_dyno_monitor = PyDynamicMonitorProxy()
     ret = py_dyno_monitor.init_dyno(rank_id)
     if not ret:
         dynamic_profiler_utils.out_log("Init dynolog failed !", dynamic_profiler_utils.LoggerLevelEnum.WARNING)
diff --git a/torch_npu/profiler/dynamic_profile.py b/torch_npu/profiler/dynamic_profile.py
index 511a6f4cf8..99bfd76f72 100644
--- a/torch_npu/profiler/dynamic_profile.py
+++ b/torch_npu/profiler/dynamic_profile.py
@@ -13,6 +13,8 @@ from .analysis.prof_common_func._file_manager import FileManager
 from ._dynamic_profiler._dynamic_profiler_utils import DynamicProfilerUtils
 from ._dynamic_profiler._dynamic_profiler_monitor import DynamicProfilerMonitor
 from ._dynamic_profiler._dynamic_profiler_config_context import ConfigContext
+from ._dynamic_profiler._dynamic_monitor_proxy import PyDynamicMonitorProxySingleton
+
 
 __all__ = [
     'init',
@@ -45,6 +47,7 @@ class _DynamicProfile:
         self._dynamic_monitor = DynamicProfilerMonitor()
         self.repeat_init = True
         atexit.register(self._clean_resource)
+        atexit.register(self._finalize_dynolog)
 
     def _clean_resource(self):
         if self.prof is not None:
@@ -55,6 +58,11 @@ class _DynamicProfile:
                 DynamicProfilerUtils.LoggerLevelEnum.WARNING)
         self._dynamic_monitor.clean_resource()
 
+    def _finalize_dynolog(self):
+        py_dyno_monitor = PyDynamicMonitorProxySingleton().get_proxy()
+        if py_dyno_monitor:
+            py_dyno_monitor.finalize_dyno()
+
     def _dynamic_profiler_valid(self):
         prof_cfg_ctx = self._dynamic_monitor.shm_to_prof_conf_context()
         return prof_cfg_ctx
-- 
Gitee


From 919915d7513dcd89d5ce7ed98a34b1bd5fd6248f Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Sat, 15 Mar 2025 03:00:36 +0000
Subject: [PATCH 165/358] !18996 Update op_plugin commit id Merge pull request
 !18996 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index e64623e4df..27834de325 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit e64623e4dfbac6891089a3560e106af46ce75136
+Subproject commit 27834de325c6a178940c490ce4f737f0e3c6a12d
-- 
Gitee


From 96b68a5b9362655808bfa101f7045400dd215d7e Mon Sep 17 00:00:00 2001
From: yuhaiyan <yuhaiyan8@huawei.com>
Date: Sat, 15 Mar 2025 05:59:49 +0000
Subject: [PATCH 166/358] !18993 Skip some failed cases Merge pull request
 !18993 from yuhaiyan/v2.6.0-dev1

---
 .../.pytorch-disabled-tests.json               | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/test/unsupported_test_cases/.pytorch-disabled-tests.json b/test/unsupported_test_cases/.pytorch-disabled-tests.json
index ba602892c4..66c1d7e75a 100644
--- a/test/unsupported_test_cases/.pytorch-disabled-tests.json
+++ b/test/unsupported_test_cases/.pytorch-disabled-tests.json
@@ -31219,5 +31219,21 @@
   "test_unary_op_out_casting_npu_int64_complex64 (__main__.TestTypePromotionPRIVATEUSE1)": ["", ["910A"]],
   "test_unsupported_dtypes (__main__.TestTEFuserDynamic)": ["", [""]],
   "test_unsupported_dtypes (__main__.TestTEFuserStatic)": ["", [""]],
-  "test_autocast_fast_dtype (__main__.TestTorchAutocast)": ["", [""]]
+  "test_autocast_fast_dtype (__main__.TestTorchAutocast)": ["", [""]],
+   "test_embedding_padding_idx_npu_float32 (__main__.TestEmbeddingNNDeviceTypePRIVATEUSE1)": ["", [""]],
+  "test_embedding_padding_idx_npu_float16 (__main__.TestEmbeddingNNDeviceTypePRIVATEUSE1)": ["", [""]],
+  "test_embedding_bag_non_contiguous_weight_npu_int32_int32_float32 (__main__.TestEmbeddingNNDeviceTypePRIVATEUSE1)": ["", [""]],
+  "test_embedding_bag_non_contiguous_weight_npu_int32_int32_bfloat16 (__main__.TestEmbeddingNNDeviceTypePRIVATEUSE1)": ["", [""]],
+  "test_embedding_bag_non_contiguous_weight_npu_int32_int64_bfloat16 (__main__.TestEmbeddingNNDeviceTypePRIVATEUSE1)": ["", [""]],
+  "test_embedding_bag_non_contiguous_weight_npu_int32_int64_float32 (__main__.TestEmbeddingNNDeviceTypePRIVATEUSE1)": ["", [""]],
+  "test_embedding_bag_non_contiguous_weight_npu_int64_int32_float32 (__main__.TestEmbeddingNNDeviceTypePRIVATEUSE1)": ["", [""]],
+  "test_embedding_bag_non_contiguous_weight_npu_int64_int32_bfloat16 (__main__.TestEmbeddingNNDeviceTypePRIVATEUSE1)": ["", [""]],
+  "test_embedding_bag_non_contiguous_weight_npu_int64_int64_float32 (__main__.TestEmbeddingNNDeviceTypePRIVATEUSE1)": ["", [""]],
+  "test_embedding_bag_non_contiguous_weight_npu_int64_int64_bfloat16 (__main__.TestEmbeddingNNDeviceTypePRIVATEUSE1)": ["", [""]],
+  "test_baddbmm_large_input_2_1000_1000_1000_npu_float16 (__main__.TestMatmulPRIVATEUSE1)": ["", [""]],
+  "test_variant_consistency_eager_unbind_copy_npu_complex64 (__main__.TestCommonPRIVATEUSE1)": ["", [""]],
+  "test_conj_view__refs_unbind_copy_npu_complex64 (__main__.TestMathBitsPRIVATEUSE1)": ["", [""]],
+  "test_conj_view_unbind_copy_npu_complex64 (__main__.TestMathBitsPRIVATEUSE1)": ["", [""]],
+  "test_neg_conj_view__refs_unbind_copy_npu_complex128 (__main__.TestMathBitsPRIVATEUSE1)": ["", [""]],
+  "test_neg_conj_view_unbind_copy_npu_complex128 (__main__.TestMathBitsPRIVATEUSE1)": ["", [""]]
 }
-- 
Gitee


From 0816a01a87ad1834eda96b7fea3c039c68d4d5f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=8F=B6=E5=AD=90=E5=87=A1?= <vscience@qq.com>
Date: Sat, 15 Mar 2025 06:33:16 +0000
Subject: [PATCH 167/358] =?UTF-8?q?!18874=20CleanCode=20Fix=20Merge=20pull?=
 =?UTF-8?q?=20request=20!18874=20from=20=E5=8F=B6=E5=AD=90=E5=87=A1/v2.6.0?=
 =?UTF-8?q?=5Fcleancode=5F0307?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/distributed/Init.cpp           |  21 ++-
 .../csrc/distributed/ProcessGroupHCCL.cpp     |  18 +-
 .../csrc/distributed/ProcessGroupHCCL.hpp     |   9 +-
 .../csrc/distributed/default_comm_hooks.cpp   |   9 +-
 torch_npu/csrc/distributed/reducer.cpp        | 174 ++++++++++++------
 torch_npu/csrc/distributed/reducer.hpp        |   6 +-
 torch_npu/csrc/distributed/reducer_npu.cpp    |   9 +-
 7 files changed, 164 insertions(+), 82 deletions(-)

diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp
index 3d7e8f0c9e..2b671852d0 100644
--- a/torch_npu/csrc/distributed/Init.cpp
+++ b/torch_npu/csrc/distributed/Init.cpp
@@ -95,7 +95,8 @@ using intrusive_ptr_no_gil_destructor_class_ =
 
 class BroadcastWork {
 public:
-    inline std::vector<at::Tensor> cast_tensors(at::TensorList tensors) {
+    inline std::vector<at::Tensor> cast_tensors(at::TensorList tensors)
+    {
         static auto cast_back_to_ori_format = [](const at::Tensor &t) {
             return at_npu::native::custom_ops::npu_format_cast(t, torch_npu::NPUBridge::GetNpuStorageImpl(t)->npu_desc_.origin_format_);
         };
@@ -114,7 +115,8 @@ public:
         work_ = process_group->broadcast(flat_tensor_, broadcastOptions);
     }
 
-    void finish() {
+    void finish()
+    {
         work_->wait();
         auto output_tensors = torch::utils::unflatten_dense_tensors(
             flat_tensor_.front(), cast_tensors_);
@@ -149,7 +151,8 @@ void broadcast_coalesced(
     c10::intrusive_ptr<c10d::ProcessGroup> process_group,
     at::TensorList tensors,
     size_t buffer_size,
-    int rank) {
+    int rank)
+{
     // Coalesce tensors into buckets taking into account the maximum buffer size.
     // This routine is multi-device aware, so the tensors can be split across
     // multiple devices and can contain a mix of CPU and CUDA tensors.
@@ -184,7 +187,8 @@ void broadcast_coalesced(
 void _register_comm_hook(
     c10d_npu::Reducer& reducer,
     py::object state,
-    py::object comm_hook) {
+    py::object comm_hook)
+{
     reducer.register_comm_hook(std::make_unique<::c10d::PythonCommHook>(
         std::move(state), std::move(comm_hook)));
 }
@@ -194,11 +198,13 @@ void _register_comm_hook(
 // function of the reducer input to set the hook type.
 void _register_builtin_comm_hook(
     c10d_npu::Reducer& reducer,
-    ::c10d::BuiltinCommHookType comm_hook_type) {
+    ::c10d::BuiltinCommHookType comm_hook_type)
+{
     reducer.register_builtin_comm_hook(comm_hook_type);
 }
 
-PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs) {
+PyObject* c10d_npu_init(PyObject* _unused, PyObject* noargs)
+{
     auto torch_npu_C_module = THPObjectPtr(PyImport_ImportModule("torch_npu._C"));
     if (!torch_npu_C_module) {
         throw python_error();
@@ -536,7 +542,8 @@ static PyMethodDef methods[] = { // NOLINT
 #endif
     {nullptr, nullptr, 0, nullptr}};
 
-PyMethodDef* python_functions() {
+PyMethodDef* python_functions()
+{
     return methods;
 }
 
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 3ebc40365d..83ca0143bc 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -473,7 +473,8 @@ bool ProcessGroupHCCL::WorkHCCL::isCompleted()
     return exception() || finishedNPUExecutionInternal();
 }
 
-bool ProcessGroupHCCL::WorkHCCL::isStarted() {
+bool ProcessGroupHCCL::WorkHCCL::isStarted()
+{
     checkAndSetException();
     return exception() || startedNPUExecutionInternal();
 }
@@ -518,7 +519,8 @@ bool ProcessGroupHCCL::WorkHCCL::finishedNPUExecution()
     return finishedNPUExecutionInternal();
 }
 
-bool ProcessGroupHCCL::WorkHCCL::startedNPUExecutionInternal() const {
+bool ProcessGroupHCCL::WorkHCCL::startedNPUExecutionInternal() const
+{
     try {
         for (const auto i : c10::irange(devices_.size())) {
             // Checking the work's corresponding ASCEND events' status
@@ -636,7 +638,8 @@ void ProcessGroupHCCL::WorkHCCL::checkDispatch()
     }
 }
 
-void ProcessGroupHCCL::WorkHCCL::synchronize() {
+void ProcessGroupHCCL::WorkHCCL::synchronize()
+{
     // Call Synchronize without a timeout. We use this method to avoid adding a
     // timeout argument to the public synchronize API.
     synchronizeInternal(kNoTimeout);
@@ -1228,17 +1231,20 @@ void ProcessGroupHCCL::workCleanupLoop()
 }
 
 std::exception_ptr ProcessGroupHCCL::WorkHCCL::checkForHCCLErrors(
-    const std::vector<std::shared_ptr<HCCLComm>>& hcclComms) const {
+    const std::vector<std::shared_ptr<HCCLComm>>& hcclComms) const
+{
     return checkForHCCLErrorsInternal(hcclComms);
 }
 
 std::exception_ptr ProcessGroupHCCL::checkForHCCLErrors(
-    const std::vector<std::shared_ptr<HCCLComm>>& hcclComms) {
+    const std::vector<std::shared_ptr<HCCLComm>>& hcclComms)
+{
     return checkForHCCLErrorsInternal(hcclComms);
 }
 
 std::exception_ptr ProcessGroupHCCL::checkForHCCLErrorsInternal(
-    const std::vector<std::shared_ptr<HCCLComm>>& hcclComms) {
+    const std::vector<std::shared_ptr<HCCLComm>>& hcclComms)
+{
     for (const auto& hcclComm : hcclComms) {
         HcclResult hcclAsyncErr = hcclComm->checkForHcclError();
         if (hcclAsyncErr != HCCL_SUCCESS) {
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
index 8306bee956..a7fdf30551 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
@@ -253,7 +253,8 @@ public:
         // return intrusive_ptr of the object
         static c10::intrusive_ptr<Options> create(
             bool is_high_priority_stream = false,
-            std::chrono::milliseconds timeout = kNoTimeout) {
+            std::chrono::milliseconds timeout = kNoTimeout)
+        {
             return c10::make_intrusive<Options>(is_high_priority_stream);
         }
 
@@ -300,11 +301,13 @@ public:
 
     ~ProcessGroupHCCL() override;
 
-    c10::intrusive_ptr<Options> getOptions() {
+    c10::intrusive_ptr<Options> getOptions()
+    {
         return options_;
     }
 
-    const std::string getBackendName() const {
+    const std::string getBackendName() const
+    {
         return std::string(HCCL_BACKEND_NAME);
     }
     c10::intrusive_ptr<c10d::Work> broadcast(
diff --git a/torch_npu/csrc/distributed/default_comm_hooks.cpp b/torch_npu/csrc/distributed/default_comm_hooks.cpp
index ffb7428893..a74c0b5044 100644
--- a/torch_npu/csrc/distributed/default_comm_hooks.cpp
+++ b/torch_npu/csrc/distributed/default_comm_hooks.cpp
@@ -11,7 +11,8 @@
 namespace c10d {
 
 c10::intrusive_ptr<c10::ivalue::Future> AllReduceCommHook::runHook(
-    GradBucket& bucket) {
+    GradBucket& bucket)
+{
     std::vector<at::Tensor> tensors = {bucket.getBufferRef()};
     // Apply the division first to avoid overflow, especially for FP16.
     tensors[0] /= state_->getSize();
@@ -19,7 +20,8 @@ c10::intrusive_ptr<c10::ivalue::Future> AllReduceCommHook::runHook(
 }
 
 c10::intrusive_ptr<c10::ivalue::Future> FP16CompressCommHook::runHook(
-    GradBucket& bucket) {
+    GradBucket& bucket)
+{
     auto compressed_tensor = bucket.getBufferRef().to(torch::kFloat16);
     // Apply the division first to avoid overflow.
     compressed_tensor /= state_->getSize();
@@ -46,7 +48,8 @@ c10::intrusive_ptr<c10::ivalue::Future> FP16CompressCommHook::runHook(
     return allreduce_fut->then(decompress, allreduce_fut->elementType());
 }
 
-c10::intrusive_ptr<c10::ivalue::Future> _AllReduceBySumCommHook::runHook(GradBucket& bucket) {
+c10::intrusive_ptr<c10::ivalue::Future> _AllReduceBySumCommHook::runHook(GradBucket& bucket)
+{
     std::vector<at::Tensor> tensors = {bucket.getBufferRef()};
     return state_->allreduce(tensors)->getFuture();
 }
diff --git a/torch_npu/csrc/distributed/reducer.cpp b/torch_npu/csrc/distributed/reducer.cpp
index 7c27cc3829..dcee8d4a88 100644
--- a/torch_npu/csrc/distributed/reducer.cpp
+++ b/torch_npu/csrc/distributed/reducer.cpp
@@ -253,7 +253,8 @@ Reducer::Reducer(
 // Additionally, there are also some built-in C++ hook implementations that can
 // be specified by calling `register_builtin_comm_hook` from Python API.
 
-Reducer::~Reducer() noexcept(false) {
+Reducer::~Reducer() noexcept(false)
+{
     // Remove all hooks on variables registered by this Reducer. This is necessary
     // to make DDP failure recoverable. Otherwise, multiple Reducer instances
     // (from recoveries) will add their hooks to the original model, and those
@@ -267,24 +268,29 @@ Reducer::~Reducer() noexcept(false) {
     }
 }
 
-bool Reducer::dynamic_graph_find_unused() {
+bool Reducer::dynamic_graph_find_unused()
+{
     return !static_graph_ && find_unused_parameters_;
 }
 
-bool Reducer::static_graph_first_iteration() {
+bool Reducer::static_graph_first_iteration()
+{
     return static_graph_ && num_iterations_ == 1;
 }
 
-bool Reducer::static_graph_after_first_iteration() {
+bool Reducer::static_graph_after_first_iteration()
+{
     return static_graph_ && num_iterations_ > 1;
 }
 
-bool Reducer::ddp_graph_static() {
+bool Reducer::ddp_graph_static()
+{
     std::lock_guard<std::mutex> lock(mutex_);
     return ddp_graph_static_;
 }
 
-void Reducer::initialize_local_used_map() {
+void Reducer::initialize_local_used_map()
+{
     const auto variable_count = params_.size();
     at::TensorOptions options = options.dtype(at::kInt);
 
@@ -303,7 +309,8 @@ void Reducer::initialize_local_used_map() {
 
 void Reducer::check_grad_layout(
     const at::Tensor& grad,
-    const at::Tensor& bucket_view) {
+    const at::Tensor& bucket_view)
+{
     // Ensure that the gradient type matches the bucket type.
     REDUCER_CHECK(
         grad.options().type_equal(bucket_view.options()),
@@ -334,7 +341,8 @@ void Reducer::check_grad_layout(
 }
 
 
-void Reducer::mark_variable_ready_dense(size_t variable_index) {
+void Reducer::mark_variable_ready_dense(size_t variable_index)
+{
     const auto replica_index = 0;
     const auto& bucket_index = variable_locators_[variable_index];
     auto& bucket = buckets_[bucket_index.bucket_index];
@@ -416,7 +424,8 @@ void Reducer::mark_variable_ready_dense(size_t variable_index) {
     });
 }
 
-void Reducer::mark_variable_ready_sparse(size_t variable_index) {
+void Reducer::mark_variable_ready_sparse(size_t variable_index)
+{
     const auto replica_index = 0;
     const auto& bucket_index = variable_locators_[variable_index];
     auto& bucket = buckets_[bucket_index.bucket_index];
@@ -449,7 +458,8 @@ void Reducer::mark_variable_ready_sparse(size_t variable_index) {
 }
 
 std::vector<c10d::GradBucket> Reducer::get_grad_buckets(
-    bool return_zero_tensors) const {
+    bool return_zero_tensors) const
+{
     std::lock_guard<std::mutex> lock(mutex_);
     std::vector<c10d::GradBucket> gradBuckets;
     gradBuckets.reserve(buckets_.size());
@@ -471,18 +481,21 @@ std::vector<c10d::GradBucket> Reducer::get_grad_buckets(
 }
 void Reducer::set_forward_pass_work_handle(
     c10::intrusive_ptr<c10d::Work> forwardPassWorkHandle,
-    bool useStaticWorldSize) {
+    bool useStaticWorldSize)
+{
     std::lock_guard<std::mutex> lock(mutex_);
     forwardPassWorkHandle_.workHandle = std::move(forwardPassWorkHandle);
     forwardPassWorkHandle_.useStaticWorldSize = useStaticWorldSize;
 }
 
-at::Tensor Reducer::get_local_used_map_on_device() const {
+at::Tensor Reducer::get_local_used_map_on_device() const
+{
     std::lock_guard<std::mutex> lock(mutex_);
     return local_used_map_dev_;
 }
 
-void Reducer::push_rebuilt_params_for_all_indices() {
+void Reducer::push_rebuilt_params_for_all_indices()
+{
     std::lock_guard<std::mutex> lock(mutex_);
     if (!should_rebuild_buckets() || !rebuilt_param_indices_.empty()) {
         return;
@@ -493,12 +506,14 @@ void Reducer::push_rebuilt_params_for_all_indices() {
     }
 }
 
-void Reducer::push_rebuilt_params(const size_t& index) {
+void Reducer::push_rebuilt_params(const size_t& index)
+{
     rebuilt_params_.push_back(params_[index]);
     rebuilt_param_indices_.push_back(index);
 }
 
-void Reducer::set_divide_factor() {
+void Reducer::set_divide_factor()
+{
     // If it was scheduled, wait on allreduce in forward pass that tells us
     // division factor based on no. of currently participating processes.
     if (div_factor_ == kUnsetDivFactor) {
@@ -517,7 +532,8 @@ void Reducer::set_divide_factor() {
 
 // Right now delay_all_reduce is only called when static_graph_=true and
 // num_iterations_==1.
-void Reducer::delay_all_reduce() {
+void Reducer::delay_all_reduce()
+{
     std::lock_guard<std::mutex> lock(this->mutex_);
 
     if (should_collect_runtime_stats()) {
@@ -555,14 +571,16 @@ void Reducer::delay_all_reduce() {
     finalize_backward();
 }
 
-void Reducer::set_logger(std::weak_ptr<c10d::Logger> logger) {
+void Reducer::set_logger(std::weak_ptr<c10d::Logger> logger)
+{
     logger_ = logger;
 }
 
 // The function `autograd_hook` is called after the gradient for a
 // model parameter has been accumulated into its gradient tensor.
 // This function is only to be called from the autograd thread.
-void Reducer::autograd_hook(size_t index) {
+void Reducer::autograd_hook(size_t index)
+{
     std::lock_guard<std::mutex> lock(this->mutex_);
     // Ignore if we don't expect to be called.
     // This may be the case if the user wants to accumulate gradients
@@ -644,7 +662,8 @@ void Reducer::autograd_hook(size_t index) {
     }
 }
 
-void Reducer::all_reduce_local_used_map() {
+void Reducer::all_reduce_local_used_map()
+{
     // See Note [Skip allreducing local_used_map_dev]
     // H2D from local_used_map_ to local_used_map_dev_
     local_used_map_dev_.copy_(local_used_map_, true);
@@ -652,7 +671,8 @@ void Reducer::all_reduce_local_used_map() {
     local_used_work_ = process_group_->allreduce(temp_local_used_map_dev_vec_);
 }
 
-at::Tensor& Reducer::get_param_from_index(size_t index) {
+at::Tensor& Reducer::get_param_from_index(size_t index)
+{
     const auto& bucket_index = variable_locators_[index];
     auto& bucket = buckets_[bucket_index.bucket_index];
     auto& replica = bucket.replicas[0];
@@ -663,7 +683,8 @@ at::Tensor& Reducer::get_param_from_index(size_t index) {
     return variable;
 }
 
-void Reducer::checkAndRaiseMarkedTwiceError(size_t index) {
+void Reducer::checkAndRaiseMarkedTwiceError(size_t index)
+{
     // Something is wrong if all variables contained in this bucket replica have
     // already been marked as ready.
     // We don't expect the same variable to be marked ready twice.
@@ -732,7 +753,8 @@ void Reducer::checkAndRaiseMarkedTwiceError(size_t index) {
     }
 }
 
-void Reducer::mark_variable_ready(size_t variable_index) {
+void Reducer::mark_variable_ready(size_t variable_index)
+{
     REDUCER_CHECK(variable_index < variable_locators_.size(), logger_,
                   "Out of range variable index.", DIST_ERROR(ErrCode::PARAM));
 
@@ -795,7 +817,8 @@ void Reducer::mark_variable_ready(size_t variable_index) {
 }
 
 c10::intrusive_ptr<c10::ivalue::Future> Reducer::run_comm_hook(
-    c10d::GradBucket& grad_bucket) {
+    c10d::GradBucket& grad_bucket)
+{
     if (comm_hook_ == nullptr) {
         return run_allreduce_hook(grad_bucket);
     } else {
@@ -804,12 +827,14 @@ c10::intrusive_ptr<c10::ivalue::Future> Reducer::run_comm_hook(
 }
 
 c10::intrusive_ptr<c10::ivalue::Future> Reducer::run_allreduce_hook(
-    c10d::GradBucket& grad_bucket) {
+    c10d::GradBucket& grad_bucket)
+{
     c10d::_AllReduceBySumCommHook allreduce_hook(process_group_);
     return allreduce_hook.runHook(grad_bucket);
 }
 
-void Reducer::all_reduce_bucket(Bucket& bucket) {
+void Reducer::all_reduce_bucket(Bucket& bucket)
+{
     std::vector<at::Tensor> tensors;
     tensors.reserve(bucket.replicas.size());
     for (const auto& replica : bucket.replicas) {
@@ -833,7 +858,8 @@ void Reducer::all_reduce_bucket(Bucket& bucket) {
 
 std::vector<at::Tensor> Reducer::get_variables_for_bucket(
     size_t bucket_index,
-    const Bucket& bucket) const {
+    const Bucket& bucket) const
+{
     // Check if we have cached mapping previously.
     if (has_rebuilt_bucket_ &&
         cached_variables_for_bucket_.find(bucket_index) !=
@@ -865,7 +891,8 @@ std::vector<at::Tensor> Reducer::get_variables_for_bucket(
 }
 
 // Called when the bucket at the specified index is ready to be reduced.
-void Reducer::mark_bucket_ready(size_t bucket_index) {
+void Reducer::mark_bucket_ready(size_t bucket_index)
+{
     TORCH_INTERNAL_ASSERT(bucket_index >= next_bucket_,
                           DIST_ERROR(ErrCode::PARAM));
 
@@ -890,7 +917,8 @@ void Reducer::mark_bucket_ready(size_t bucket_index) {
     }
 }
 
-void Reducer::install_futures(c10::List<c10::intrusive_ptr<c10::ivalue::Future>> futs) {
+void Reducer::install_futures(c10::List<c10::intrusive_ptr<c10::ivalue::Future>> futs)
+{
     // Append instead of overwrite so that this method can be called multiple
     // times in one iteration.
     if (!installed_futures_) {
@@ -902,7 +930,8 @@ void Reducer::install_futures(c10::List<c10::intrusive_ptr<c10::ivalue::Future>>
 
 void Reducer::initialize_buckets(
     std::vector<std::vector<size_t>> bucket_indices,
-    std::vector<size_t> per_bucket_sizes) {
+    std::vector<size_t> per_bucket_sizes)
+{
     // If initialize_buckets is called inside DDP constructor, then
     // it does not matter rpc context ptr is nullptr or not, as grad
     // will not be mutated.
@@ -1074,7 +1103,8 @@ void Reducer::initialize_buckets(
 // (see Note:  "Gradient Layout Contract" in initialize_buckets).
 void Reducer::initialize_bucket_views(
     Reducer::BucketReplica& replica,
-    at::Tensor& contents) {
+    at::Tensor& contents)
+{
     for (const auto i : c10::irange(replica.variables.size())) {
         auto& v = replica.variables[i];
         const auto offset = replica.offsets[i];
@@ -1130,7 +1160,8 @@ void Reducer::populate_bucket_views_out(
     }
 }
 
-void Reducer::prepare_for_forward() {
+void Reducer::prepare_for_forward()
+{
     std::lock_guard<std::mutex> lock(mutex_);
     num_iterations_++;
     if (should_collect_runtime_stats()) {
@@ -1138,7 +1169,8 @@ void Reducer::prepare_for_forward() {
     }
 }
 
-void Reducer::reset_bucket_counting() {
+void Reducer::reset_bucket_counting()
+{
     next_bucket_ = 0;
     // Reset num_buckets_ready_ at the beginning of backward computation
     // in each iteration.
@@ -1163,7 +1195,8 @@ void Reducer::reset_bucket_counting() {
 // done immediately because the model output may be ignored, and we only
 // want to start performing reductions on `torch.autograd.backward()`.
 void Reducer::search_unused_parameters(
-    const std::vector<torch::autograd::Variable>& outputs) {
+    const std::vector<torch::autograd::Variable>& outputs)
+{
     std::unordered_set<torch::autograd::Node*> seen;
     std::vector<torch::autograd::Node*> queue;
 
@@ -1243,7 +1276,8 @@ void Reducer::search_unused_parameters(
 }
 
 void Reducer::prepare_for_backward(
-    const std::vector<torch::autograd::Variable>& outputs) {
+    const std::vector<torch::autograd::Variable>& outputs)
+{
     std::lock_guard<std::mutex> lock(mutex_);
 
     backward_compute_start_time_ = current_time_in_nanos();
@@ -1279,7 +1313,8 @@ void Reducer::copy_bucket_to_grad(
     torch::autograd::Variable& variable,
     Reducer::BucketReplica& replica,
     size_t intra_bucket_index,
-    bool global_unused) {
+    bool global_unused)
+{
     const auto& bucket_view = replica.bucket_views_out[intra_bucket_index];
     runGradCallbackForVariable(variable, [&](auto& grad) {
         // If a parameter is globally unused, we keep its grad untouched.
@@ -1302,7 +1337,8 @@ void Reducer::copy_bucket_to_grad(
     });
 }
 
-std::vector<std::string> Reducer::getUnmarkedParamsForIteration() {
+std::vector<std::string> Reducer::getUnmarkedParamsForIteration()
+{
     std::vector<std::string> unMarkedParamNames;
     for (const auto& it : param_names_) {
         if (perIterationReadyParams_.find(it.first) ==
@@ -1313,7 +1349,8 @@ std::vector<std::string> Reducer::getUnmarkedParamsForIteration() {
     return unMarkedParamNames;
 }
 
-std::vector<size_t> Reducer::getUnmarkedParamIndicesForIteration() {
+std::vector<size_t> Reducer::getUnmarkedParamIndicesForIteration()
+{
     std::vector<size_t> unmarked_param_indices;
     const auto variable_count = params_.size();
     for (const auto variable_index : c10::irange(variable_count)) {
@@ -1326,7 +1363,8 @@ std::vector<size_t> Reducer::getUnmarkedParamIndicesForIteration() {
 }
 
 // A bucket with one or more dense tensors needs to be unflattened.
-void Reducer::finalize_bucket_dense(Bucket& bucket) {
+void Reducer::finalize_bucket_dense(Bucket& bucket)
+{
     size_t replica_index = 0;
     auto& replica = bucket.replicas[replica_index];
     for (const auto intra_bucket_index : c10::irange(replica.variables.size())) {
@@ -1415,7 +1453,8 @@ void Reducer::finalize_bucket_dense(Bucket& bucket) {
     }
 }
 
-void Reducer::finalize_backward() {
+void Reducer::finalize_backward()
+{
     // No longer expect autograd hooks to fire after this function returns.
     TORCH_INTERNAL_ASSERT(expect_autograd_hooks_,
                           DIST_ERROR(ErrCode::INTERNAL));
@@ -1510,7 +1549,8 @@ void Reducer::finalize_backward() {
 
 void Reducer::runGradCallbackForVariable(
     at::Tensor& variable,
-    GradCallback&& cb) {
+    GradCallback&& cb)
+{
 #ifdef _WIN32
     cb(variable.mutable_grad());
 #else
@@ -1525,7 +1565,8 @@ void Reducer::runGradCallbackForVariable(
 }
 
 #ifndef _WIN32
-void Reducer::RpcContext::set(ContextPtr&& new_context_ptr) {
+void Reducer::RpcContext::set(ContextPtr&& new_context_ptr)
+{
     // We should set 'new_context_ptr' even if it's nullptr. That means the
     // reducer is under a local backward run.
     const auto new_context_raw_ptr = new_context_ptr.get();
@@ -1539,7 +1580,8 @@ void Reducer::RpcContext::set(ContextPtr&& new_context_ptr) {
 #endif
 
 void Reducer::sync_bucket_indices(
-    std::vector<std::vector<size_t>>& bucket_indices) {
+    std::vector<std::vector<size_t>>& bucket_indices)
+{
     auto num_buckets = bucket_indices.size();
     std::vector<size_t> bucket_sizes;
     bucket_sizes.reserve(num_buckets);
@@ -1611,7 +1653,8 @@ void Reducer::sync_bucket_indices(
     }
 }
 
-bool Reducer::rebuild_buckets() {
+bool Reducer::rebuild_buckets()
+{
     // Ensure reduction for previous backwards pass is finished. If user's model
     // has unused parameters for example, this will raise an error recommending to
     // run with find_unused_parameters=True, instead of the size mismatch
@@ -1693,7 +1736,8 @@ bool Reducer::rebuild_buckets() {
 }
 
 // See Note [DDP Communication Hook]
-void Reducer::register_comm_hook(std::unique_ptr<c10d::CommHookInterface> iface) {
+void Reducer::register_comm_hook(std::unique_ptr<c10d::CommHookInterface> iface)
+{
     REDUCER_CHECK(
         comm_hook_ == nullptr,
         logger_,
@@ -1704,7 +1748,8 @@ void Reducer::register_comm_hook(std::unique_ptr<c10d::CommHookInterface> iface)
 }
 
 // See Note [DDP Communication Hook]
-void Reducer::register_builtin_comm_hook(c10d::BuiltinCommHookType comm_hook_type) {
+void Reducer::register_builtin_comm_hook(c10d::BuiltinCommHookType comm_hook_type)
+{
     REDUCER_CHECK(
         comm_hook_ == nullptr,
         logger_,
@@ -1725,7 +1770,8 @@ void Reducer::register_builtin_comm_hook(c10d::BuiltinCommHookType comm_hook_typ
     }
 }
 
-void Reducer::ensure_prior_reduction_finished() {
+void Reducer::ensure_prior_reduction_finished()
+{
     // Check that any prior reduction has finished.
     // The variable `require_finalize_` is true until all gradients
     // have been computed and reduction of all buckets has been kicked off.
@@ -1823,15 +1869,18 @@ void Reducer::ensure_prior_reduction_finished() {
     }
 }
 
-void Reducer::set_ddp_runtime_logging_sample_rate(int sample_rate) {
+void Reducer::set_ddp_runtime_logging_sample_rate(int sample_rate)
+{
     ddp_runtime_logging_sample_rate_ = sample_rate;
 }
 
-int Reducer::get_ddp_runtime_logging_sample_rate() {
+int Reducer::get_ddp_runtime_logging_sample_rate()
+{
     return ddp_runtime_logging_sample_rate_;
 }
 
-bool Reducer::should_collect_runtime_stats() {
+bool Reducer::should_collect_runtime_stats()
+{
     if (num_iterations_ > 0 &&
         (num_iterations_ <= 10 ||
         num_iterations_ % get_ddp_runtime_logging_sample_rate() == 0)) {
@@ -1840,37 +1889,43 @@ bool Reducer::should_collect_runtime_stats() {
     return false;
 }
 
-void Reducer::record_forward_compute_start_time() {
+void Reducer::record_forward_compute_start_time()
+{
     if (timer_) {
         timer_->record(Timer::Event::kForwardStart);
     }
 }
 
-void Reducer::record_backward_compute_start_time() {
+void Reducer::record_backward_compute_start_time()
+{
     if (timer_) {
         timer_->record(Timer::Event::kBackwardComputeStart);
     }
 }
 
-void Reducer::record_backward_compute_end_time() {
+void Reducer::record_backward_compute_end_time()
+{
     if (timer_) {
         timer_->record(Timer::Event::kBackwardComputeEnd);
     }
 }
 
-void Reducer::record_backward_comm_start_time() {
+void Reducer::record_backward_comm_start_time()
+{
     if (timer_) {
         timer_->record(Timer::Event::kBackwardCommStart);
     }
 }
 
-void Reducer::record_backward_comm_end_time() {
+void Reducer::record_backward_comm_end_time()
+{
     if (timer_) {
         timer_->record(Timer::Event::kBackwardCommEnd);
     }
 }
 
-void Reducer::set_static_graph() {
+void Reducer::set_static_graph()
+{
     std::lock_guard<std::mutex> lock(mutex_);
     REDUCER_CHECK(
         num_iterations_ == 0,
@@ -1901,7 +1956,8 @@ struct BucketKey {
     }
 };
 
-inline bool operator==(const BucketKey& lhs, const BucketKey& rhs) {
+inline bool operator==(const BucketKey& lhs, const BucketKey& rhs)
+{
     return lhs.type == rhs.type && lhs.device == rhs.device;
 }
 
@@ -1912,7 +1968,8 @@ std::tuple<std::vector<std::vector<size_t>>, std::vector<size_t>> compute_bucket
     const std::vector<size_t>& bucket_size_limits,
     const std::vector<bool>& expect_sparse_gradient,
     const std::vector<int64_t>& tensor_indices,
-    const c10::optional<std::weak_ptr<c10d::Logger>>& logger) {
+    const c10::optional<std::weak_ptr<c10d::Logger>>& logger)
+{
     // Either expect_sparse_gradient is not specified or it has as many elements
     // as the vector with tensors.
     TORCH_INTERNAL_ASSERT(expect_sparse_gradient.empty() ||
@@ -2036,7 +2093,8 @@ std::tuple<std::vector<std::vector<size_t>>, std::vector<size_t>> compute_bucket
 void verify_params_across_processes(
     const c10::intrusive_ptr<c10d::ProcessGroup>& process_group,
     const std::vector<at::Tensor>& params,
-    const c10::optional<std::weak_ptr<c10d::Logger>>& logger) {
+    const c10::optional<std::weak_ptr<c10d::Logger>>& logger)
+{
     size_t i = 0;
     for (const auto& t : params) {
         i += static_cast<size_t>(2 * t.dim());
diff --git a/torch_npu/csrc/distributed/reducer.hpp b/torch_npu/csrc/distributed/reducer.hpp
index e6c59e5a78..f25cab26ee 100644
--- a/torch_npu/csrc/distributed/reducer.hpp
+++ b/torch_npu/csrc/distributed/reducer.hpp
@@ -153,7 +153,8 @@ public:
     // Returns the relative time in nanoseconds when gradients were ready,
     // with respect to the time `prepare_for_backward` was called. The
     // vector is for parameters for a single model replica.
-    std::vector<int64_t> get_backward_stats() const {
+    std::vector<int64_t> get_backward_stats() const
+    {
         return backward_stats_;
     }
 
@@ -194,7 +195,8 @@ public:
     // Returns true if we should rebuild buckets, else false. We only rebuild
     // buckets once after the first iteration and never rebuild them if
     // find_unused_parameters_.
-    inline bool should_rebuild_buckets() const {
+    inline bool should_rebuild_buckets() const
+    {
         return (static_graph_ || !find_unused_parameters_) && !has_rebuilt_bucket_;
     }
 
diff --git a/torch_npu/csrc/distributed/reducer_npu.cpp b/torch_npu/csrc/distributed/reducer_npu.cpp
index 0cb0ca7a46..1e80f93f4f 100644
--- a/torch_npu/csrc/distributed/reducer_npu.cpp
+++ b/torch_npu/csrc/distributed/reducer_npu.cpp
@@ -14,14 +14,16 @@ class NpuTimer : public c10d::Timer {
 public:
     explicit NpuTimer(c10::Device dev) : device(dev) {}
 
-    void record(Event event) override {
+    void record(Event event) override
+    {
         // Parent class sets the host-side time
         Timer::record(event);
         c10_npu::NPUGuard g(device);
         getEvent(event).record();
     }
 
-    c10::optional<int64_t> measureDifference(Event start, Event end) override {
+    c10::optional<int64_t> measureDifference(Event start, Event end) override
+    {
         // Currently elapsed_time does not support the return of negative values.
         // So measureDifference is only calculated when the debug level is detail.
         if (debug_level() != DebugLevel::Detail) {
@@ -73,7 +75,8 @@ private:
     c10_npu::NPUEvent backward_comm_start = c10_npu::NPUEvent(ACL_EVENT_TIME_LINE);
     c10_npu::NPUEvent backward_comm_end = c10_npu::NPUEvent(ACL_EVENT_TIME_LINE);
 
-    c10_npu::NPUEvent& getEvent(Event event) {
+    c10_npu::NPUEvent& getEvent(Event event)
+    {
         switch (event) {
             case Event::kForwardStart:
                 return forward_start;
-- 
Gitee


From 660a5f174a67e1c3a22026c72af2f831e9d308de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=9C=E9=87=91=E8=88=AA?= <15990042527@163.com>
Date: Sat, 15 Mar 2025 07:49:34 +0000
Subject: [PATCH 168/358] =?UTF-8?q?!19006=20cleancode=20Merge=20pull=20req?=
 =?UTF-8?q?uest=20!19006=20from=20=E6=9D=9C=E9=87=91=E8=88=AA/v2.6.0-clean?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/framework/FormatHelper.cpp     |   6 +-
 torch_npu/csrc/framework/InferFormat.cpp      |   2 +-
 torch_npu/csrc/framework/NPUDefine.h          |  23 +-
 torch_npu/csrc/framework/OpCmdHelper.h        |   2 +-
 torch_npu/csrc/framework/OpCommand.cpp        | 283 +++++++++---------
 torch_npu/csrc/framework/OpCommand.h          |  10 +-
 torch_npu/csrc/framework/OpParamMaker.cpp     |  24 +-
 torch_npu/csrc/framework/OpParamMaker.h       |   2 -
 .../csrc/framework/StorageDescHelper.cpp      |  14 +-
 torch_npu/csrc/framework/StorageDescHelper.h  |   4 +-
 10 files changed, 181 insertions(+), 189 deletions(-)

diff --git a/torch_npu/csrc/framework/FormatHelper.cpp b/torch_npu/csrc/framework/FormatHelper.cpp
index 5dbe462877..ac1610b020 100644
--- a/torch_npu/csrc/framework/FormatHelper.cpp
+++ b/torch_npu/csrc/framework/FormatHelper.cpp
@@ -253,7 +253,7 @@ FormatShape InferShapeNDToNZ(c10::IntArrayRef dims, size_t itemsize)
     FormatShape res;
     // sum(keepdim = false) may make tensor dim = 0
     FormatShape dim;
-    for (int i = 0; i < dims.size(); i++) {
+    for (size_t i = 0; i < dims.size(); i++) {
         dim.emplace_back(dims[i]);
     }
 
@@ -265,7 +265,7 @@ FormatShape InferShapeNDToNZ(c10::IntArrayRef dims, size_t itemsize)
         dim.emplace_back(1);
     }
 
-    int i = 0;
+    size_t i = 0;
     for (; i < dim.size() - 2; i++) {
         res.emplace_back(dim[i]);
     }
@@ -483,7 +483,7 @@ FormatShape InferShapeofND(c10::IntArrayRef dims, size_t itemsize)
 {
     FormatShape res;
     res.resize(dims.size());
-    for (int j = 0; j < dims.size(); j++) {
+    for (size_t j = 0; j < dims.size(); j++) {
         res[j] = dims[j];
     }
     return res;
diff --git a/torch_npu/csrc/framework/InferFormat.cpp b/torch_npu/csrc/framework/InferFormat.cpp
index c405adcb65..8af918ed15 100644
--- a/torch_npu/csrc/framework/InferFormat.cpp
+++ b/torch_npu/csrc/framework/InferFormat.cpp
@@ -1,8 +1,8 @@
 #include "torch_npu/csrc/core/npu/register/OptionsManager.h"
-#include "torch_npu/csrc/framework/InferFormat.h"
 #include "torch_npu/csrc/framework/FormatHelper.h"
 #include "torch_npu/csrc/core/NPUBridge.h"
 #include "torch_npu/csrc/core/NPUStorageImpl.h"
+#include "torch_npu/csrc/framework/InferFormat.h"
 
 namespace at_npu {
 namespace native {
diff --git a/torch_npu/csrc/framework/NPUDefine.h b/torch_npu/csrc/framework/NPUDefine.h
index 59adb09eec..17196d5591 100644
--- a/torch_npu/csrc/framework/NPUDefine.h
+++ b/torch_npu/csrc/framework/NPUDefine.h
@@ -9,13 +9,7 @@ namespace at_npu {
 namespace native {
 
 struct ACL_PARAMS {
-    ACL_PARAMS()
-    {
-        input_desc = nullptr;
-        input_data_buf = nullptr;
-        output_desc = nullptr;
-        output_data_buf = nullptr;
-    }
+    ACL_PARAMS() : input_desc(nullptr), input_data_buf(nullptr), output_desc(nullptr), output_data_buf(nullptr) {}
 
     int input_num{0};
     const aclTensorDesc **input_desc;
@@ -27,19 +21,10 @@ struct ACL_PARAMS {
 
 struct ACL_DYNAMIC_PARAMS {
     ACL_DYNAMIC_PARAMS()
+        : input_desc(nullptr), input_data_buf(nullptr), output_desc(nullptr), output_data_buf(nullptr),
+          inputDims(nullptr), outputDims(nullptr), inputFormats(nullptr), outputFormats(nullptr),
+          compile_input_desc(nullptr), compile_output_desc(nullptr), hasAttr(false)
     {
-        input_desc = nullptr;
-        input_data_buf = nullptr;
-        output_desc = nullptr;
-        output_data_buf = nullptr;
-        inputDims = nullptr;
-        outputDims = nullptr;
-        inputFormats = nullptr;
-        outputFormats = nullptr;
-        compile_input_desc = nullptr;
-        compile_output_desc = nullptr;
-
-        hasAttr = false;
     }
 
     int input_num = 0;
diff --git a/torch_npu/csrc/framework/OpCmdHelper.h b/torch_npu/csrc/framework/OpCmdHelper.h
index a4bee5629e..0f320f4d34 100644
--- a/torch_npu/csrc/framework/OpCmdHelper.h
+++ b/torch_npu/csrc/framework/OpCmdHelper.h
@@ -23,7 +23,7 @@ public:
     static std::tuple<aclTensorDesc *, aclDataBuffer *> CovertNPUTensorWithZeroDimToAclInput(const at::Tensor &tensor,
                                                                                              const string &descName);
 
-    static std::tuple<aclTensorDesc *, aclDataBuffer *> CovertScalarToAclInput(const at::Tensor &tensor,
+    static std::tuple<aclTensorDesc *, aclDataBuffer *> CovertScalarToAclInput(const at::Tensor &aclInput,
                                                                                at::ScalarType type);
 
     static std::tuple<aclTensorDesc *, aclDataBuffer *> CovertToAclOutput(const at::Tensor &tensor,
diff --git a/torch_npu/csrc/framework/OpCommand.cpp b/torch_npu/csrc/framework/OpCommand.cpp
index ecaadcb804..01b894f0d0 100644
--- a/torch_npu/csrc/framework/OpCommand.cpp
+++ b/torch_npu/csrc/framework/OpCommand.cpp
@@ -43,95 +43,85 @@ OpCommand::OpCommand()
     aclCmd->SetCustomHandler(nullptr);
 }
 
-OpCommand& OpCommand::Name(const string &name) {
+OpCommand& OpCommand::Name(const string &name)
+{
     aclCmd->SetName(name);
     return *this;
 }
 
-OpCommand& OpCommand::SetCustomHandler(PROC_FUNC func) {
-  aclCmd->SetCustomHandler(func);
-  return *this;
+OpCommand& OpCommand::SetCustomHandler(PROC_FUNC func)
+{
+    aclCmd->SetCustomHandler(func);
+    return *this;
 }
 
-OpCommand& OpCommand::DynamicInputReg(
-    DynamicInputRegFunc func,
-    DyNumAndIndex num_and_index) {
-  return *this;
-}
+OpCommand& OpCommand::DynamicInputReg(DynamicInputRegFunc func, DyNumAndIndex num_and_index) { return *this; }
 
-OpCommand& OpCommand::Expect(UnifiedResult unified_result) {
-  commonType = unified_result.common_type;
-  resultTypeDefined = unified_result.result_type_defined;
-  commonShape = unified_result.common_shape;
-  return *this;
+OpCommand& OpCommand::Expect(UnifiedResult unified_result)
+{
+    commonType = unified_result.common_type;
+    resultTypeDefined = unified_result.result_type_defined;
+    commonShape = unified_result.common_shape;
+    return *this;
 }
 
-OpCommand& OpCommand::Input() {
-  return AddNoneTensor();
-}
+OpCommand& OpCommand::Input() { return AddNoneTensor(); }
 
-OpCommand& OpCommand::Input(
-    const at::Tensor &input,
-    const string &descName,
-    const c10::optional<aclFormat> &sensitive_format,
-    const string &realData) {
-  return AddTensorInput(
-      Contiguous(input), c10::ScalarType::Undefined, descName, realData);
+OpCommand& OpCommand::Input(const at::Tensor &input, const string &descName,
+                            const c10::optional<aclFormat> &sensitive_format, const string &realData)
+{
+    return AddTensorInput(Contiguous(input), c10::ScalarType::Undefined, descName, realData);
 }
 
-OpCommand& OpCommand::InputWithoutContiguous(
-    const at::Tensor &input,
-    const string &descName,
-    const string &realData) {
-  if (input.storage_offset() != 0) {
-    TORCH_NPU_WARN_ONCE(
-        "[Check][offset] Check input storage_offset[%ld] = 0 failed, result is untrustworthy",
-        input.storage_offset());
-  }
-  return AddTensorInput(const_cast<at::Tensor &>(input));
+OpCommand& OpCommand::InputWithoutContiguous(const at::Tensor &input, const string &descName, const string &realData)
+{
+    if (input.storage_offset() != 0) {
+        TORCH_NPU_WARN_ONCE("[Check][offset] Check input storage_offset[%ld] = 0 failed, result is untrustworthy",
+                            input.storage_offset());
+    }
+    return AddTensorInput(const_cast<at::Tensor &>(input));
 }
 
-OpCommand& OpCommand::Input(const c10::IntArrayRef &dimListRef, at::ScalarType toType,
-    CompileType compileType, const string& realDtype, const string& descName) {
-  return Input<int64_t>(dimListRef, dimListRef.size(), toType, compileType, realDtype, descName);
+OpCommand& OpCommand::Input(const c10::IntArrayRef &dimListRef, at::ScalarType toType, CompileType compileType,
+                            const string &realDtype, const string &descName)
+{
+    return Input<int64_t>(dimListRef, dimListRef.size(), toType, compileType, realDtype, descName);
 }
 
-OpCommand& OpCommand::Input(const c10::ArrayRef<double> &dimListRef, at::IntArrayRef realShape,
-    at::ScalarType toType, CompileType compileType, const string& realDtype) {
-  return Input<double>(dimListRef, realShape, toType, compileType, realDtype);
+OpCommand& OpCommand::Input(const c10::ArrayRef<double> &dimListRef, at::IntArrayRef realShape, at::ScalarType toType,
+                            CompileType compileType, const string &realDtype)
+{
+    return Input<double>(dimListRef, realShape, toType, compileType, realDtype);
 }
 
-OpCommand& OpCommand::Input(const c10::Scalar &input, const at::ScalarType type,
-    CompileType compileType) {
-  const auto &scalarTensor = CreateScalarTensor(input, type);
-  return AddHostTensorInput(scalarTensor, compileType);
+OpCommand& OpCommand::Input(const c10::Scalar &input, const at::ScalarType type, CompileType compileType)
+{
+    const auto &scalarTensor = CreateScalarTensor(input, type);
+    return AddHostTensorInput(scalarTensor, compileType);
 }
 
 OpCommand& OpCommand::Inputs(const at::TensorList &inputs)
 {
-  for (auto &input : inputs)
-  {
-    this->Input(input);
-  }
-  return *this;
+    for (auto &input : inputs) {
+        this->Input(input);
+    }
+    return *this;
 }
 
-OpCommand& OpCommand::InputScalarToNPUTensor(
-    const c10::Scalar& input,
-    const at::ScalarType type) {
-  return AddScalarInput(input, type);
+OpCommand& OpCommand::InputScalarToNPUTensor(const c10::Scalar &input, const at::ScalarType type)
+{
+    return AddScalarInput(input, type);
 }
 
-OpCommand& OpCommand::Output(
-    at::Tensor &output,
-    const string &descName,
-    const c10::optional<aclFormat> &sensitive_format,
-    const string &realType) {
-  outputTensor.emplace_back(output);
-  return AddOutput(output, realType);
+OpCommand& OpCommand::Output(at::Tensor &output, const string &descName,
+                             const c10::optional<aclFormat> &sensitive_format, const string &realType)
+{
+    outputTensor.emplace_back(output);
+    return AddOutput(output, realType);
 }
 
-void OpCommand::Run() {
+void OpCommand::Run()
+{
     // Check for npu graph
     if (c10_npu::is_stream_capturing.load() && aclCmd->CheckCustomHandlerNull()) {
         c10_npu::assertNotCapturing("Cannot run aclop operators");
@@ -222,25 +212,26 @@ void OpCommand::RunOpApi(const string &op_name, PROC_FUNC func, bool sync)
     }
 }
 
-OpCommand& OpCommand::Sync(c10::SmallVector<int64_t, N> &index) {
-  sync_index = index;
-  if (!index.empty()) {
-    sync = true;
-  }
-  return *this;
+OpCommand& OpCommand::Sync(c10::SmallVector<int64_t, N> &index)
+{
+    sync_index = index;
+    if (!index.empty()) {
+        sync = true;
+    }
+    return *this;
 }
 
-OpCommand& OpCommand::Sync() {
-  c10_npu::NPUStream stream = c10_npu::getCurrentNPUStream();
-  NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeStreamWithTimeout(stream));
-  return *this;
+OpCommand& OpCommand::Sync()
+{
+    c10_npu::NPUStream stream = c10_npu::getCurrentNPUStream();
+    NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeStreamWithTimeout(stream));
+    return *this;
 }
 
-OpCommand& OpCommand::AddTensorInput(at::Tensor &tensor,
-                                     at::ScalarType forceScaleType,
-                                     const string &descName,
-                                     const string &realData) {
-    std::tuple <aclTensorDesc*, aclDataBuffer*> res;
+OpCommand& OpCommand::AddTensorInput(at::Tensor &tensor, at::ScalarType forceScaleType, const string &descName,
+                                     const string &realData)
+{
+    std::tuple<aclTensorDesc*, aclDataBuffer*> res;
     if (commonType.has_value() && commonType.value() != tensor.scalar_type()) {
         tensor = custom_ops::npu_dtype_cast(tensor, commonType.value());
     }
@@ -262,37 +253,42 @@ OpCommand& OpCommand::AddHostTensorInput(
     const at::Tensor &tensor,
     CompileType compileType,
     const string& realDtype,
-    const string& descName) {
-  std::tuple <aclTensorDesc*, aclDataBuffer*> res = OpCmdHelper::CovertHostTensorToAclInput(
-      tensor,
-      tensor.scalar_type(),
-      compileType,
-      realDtype,
-      descName);
-  aclCmd->AddInput(std::get<0>(res), std::get<1>(res), tensor);
-  return *this;
+    const string& descName)
+{
+    std::tuple<aclTensorDesc*, aclDataBuffer*> res =
+        OpCmdHelper::CovertHostTensorToAclInput(
+            tensor,
+            tensor.scalar_type(),
+            compileType,
+            realDtype,
+            descName);
+    aclCmd->AddInput(std::get<0>(res), std::get<1>(res), tensor);
+    return *this;
 }
 
-OpCommand& OpCommand::AddNoneTensor() {
-  AclTensorDescMaker desc;
-  auto aclDesc = desc.Create(ACL_DT_UNDEFINED, ACL_FORMAT_UNDEFINED).Get();
-  AclTensorBufferMaker buffer(nullptr, 0);
-  aclCmd->AddInput(aclDesc, buffer.Get());
-  return *this;
+OpCommand& OpCommand::AddNoneTensor()
+{
+    AclTensorDescMaker desc;
+    auto aclDesc = desc.Create(ACL_DT_UNDEFINED, ACL_FORMAT_UNDEFINED).Get();
+    AclTensorBufferMaker buffer(nullptr, 0);
+    aclCmd->AddInput(aclDesc, buffer.Get());
+    return *this;
 }
 
-OpCommand& OpCommand::AddScalarInput(const c10::Scalar& input, at::ScalarType type) {
-  at::ScalarType type_bk = type;
-  if (commonType.has_value()) {
-    type_bk = commonType.value();
-  }
-  at::Tensor aclInput = CopyHostToDevice(input, type_bk);
-  auto res = OpCmdHelper::CovertScalarToAclInput(aclInput, type_bk);
-  aclCmd->AddInput(std::get<0>(res), std::get<1>(res));
-  return *this;
+OpCommand& OpCommand::AddScalarInput(const c10::Scalar& input, at::ScalarType type)
+{
+    at::ScalarType type_bk = type;
+    if (commonType.has_value()) {
+        type_bk = commonType.value();
+    }
+    at::Tensor aclInput = CopyHostToDevice(input, type_bk);
+    auto res = OpCmdHelper::CovertScalarToAclInput(aclInput, type_bk);
+    aclCmd->AddInput(std::get<0>(res), std::get<1>(res));
+    return *this;
 }
 
-OpCommand& OpCommand::AddOutput(at::Tensor &output, const string &realType) {
+OpCommand& OpCommand::AddOutput(at::Tensor &output, const string &realType)
+{
     if (resultTypeDefined == false && commonType.has_value() && commonType.value() != output.scalar_type()) {
         output = custom_ops::npu_dtype_cast(output, commonType.value());
     }
@@ -303,17 +299,20 @@ OpCommand& OpCommand::AddOutput(at::Tensor &output, const string &realType) {
 
 // 由于format_contiguous会生成新Tensor，为了保证其在生命周期内有效，故而放到对象中存储
 // 同下，CopyScalarToDevice也有同样问题
-at::Tensor& OpCommand::Contiguous(const at::Tensor &input) {
-  storage.emplace_back(std::move(NpuUtils::format_contiguous_add_copy_optimize(input)));
-  return storage.back();
+at::Tensor& OpCommand::Contiguous(const at::Tensor &input)
+{
+    storage.emplace_back(std::move(NpuUtils::format_contiguous_add_copy_optimize(input)));
+    return storage.back();
 }
 
-at::Tensor OpCommand::CopyHostToDevice(const c10::Scalar& scalar, at::ScalarType type) {
-  auto tensor = scalar_to_tensor(scalar).to(type);
-  return CopyHostToDevice(tensor);
+at::Tensor OpCommand::CopyHostToDevice(const c10::Scalar& scalar, at::ScalarType type)
+{
+    auto tensor = scalar_to_tensor(scalar).to(type);
+    return CopyHostToDevice(tensor);
 }
 
-at::Tensor OpCommand::CopyHostToDevice(const at::Tensor& cpuTensor) {
+at::Tensor OpCommand::CopyHostToDevice(const at::Tensor& cpuTensor)
+{
     at::Tensor cpuPinMemTensor = cpuTensor.pin_memory();
     int deviceIndex = 0;
     NPU_CHECK_ERROR(c10_npu::GetDevice(&deviceIndex));
@@ -327,43 +326,47 @@ at::Tensor OpCommand::CopyHostToDevice(const at::Tensor& cpuTensor) {
 }
 
 at::Tensor& OpCommand::CreateHostTensor(
-    void *data, at::IntArrayRef size,
+    void *data,
+    at::IntArrayRef size,
     const c10::TensorOptions &options,
-    at::ScalarType toType) {
-  at::ScalarType dtype = options.dtype().toScalarType();
-  auto cpuTensor = at::empty(size, options);
-  std::memcpy(cpuTensor.data_ptr(), data, elementSize(dtype) * cpuTensor.numel());
-  if (toType != dtype) {
-    cpuTensor = cpuTensor.to(toType);
-  }
-
-  storage.emplace_back(std::move(cpuTensor));
-  return storage.back();
+    at::ScalarType toType)
+{
+    at::ScalarType dtype = options.dtype().toScalarType();
+    auto cpuTensor = at::empty(size, options);
+    std::memcpy(cpuTensor.data_ptr(), data, elementSize(dtype) * cpuTensor.numel());
+    if (toType != dtype) {
+        cpuTensor = cpuTensor.to(toType);
+    }
+
+    storage.emplace_back(std::move(cpuTensor));
+    return storage.back();
 }
 
-bool OpCommand::ScalarIsInLimits(const c10::Scalar &scalar, at::ScalarType type) {
-  bool scalar_flag = false;
-  if (at::isFloatingType(type)) {
-    auto value = scalar.to<double>();
-    scalar_flag = value <= floating_limits_map[type][0] && value >= floating_limits_map[type][1];
-  } else if (at::isIntegralType(type)) {
-    auto value = scalar.to<long>();
-    scalar_flag = value <= integral_limits_map[type][0] && value >= integral_limits_map[type][1];
-  }
-  return scalar_flag;
+bool OpCommand::ScalarIsInLimits(const c10::Scalar &scalar, at::ScalarType type)
+{
+    bool scalar_flag = false;
+    if (at::isFloatingType(type)) {
+        auto value = scalar.to<double>();
+        scalar_flag = value <= floating_limits_map[type][0] && value >= floating_limits_map[type][1];
+    } else if (at::isIntegralType(type)) {
+        auto value = scalar.to<long>();
+        scalar_flag = value <= integral_limits_map[type][0] && value >= integral_limits_map[type][1];
+    }
+    return scalar_flag;
 }
 
-at::Tensor& OpCommand::CreateScalarTensor(const c10::Scalar &scalar, at::ScalarType type) {
-  if (commonType.has_value()) {
-    type = commonType.value();
-  }
-
-  if (ScalarIsInLimits(scalar, type)) {
-    storage.emplace_back(at::detail::scalar_tensor_static(scalar, type, at::kCPU));
-  } else {
-    storage.emplace_back(scalar_to_tensor(scalar).to(type));
-  }
-  return storage.back();
+at::Tensor& OpCommand::CreateScalarTensor(const c10::Scalar &scalar, at::ScalarType type)
+{
+    if (commonType.has_value()) {
+        type = commonType.value();
+    }
+
+    if (ScalarIsInLimits(scalar, type)) {
+        storage.emplace_back(at::detail::scalar_tensor_static(scalar, type, at::kCPU));
+    } else {
+        storage.emplace_back(scalar_to_tensor(scalar).to(type));
+    }
+    return storage.back();
 }
 
 } // namespace native
diff --git a/torch_npu/csrc/framework/OpCommand.h b/torch_npu/csrc/framework/OpCommand.h
index c0bca85698..9144b5ddba 100644
--- a/torch_npu/csrc/framework/OpCommand.h
+++ b/torch_npu/csrc/framework/OpCommand.h
@@ -15,11 +15,11 @@ namespace native {
 
 // get common dtype and shape from op adapter layer
 struct UnifiedResult {
-  c10::optional<at::ScalarType> common_type = c10::nullopt;
-  c10::optional<c10::IntArrayRef> common_shape = c10::nullopt;
-  // judge result tensor's dtype is defined or not.
-  // if result's dtype is defined, result_type_defined is true and result's dtype remains unchanged.
-  bool result_type_defined = false;
+    c10::optional<at::ScalarType> common_type = c10::nullopt;
+    c10::optional<c10::IntArrayRef> common_shape = c10::nullopt;
+    // judge result tensor's dtype is defined or not.
+    // if result's dtype is defined, result_type_defined is true and result's dtype remains unchanged.
+    bool result_type_defined = false;
 };
 
 class TORCH_NPU_API OpCommand {
diff --git a/torch_npu/csrc/framework/OpParamMaker.cpp b/torch_npu/csrc/framework/OpParamMaker.cpp
index fab27ad75a..90c4fc4fed 100644
--- a/torch_npu/csrc/framework/OpParamMaker.cpp
+++ b/torch_npu/csrc/framework/OpParamMaker.cpp
@@ -1,17 +1,17 @@
 #include <unistd.h>
-#include "torch_npu/csrc/core/npu/NPUQueue.h"
 #include <ATen/record_function.h>
 
 #include "torch_npu/csrc/core/npu/CachingHostAllocator.h"
 #include "torch_npu/csrc/core/npu/NPUEventManager.h"
+#include "torch_npu/csrc/core/npu/NPUQueue.h"
 #include "torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h"
+#include "torch_npu/csrc/distributed/HCCLUtils.hpp"
+#include "torch_npu/csrc/framework/OpCmdHelper.h"
+#include "torch_npu/csrc/framework/OpParamMaker.h"
 #include "torch_npu/csrc/framework/aoe/AoeUtils.h"
+#include "torch_npu/csrc/framework/interface/HcclInterface.h"
 #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
 #include "torch_npu/csrc/framework/utils/NpuUtils.h"
-#include "torch_npu/csrc/framework/OpParamMaker.h"
-#include "torch_npu/csrc/framework/OpCmdHelper.h"
-#include "torch_npu/csrc/framework/interface/HcclInterface.h"
-#include "torch_npu/csrc/distributed/HCCLUtils.hpp"
 
 #ifndef BUILD_LIBTORCH
 #include <Python.h>
@@ -19,6 +19,9 @@
 
 namespace at_npu {
 namespace native {
+
+static bool deterministicaclnn_oldstatus = false;
+
 void OpAttrMaker::Set(aclopAttr *attr, const string &name, bool value)
 {
     aclopSetAttrBool(attr, name.c_str(), value);
@@ -100,7 +103,8 @@ inline void SetDeterministicOption(bool deterministicAlgorithmsStatus, bool isOp
 {
     if (deterministicaclnn_oldstatus != deterministicAlgorithmsStatus) {
         if (!isOpapi) {
-            NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_DETERMINISTIC, deterministicAlgorithmsStatus ? "1" : "0"));
+            NPU_CHECK_ERROR(
+                AclSetCompileopt(aclCompileOpt::ACL_OP_DETERMINISTIC, deterministicAlgorithmsStatus ? "1" : "0"));
         }
         NPU_CHECK_ERROR(
             AclrtCtxSetSysParamOpt(aclSysParamOpt::ACL_OPT_DETERMINISTIC, deterministicAlgorithmsStatus ? 1 : 0));
@@ -131,7 +135,7 @@ void OpCommandImpl::Run(
     ASCEND_LOGD("Op %s Run.", opName.c_str());
     RECORD_FUNCTION(opName, std::vector<c10::IValue>({}));
 #ifndef BUILD_LIBTORCH
-    if (PyGILState_Check()) {
+    if (PyGILState_Check() != 0) {
         // we need to release GIL for NPU to compile op.
         Py_BEGIN_ALLOW_THREADS;
         ACL_REQUIRE_OK_OP(InnerRun(opName, execParam, sync, sync_index, outputTensor), opName.c_str());
@@ -149,7 +153,7 @@ void OpCommandImpl::RunOpApi(const string &op_name, PROC_FUNC func)
     ASCEND_LOGD("Op %s Run.", op_name.c_str());
     RECORD_FUNCTION(op_name, std::vector<c10::IValue>({}));
 #ifndef BUILD_LIBTORCH
-    if (PyGILState_Check()) {
+    if (PyGILState_Check() != 0) {
         // we need to release GIL for NPU to compile op.
         Py_BEGIN_ALLOW_THREADS;
         ACL_REQUIRE_OK_OP(InnerRunOpApi(op_name, func), op_name.c_str());
@@ -225,7 +229,7 @@ aclError OpCommandImpl::InnerRun(
                 params.attr,
                 ACL_ENGINE_SYS,
                 ACL_COMPILE_SYS,
-                NULL,
+                nullptr,
                 stream);
             OPS_CHECK_ERROR(ret, name.c_str());
         } else {
@@ -241,7 +245,7 @@ aclError OpCommandImpl::InnerRun(
                 params.attr,
                 ACL_ENGINE_SYS,
                 ACL_COMPILE_SYS,
-                NULL,
+                nullptr,
                 stream);
             OPS_CHECK_ERROR(ret, name.c_str());
             for (size_t i = 0; i < sync_index.size(); i++) {
diff --git a/torch_npu/csrc/framework/OpParamMaker.h b/torch_npu/csrc/framework/OpParamMaker.h
index 45921f36d0..4d13a9d532 100644
--- a/torch_npu/csrc/framework/OpParamMaker.h
+++ b/torch_npu/csrc/framework/OpParamMaker.h
@@ -402,8 +402,6 @@ private:
 void SetDeterministic(bool isOpapi = true);
 void SetDeterministicOps(bool deterministicAlgorithmsStatus);
 
-static bool deterministicaclnn_oldstatus = false;
-
 } // namespace native
 } // namespace at_npu
 
diff --git a/torch_npu/csrc/framework/StorageDescHelper.cpp b/torch_npu/csrc/framework/StorageDescHelper.cpp
index 24f0f83c4d..ea47f6620e 100644
--- a/torch_npu/csrc/framework/StorageDescHelper.cpp
+++ b/torch_npu/csrc/framework/StorageDescHelper.cpp
@@ -1,9 +1,10 @@
+#include <c10/util/accumulate.h>
+
 #include "torch_npu/csrc/framework/StorageDescHelper.h"
 #include "torch_npu/csrc/framework/FormatHelper.h"
 #include "torch_npu/csrc/framework/InferFormat.h"
 #include "torch_npu/csrc/core/NPUBridge.h"
 #include "torch_npu/csrc/core/NPUStorageImpl.h"
-#include <c10/util/accumulate.h>
 
 namespace at_npu {
 namespace native {
@@ -99,8 +100,8 @@ void StorageDescHelper::SetDesc(at::Tensor &dst, const c10::IntArrayRef& size, c
 
 bool StorageDescHelper::CheckDescInit(const c10::Storage &storage)
 {
-    return ACL_FORMAT_UNDEFINED !=
-           torch_npu::NPUBridge::GetNpuStorageImpl(storage.unsafeGetStorageImpl())->npu_desc_.origin_format_;
+    return torch_npu::NPUBridge::GetNpuStorageImpl(storage.unsafeGetStorageImpl())->npu_desc_.origin_format_ !=
+           ACL_FORMAT_UNDEFINED;
 }
 
 void StorageDescHelper::GetDescForSerialization(const at::Tensor &tensor,
@@ -163,7 +164,8 @@ void StorageDescHelper::SetDescForSerialization(const at::Tensor &tensor,
 
     auto str_to_small_vector = [](std::string str) -> c10::SmallVector<int64_t, 5> {
         int start = 0;
-        while ((start < static_cast<int64_t>(str.size())) && (str[start++] != '/'));
+        while ((start < static_cast<int64_t>(str.size())) && (str[start++] != '/')) {
+        }
         int end = start;
         c10::SmallVector<int64_t, 5> vec;
         while (end < static_cast<int64_t>(str.size())) {
@@ -253,9 +255,9 @@ torch_npu::NPUStorageDesc StorageDescHelper::SetDesc(const caffe2::TypeMeta &dty
     return npu_desc;
 }
 
-int64_t StorageDescHelper::GetMemorySize(const torch_npu::NPUStorageDesc &desc)
+int64_t StorageDescHelper::GetMemorySize(const torch_npu::NPUStorageDesc &dst)
 {
-    const auto &physical_size = FormatHelper::GetStorageSizes(desc);
+    const auto &physical_size = FormatHelper::GetStorageSizes(dst);
     return c10::multiply_integers(physical_size);
 }
 
diff --git a/torch_npu/csrc/framework/StorageDescHelper.h b/torch_npu/csrc/framework/StorageDescHelper.h
index d6bf8ab506..6497ee1a88 100644
--- a/torch_npu/csrc/framework/StorageDescHelper.h
+++ b/torch_npu/csrc/framework/StorageDescHelper.h
@@ -38,8 +38,8 @@ public:
     static bool CheckDescInit(const c10::Storage &storage);
 
     // For Serialization to Get and Set NpuStorageDesc
-    static void GetDescForSerialization(const at::Tensor &dst, std::unordered_map<std::string, bool> &desc_map);
-    static void SetDescForSerialization(const at::Tensor &dst, std::unordered_map<std::string, bool> &desc_map);
+    static void GetDescForSerialization(const at::Tensor &tensor, std::unordered_map<std::string, bool> &desc_map);
+    static void SetDescForSerialization(const at::Tensor &tensor, std::unordered_map<std::string, bool> &desc_map);
 
     static void CopyDesc(at::Tensor &dst, const at::Tensor &src);
     static void CopyDesc(at::Tensor &dst, const c10::Storage &src);
-- 
Gitee


From 94efe233a725a562ca796f2f0432444241b60948 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Sat, 15 Mar 2025 13:45:36 +0000
Subject: [PATCH 169/358] !19036 Update op_plugin commit id Merge pull request
 !19036 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 27834de325..04d2dfa19c 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 27834de325c6a178940c490ce4f737f0e3c6a12d
+Subproject commit 04d2dfa19c51839857ead5bf6bc15d7c6fd1eee0
-- 
Gitee


From ba689e1172456ae6c591613263e1b21576a57d5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=9C=E9=87=91=E8=88=AA?= <15990042527@163.com>
Date: Mon, 17 Mar 2025 01:14:49 +0000
Subject: [PATCH 170/358] =?UTF-8?q?!19027=20cleancode=20Merge=20pull=20req?=
 =?UTF-8?q?uest=20!19027=20from=20=E6=9D=9C=E9=87=91=E8=88=AA/v2.6.0-clean?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../csrc/framework/utils/CalcuOpUtil.cpp      | 24 +++++++++----------
 torch_npu/csrc/framework/utils/CalcuOpUtil.h  |  2 +-
 .../framework/utils/ForceJitCompileList.cpp   |  7 +++---
 .../framework/utils/ForceJitCompileList.h     |  4 ++--
 .../framework/utils/NpuStorageOffsetGuard.h   |  2 +-
 torch_npu/csrc/framework/utils/NpuUtils.cpp   |  9 +++----
 torch_npu/csrc/framework/utils/NpuUtils.h     |  5 ++--
 .../csrc/framework/utils/OpPreparation.cpp    | 24 +++++++++----------
 .../csrc/framework/utils/OpPreparation.h      |  6 ++---
 9 files changed, 40 insertions(+), 43 deletions(-)

diff --git a/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp b/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp
index 6fb935b1d5..05cd1371ea 100644
--- a/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp
+++ b/torch_npu/csrc/framework/utils/CalcuOpUtil.cpp
@@ -1,25 +1,24 @@
 #include <ATen/record_function.h>
 
+#include "third_party/acl/inc/acl/acl_base.h"
+#include "third_party/acl/inc/acl/acl_rt.h"
 #include "torch_npu/csrc/aten/mirror/NPUMemoryOverlap.h"
-#include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/core/NPUBridge.h"
 #include "torch_npu/csrc/core/NPUStorageImpl.h"
 #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
-#include "torch_npu/csrc/core/npu/NpuVariables.h"
+#include "torch_npu/csrc/core/npu/NPUException.h"
+#include "torch_npu/csrc/core/npu/NPUFunctions.h"
 #include "torch_npu/csrc/core/npu/interface/AclInterface.h"
 #include "torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h"
-#include "torch_npu/csrc/core/npu/register/OptionsManager.h"
 #include "torch_npu/csrc/core/npu/register/OptionRegister.h"
+#include "torch_npu/csrc/core/npu/register/OptionsManager.h"
 #include "torch_npu/csrc/framework/InferFormat.h"
-#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
 #include "torch_npu/csrc/framework/contiguous/ReshapeOpt.h"
 #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h"
 #include "torch_npu/csrc/framework/interface/EnvVariables.h"
+#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
 #include "torch_npu/csrc/framework/utils/ForceJitCompileList.h"
 #include "torch_npu/csrc/framework/utils/NpuUtils.h"
-#include "third_party/acl/inc/acl/acl_base.h"
-#include "third_party/acl/inc/acl/acl_rt.h"
-#include "torch_npu/csrc/core/npu/NPUFunctions.h"
 
 namespace {
 constexpr float EPSILON = 1e-6;
@@ -91,7 +90,7 @@ constexpr aclDataType kATenScalarTypeToAclDataTypeTable[static_cast<int64_t>(at:
 AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(ENUM_PAIR_FUNC)
 #undef DEFINE_ENUM
 
-static std::map<const string, const aclDataType> STRING_SCALAR_TYPE_TO_ACL_TYPE_MAP = {
+static std::map<const std::string, const aclDataType> STRING_SCALAR_TYPE_TO_ACL_TYPE_MAP = {
     {"uint16", ACL_UINT16}, {"uint8", ACL_UINT8}, {"uint64", ACL_UINT64}, {"string", ACL_STRING}};
 
 aclError AclrtMemcpyAsyncParamCheck(
@@ -119,7 +118,7 @@ aclDataType CalcuOpUtil::ConvertToAclDataType(const at::ScalarType &data_type)
     return acl_dtype;
 }
 
-aclDataType CalcuOpUtil::ConvertToAclDataType(const at::ScalarType &data_type, const string &realDataType)
+aclDataType CalcuOpUtil::ConvertToAclDataType(const at::ScalarType &data_type, const std::string &realDataType)
 {
     auto acl_dtype = kATenScalarTypeToAclDataTypeTable[static_cast<int64_t>(data_type)];
     TORCH_CHECK(acl_dtype != ACL_DT_UNDEFINED,
@@ -251,8 +250,9 @@ int64_t CalcuOpUtil::GetTensorNpuFormat(const at::Tensor &tensor)
 void CalcuOpUtil::CheckMemoryOverLaps(c10::ArrayRef<at::Tensor> inputs, c10::ArrayRef<at::Tensor> outputs)
 {
     for (const auto i : c10::irange(outputs.size())) {
-        if (!outputs[i].defined())
+        if (!outputs[i].defined()) {
             continue;
+        }
 
         assert_no_internal_overlap(outputs[i]);
 
@@ -273,7 +273,7 @@ float CalcuOpUtil::GetScalarFloatValue(const c10::Scalar &scalar)
     if (scalar.isFloatingPoint()) {
         value = scalar.toFloat();
     } else {
-        value = (float)scalar.toInt();
+        value = static_cast<float>(scalar.toInt());
     }
 
     return value;
@@ -302,7 +302,7 @@ static std::unordered_map<uint8_t, aclCubeMathType> ACL_CUBE_MATH_TYPE_MAP = {
 int8_t CalcuOpUtil::GetCubeMathType(bool allowHf32)
 {
     bool allowFp32ToFp16 = native::env::IsAllowFP32ToFP16();
-    uint8_t CubeMathTypeCode = ((uint8_t)allowHf32 << 1) + (uint8_t)allowFp32ToFp16;
+    uint8_t CubeMathTypeCode = (static_cast<uint8_t>(allowHf32) << 1) + static_cast<uint8_t>(allowFp32ToFp16);
     auto iter = ACL_CUBE_MATH_TYPE_MAP.find(CubeMathTypeCode);
     if (iter == ACL_CUBE_MATH_TYPE_MAP.end()) {
         return ALLOW_FP32_DOWN_PRECISION;
diff --git a/torch_npu/csrc/framework/utils/CalcuOpUtil.h b/torch_npu/csrc/framework/utils/CalcuOpUtil.h
index e79094bca3..3d2a925e48 100644
--- a/torch_npu/csrc/framework/utils/CalcuOpUtil.h
+++ b/torch_npu/csrc/framework/utils/CalcuOpUtil.h
@@ -62,7 +62,7 @@ namespace native {
 class CalcuOpUtil {
 public:
     static aclDataType ConvertToAclDataType(const at::ScalarType &data_type);
-    static aclDataType ConvertToAclDataType(const at::ScalarType &data_type, const string &realDataType);
+    static aclDataType ConvertToAclDataType(const at::ScalarType &data_type, const std::string &realDataType);
     static c10::Scalar ConvertTensorToScalar(const at::Tensor &tensor);
     static at::Tensor CopyScalarToDevice(const c10::Scalar &cpu_scalar, at::ScalarType scalar_data_type);
     static at::Tensor CopyTensorHostToDevice(const at::Tensor &cpu_tensor);
diff --git a/torch_npu/csrc/framework/utils/ForceJitCompileList.cpp b/torch_npu/csrc/framework/utils/ForceJitCompileList.cpp
index 1424c248fb..58c64c6e1d 100644
--- a/torch_npu/csrc/framework/utils/ForceJitCompileList.cpp
+++ b/torch_npu/csrc/framework/utils/ForceJitCompileList.cpp
@@ -1,4 +1,3 @@
-#include <cstdint>
 #include <string>
 #include <vector>
 #include "torch_npu/csrc/core/npu/npu_log.h"
@@ -31,15 +30,17 @@ void ForceJitCompileList::RegisterJitlist(const std::string &jitlist)
     std::string token;
     while (end != std::string::npos) {
         token = value.substr(start, end - start);
-        if (!token.empty())
+        if (!token.empty()) {
             jit_list_.emplace(token);
+        }
         start = end + delimiter.size();
         end = value.find(delimiter, start);
     }
     // if start + end > value.size(), substring only split(start, value.size() - start)
     token = value.substr(start, end);
-    if (!token.empty())
+    if (!token.empty()) {
         jit_list_.emplace(token);
+    }
     DisplayJitlist();
     return;
 }
diff --git a/torch_npu/csrc/framework/utils/ForceJitCompileList.h b/torch_npu/csrc/framework/utils/ForceJitCompileList.h
index f4bef71072..d19eadad0c 100644
--- a/torch_npu/csrc/framework/utils/ForceJitCompileList.h
+++ b/torch_npu/csrc/framework/utils/ForceJitCompileList.h
@@ -1,8 +1,8 @@
 #ifndef __PLUGIN_NATIVE_UTILS_JITCOMPILELIST__
 #define __PLUGIN_NATIVE_UTILS_JITCOMPILELIST__
 
-#include <string>
 #include <set>
+#include <string>
 
 using std::string;
 using std::vector;
@@ -13,7 +13,7 @@ namespace native {
 class ForceJitCompileList {
 public:
     static ForceJitCompileList &GetInstance();
-    void RegisterJitlist(const std::string &blacklist);
+    void RegisterJitlist(const std::string &jitlist);
     bool Inlist(const std::string &opName) const;
     void DisplayJitlist() const;
     ~ForceJitCompileList() = default;
diff --git a/torch_npu/csrc/framework/utils/NpuStorageOffsetGuard.h b/torch_npu/csrc/framework/utils/NpuStorageOffsetGuard.h
index e2032c9b31..ea2150ceef 100644
--- a/torch_npu/csrc/framework/utils/NpuStorageOffsetGuard.h
+++ b/torch_npu/csrc/framework/utils/NpuStorageOffsetGuard.h
@@ -1,7 +1,7 @@
 #ifndef __NPU_STORAGE_GUARD__
 #define __NPU_STORAGE_GUARD__
 #include <ATen/ATen.h>
-#include <stdint.h>
+#include <cstdint>
 
 #include "torch_npu/csrc/framework/utils/NpuUtils.h"
 
diff --git a/torch_npu/csrc/framework/utils/NpuUtils.cpp b/torch_npu/csrc/framework/utils/NpuUtils.cpp
index e2f6c338c2..f805b489dc 100644
--- a/torch_npu/csrc/framework/utils/NpuUtils.cpp
+++ b/torch_npu/csrc/framework/utils/NpuUtils.cpp
@@ -1,22 +1,19 @@
 #include <mutex>
 #include <set>
 
-#include "torch_npu/csrc/framework/utils/NpuUtils.h"
-#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
+#include "torch_npu/csrc/aten/CustomFunctions.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
-#include "torch_npu/csrc/core/npu/NPUFunctions.h"
 #include "torch_npu/csrc/core/NPUBridge.h"
 #include "torch_npu/csrc/core/NPUStorageImpl.h"
+#include "torch_npu/csrc/core/npu/NPUFunctions.h"
 #include "torch_npu/csrc/framework/FormatHelper.h"
 #include "torch_npu/csrc/framework/StorageDescHelper.h"
 #include "torch_npu/csrc/framework/contiguous/ContiguousOpt.h"
-#include "torch_npu/csrc/framework/interface/EnvVariables.h"
 #include "torch_npu/csrc/framework/interface/MsProfilerInterface.h"
 #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
 #include "torch_npu/csrc/framework/utils/NpuUtils.h"
-#include "torch_npu/csrc/framework/utils/OpPreparation.h"
 #include "torch_npu/csrc/framework/utils/OpAdapter.h"
-#include "torch_npu/csrc/aten/CustomFunctions.h"
+#include "torch_npu/csrc/framework/utils/OpPreparation.h"
 #ifndef BUILD_LIBTORCH
 #include "torch_npu/csrc/profiler/npu_profiler.h"
 #endif
diff --git a/torch_npu/csrc/framework/utils/NpuUtils.h b/torch_npu/csrc/framework/utils/NpuUtils.h
index 3d122f2424..0a2d63c267 100644
--- a/torch_npu/csrc/framework/utils/NpuUtils.h
+++ b/torch_npu/csrc/framework/utils/NpuUtils.h
@@ -1,20 +1,19 @@
 #ifndef __PULGIN_NATIVE_NPU_UTILS_NUP_UTILS__
 #define __PULGIN_NATIVE_NPU_UTILS_NUP_UTILS__
 
-#include <stdint.h>
+#include <cstdint>
 #include <string>
 #include <vector>
 #include <ATen/ATen.h>
 #include "torch_npu/csrc/core/npu/npu_log.h"
 
-#include "third_party/acl/inc/ge/ge_error_codes.h"
 #include "third_party/acl/inc/acl/acl.h"
 #include "third_party/acl/inc/acl/acl_base.h"
 #include "third_party/acl/inc/acl/acl_op.h"
 
 #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
-#include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h"
 #include "torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.h"
+#include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h"
 
 using std::string;
 using std::vector;
diff --git a/torch_npu/csrc/framework/utils/OpPreparation.cpp b/torch_npu/csrc/framework/utils/OpPreparation.cpp
index ac0a9841c5..6777eee4d1 100644
--- a/torch_npu/csrc/framework/utils/OpPreparation.cpp
+++ b/torch_npu/csrc/framework/utils/OpPreparation.cpp
@@ -1,14 +1,14 @@
 #include <ATen/record_function.h>
 #include "torch_npu/csrc/framework/utils/OpPreparation.h"
-#include "torch_npu/csrc/framework/FormatHelper.h"
-#include "torch_npu/csrc/framework/InferFormat.h"
-#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
-#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "torch_npu/csrc/aten/CustomFunctions.h"
-#include "torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "torch_npu/csrc/core/NPUBridge.h"
 #include "torch_npu/csrc/core/NPUStorageImpl.h"
 #include "torch_npu/csrc/core/npu/NPUGuard.h"
+#include "torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h"
+#include "torch_npu/csrc/framework/FormatHelper.h"
+#include "torch_npu/csrc/framework/InferFormat.h"
+#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
 #ifndef BUILD_LIBTORCH
 #include "torch_npu/csrc/profiler/utils.h"
 #endif
@@ -97,7 +97,7 @@ aclDataType OpPreparation::convert_to_acl_data_type(const at::ScalarType &data_t
     return CalcuOpUtil::ConvertToAclDataType(data_type);
 }
 
-aclDataType OpPreparation::convert_to_acl_data_type(const at::ScalarType &data_type, const string &realDataType)
+aclDataType OpPreparation::convert_to_acl_data_type(const at::ScalarType &data_type, const std::string &realDataType)
 {
     return CalcuOpUtil::ConvertToAclDataType(data_type, realDataType);
 }
@@ -272,10 +272,10 @@ at::Tensor OpPreparation::apply_tensor_with_format(c10::IntArrayRef sizes,
                 "Expected all tensors to be on the same device. "
                 "Expected NPU tensor, please check whether the input tensor device is correct.",
                 OPS_ERROR(ErrCode::TYPE));
-    auto fixFormat = InferFormat::GuessStorageFormat(sizes, (aclFormat)format);
-    if (options.dtype_opt() == at::ScalarType::Double && !FormatHelper::IsBaseFormatType((aclFormat)format)) {
+    auto fixFormat = InferFormat::GuessStorageFormat(sizes, static_cast<aclFormat>(format));
+    if (options.dtype_opt() == at::ScalarType::Double && !FormatHelper::IsBaseFormatType(static_cast<aclFormat>(format))) {
         ASCEND_LOGW("NPU don't support create double dtype tensor with inner format, repalce with base format.");
-        fixFormat = FormatHelper::GetBaseFormat((aclFormat)format);
+        fixFormat = FormatHelper::GetBaseFormat(static_cast<aclFormat>(format));
     }
     return NPUNativeFunctions::unsafe_empty_with_format(sizes,
                                                         c10::optTypeMetaToScalarType(options.dtype_opt()),
@@ -335,8 +335,8 @@ void OpPreparation::CheckOut(const std::initializer_list<at::Tensor> &input,
 
     bool is_read_write = false;
     // check if output is also an input
-    for (const auto &input : inputs) {
-        if (output.is_same(input)) {
+    for (const auto &local_input : inputs) {
+        if (output.is_same(local_input)) {
             is_read_write = true;
             break;
         }
@@ -477,7 +477,7 @@ at::Tensor OpPreparation::ApplyTensorWithFormat(c10::IntArrayRef sizes,
                 "Expected all tensors to be on the same device. "
                 "Expected NPU tensor, please check whether the input tensor device is correct.",
                 OPS_ERROR(ErrCode::TYPE));
-    auto fixFormat = InferFormat::GuessStorageFormat(sizes, (aclFormat)format);
+    auto fixFormat = InferFormat::GuessStorageFormat(sizes, static_cast<aclFormat>(format));
     return NPUNativeFunctions::unsafe_empty_with_format(sizes,
                                                         c10::optTypeMetaToScalarType(options.dtype_opt()),
                                                         options.layout_opt(),
diff --git a/torch_npu/csrc/framework/utils/OpPreparation.h b/torch_npu/csrc/framework/utils/OpPreparation.h
index 509d16d885..74ac303898 100644
--- a/torch_npu/csrc/framework/utils/OpPreparation.h
+++ b/torch_npu/csrc/framework/utils/OpPreparation.h
@@ -21,7 +21,7 @@ public:
     static UnifiedResult reduce_op_check(at::Tensor &out1, at::Tensor &out2, const at::Tensor &a);
     // From CalcuOpUtil part
     static aclDataType convert_to_acl_data_type(const at::ScalarType &data_type);
-    static aclDataType convert_to_acl_data_type(const at::ScalarType &data_type, const string &realDataType);
+    static aclDataType convert_to_acl_data_type(const at::ScalarType &data_type, const std::string &realDataType);
     static at::Tensor copy_scalar_to_device(const c10::Scalar &cpu_scalar, at::ScalarType scalar_data_type);
     static at::Tensor copy_scalar_to_device(const c10::Scalar &cpu_scalar, at::ScalarType scalar_data_type,
                                             const c10::Device device);
@@ -83,8 +83,8 @@ public:
     static at::Tensor apply_tensor_without_format(const at::Tensor &src);
     static at::Tensor apply_tensor_without_format(const at::Tensor &src, c10::IntArrayRef sizes);
     static at::Tensor apply_tensor_without_format(c10::IntArrayRef sizes, const c10::TensorOptions &options);
-    static at::Tensor unsafe_empty_workspace(uint64_t size);
-    static at::Tensor unsafe_empty_workspace(uint64_t size, aclrtStream stream);
+    static at::Tensor unsafe_empty_workspace(uint64_t workspace_size);
+    static at::Tensor unsafe_empty_workspace(uint64_t workspace_size, aclrtStream stream);
     // DEPRECATED: ApplyTensorWithSizes will be deprecated, please use apply_tensor_with_sizes instead.
     static at::Tensor ApplyTensorWithSizes(c10::IntArrayRef sizes, const c10::TensorOptions &options);
     // DEPRECATED: CheckMemory will be deprecated, please use check_memory instead.
-- 
Gitee


From 1e42a5436a2dafd79357c77a25f707158357ca3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= <wangchao430@huawei.com>
Date: Mon, 17 Mar 2025 01:20:49 +0000
Subject: [PATCH 171/358] =?UTF-8?q?!19025=20Distributed:=20CleanCode=20fix?=
 =?UTF-8?q?=20Merge=20pull=20request=20!19025=20from=20=E7=8E=8B=E8=B6=85/?=
 =?UTF-8?q?v2.6.0=5Fclean0314?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/distributed/HcclCompile.h      |  2 ++
 .../csrc/distributed/ParallelTcpStore.cpp     |  2 +-
 .../csrc/distributed/ProcessGroupHCCL.cpp     | 25 ++++++++++---------
 .../csrc/distributed/ProcessGroupHCCL.hpp     |  6 ++---
 .../csrc/distributed/ProcessGroupLCCL.hpp     |  2 +-
 torch_npu/csrc/distributed/StoreClient.cpp    |  4 +--
 torch_npu/csrc/distributed/reducer.cpp        | 22 ++++++++--------
 torch_npu/csrc/distributed/reducer.hpp        |  4 +--
 8 files changed, 35 insertions(+), 32 deletions(-)

diff --git a/torch_npu/csrc/distributed/HcclCompile.h b/torch_npu/csrc/distributed/HcclCompile.h
index f762cb8baf..8204dd9555 100644
--- a/torch_npu/csrc/distributed/HcclCompile.h
+++ b/torch_npu/csrc/distributed/HcclCompile.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include <c10/util/CallOnce.h>
 #include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/core/npu/register/FunctionLoader.h"
diff --git a/torch_npu/csrc/distributed/ParallelTcpStore.cpp b/torch_npu/csrc/distributed/ParallelTcpStore.cpp
index 13c6e2c1bc..59d07278e6 100644
--- a/torch_npu/csrc/distributed/ParallelTcpStore.cpp
+++ b/torch_npu/csrc/distributed/ParallelTcpStore.cpp
@@ -50,7 +50,7 @@ ParallelStoreServer::ParallelStoreServer(std::string initKey, const std::string
 }
 
 ParallelStoreServer::ParallelStoreServer(const std::string localSocketPath, CallBackFn callback) noexcept
-    : localSocketPath_(std::move(localSocketPath)), callback_(std::move(callback))
+    : localSocketPath_(localSocketPath), callback_(std::move(callback))
 {
     auto threadNum = 1U;
     auto listenThreadNum = 1U;
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 83ca0143bc..de31e9444a 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -54,7 +54,7 @@ using hcclUs = std::chrono::steady_clock::time_point;
 #define DURATION_US(x) (std::chrono::duration_cast<std::chrono::microseconds>(x))
 #define TIME_NOW() ({ std::chrono::steady_clock::now(); })
 
-#define MAX_GROUP_NAME_LEN 128
+constexpr int32_t MAX_GROUP_NAME_LEN = 128;
 
 // HCCL ReduceOp mapping
 std::map<c10d::ReduceOp, HcclReduceOp> hcclOp = {
@@ -1330,7 +1330,7 @@ void ProcessGroupHCCL::recordDataVol(std::string opName, const std::string dataV
 {
     std::ofstream outfile;
     std::stringstream fileName;
-    std::string commName = getHcclCommNameWithoutInit(currRank, hcclComms);
+    std::string commName = getHcclCommNameWithoutInit(hcclComms);
     auto master_addr = getenv("MASTER_ADDR");
     auto hccl_algo = getenv("HCCL_ALGO");
     TORCH_CHECK(master_addr != nullptr, "Unable to fetch master IP addr, environment variable is null.", DIST_ERROR(ErrCode::NOT_FOUND));
@@ -1444,7 +1444,7 @@ bool ProcessGroupHCCL::recordHcclStatus(const std::string path, bool end, bool e
 void ProcessGroupHCCL::recordComm(std::string filename, std::string opName, const int currRank, std::vector<std::shared_ptr<HCCLComm>>& hcclComms)
 {
     std::ofstream outfile;
-    std::string commName = getHcclCommNameWithoutInit(currRank, hcclComms);
+    std::string commName = getHcclCommNameWithoutInit(hcclComms);
     if (isFileExists(filename)) {
         try {
             outfile.open(filename, std::ios::app);
@@ -1545,7 +1545,7 @@ void ProcessGroupHCCL::createHCCLComm(
             default:
                 throw std::runtime_error(
                     "create/get the HCCL Communicator failed for comm type:" +
-                    std::to_string((int)commType) + DIST_ERROR(ErrCode::PARAM));
+                    std::to_string(static_cast<int>(commType)) + DIST_ERROR(ErrCode::PARAM));
         }
 
         // Creates the HCCL streams
@@ -1752,7 +1752,7 @@ std::vector<std::shared_ptr<HCCLComm>>& ProcessGroupHCCL::createHCCLComm(
 
 int64_t ProcessGroupHCCL::getStreamId(bool p2p, int peer)
 {
-    int device;
+    int device = -1;
     NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
     std::vector<at::Device> devices = {at::Device(c10::DeviceType::PrivateUse1, device)};
     auto key = getKeyFromDevices(devices);
@@ -2136,7 +2136,7 @@ std::string ProcessGroupHCCL::getHcclCommName(int rankid, bool init_comm)
     return std::string(commName);
 }
 
-std::string ProcessGroupHCCL::getHcclCommNameWithoutInit(int rankid, std::vector<std::shared_ptr<HCCLComm>>& hcclComms)
+std::string ProcessGroupHCCL::getHcclCommNameWithoutInit(std::vector<std::shared_ptr<HCCLComm>>& hcclComms)
 {
     TORCH_CHECK(hcclComms.size() == 1, "expect hcclComms.size() = 1, but hcclComms.size() = ",
         hcclComms.size(), DIST_ERROR(ErrCode::VALUE));
@@ -2386,7 +2386,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::collective(
             }
             char* global_rank = getenv("RANK");
             TORCH_CHECK(global_rank != nullptr, "Unable to fetch global rank for NSLB.", DIST_ERROR(ErrCode::NOT_FOUND));
-            recordDataVol(opTypeToString(opType), std::to_string(dataVol), atoi(global_rank), hcclComms);
+            recordDataVol(opTypeToString(opType), std::to_string(dataVol), strtol(global_rank, nullptr, 10), hcclComms);
         }
         if (op_id_ >= nslb_num) {
             nslb_is_end = true;
@@ -2548,7 +2548,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::collectiveCoalesced(
             }
             char* global_rank = getenv("RANK");
             TORCH_CHECK(global_rank != nullptr, "Unable to fetch global rank for NSLB.", DIST_ERROR(ErrCode::NOT_FOUND));
-            recordDataVol(opTypeToString(opType), std::to_string(dataVol), atoi(global_rank), hcclComms);
+            recordDataVol(opTypeToString(opType), std::to_string(dataVol), strtol(global_rank, nullptr, 10), hcclComms);
         }
         if (op_id_ >= nslb_num) {
             nslb_is_end = true;
@@ -2664,7 +2664,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::pointToPoint(
 {
     c10_npu::CaptureStatus capture_status = c10_npu::currentStreamCaptureStatusMayInitCtx();
     const auto devices = getDeviceList(tensors);
-    int p2pRank = 0, p2pTargetRank = 0;
+    int p2pRank = 0;
+    int p2pTargetRank = 0;
     bool isSendRecvSelf = false;
 
     std::string key;
@@ -2728,7 +2729,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::pointToPoint(
             char* global_rank = getenv("RANK");
             TORCH_CHECK(global_rank != nullptr, "Unable to fetch global rank for NSLB.",
                         DIST_ERROR(ErrCode::NOT_FOUND));
-            recordDataVol(opTypeToString(opType), std::to_string(dataVol), atoi(global_rank), hcclComms);
+            recordDataVol(opTypeToString(opType), std::to_string(dataVol), strtol(global_rank, nullptr, 10), hcclComms);
         }
         if (op_id_ >= nslb_num) {
             nslb_is_end = true;
@@ -3229,7 +3230,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_oop(
         c10d::OpType::REDUCE);
 }
 
-#define ADDRESS_ALIGNMENT_BYTE 512
+constexpr int64_t ADDRESS_ALIGNMENT_BYTE = 512;
 at::Tensor ProcessGroupHCCL::byte_alignment(at::Tensor& tensors)
 {
     at::Tensor inter_tensors = at::reshape(tensors, {1, tensors.numel()});
@@ -3251,7 +3252,7 @@ at::Tensor ProcessGroupHCCL::byte_alignment(at::Tensor& tensors)
 
         inter_tensors = op_plugin::constant_pad_nd(inter_tensors, {0, num_add}, 0);
 
-        if (transflag == true) {
+        if (transflag) {
             inter_tensors = at_npu::native::custom_ops::npu_dtype_cast(inter_tensors, at::ScalarType::Bool);
         }
     }
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
index a7fdf30551..7785d14d1c 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
@@ -50,7 +50,7 @@ enum class HcclCommType {
     P2P = 1
 };
 
-enum WatchdogStatus {
+enum class WatchdogStatus {
     INIT = 0,
     RUN = 1,
     STOP = 2
@@ -306,7 +306,7 @@ public:
         return options_;
     }
 
-    const std::string getBackendName() const
+    const std::string getBackendName() const override
     {
         return std::string(HCCL_BACKEND_NAME);
     }
@@ -459,7 +459,7 @@ public:
 
     void abortAndClearHcclComm(c10::optional<std::string> abortReason);
 
-    std::string getHcclCommNameWithoutInit(int rankid, std::vector<std::shared_ptr<HCCLComm>>& hcclComms);
+    std::string getHcclCommNameWithoutInit(std::vector<std::shared_ptr<HCCLComm>>& hcclComms);
 
     // Return the global ranks of a PG
     const std::vector<uint32_t>& groupRanks() const;
diff --git a/torch_npu/csrc/distributed/ProcessGroupLCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupLCCL.hpp
index 197c2c4874..df4e5cd73f 100644
--- a/torch_npu/csrc/distributed/ProcessGroupLCCL.hpp
+++ b/torch_npu/csrc/distributed/ProcessGroupLCCL.hpp
@@ -105,7 +105,7 @@ public:
 
     ~ProcessGroupLCCL() override;
 
-    const std::string getBackendName() const
+    const std::string getBackendName() const override
     {
         return LCCL_BACKEND_NAME;
     }
diff --git a/torch_npu/csrc/distributed/StoreClient.cpp b/torch_npu/csrc/distributed/StoreClient.cpp
index 2fbaec89f7..8be1fd5831 100644
--- a/torch_npu/csrc/distributed/StoreClient.cpp
+++ b/torch_npu/csrc/distributed/StoreClient.cpp
@@ -32,10 +32,10 @@ namespace c10d {
 namespace torch_npu {
     
 Client::Client(const std::string host, uint16_t port, const std::chrono::milliseconds timeout) noexcept
-    : host_{ std::move(host) }, port_{ port }, socketFd_(-1), timeout_{ timeout }
+    : host_{ host }, port_{ port }, socketFd_(-1), timeout_{ timeout }
 {}
 Client::Client(const std::string localSocketPath, const std::chrono::milliseconds timeout) noexcept
-    : localSocketPath_ { std::move(localSocketPath)}, socketFd_(-1), timeout_{ timeout }
+    : localSocketPath_ { localSocketPath }, socketFd_(-1), timeout_{ timeout }
 {}
 
 int Client::Connect() noexcept
diff --git a/torch_npu/csrc/distributed/reducer.cpp b/torch_npu/csrc/distributed/reducer.cpp
index dcee8d4a88..cb342cf333 100644
--- a/torch_npu/csrc/distributed/reducer.cpp
+++ b/torch_npu/csrc/distributed/reducer.cpp
@@ -863,7 +863,7 @@ std::vector<at::Tensor> Reducer::get_variables_for_bucket(
     // Check if we have cached mapping previously.
     if (has_rebuilt_bucket_ &&
         cached_variables_for_bucket_.find(bucket_index) !=
-            cached_variables_for_bucket_.end()) {
+        cached_variables_for_bucket_.end()) {
         return cached_variables_for_bucket_[bucket_index];
     }
     std::vector<at::Tensor> variables_for_bucket;
@@ -1620,7 +1620,7 @@ void Reducer::sync_bucket_indices(
     num_buckets = static_cast<size_t>(indices_accessor[indices_accessor_Index]);
 
     // Broadcast bucket_sizes
-    auto bucket_sizes_tensor = at::empty({(int64_t)num_buckets}, at::kInt);
+    auto bucket_sizes_tensor = at::empty({static_cast<int64_t>(num_buckets)}, at::kInt);
     auto bucket_sizes_accessor = bucket_sizes_tensor.accessor<int, 1>();
     for (const auto i : c10::irange(num_buckets)) {
         // For rank != 0, it is possible that local num buckets bucket_sizes.size()
@@ -1628,7 +1628,7 @@ void Reducer::sync_bucket_indices(
         bucket_sizes_accessor[i] =
             static_cast<int>(bucket_sizes.at(std::min(i, (bucket_sizes.size() - 1))));
     }
-    auto bucket_sizes_tensor_device = at::empty({(int64_t)num_buckets}, options);
+    auto bucket_sizes_tensor_device = at::empty({static_cast<int64_t>(num_buckets)}, options);
     bucket_sizes_tensor_device.copy_(bucket_sizes_tensor, true);
     std::vector<at::Tensor> bucket_sizes_tensor_list = {
         bucket_sizes_tensor_device};
@@ -2064,14 +2064,14 @@ std::tuple<std::vector<std::vector<size_t>>, std::vector<size_t>> compute_bucket
         std::sort(result.begin(), result.end(),
                   [](const std::tuple<std::vector<size_t>, size_t>& a,
                      const std::tuple<std::vector<size_t>, size_t>& b) {
-                      auto indices_a = std::get<0>(a);
-                      auto indices_b = std::get<0>(b);
-                      const auto amin =
-                          std::min_element(indices_a.begin(), indices_a.end());
-                      const auto bmin =
-                          std::min_element(indices_b.begin(), indices_b.end());
-                      return *amin < *bmin;
-                  });
+                        auto indices_a = std::get<0>(a);
+                        auto indices_b = std::get<0>(b);
+                        const auto amin =
+                            std::min_element(indices_a.begin(), indices_a.end());
+                        const auto bmin =
+                            std::min_element(indices_b.begin(), indices_b.end());
+                        return *amin < *bmin;
+                    });
     }
 
     // Return bucket indices and size limits as separate entries in tuple, as
diff --git a/torch_npu/csrc/distributed/reducer.hpp b/torch_npu/csrc/distributed/reducer.hpp
index f25cab26ee..dd0c133c24 100644
--- a/torch_npu/csrc/distributed/reducer.hpp
+++ b/torch_npu/csrc/distributed/reducer.hpp
@@ -344,7 +344,7 @@ protected:
             value,
         "");
 #endif
-    void runGradCallbackForVariable(at::Tensor& variable, GradCallback && cb);
+    void runGradCallbackForVariable(at::Tensor& variable, GradCallback&& cb);
 
     // A bucket replica represents [1..N] gradients to be reduced,
     // with the same dtype, on the same device.
@@ -509,7 +509,7 @@ protected:
         ContextPtr context_ptr_holder;
         std::atomic<ContextPtr::element_type*> context_ptr{nullptr};
 
-        void set(ContextPtr && new_context_ptr);
+        void set(ContextPtr&& new_context_ptr);
     };
     RpcContext rpc_context_;
 #endif
-- 
Gitee


From f8fefdb5529d7fc1893bc71a3cf1156a297233a8 Mon Sep 17 00:00:00 2001
From: xudaohong <xudaohong@huawei.com>
Date: Mon, 17 Mar 2025 01:40:06 +0000
Subject: [PATCH 172/358] !18946 [fix] segment_size 128M when enable hccl zero
 copy Merge pull request !18946 from xudaohong/v2.6.0

---
 torch_npu/csrc/core/npu/NPUCachingAllocator.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index e454da1561..bf6a583e2a 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -89,6 +89,7 @@ constexpr size_t kMinBlockSize = 512; // all sizes are rounded to at least 512 b
 constexpr size_t kSmallSize = 1048576; // largest "small" allocation is 1 MiB
 constexpr size_t kSmallBuffer = 2097152; // "small" allocations are packed in 2 MiB blocks
 constexpr size_t kLargeBuffer = 20971520; // "large" allocations may be packed in 20 MiB blocks
+constexpr size_t kLargeBufferForHccl = 134217728; // "large for hccl" allocations may be packed in 128 MiB blocks
 constexpr size_t kMinLargeAlloc = 10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
 constexpr size_t kRoundLarge = 2097152; // round up large allocs to 2 MiB
 constexpr size_t kAlignRoundLarge = 16384; // round up large allocs to 16 KB
@@ -1786,7 +1787,9 @@ class DeviceCachingAllocator {
         return c;
       }
     }
-    auto segment_size = pool->is_small ? kSmallBuffer : kLargeBuffer;
+    auto segment_size = pool->is_small ? kSmallBuffer : (
+      c10_npu::option::OptionsManager::IsHcclZeroCopyEnable() ? kLargeBufferForHccl : kLargeBuffer
+    );
     auto segment = new ExpandableSegment(device, stream, segment_size);
     if (hcclComm_) {
         segment->setHcclComm(hcclComm_);
-- 
Gitee


From a55778ef478511cb287652e7e0a4a30d319e6072 Mon Sep 17 00:00:00 2001
From: huangyunlong <huangyunlong4@huawei.com>
Date: Mon, 17 Mar 2025 03:54:00 +0000
Subject: [PATCH 173/358] !19034 clean code Merge pull request !19034 from
 huangyunlong/2.6cl

---
 torch_npu/csrc/InitNpuBindings.cpp           |   6 +-
 torch_npu/csrc/flopcount/FlopCounter.cpp     |  61 ++++++---
 torch_npu/csrc/libs/init_npu.cpp             | 109 +++++++--------
 torch_npu/csrc/sanitizer/NPUTrace.h          |   3 +-
 torch_npu/csrc/sanitizer/PyCallbackTrigger.h |  36 +++--
 torch_npu/csrc/utils/LazyInit.cpp            |   5 +-
 torch_npu/csrc/utils/TensorType.cpp          | 131 +++++++++++--------
 torch_npu/utils/serialization.py             |   4 +-
 8 files changed, 213 insertions(+), 142 deletions(-)

diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp
index 2f3960ee9b..899887e10b 100644
--- a/torch_npu/csrc/InitNpuBindings.cpp
+++ b/torch_npu/csrc/InitNpuBindings.cpp
@@ -146,7 +146,8 @@ static std::vector<PyMethodDef> methods;
 
 extern "C"
 
-PyObject* initModule() {
+PyObject* initModule()
+{
     at::internal::lazy_init_num_threads();
 
     AddPyMethodDefs(methods, TorchNpuMethods);
@@ -192,6 +193,7 @@ PyObject* initModule() {
     return module;
 }
 
-PyMODINIT_FUNC PyInit__C(void) {
+PyMODINIT_FUNC PyInit__C(void)
+{
     return initModule();
 }
diff --git a/torch_npu/csrc/flopcount/FlopCounter.cpp b/torch_npu/csrc/flopcount/FlopCounter.cpp
index 9298f6436b..398c8f5e60 100644
--- a/torch_npu/csrc/flopcount/FlopCounter.cpp
+++ b/torch_npu/csrc/flopcount/FlopCounter.cpp
@@ -121,9 +121,9 @@ int64_t FlopCounter::conv_backward_flop(const at::Tensor &grad_output, const at:
     if (output_mask[1]) {
         std::vector<int64_t> grad_weight_shape(gradeWeight.sizes().begin(), gradeWeight.sizes().end());
         if (transposed) {
-            flop_count += conv_flop_count(t(grad_output_shape), t(input_shape), t(grad_weight_shape), transposed=false);
+            flop_count += conv_flop_count(t(grad_output_shape), t(input_shape), t(grad_weight_shape), false);
         } else {
-            flop_count += conv_flop_count(t(input_shape), t(grad_output_shape), t(grad_weight_shape), transposed=false);
+            flop_count += conv_flop_count(t(input_shape), t(grad_output_shape), t(grad_weight_shape), false);
         }
     }
 
@@ -150,30 +150,30 @@ std::vector<std::tuple<std::vector<int64_t>, std::vector<int64_t>, std::vector<i
 
     // for GQA and MQA
     if (input_layer_str == "SBH" || input_layer_str == "BSH" || input_layer_str == "BSND") {
-        if (q_2 != k_2 && q_2!= v_2) {
+        if (q_2 != k_2 && q_2 != v_2) {
             k_2 = q_2;
             v_2 = q_2;
         }
     } else {
-        if (q_1 != k_1 && q_1!= v_1) {
+        if (q_1 != k_1 && q_1 != v_1) {
             k_1 = q_1;
             v_1 = q_1;
         }
     }
 
     if (input_layer_str == "BSH") {
-        std::vector<int64_t> new_query_shape = {query[0], head_num, q_1, q_2/head_num};
-        std::vector<int64_t> new_key_shape = {key[0], head_num, k_1, k_2/head_num};
-        std::vector<int64_t> new_value_shape = {value[0], head_num, v_1, v_2/head_num};
+        std::vector<int64_t> new_query_shape = {query[0], head_num, q_1, q_2 / head_num};
+        std::vector<int64_t> new_key_shape = {key[0], head_num, k_1, k_2 / head_num};
+        std::vector<int64_t> new_value_shape = {value[0], head_num, v_1, v_2 / head_num};
         std::vector<int64_t> new_grad_out_shape;
         if (!grad_out.empty()) {
             new_grad_out_shape = new_query_shape;
         }
         result.emplace_back(new_query_shape, new_key_shape, new_value_shape, new_grad_out_shape);
     } else if (input_layer_str == "SBH") {
-        std::vector<int64_t> new_query_shape = {q_1, head_num, query[0], q_2/head_num};
-        std::vector<int64_t> new_key_shape = {k_1, head_num, key[0], k_2/head_num};
-        std::vector<int64_t> new_value_shape = {v_1, head_num, value[0], v_2/head_num};
+        std::vector<int64_t> new_query_shape = {q_1, head_num, query[0], q_2 / head_num};
+        std::vector<int64_t> new_key_shape = {k_1, head_num, key[0], k_2 / head_num};
+        std::vector<int64_t> new_value_shape = {v_1, head_num, value[0], v_2 / head_num};
         std::vector<int64_t> new_grad_out_shape;
         if (!grad_out.empty()) {
             new_grad_out_shape = new_query_shape;
@@ -203,9 +203,9 @@ std::vector<std::tuple<std::vector<int64_t>, std::vector<int64_t>, std::vector<i
         TORCH_CHECK(sizeValue <= static_cast<size_t>(std::numeric_limits<int64_t>::max()), "cum_seq_q.size() is too large to be represented as an int64_t", OPS_ERROR(ErrCode::PARAM));
         int64_t b = static_cast<int64_t>(sizeValue);
         TORCH_CHECK(b != 0, "Divisor b may be 0, please check it.")
-        std::vector<int64_t> new_query_shape = {b, q_1, query[0]/b, q_2};
-        std::vector<int64_t> new_key_shape = {b, k_1, key[0]/b, k_2};
-        std::vector<int64_t> new_value_shape = {b, v_1, value[0]/b, v_2};
+        std::vector<int64_t> new_query_shape = {b, q_1, query[0] / b, q_2};
+        std::vector<int64_t> new_key_shape = {b, k_1, key[0] / b, k_2};
+        std::vector<int64_t> new_value_shape = {b, v_1, value[0] / b, v_2};
         std::vector<int64_t> new_grad_out_shape;
         if (!grad_out.empty()) {
             new_grad_out_shape = new_query_shape;
@@ -241,9 +241,18 @@ inline int64_t safe_sum(const std::initializer_list<int64_t>& values)
 
 int64_t sdpa_flop_count(const std::vector<int64_t> query_shape, const std::vector<int64_t> key_shape, const std::vector<int64_t> value_shape)
 {
-    int64_t b, h, s_q, d_q;
-    int64_t _b2, _h2, s_k, _d2;
-    int64_t _b3, _h3, _s3, d_v;
+    int64_t b;
+    int64_t h;
+    int64_t s_q;
+    int64_t d_q;
+    int64_t _b2;
+    int64_t _h2;
+    int64_t s_k;
+    int64_t _d2;
+    int64_t _b3;
+    int64_t _h3;
+    int64_t _s3;
+    int64_t d_v;
 
     b = query_shape[0];
     h = query_shape[1];
@@ -275,10 +284,22 @@ int64_t sdpa_flop_count(const std::vector<int64_t> query_shape, const std::vecto
 
 int64_t sdpa_backward_flop_count(const std::vector<int64_t> query_shape, const std::vector<int64_t> key_shape, const std::vector<int64_t> value_shape, const std::vector<int64_t> grad_out_shape)
 {
-    int64_t b, h, s_q, d_q;
-    int64_t _b2, _h2, s_k, _d2;
-    int64_t _b3, _h3, _s3, d_v;
-    int64_t _b4, _h4, _s4, d_4;
+    int64_t b;
+    int64_t h;
+    int64_t s_q;
+    int64_t d_q;
+    int64_t _b2;
+    int64_t _h2;
+    int64_t s_k;
+    int64_t _d2;
+    int64_t _b3;
+    int64_t _h3;
+    int64_t _s3;
+    int64_t d_v;
+    int64_t _b4;
+    int64_t _h4;
+    int64_t _s4;
+    int64_t d_4;
 
     b = query_shape[0];
     h = query_shape[1];
diff --git a/torch_npu/csrc/libs/init_npu.cpp b/torch_npu/csrc/libs/init_npu.cpp
index 238f89420f..bc8ca3eee9 100644
--- a/torch_npu/csrc/libs/init_npu.cpp
+++ b/torch_npu/csrc/libs/init_npu.cpp
@@ -9,57 +9,62 @@
 
 namespace torch_npu {
 
-bool is_npu_device(const at::Device& device) {
-  return device.type() == c10::DeviceType::PrivateUse1;
+bool is_npu_device(const at::Device& device)
+{
+    return device.type() == c10::DeviceType::PrivateUse1;
 }
 
 
-void init_npu(const c10::DeviceIndex device_index) {
-  c10_npu::NpuSysCtrl::SysStatus status =
-      c10_npu::NpuSysCtrl::GetInstance().Initialize((int)device_index);
-  if (status != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) {
-    C10_NPU_SHOW_ERR_MSG();
-    return;
-  }
+void init_npu(const c10::DeviceIndex device_index)
+{
+    c10_npu::NpuSysCtrl::SysStatus status =
+        c10_npu::NpuSysCtrl::GetInstance().Initialize((int)device_index);
+    if (status != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) {
+        C10_NPU_SHOW_ERR_MSG();
+        return;
+    }
 }
 
 
-void init_npu(const std::string& device_str) {
-  auto device = at::Device(device_str);
-  TORCH_CHECK(is_npu_device(device), "NPU device init fail, except got NPU device, but got ", device_str,
-              PTA_ERROR(ErrCode::PARAM));
-  init_npu(device.index());
+void init_npu(const std::string& device_str)
+{
+    auto device = at::Device(device_str);
+    TORCH_CHECK(is_npu_device(device), "NPU device init fail, except got NPU device, but got ", device_str,
+                PTA_ERROR(ErrCode::PARAM));
+    init_npu(device.index());
 }
 
 
-void init_npu(const at::Device& device) {
-  TORCH_CHECK(is_npu_device(device), "NPU device init fail, except got NPU device, but got ", str(device),
-              PTA_ERROR(ErrCode::PARAM));
-  init_npu(device.index());
+void init_npu(const at::Device& device)
+{
+    TORCH_CHECK(is_npu_device(device), "NPU device init fail, except got NPU device, but got ", str(device),
+                PTA_ERROR(ErrCode::PARAM));
+    init_npu(device.index());
 }
 
-void finalize_npu() {
-  if (c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) {
-    try {
-      c10_npu::npuSynchronizeDevice();
-    } catch (std::exception& e) {
-      TORCH_CHECK(false, "NPU SynchronizeDevice failed err=:%s", e.what(), PTA_ERROR(ErrCode::ACL));
-    }
-
-    at_npu::native::CachingHostAllocator_emptyCache();
-    try {
-      c10_npu::NPUCachingAllocator::emptyCache();
-    } catch (std::exception& e) {
-      TORCH_CHECK(false, "NPU CachingAllocator::emptyCache failed err=:%s", e.what(), PTA_ERROR(ErrCode::ACL));
+void finalize_npu()
+{
+    if (c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) {
+        try {
+            c10_npu::npuSynchronizeDevice();
+        } catch (std::exception& e) {
+            TORCH_CHECK(false, "NPU SynchronizeDevice failed err=:%s", e.what(), PTA_ERROR(ErrCode::ACL));
+        }
+
+        at_npu::native::CachingHostAllocator_emptyCache();
+        try {
+            c10_npu::NPUCachingAllocator::emptyCache();
+        } catch (std::exception& e) {
+            TORCH_CHECK(false, "NPU CachingAllocator::emptyCache failed err=:%s", e.what(), PTA_ERROR(ErrCode::ACL));
+        }
+
+        c10_npu::NpuSysCtrl::SysStatus status = c10_npu::NpuSysCtrl::GetInstance().Finalize();
+        if (status != c10_npu::NpuSysCtrl::SysStatus::FINALIZE_SUCC) {
+            TORCH_CHECK(false, "NPU sys finalize failed.\n", PTA_ERROR(ErrCode::ACL));
+        }
+    } else {
+        TORCH_NPU_WARN("Please init npu device first!");
     }
-
-    c10_npu::NpuSysCtrl::SysStatus status = c10_npu::NpuSysCtrl::GetInstance().Finalize();
-    if (status != c10_npu::NpuSysCtrl::SysStatus::FINALIZE_SUCC) {
-      TORCH_CHECK(false, "NPU sys finalize failed.\n", PTA_ERROR(ErrCode::ACL));
-    }
-  } else {
-    TORCH_NPU_WARN("Please init npu device first!");
-  }
 }
 
 } // namespace torch_npu
@@ -68,9 +73,10 @@ void finalize_npu() {
 namespace torch {
 namespace npu {
 
-void synchronize(int64_t device_index) {
-  c10_npu::NPUGuard device_guard(at::Device(at::DeviceType::PrivateUse1, device_index));
-  c10_npu::npuSynchronizeDevice();
+void synchronize(int64_t device_index)
+{
+    c10_npu::NPUGuard device_guard(at::Device(at::DeviceType::PrivateUse1, device_index));
+    c10_npu::npuSynchronizeDevice();
 }
 
 } // namespace npu
@@ -80,15 +86,16 @@ void synchronize(int64_t device_index) {
 namespace c10 {
 namespace npu {
 
-DeviceIndex current_device() {
-  if (c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) {
-    int device;
-    c10_npu::GetDevice(&device);
-    return (c10::DeviceIndex)device;
-  } else {
-    TORCH_NPU_WARN("Please init npu device first!");
-    return (c10::DeviceIndex)-1;
-  }
+DeviceIndex current_device()
+{
+    if (c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) {
+        int device;
+        c10_npu::GetDevice(&device);
+        return (c10::DeviceIndex)device;
+    } else {
+        TORCH_NPU_WARN("Please init npu device first!");
+        return (c10::DeviceIndex)-1;
+    }
 }
 
 } // namespace npu
diff --git a/torch_npu/csrc/sanitizer/NPUTrace.h b/torch_npu/csrc/sanitizer/NPUTrace.h
index 429aaff560..ff62bbc0fb 100644
--- a/torch_npu/csrc/sanitizer/NPUTrace.h
+++ b/torch_npu/csrc/sanitizer/NPUTrace.h
@@ -15,8 +15,9 @@ struct NPUTrace {
     static void setTrace(const PyCallbackTrigger*);
     static const PyCallbackTrigger* getTrace()
     {
-        if (!have_state)
+        if (!have_state) {
             return nullptr;
+        }
         return npu_trace_state.load(std::memory_order_acquire);
     }
 };
diff --git a/torch_npu/csrc/sanitizer/PyCallbackTrigger.h b/torch_npu/csrc/sanitizer/PyCallbackTrigger.h
index 030f0d7715..ee9109a99f 100644
--- a/torch_npu/csrc/sanitizer/PyCallbackTrigger.h
+++ b/torch_npu/csrc/sanitizer/PyCallbackTrigger.h
@@ -26,63 +26,75 @@ struct PyCallbackTrigger {
     PyCallbackTrigger(const int mode) : sanitizer_mode(static_cast<SanitizerMode>(mode)){};
     void traceNpuAclStartExecution(std::string acl_name) const
     {
-        if (sanitizer_mode == SanitizerMode::KERNEL)
+        if (sanitizer_mode == SanitizerMode::KERNEL) {
             CONCRETE_TRACE_NPU("NPUACLStartExecuteCallbacks", acl_name);
+        }
     }
     void traceNpuAclFinishExecution(std::string acl_name) const
     {
-        if (sanitizer_mode == SanitizerMode::KERNEL)
+        if (sanitizer_mode == SanitizerMode::KERNEL) {
             CONCRETE_TRACE_NPU("NPUACLFinishExecuteCallbacks", acl_name);
+        }
     }
     void traceNpuEventCreation(uintptr_t event) const
     {
-        if (sanitizer_mode == SanitizerMode::STREAM)
+        if (sanitizer_mode == SanitizerMode::STREAM) {
             CONCRETE_TRACE_NPU("NPUEventCreationCallbacks", event);
+        }
     }
     void traceNpuEventDeletion(uintptr_t event) const
     {
-        if (sanitizer_mode == SanitizerMode::STREAM)
+        if (sanitizer_mode == SanitizerMode::STREAM) {
             CONCRETE_TRACE_NPU("NPUEventDeletionCallbacks", event);
+        }
     }
     void traceNpuEventRecord(uintptr_t event, uintptr_t stream) const
     {
-        if (sanitizer_mode == SanitizerMode::STREAM)
+        if (sanitizer_mode == SanitizerMode::STREAM) {
             CONCRETE_TRACE_NPU("NPUEventRecordCallbacks", event, stream);
+        }
     }
     void traceNpuEventWait(uintptr_t event, uintptr_t stream) const
     {
-        if (sanitizer_mode == SanitizerMode::STREAM)
+        if (sanitizer_mode == SanitizerMode::STREAM) {
             CONCRETE_TRACE_NPU("NPUEventWaitCallbacks", event, stream);
+        }
     }
     void traceNpuMemoryAllocation(uintptr_t ptr) const
     {
-        if (sanitizer_mode == SanitizerMode::STREAM)
+        if (sanitizer_mode == SanitizerMode::STREAM) {
             CONCRETE_TRACE_NPU("NPUMemoryAllocationCallbacks", ptr);
+        }
     }
     void traceNpuMemoryDeallocation(uintptr_t ptr) const
     {
-        if (sanitizer_mode == SanitizerMode::STREAM)
+        if (sanitizer_mode == SanitizerMode::STREAM) {
             CONCRETE_TRACE_NPU("NPUMemoryDeallocationCallbacks", ptr);
+        }
     }
     void traceNpuStreamCreation(uintptr_t stream) const
     {
-        if (sanitizer_mode == SanitizerMode::STREAM)
+        if (sanitizer_mode == SanitizerMode::STREAM) {
             CONCRETE_TRACE_NPU("NPUStreamCreationCallbacks", stream);
+        }
     }
     void traceNpuDeviceSynchronization() const
     {
-        if (sanitizer_mode == SanitizerMode::STREAM)
+        if (sanitizer_mode == SanitizerMode::STREAM) {
             CONCRETE_TRACE_NPU("NPUDeviceSynchronizationCallbacks");
+        }
     }
     void traceNpuStreamSynchronization(uintptr_t stream) const
     {
-        if (sanitizer_mode == SanitizerMode::STREAM)
+        if (sanitizer_mode == SanitizerMode::STREAM) {
             CONCRETE_TRACE_NPU("NPUStreamSynchronizationCallbacks", stream);
+        }
     }
     void traceNpuEventSynchronization(uintptr_t event) const
     {
-        if (sanitizer_mode == SanitizerMode::STREAM)
+        if (sanitizer_mode == SanitizerMode::STREAM) {
             CONCRETE_TRACE_NPU("NPUEventSynchronizationCallbacks", event);
+        }
     }
     static PyCallbackTrigger* instance(const int mode)
     {
diff --git a/torch_npu/csrc/utils/LazyInit.cpp b/torch_npu/csrc/utils/LazyInit.cpp
index 7e3259b0d0..b3aa453d85 100644
--- a/torch_npu/csrc/utils/LazyInit.cpp
+++ b/torch_npu/csrc/utils/LazyInit.cpp
@@ -30,8 +30,9 @@ void npu_lazy_init()
     }
 }
 
-void npu_set_run_yet_variable_to_false() {
-  npu_run_yet = false;
+void npu_set_run_yet_variable_to_false()
+{
+    npu_run_yet = false;
 }
 
 }
diff --git a/torch_npu/csrc/utils/TensorType.cpp b/torch_npu/csrc/utils/TensorType.cpp
index cd7b0dc5dc..aeb6fd8b83 100644
--- a/torch_npu/csrc/utils/TensorType.cpp
+++ b/torch_npu/csrc/utils/TensorType.cpp
@@ -10,45 +10,49 @@ namespace utils {
 using namespace at;
 using namespace torch::autograd;
 
-std::vector<std::pair<Backend, ScalarType>> all_declared_types_npu() {
-  std::vector<std::pair<Backend, ScalarType>> ret;
-  // can't easily iterate over enum classes, does not support BFloat16 now
-  std::vector<Backend> backends = { c10::Backend::PrivateUse1 };
-  std::vector<ScalarType> scalar_types = {
-    ScalarType::Byte, ScalarType::Char, ScalarType::Double, ScalarType::Float,
-    ScalarType::Int, ScalarType::Long, ScalarType::Short, ScalarType::Half,
-    ScalarType::Bool, ScalarType::BFloat16
-  };
-
-  for (auto& backend : backends) {
-    for (auto& scalar_type : scalar_types) {
-      ret.emplace_back(std::make_pair(backend, scalar_type));
+std::vector<std::pair<Backend, ScalarType>> all_declared_types_npu()
+{
+    std::vector<std::pair<Backend, ScalarType>> ret;
+    // can't easily iterate over enum classes, does not support BFloat16 now
+    std::vector<Backend> backends = { c10::Backend::PrivateUse1 };
+    std::vector<ScalarType> scalar_types = {
+        ScalarType::Byte, ScalarType::Char, ScalarType::Double, ScalarType::Float,
+        ScalarType::Int, ScalarType::Long, ScalarType::Short, ScalarType::Half,
+        ScalarType::Bool, ScalarType::BFloat16
+    };
+
+    for (auto& backend : backends) {
+        for (auto& scalar_type : scalar_types) {
+            ret.emplace_back(std::make_pair(backend, scalar_type));
+        }
     }
-  }
 
-  return ret;
+    return ret;
 }
 
 struct PyTensorType {
-  PyTypeObject py_type;
-  THPDtype* dtype;
-  THPLayout* layout;
-  bool is_npu;
-  char name[64];
-  int backend;
-  int scalar_type;
-
-  Backend get_backend() const {
-    return static_cast<Backend>(backend);
-  }
+    PyTypeObject py_type;
+    THPDtype* dtype;
+    THPLayout* layout;
+    bool is_npu;
+    char name[64];
+    int backend;
+    int scalar_type;
+
+    Backend get_backend() const
+    {
+        return static_cast<Backend>(backend);
+    }
 
-  DispatchKey get_dispatch_key() const {
-    return backendToDispatchKey(static_cast<Backend>(backend));
-  }
+    DispatchKey get_dispatch_key() const
+    {
+        return backendToDispatchKey(static_cast<Backend>(backend));
+    }
 
-  ScalarType get_scalar_type() const {
-    return static_cast<ScalarType>(scalar_type);
-  }
+    ScalarType get_scalar_type() const
+    {
+        return static_cast<ScalarType>(scalar_type);
+    }
 };
 
 static_assert(std::is_standard_layout<PyTensorType>::value, "PyTensorType must be standard layout");
@@ -78,7 +82,8 @@ static PyObject* Tensor_new(PyTypeObject *type, PyObject *args, PyObject *kwargs
     END_HANDLE_TH_ERRORS
 }
 
-static PyObject* Tensor_instancecheck(PyObject* _self, PyObject* arg) {
+static PyObject* Tensor_instancecheck(PyObject* _self, PyObject* arg)
+{
   HANDLE_TH_ERRORS
   auto self = (PyTensorType*)_self;
   if (THPVariable_Check(arg)) {
@@ -93,15 +98,18 @@ static PyObject* Tensor_instancecheck(PyObject* _self, PyObject* arg) {
   END_HANDLE_TH_ERRORS
 }
 
-PyObject* Tensor_dtype(PyTensorType* self, void *unused) {
+PyObject* Tensor_dtype(PyTensorType* self, void *unused)
+{
   return torch::autograd::utils::wrap(self->dtype);
 }
 
-PyObject* Tensor_layout(PyTensorType* self, void *unused) {
+PyObject* Tensor_layout(PyTensorType* self, void *unused)
+{
   return torch::autograd::utils::wrap(self->layout);
 }
 
-PyObject* Tensor_is_npu(PyTensorType* self, void *unused) {
+PyObject* Tensor_is_npu(PyTensorType* self, void *unused)
+{
   if (self->is_npu) {
     Py_RETURN_TRUE;
   } else {
@@ -109,7 +117,8 @@ PyObject* Tensor_is_npu(PyTensorType* self, void *unused) {
   }
 }
 
-PyObject* Tensor_is_sparse(PyTensorType *self, void *unused) {
+PyObject* Tensor_is_sparse(PyTensorType *self, void *unused)
+{
   if (self->layout->layout == at::Layout::Strided) {
     Py_RETURN_FALSE;
   } else {
@@ -138,7 +147,8 @@ static PyTypeObject metaclass = {
   sizeof(PyTypeObject)                         /* tp_basicsize */
 };
 
-static void py_initialize_metaclass(PyTypeObject& metaclass) {
+static void py_initialize_metaclass(PyTypeObject& metaclass)
+{
   metaclass.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE;
   metaclass.tp_methods = metaclass_methods;
   metaclass.tp_getset = metaclass_properties;
@@ -154,7 +164,8 @@ static PyTypeObject tensor_type_prototype = {
   sizeof(PyTensorType)                         /* tp_basicsize */
 };
 
-static void py_initialize_tensor_type(PyTypeObject& type, const char* name, PyObject* tp_dict) {
+static void py_initialize_tensor_type(PyTypeObject& type, const char* name, PyObject* tp_dict)
+{
   // NOTE: we don't use the typical static declaration of PyTypeObject because
   // we need to initialize as many types as there are VariableType instances.
   // We copy the basic object fields from a prototype definition and initialize
@@ -173,7 +184,8 @@ static void py_initialize_tensor_type(PyTypeObject& type, const char* name, PyOb
   }
 }
 
-static std::string get_module(Backend backend) {
+static std::string get_module(Backend backend)
+{
     switch (backend) {
         case Backend::CPU:
             return "torch";
@@ -190,13 +202,15 @@ static std::string get_module(Backend backend) {
     }
 }
 
-static std::string get_name(Backend backend, ScalarType scalarType) {
+static std::string get_name(Backend backend, ScalarType scalarType)
+{
   std::ostringstream ss;
   ss << get_module(backend) << "." << toString(scalarType) << "Tensor";
   return ss.str();
 }
 
-static void set_type(PyTensorType& type_obj, Backend backend, ScalarType scalarType) {
+static void set_type(PyTensorType& type_obj, Backend backend, ScalarType scalarType)
+{
   // This field is lazily initialized from backend and scalar_type
   type_obj.backend = static_cast<int>(backend);
   type_obj.scalar_type = static_cast<int>(scalarType);
@@ -205,13 +219,15 @@ static void set_type(PyTensorType& type_obj, Backend backend, ScalarType scalarT
   type_obj.is_npu = (backend == c10::Backend::PrivateUse1);
 }
 
-static void set_name(PyTensorType& type_obj, const std::string& name) {
+static void set_name(PyTensorType& type_obj, const std::string& name)
+{
   size_t n = sizeof(type_obj.name);
   strncpy(type_obj.name, name.c_str(), n);
   type_obj.name[n - 1] = '\0';
 }
 
-static THPObjectPtr get_tensor_dict() {
+static THPObjectPtr get_tensor_dict()
+{
   auto torch = THPObjectPtr(PyImport_ImportModule("torch"));
   if (!torch) {
       throw python_error();
@@ -242,7 +258,8 @@ static THPObjectPtr get_tensor_dict() {
 
 static std::vector<PyTensorType> tensor_types;
 
-static void initialize_npu_aten_types(std::vector<PyTensorType>& tensor_types) {
+static void initialize_npu_aten_types(std::vector<PyTensorType>& tensor_types)
+{
   // only initialize npu types
   auto declared_types = all_declared_types_npu();
   tensor_types.resize(declared_types.size());
@@ -256,7 +273,8 @@ static void initialize_npu_aten_types(std::vector<PyTensorType>& tensor_types) {
   }
 }
 
-void _initialize_python_bindings() {
+void _initialize_python_bindings()
+{
   // Initialize the at::Type* pointers, name, and properties of the PyTensorType
   // vector. After this call, the vector must not be resized.
   initialize_npu_aten_types(tensor_types);
@@ -282,12 +300,17 @@ void _initialize_python_bindings() {
   py_bind_tensor_types(tensor_types);
 }
 
-static void py_bind_tensor_types(const std::vector<PyTensorType>& tensor_types) {
+static void py_bind_tensor_types(const std::vector<PyTensorType>& tensor_types)
+{
     auto torch_module = THPObjectPtr(PyImport_ImportModule("torch"));
-    if (!torch_module) throw python_error();
+    if (!torch_module) {
+        throw python_error();
+    }
 
     auto tensor_classes = THPObjectPtr(PyObject_GetAttrString(torch_module.get(), "_tensor_classes"));
-    if (!tensor_classes) throw python_error();
+    if (!tensor_classes) {
+        throw python_error();
+    }
 
     for (auto& tensor_type : tensor_types) {
         auto name = std::string(tensor_type.name);
@@ -296,7 +319,9 @@ static void py_bind_tensor_types(const std::vector<PyTensorType>& tensor_types)
         auto module_name = name.substr(0, idx);
 
         auto module_obj = THPObjectPtr(PyImport_ImportModule(module_name.c_str()));
-        if (!module_obj) throw python_error();
+        if (!module_obj) {
+            throw python_error();
+        }
 
         PyObject* type_obj = (PyObject*)&tensor_type;
         Py_INCREF(type_obj);
@@ -310,7 +335,8 @@ static void py_bind_tensor_types(const std::vector<PyTensorType>& tensor_types)
 }
 
 // Callback for python part. Used for additional initialization of python classes
-static PyObject* THPModule_initExtension(PyObject *_unused, PyObject *noargs) {
+static PyObject* THPModule_initExtension(PyObject *_unused, PyObject *noargs)
+{
   HANDLE_TH_ERRORS
   _initialize_python_bindings();
   Py_RETURN_NONE;
@@ -323,7 +349,8 @@ static PyMethodDef TorchNpuExtensionMethods[] = {
     {nullptr, nullptr, 0, nullptr}
 };
 
-PyMethodDef* npu_extension_functions() {
+PyMethodDef* npu_extension_functions()
+{
   return TorchNpuExtensionMethods;
 }
 }
diff --git a/torch_npu/utils/serialization.py b/torch_npu/utils/serialization.py
index 1ae73a7c83..3eb1e40db2 100644
--- a/torch_npu/utils/serialization.py
+++ b/torch_npu/utils/serialization.py
@@ -3,8 +3,8 @@ import pickle
 from typing import Any, Optional
 
 import torch
-from torch.serialization import _check_dill_version, _open_file_like, _is_zipfile,\
-    _open_zipfile_reader, _is_torchscript_zip, _weights_only_unpickler,\
+from torch.serialization import _check_dill_version, _open_file_like, _is_zipfile, \
+    _open_zipfile_reader, _is_torchscript_zip, _weights_only_unpickler, \
     _legacy_load, _load, FILE_LIKE, MAP_LOCATION, DEFAULT_PROTOCOL
 
 import torch_npu
-- 
Gitee


From df246917378da593c456f6b4275c421e1c7feb49 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Mon, 17 Mar 2025 05:00:39 +0000
Subject: [PATCH 174/358] !19061 Update op_plugin commit id Merge pull request
 !19061 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 04d2dfa19c..5144275d68 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 04d2dfa19c51839857ead5bf6bc15d7c6fd1eee0
+Subproject commit 5144275d6827f02c729aed0305328ad380c3f5b4
-- 
Gitee


From 54b16ff50b24d803fbfa05282a200d3462a38baf Mon Sep 17 00:00:00 2001
From: dilililiwhy <why.wuhuanyu@huawei.com>
Date: Mon, 17 Mar 2025 08:28:14 +0000
Subject: [PATCH 175/358] !19059 [4/N] cleancode (torch_npu/csrc/aten) Merge
 pull request !19059 from dilililiwhy/cleancode_aten_260_part4

---
 torch_npu/csrc/aten/NPUGeneratorImpl.cpp      | 30 ++-----------------
 torch_npu/csrc/aten/NPUGeneratorImpl.h        |  1 -
 torch_npu/csrc/aten/PinMemory.cpp             |  1 -
 torch_npu/csrc/aten/common/ChangeDataPtr.cpp  | 16 +++++-----
 torch_npu/csrc/aten/common/CopyKernel.cpp     |  5 ++--
 .../csrc/aten/common/CopyMemoryKernel.cpp     |  3 --
 .../csrc/aten/common/FormatCastHelper.cpp     |  2 +-
 .../aten/common/PinnedMemoryAllocator.cpp     |  2 --
 torch_npu/csrc/aten/common/ResizeNpu.cpp      |  2 +-
 torch_npu/csrc/aten/common/ResizeNpu.h        |  3 +-
 torch_npu/csrc/aten/common/SetNpu.h           |  4 +++
 .../csrc/aten/common/TensorFactories.cpp      |  2 +-
 torch_npu/csrc/aten/common/TensorShape.cpp    |  2 +-
 torch_npu/csrc/aten/common/ToKernelNpu.cpp    |  4 +--
 torch_npu/csrc/aten/common/from_blob.cpp      |  2 +-
 .../csrc/aten/mirror/NPUMemoryOverlap.cpp     |  4 +--
 .../aten/ops/HasCompatibleShallowCopyType.cpp |  2 +-
 17 files changed, 28 insertions(+), 57 deletions(-)

diff --git a/torch_npu/csrc/aten/NPUGeneratorImpl.cpp b/torch_npu/csrc/aten/NPUGeneratorImpl.cpp
index 5695fb69e2..656315f56d 100644
--- a/torch_npu/csrc/aten/NPUGeneratorImpl.cpp
+++ b/torch_npu/csrc/aten/NPUGeneratorImpl.cpp
@@ -5,8 +5,8 @@
 #include "torch_npu/csrc/core/npu/NPUFunctions.h"
 
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
-#include "torch_npu/csrc/aten/NPUGeneratorImpl.h"
 #include "torch_npu/csrc/core/npu/NPUGraphsUtils.h"
+#include "torch_npu/csrc/aten/NPUGeneratorImpl.h"
 
 namespace at_npu {
 namespace detail {
@@ -103,8 +103,8 @@ at::Generator createNPUGenerator(c10::DeviceIndex device_index)
  * NPUGeneratorImpl class implementation
  */
 NPUGeneratorImpl::NPUGeneratorImpl(c10::DeviceIndex device_index)
-  : c10::GeneratorImpl{c10::Device(c10::DeviceType::PrivateUse1, device_index),
-              c10::DispatchKeySet(c10::DispatchKey::PrivateUse1)}
+    : c10::GeneratorImpl{c10::Device(c10::DeviceType::PrivateUse1, device_index),
+        c10::DispatchKeySet(c10::DispatchKey::PrivateUse1)}
 {
     c10_npu::assertNotCapturing("Not support Generator while in capture mode");
 }
@@ -310,30 +310,6 @@ PhiloxNpuState NPUGeneratorImpl::philox_npu_state(uint64_t increment)
     c10_npu::assertNotCapturing("Not support Generator while in capture mode");
     // rounds increment up to the nearest multiple of 4
     increment = ((increment + 3) / 4) * 4;
-    /*
-    if (at::npu::currentStreamCaptureStatus() != at::npu::CaptureStatus::None) {
-        TORCH_CHECK(graph_expects_this_gen_,
-                    "philox_npu_state for an unexpected NPU generator used during capture. "
-                    CAPTURE_DEFAULT_GENS_MSG);
-        // see Note [Why enforce RNG offset % 4 == 0?]
-        TORCH_INTERNAL_ASSERT(this->offset_intragraph_ % 4 == 0);
-        uint32_t offset = this->offset_intragraph_;
-        TORCH_INTERNAL_ASSERT(this->offset_intragraph_ <=
-                              std::numeric_limits<uint32_t>::max() - increment);
-        this->offset_intragraph_ += increment;
-        return PhiloxNpuState(this->seed_,
-                               this->offset_extragraph_,
-                               offset);
-    } else {
-        TORCH_CHECK(!graph_expects_this_gen_,
-                    "NPU generator expects graph capture to be underway, "
-                    "but the current stream is not capturing.");
-        // see Note [Why enforce RNG offset % 4 == 0?]
-        TORCH_INTERNAL_ASSERT(this->philox_offset_per_thread_ % 4 == 0);
-        uint64_t offset = this->philox_offset_per_thread_;
-        this->philox_offset_per_thread_ += increment;
-        return PhiloxNpuState(this->seed_, offset);
-    } */
 
     return PhiloxNpuState(this->seed_, 0);
 }
diff --git a/torch_npu/csrc/aten/NPUGeneratorImpl.h b/torch_npu/csrc/aten/NPUGeneratorImpl.h
index d75ac0da08..145b349875 100644
--- a/torch_npu/csrc/aten/NPUGeneratorImpl.h
+++ b/torch_npu/csrc/aten/NPUGeneratorImpl.h
@@ -4,7 +4,6 @@
 #include <ATen/core/Generator.h>
 #include <ATen/Tensor.h>
 #include <ATen/Context.h>
-#include <limits>
 #include "torch_npu/csrc/core/npu/NPUMacros.h"
 
 
diff --git a/torch_npu/csrc/aten/PinMemory.cpp b/torch_npu/csrc/aten/PinMemory.cpp
index 80cd0db97a..3cfdea1b2f 100644
--- a/torch_npu/csrc/aten/PinMemory.cpp
+++ b/torch_npu/csrc/aten/PinMemory.cpp
@@ -1,5 +1,4 @@
 #include <ATen/ATen.h>
-#include <ATen/MemoryOverlap.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/TensorUtils.h>
 #include <c10/core/Storage.h>
diff --git a/torch_npu/csrc/aten/common/ChangeDataPtr.cpp b/torch_npu/csrc/aten/common/ChangeDataPtr.cpp
index 0f27a471cb..34bb79a5de 100644
--- a/torch_npu/csrc/aten/common/ChangeDataPtr.cpp
+++ b/torch_npu/csrc/aten/common/ChangeDataPtr.cpp
@@ -5,11 +5,11 @@
 namespace at_npu {
 namespace native {
 
-int64_t NPUNativeFunctions::npu_change_data_ptr(const at::Tensor& dst, const at::Tensor& src, int64_t offset)
+int64_t NPUNativeFunctions::npu_change_data_ptr(const at::Tensor& dst, const at::Tensor& src, int64_t index)
 {
     TORCH_CHECK(
-        offset >= 0,
-        "Expect offset equal or greater than zero, got: ", offset, PTA_ERROR(ErrCode::VALUE));
+        index >= 0,
+        "Expect offset(index) equal or greater than zero, got: ", index, PTA_ERROR(ErrCode::VALUE));
 
     const auto& src_scalar_type = src.scalar_type();
     const auto& dst_scalar_type = dst.scalar_type();
@@ -32,22 +32,22 @@ int64_t NPUNativeFunctions::npu_change_data_ptr(const at::Tensor& dst, const at:
     int64_t src_storage_size = c10::multiply_integers(src_sizes);
 
     TORCH_CHECK(
-        offset + dst_storage_size * dst.element_size() <=
+        index + dst_storage_size * dst.element_size() <=
         src_storage_size * src.element_size(),
         "Offsets overflow, got: ",
-        "offset ", offset,
+        "offset(index) ", index,
         ", dst storage size ", dst_storage_size,
         ", src storage size ", src_storage_size, PTA_ERROR(ErrCode::PARAM));
 
     at::DataPtr aim_data_ptr;
     if (src_scalar_type == at::ScalarType::Float) {
-        float* data_ptr = static_cast<float*>(src.storage().data_ptr().get()) + offset;
+        float* data_ptr = static_cast<float*>(src.storage().data_ptr().get()) + index;
         aim_data_ptr = at::DataPtr(data_ptr, dst.storage().device());
     } else if (src_scalar_type == at::ScalarType::BFloat16) {
-        at::BFloat16* data_ptr = static_cast<at::BFloat16*>(src.storage().data_ptr().get()) + offset;
+        at::BFloat16* data_ptr = static_cast<at::BFloat16*>(src.storage().data_ptr().get()) + index;
         aim_data_ptr = at::DataPtr(data_ptr, dst.storage().device());
     } else {
-        at::Half* data_ptr = static_cast<at::Half*>(src.storage().data_ptr().get()) + offset;
+        at::Half* data_ptr = static_cast<at::Half*>(src.storage().data_ptr().get()) + index;
         aim_data_ptr = at::DataPtr(data_ptr, dst.storage().device());
     }
     dst.storage().set_data_ptr(std::move(aim_data_ptr));
diff --git a/torch_npu/csrc/aten/common/CopyKernel.cpp b/torch_npu/csrc/aten/common/CopyKernel.cpp
index 53d1b3c18e..feb0f9b887 100644
--- a/torch_npu/csrc/aten/common/CopyKernel.cpp
+++ b/torch_npu/csrc/aten/common/CopyKernel.cpp
@@ -114,9 +114,8 @@ void copy_between_host_and_device(
             C10_NPU_SHOW_ERR_MSG();
             if (c10_npu::option::OptionsManager::IsResumeModeEnable()) {
                 TORCH_NPU_WARN("ACL stream synchronize failed, error code:", error,
-                               ". But in checkpoint-resume mode will not throw exceptions.");
-            }
-            else {
+                    ". But in checkpoint-resume mode will not throw exceptions.");
+            } else {
                 AT_ERROR("ACL stream synchronize failed, error code:", error);
             }
         }
diff --git a/torch_npu/csrc/aten/common/CopyMemoryKernel.cpp b/torch_npu/csrc/aten/common/CopyMemoryKernel.cpp
index f7e1db3638..b7264a8600 100644
--- a/torch_npu/csrc/aten/common/CopyMemoryKernel.cpp
+++ b/torch_npu/csrc/aten/common/CopyMemoryKernel.cpp
@@ -20,9 +20,6 @@ at::Tensor& NPUNativeFunctions::copy_memory_(at::Tensor& self, const at::Tensor&
     AT_ASSERT(
         src.dtype() == self.dtype(),
         "input tensors of copy_memory_ should have same dtype", OPS_ERROR(ErrCode::PARAM));
-    // AT_ASSERT(
-    //     src.is_contiguous() && self.is_contiguous(),
-    //     "input tensors of copy_memory_ should be contiguous");
     AT_ASSERT(
         src.device().index() == self.device().index(),
         "input tensors of copy_memory_ should have same device index", OPS_ERROR(ErrCode::PARAM));
diff --git a/torch_npu/csrc/aten/common/FormatCastHelper.cpp b/torch_npu/csrc/aten/common/FormatCastHelper.cpp
index 6dd82d8ef5..2f61a7c782 100644
--- a/torch_npu/csrc/aten/common/FormatCastHelper.cpp
+++ b/torch_npu/csrc/aten/common/FormatCastHelper.cpp
@@ -1,9 +1,9 @@
 #include "torch_npu/csrc/framework/FormatHelper.h"
-#include "torch_npu/csrc/aten/common/FormatCastHelper.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "torch_npu/csrc/core/NPUBridge.h"
 #include "torch_npu/csrc/aten/CustomFunctions.h"
 #include "torch_npu/csrc/core/npu/NPUException.h"
+#include "torch_npu/csrc/aten/common/FormatCastHelper.h"
 
 namespace at_npu {
 namespace native {
diff --git a/torch_npu/csrc/aten/common/PinnedMemoryAllocator.cpp b/torch_npu/csrc/aten/common/PinnedMemoryAllocator.cpp
index ae1b53cf9e..86cdda1638 100644
--- a/torch_npu/csrc/aten/common/PinnedMemoryAllocator.cpp
+++ b/torch_npu/csrc/aten/common/PinnedMemoryAllocator.cpp
@@ -1,6 +1,5 @@
 #include <stdexcept>
 #include <ATen/Context.h>
-#include <ATen/Config.h>
 #include <ATen/TensorUtils.h>
 #include <c10/core/Storage.h>
 #include <ATen/ATen.h>
@@ -26,7 +25,6 @@ bool NPUNativeFunctions::is_pinned(const at::Tensor& self, c10::optional<at::Dev
 
 at::Tensor NPUNativeFunctions::_pin_memory(const at::Tensor& self, c10::optional<at::Device> device)
 {
-    // TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!device.has_value() || device->is_npu());
     auto allocator = getPinnedMemoryAllocator();
     auto storage = c10::Storage(
         c10::Storage::use_byte_size_t(),
diff --git a/torch_npu/csrc/aten/common/ResizeNpu.cpp b/torch_npu/csrc/aten/common/ResizeNpu.cpp
index 510c97b2fa..af49fa1c33 100644
--- a/torch_npu/csrc/aten/common/ResizeNpu.cpp
+++ b/torch_npu/csrc/aten/common/ResizeNpu.cpp
@@ -1,9 +1,9 @@
 #include <ATen/ATen.h>
 #include <ATen/NamedTensorUtils.h>
 
-#include "torch_npu/csrc/aten/common/ResizeNpu.h"
 #include "torch_npu/csrc/framework/FormatHelper.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include "torch_npu/csrc/aten/common/ResizeNpu.h"
 
 namespace at_npu {
 namespace native {
diff --git a/torch_npu/csrc/aten/common/ResizeNpu.h b/torch_npu/csrc/aten/common/ResizeNpu.h
index df7724c65c..c063534bc1 100644
--- a/torch_npu/csrc/aten/common/ResizeNpu.h
+++ b/torch_npu/csrc/aten/common/ResizeNpu.h
@@ -49,7 +49,7 @@ static void storage_resize_npu(
         return;
     }
     std::vector<int64_t> resize_shape = {
-        size/static_cast<ptrdiff_t>(itemsize)
+        size / static_cast<ptrdiff_t>(itemsize)
     };
     // It is necessary to properly refresh the storage according to sizes and strides,
     // not just new sizes.
@@ -151,7 +151,6 @@ static void resize_nd_npu(
     const int64_t* size,
     const int64_t* stride)
 {
-    // AT_CHECK(nDimension >= 0, "resizeNd nDimension must be non-negative");
     c10::IntArrayRef sizes(size, nDimension);
     at::optional<c10::IntArrayRef> strides;
     if (stride != nullptr) {
diff --git a/torch_npu/csrc/aten/common/SetNpu.h b/torch_npu/csrc/aten/common/SetNpu.h
index a35a0f4114..b7ed357dc3 100644
--- a/torch_npu/csrc/aten/common/SetNpu.h
+++ b/torch_npu/csrc/aten/common/SetNpu.h
@@ -1,3 +1,6 @@
+#ifndef SETNPU_H_
+#define SETNPU_H_
+
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/native/Resize.h>
@@ -16,3 +19,4 @@ at::Tensor set_tensor_with_storage_format(c10::Storage src);
 
 } // namespace native
 } // namespace at_npu
+#endif // SETNPU_H
diff --git a/torch_npu/csrc/aten/common/TensorFactories.cpp b/torch_npu/csrc/aten/common/TensorFactories.cpp
index c1b701f455..29ee30489c 100644
--- a/torch_npu/csrc/aten/common/TensorFactories.cpp
+++ b/torch_npu/csrc/aten/common/TensorFactories.cpp
@@ -72,7 +72,7 @@ size_t computeStorageNbytes(
         if (sizes[i] == 0) {
             return 0;
         }
-        size += strides[i]*(sizes[i]-1);
+        size += strides[i] * (sizes[i] - 1);
     }
     return size * itemsize_bytes;
 }
diff --git a/torch_npu/csrc/aten/common/TensorShape.cpp b/torch_npu/csrc/aten/common/TensorShape.cpp
index 6245d9e1f4..ddf6e5c4d1 100644
--- a/torch_npu/csrc/aten/common/TensorShape.cpp
+++ b/torch_npu/csrc/aten/common/TensorShape.cpp
@@ -9,11 +9,11 @@
 #include <ATen/native/Copy.h>
 #include <ATen/native/Resize.h>
 #include <ATen/quantized/QTensorImpl.h>
-#include "torch_npu/csrc/core/npu/NPUException.h"
 #include <c10/util/Optional.h>
 #include <algorithm>
 #include <vector>
 
+#include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/framework/InferFormat.h"
 #include "torch_npu/csrc/aten/common/FormatCastHelper.h"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
diff --git a/torch_npu/csrc/aten/common/ToKernelNpu.cpp b/torch_npu/csrc/aten/common/ToKernelNpu.cpp
index 59025bbf97..02fac3c9ef 100644
--- a/torch_npu/csrc/aten/common/ToKernelNpu.cpp
+++ b/torch_npu/csrc/aten/common/ToKernelNpu.cpp
@@ -155,12 +155,12 @@ at::Tensor NPUNativeFunctions::to(
     if (self.dtype() == dtype) {
         return self;
     }
-    if (at::ScalarType::Double == dtype) {
+    if (dtype == at::ScalarType::Double) {
         TORCH_NPU_WARN_ONCE(
             "Warning: Device do not support double dtype now, "
             "dtype cast repalce with float.");
     }
-    dtype = (at::ScalarType::Double == dtype) ? at::ScalarType::Float : dtype;
+    dtype = (dtype == at::ScalarType::Double) ? at::ScalarType::Float : dtype;
     return custom_ops::npu_dtype_cast(self, dtype);
 }
 
diff --git a/torch_npu/csrc/aten/common/from_blob.cpp b/torch_npu/csrc/aten/common/from_blob.cpp
index fca5fca3de..08f2e63fd2 100644
--- a/torch_npu/csrc/aten/common/from_blob.cpp
+++ b/torch_npu/csrc/aten/common/from_blob.cpp
@@ -2,7 +2,6 @@
 
 #include <ATen/Utils.h>
 #include <c10/core/Allocator.h>
-#include "torch_npu/csrc/aten/common/from_blob.h"
 #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
 #include "torch_npu/csrc/core/npu/NPUGuard.h"
 #include "torch_npu/csrc/framework/StorageDescHelper.h"
@@ -10,6 +9,7 @@
 #include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/core/NPUStorageImpl.h"
 #include "torch_npu/csrc/aten/common/TensorFactories.h"
+#include "torch_npu/csrc/aten/common/from_blob.h"
 
 namespace at_npu {
 
diff --git a/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.cpp b/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.cpp
index 2454731c80..9602ee74b9 100644
--- a/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.cpp
+++ b/torch_npu/csrc/aten/mirror/NPUMemoryOverlap.cpp
@@ -1,6 +1,6 @@
-#include "NPUMemoryOverlap.h"
-#include "torch_npu/csrc/core/npu/NPUException.h"
 #include <c10/core/Layout.h>
+#include "torch_npu/csrc/core/npu/NPUException.h"
+#include "NPUMemoryOverlap.h"
 
 namespace at_npu {
 namespace native {
diff --git a/torch_npu/csrc/aten/ops/HasCompatibleShallowCopyType.cpp b/torch_npu/csrc/aten/ops/HasCompatibleShallowCopyType.cpp
index 59c1b3aedb..6fe1d80db2 100644
--- a/torch_npu/csrc/aten/ops/HasCompatibleShallowCopyType.cpp
+++ b/torch_npu/csrc/aten/ops/HasCompatibleShallowCopyType.cpp
@@ -1,6 +1,6 @@
+#include <torch/library.h>
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "torch_npu/csrc/framework/utils/OpAdapter.h"
-#include <torch/library.h>
 
 namespace at_npu {
 namespace native {
-- 
Gitee


From 32d3475e89463fdd8ec41c57f957ed8df7be2471 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= <wangchao430@huawei.com>
Date: Mon, 17 Mar 2025 08:29:59 +0000
Subject: [PATCH 176/358] =?UTF-8?q?!19067=20SilentCheck:=20grad=20silent?=
 =?UTF-8?q?=20check=20fix=20Merge=20pull=20request=20!19067=20from=20?=
 =?UTF-8?q?=E7=8E=8B=E8=B6=85/v2.6.0=5Fsilentperf2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/utils/_step.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/torch_npu/utils/_step.py b/torch_npu/utils/_step.py
index 9ebf364054..314e0b69c9 100644
--- a/torch_npu/utils/_step.py
+++ b/torch_npu/utils/_step.py
@@ -54,7 +54,10 @@ def input_hook(idx, asd_flag):
         loggerSilent.debug(f"input_hook: IS_IN_BACKWARD is {IS_IN_BACKWARD}, will change to False. idx is {idx}, flag is {asd_flag}")
         IS_IN_BACKWARD = False
         torch_npu._C._npu_set_call_state("forward")
-        _silent_fault_detector_v2.silent_fault_check(idx, asd_flag, grad)
+        if torch_npu._C._get_silent_check_version() == 3:
+            _silent_fault_detector_v3.silent_fault_check(idx, asd_flag, grad)
+        else:
+            _silent_fault_detector_v2.silent_fault_check(idx, asd_flag, grad)
         return
     return hook
 
-- 
Gitee


From 21211e56b2b942e6e46768ff133846d8c47f6590 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=8F=B6=E5=AD=90=E5=87=A1?= <vscience@qq.com>
Date: Mon, 17 Mar 2025 12:01:57 +0000
Subject: [PATCH 177/358] =?UTF-8?q?!18684=20CleanCode=20fix=20G.FMT.05=20M?=
 =?UTF-8?q?erge=20pull=20request=20!18684=20from=20=E5=8F=B6=E5=AD=90?=
 =?UTF-8?q?=E5=87=A1/v2.6.0=5Fcleancode?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/distributed/__init__.py             | 6 +++---
 torch_npu/distributed/distributed_c10d.py     | 4 ++--
 torch_npu/distributed/rendezvous.py           | 4 ++--
 torch_npu/distributed/rpc/__init__.py         | 4 ++--
 torch_npu/distributed/rpc/backend_registry.py | 4 ++--
 torch_npu/distributed/rpc/options.py          | 5 ++---
 torch_npu/distributed/run.py                  | 4 ++--
 7 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/torch_npu/distributed/__init__.py b/torch_npu/distributed/__init__.py
index 12b706cea7..7fb92f44e0 100644
--- a/torch_npu/distributed/__init__.py
+++ b/torch_npu/distributed/__init__.py
@@ -1,10 +1,10 @@
-import torch_npu
-from torch_npu.utils._error_code import ErrCode, dist_error
-
 __all__ = [
     "is_hccl_available", "reinit_process_group", "reduce_scatter_tensor_uneven", "all_gather_into_tensor_uneven"
 ]
 
+import torch_npu
+from torch_npu.utils._error_code import ErrCode, dist_error
+
 
 def is_available():
     """
diff --git a/torch_npu/distributed/distributed_c10d.py b/torch_npu/distributed/distributed_c10d.py
index af2bce03fd..ef66ea772b 100644
--- a/torch_npu/distributed/distributed_c10d.py
+++ b/torch_npu/distributed/distributed_c10d.py
@@ -1,3 +1,5 @@
+__all__ = ["is_hccl_available", "reinit_process_group"]
+
 from datetime import timedelta
 from typing import Optional
 import warnings
@@ -13,8 +15,6 @@ from torch.distributed.distributed_c10d import _get_default_group, get_group_ran
 
 from torch_npu.utils._error_code import ErrCode, dist_error
 
-__all__ = ["is_hccl_available", "reinit_process_group"]
-
 
 def _batch_isend_irecv(p2p_op_list):
     group = p2p_op_list[0].group
diff --git a/torch_npu/distributed/rendezvous.py b/torch_npu/distributed/rendezvous.py
index f20c858942..80b10f6dba 100644
--- a/torch_npu/distributed/rendezvous.py
+++ b/torch_npu/distributed/rendezvous.py
@@ -1,3 +1,5 @@
+__all__ = []
+
 try:
     from urllib.parse import urlparse, urlunparse
 except ImportError as e:
@@ -22,8 +24,6 @@ log = logging.getLogger(__name__)
 
 _default_timeout_seconds = 600
 
-__all__ = []
-
 
 def _rendezvous_error(msg):
     return ValueError("Error initializing torch_npu.distributed using " + msg)
diff --git a/torch_npu/distributed/rpc/__init__.py b/torch_npu/distributed/rpc/__init__.py
index 3ea5708096..7a06791ec6 100644
--- a/torch_npu/distributed/rpc/__init__.py
+++ b/torch_npu/distributed/rpc/__init__.py
@@ -1,7 +1,7 @@
-import torch.distributed.rpc as rpc
-
 __all__ = []
 
+import torch.distributed.rpc as rpc
+
 
 if rpc.is_available():
     from . import backend_registry
diff --git a/torch_npu/distributed/rpc/backend_registry.py b/torch_npu/distributed/rpc/backend_registry.py
index d0542d6321..749581c6b8 100644
--- a/torch_npu/distributed/rpc/backend_registry.py
+++ b/torch_npu/distributed/rpc/backend_registry.py
@@ -1,3 +1,5 @@
+__all__ = []
+
 import torch
 import torch.distributed as dist
 import torch.distributed.rpc as rpc
@@ -9,8 +11,6 @@ from torch.distributed.rpc import constants as rpc_constants
 import torch_npu._C
 from torch_npu.utils._error_code import ErrCode, dist_error
 
-__all__ = []
-
 
 def _get_device_count_info():
     # Function used to replace torch.cuda.device_count in torch_npu
diff --git a/torch_npu/distributed/rpc/options.py b/torch_npu/distributed/rpc/options.py
index 3b21453237..8821dfda05 100644
--- a/torch_npu/distributed/rpc/options.py
+++ b/torch_npu/distributed/rpc/options.py
@@ -1,3 +1,5 @@
+__all__ = ["NPUTensorPipeRpcBackendOptions"]
+
 from typing import Dict, List, Optional, Union
 
 import torch
@@ -10,9 +12,6 @@ from torch_npu.utils._error_code import ErrCode, dist_error
 DeviceType = Union[int, str, torch.device]
 
 
-__all__ = ["NPUTensorPipeRpcBackendOptions"]
-
-
 def _to_device(device: DeviceType) -> torch.device:
     device = torch.device(device)
     if device.type != _get_privateuse1_backend_name():
diff --git a/torch_npu/distributed/run.py b/torch_npu/distributed/run.py
index 6cb3be42f7..3fb124bd14 100644
--- a/torch_npu/distributed/run.py
+++ b/torch_npu/distributed/run.py
@@ -1,11 +1,11 @@
+__all__ = ["parse_args"]
+
 from torch.distributed import run as torch_run
 from torch.distributed.argparse_util import check_env, env
 from torch.distributed.run import get_args_parser
 from torch.distributed.elastic.multiprocessing.errors import record
 import torch_npu
 
-__all__ = ["parse_args"]
-
 
 def parse_args(args):
     parser = get_args_parser()
-- 
Gitee


From 18a24e925e4663b727686915631f538000a6ff8a Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Mon, 17 Mar 2025 13:45:39 +0000
Subject: [PATCH 178/358] !19105 Update op_plugin commit id Merge pull request
 !19105 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 5144275d68..484d181d99 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 5144275d6827f02c729aed0305328ad380c3f5b4
+Subproject commit 484d181d99dfe54a2964874850856e4c1038c046
-- 
Gitee


From ad48d8f0e74d3018459ef46e0ca2de651e8fdabb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= <yanpengquan@huawei.com>
Date: Tue, 18 Mar 2025 09:31:44 +0000
Subject: [PATCH 179/358] =?UTF-8?q?!19100=20fix=20std::out=5Fof=5Frange=20?=
 =?UTF-8?q?error=20in=20serialization=20Merge=20pull=20request=20!19100=20?=
 =?UTF-8?q?from=20=E9=97=AB=E9=B9=8F=E5=85=A8/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/framework/StorageDescHelper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_npu/csrc/framework/StorageDescHelper.cpp b/torch_npu/csrc/framework/StorageDescHelper.cpp
index ea47f6620e..6a23a5e4b9 100644
--- a/torch_npu/csrc/framework/StorageDescHelper.cpp
+++ b/torch_npu/csrc/framework/StorageDescHelper.cpp
@@ -172,7 +172,7 @@ void StorageDescHelper::SetDescForSerialization(const at::Tensor &tensor,
             if (str[end] != '/') {
                 end++;
             } else {
-                vec.emplace_back(std::stoi(str.substr(start, end - start)));
+                vec.emplace_back(std::stoll(str.substr(start, end - start)));
                 end++;
                 start = end;
             }
-- 
Gitee


From 53dde395c5f1a7eee0fd698861bd181f52117726 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 18 Mar 2025 10:00:39 +0000
Subject: [PATCH 180/358] !19126 Update op_plugin commit id Merge pull request
 !19126 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 484d181d99..ed9f48edcd 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 484d181d99dfe54a2964874850856e4c1038c046
+Subproject commit ed9f48edcdf2ae747d550cf6d4fd6dbe6ebccb09
-- 
Gitee


From c0bd0334878697e28efa34adb4d5ab3927becf8b Mon Sep 17 00:00:00 2001
From: yuhaiyan <yuhaiyan8@huawei.com>
Date: Tue, 18 Mar 2025 12:11:46 +0000
Subject: [PATCH 181/358] !19049 Add APIs for CANNVersion. Merge pull request
 !19049 from yuhaiyan/v2.6.0-dev2

---
 test/npu/test_cann_version.py                 | 36 +++++++
 .../csrc/core/npu/interface/AclInterface.cpp  | 15 +++
 .../csrc/core/npu/interface/AclInterface.h    |  3 +
 torch_npu/csrc/npu/GetCANNInfo.cpp            | 96 +++++++++++++++++++
 torch_npu/csrc/npu/GetCANNInfo.h              | 14 +++
 torch_npu/csrc/npu/Module.cpp                 | 29 ++++++
 torch_npu/npu/utils.py                        | 27 +++++-
 7 files changed, 219 insertions(+), 1 deletion(-)
 create mode 100644 test/npu/test_cann_version.py
 create mode 100644 torch_npu/csrc/npu/GetCANNInfo.cpp
 create mode 100644 torch_npu/csrc/npu/GetCANNInfo.h

diff --git a/test/npu/test_cann_version.py b/test/npu/test_cann_version.py
new file mode 100644
index 0000000000..72f69e638a
--- /dev/null
+++ b/test/npu/test_cann_version.py
@@ -0,0 +1,36 @@
+import re
+
+from torch.testing._internal.common_utils import TestCase, run_tests
+import torch
+import torch_npu
+from torch_npu.utils.collect_env import get_cann_version as get_cann_version_from_env
+from torch_npu.npu.utils import get_cann_version, _is_gte_cann_version
+
+
+class TestCANNversion(TestCase):
+    def test_get_cann_version(self):
+        version_env = get_cann_version_from_env()
+        version = get_cann_version(module="CANN")
+        if not version_env.startswith("CANN") and version_env >= "8.1.RC1":
+            is_match = (re.match("([0-9]+).([0-9]+).RC([0-9]+)", version)
+                        or re.match("([0-9]+).([0-9]+).([0-9]+)", version)
+                        or re.match("([0-9]+).([0-9]+).T([0-9]+)", version)
+                        or re.match("([0-9]+).([0-9]+).RC([0-9]+).alpha([0-9]+)", version))
+            self.assertTrue(is_match, f"The env version is {version_env}. The format of cann version {version} is invalid.")
+        else:
+            self.assertEqual(version, "")
+
+
+    def test_compare_cann_version(self):
+        version_env = get_cann_version_from_env()
+        if not version_env.startswith("CANN") and version_env >= "8.1.RC1":
+            result = _is_gte_cann_version("8.1.RC1", module="CANN")
+            self.assertTrue(result, f"The env version is {version_env}, the result from _is_gte_cann_version is False")
+        else:
+            with self.assertRaisesRegex(RuntimeError,
+                    "When the version is less than \"8.1.RC1\", this function is not supported"):
+                _is_gte_cann_version("7.0.0", "CANN")
+
+
+if __name__ == "__main__":
+    run_tests()
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
index dc6c7381bd..0cca8cf753 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
@@ -75,6 +75,7 @@ LOAD_FUNCTION(aclmdlCaptureEnd)
 LOAD_FUNCTION(aclmdlDebugPrint)
 LOAD_FUNCTION(aclmdlExecuteAsync)
 LOAD_FUNCTION(aclmdlUnload)
+LOAD_FUNCTION(aclsysGetCANNVersion)
 
 aclprofStepInfoPtr init_stepinfo() {
     typedef aclprofStepInfoPtr(*npdInitFunc)();
@@ -771,6 +772,20 @@ aclError AclmdlExecuteAsync(uint32_t modelId, aclrtStream stream)
     return func(modelId, inputs, outputs, stream);
 }
 
+aclError AclsysGetCANNVersion(aclCANNPackageName name, aclCANNPackageVersion *version)
+{
+    using aclsysGetCANNVersionFunc = aclError(*)(aclCANNPackageName, aclCANNPackageVersion *);
+    static aclsysGetCANNVersionFunc func = nullptr;
+    if (func == nullptr) {
+        func = (aclsysGetCANNVersionFunc)GET_FUNC(aclsysGetCANNVersion);
+        if (func == nullptr) {
+        return ACL_ERROR_RT_FEATURE_NOT_SUPPORT;
+        }
+    }
+
+    return func(name, version);
+}
+
 aclError AclmdlUnload(uint32_t modelId)
 {
     typedef aclError (*AclmdlUnload)(uint32_t);
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h
index 96a080eb0c..245ad09584 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.h
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.h
@@ -7,6 +7,7 @@
 #include "third_party/acl/inc/acl/acl_mdl.h"
 #include "third_party/acl/inc/acl/acl_prof.h"
 #include "torch_npu/csrc/core/npu/interface/HcclInterface.h"
+#include "third_party/acl/inc/acl/acl.h"
 
 
 namespace c10_npu {
@@ -192,6 +193,8 @@ aclError AclmdlDebugPrint(uint32_t modelId);
 
 aclError AclmdlExecuteAsync(uint32_t modelId, aclrtStream stream);
 
+aclError AclsysGetCANNVersion(aclCANNPackageName name, aclCANNPackageVersion *version);
+
 aclError AclmdlUnload(uint32_t modelId);
 
 bool IsCaptureSupported();
diff --git a/torch_npu/csrc/npu/GetCANNInfo.cpp b/torch_npu/csrc/npu/GetCANNInfo.cpp
new file mode 100644
index 0000000000..ed9255c4a9
--- /dev/null
+++ b/torch_npu/csrc/npu/GetCANNInfo.cpp
@@ -0,0 +1,96 @@
+#include <iostream>
+#include "torch_npu/csrc/npu/GetCANNInfo.h"
+#include "torch_npu/csrc/core/npu/register/FunctionLoader.h"
+#include "torch_npu/csrc/core/npu/NPUException.h"
+#include "torch_npu/csrc/core/npu/interface/AclInterface.h"
+#include "third_party/acl/inc/acl/acl.h"
+
+
+std::unordered_map<std::string, aclCANNPackageName> packageNameMap = {
+    {"CANN", ACL_PKG_NAME_CANN},
+    {"RUNTIME", ACL_PKG_NAME_RUNTIME},
+    {"COMPILER", ACL_PKG_NAME_COMPILER},
+    {"HCCL", ACL_PKG_NAME_HCCL},
+    {"TOOLKIT", ACL_PKG_NAME_TOOLKIT},
+    {"OPP", ACL_PKG_NAME_OPP},
+    {"OPP_KERNEL", ACL_PKG_NAME_OPP_KERNEL},
+    {"DRIVER", ACL_PKG_NAME_DRIVER}
+};
+
+double VersionToNum(std::string versionStr)
+{
+    std::smatch results;
+    int major = -1;
+    int minor = -1;
+    int release = -1;
+    int RCVersion = -51;
+    int TVersion = -1;
+    int alphaVersion = 0;
+    if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).RC([0-9]+)"))) {
+        major = stoi(results[1]);
+        minor = stoi(results[2]);
+        RCVersion = stoi(results[3]);
+    } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).([0-9]+)"))) {
+        major = stoi(results[1]);
+        minor = stoi(results[2]);
+        release = stoi(results[3]);
+    } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).T([0-9]+)"))) {
+        major = stoi(results[1]);
+        minor = stoi(results[2]);
+        TVersion = stoi(results[3]);
+    } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).RC([0-9]+).alpha([0-9]+)"))) {
+        major = stoi(results[1]);
+        minor = stoi(results[2]);
+        RCVersion = stoi(results[3]);
+        alphaVersion = stoi(results[4]);
+    } else {
+        TORCH_NPU_WARN_ONCE("Version: " + versionStr + " is invalid.");
+        return 0.0;
+    }
+
+    double num = ((major + 1) * 100000000) + ((minor + 1) * 1000000) + ((release + 1) * 10000) + ((RCVersion + 1) * 100 + 5000) + ((TVersion + 1) * 100) - (100 - alphaVersion);
+    return num;
+}
+
+std::unordered_map<std::string, std::string> CANNVersionCache;
+
+std::string GetCANNVersion(const std::string& module)
+{
+    auto it = CANNVersionCache.find(module);
+    if (it != CANNVersionCache.end()) {
+        return it->second;
+    }
+    auto find_module = packageNameMap.find(module);
+    if (find_module == packageNameMap.end()) {
+        TORCH_NPU_WARN_ONCE("module " + module + "is invalid.");
+        CANNVersionCache[module] = "";
+        return "";
+    }
+    aclCANNPackageName name = find_module->second;
+    aclCANNPackageVersion version;
+    aclError ret = c10_npu::acl::AclsysGetCANNVersion(name, &version);
+    if (ret == ACL_ERROR_RT_FEATURE_NOT_SUPPORT) {
+        TORCH_NPU_WARN_ONCE("Failed to find function aclsysGetCANNVersion");
+        CANNVersionCache[module] = "";
+        return "";
+    }
+    std::string module_version = version.version;
+    CANNVersionCache[module] = module_version;
+    return module_version;
+}
+
+bool IsGteCANNVersion(const std::string version, const std::string module)
+{
+    static std::string baseVersion = "8.1.RC1";
+    if (version.compare(baseVersion) < 0) {
+        TORCH_CHECK(false, "When the version is less than \"8.1.RC1\", this function is not supported.", PTA_ERROR(ErrCode::VALUE));
+    }
+    std::string currentVersion = GetCANNVersion(module);
+    double current_num = VersionToNum(currentVersion);
+    double boundary_num = VersionToNum(version);
+    if (current_num >= boundary_num) {
+        return true;
+    } else {
+        return false;
+    }
+}
\ No newline at end of file
diff --git a/torch_npu/csrc/npu/GetCANNInfo.h b/torch_npu/csrc/npu/GetCANNInfo.h
new file mode 100644
index 0000000000..8c3aa86c6b
--- /dev/null
+++ b/torch_npu/csrc/npu/GetCANNInfo.h
@@ -0,0 +1,14 @@
+#ifndef THNP_GETCANNINFO_INC
+#define THNP_GETCANNINFO_INC
+#include "torch_npu/csrc/core/npu/NPUMacros.h"
+
+
+TORCH_NPU_API std::string GetCANNVersion(const std::string& module = "CANN");
+
+/*
+support version format: a.b.c, a.b.RCd, a.b.Tg, a.b.RCd.alphaf
+formula: ((a+1) * 100000000) + ((b+1) * 1000000) + ((c+1) * 10000) + ((d+1) * 100 + 5000) + ((g+1) * 100) - (100 - f)
+*/
+bool IsGteCANNVersion(const std::string version, const std::string module = "CANN");
+
+#endif
\ No newline at end of file
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index 03aecd7adb..3a37cd73a1 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -43,6 +43,7 @@
 #include "torch_npu/csrc/profiler/msprof_tx.h"
 #include "torch_npu/csrc/npu/memory_snapshot.h"
 #include "torch_npu/csrc/core/npu/interface/OpInterface.h"
+#include "torch_npu/csrc/npu/GetCANNInfo.h"
 #include "op_plugin/utils/custom_functions/opapi/FFTCommonOpApi.h"
 
 struct NPUDeviceProp {
@@ -1306,6 +1307,32 @@ PyObject* THNPModule_npu_clear_fft_plan_cache(PyObject* self, PyObject* noargs)
     END_HANDLE_TH_ERRORS
 }
 
+static PyObject* THNPModule_get_cann_version(PyObject* self, PyObject *args)
+{
+    HANDLE_TH_ERRORS
+    TORCH_CHECK(THPUtils_checkString(args), "invalid value of module, module must be string", PTA_ERROR(ErrCode::PARAM));
+    std::string module = THPUtils_unpackString(args);
+    std::string version = GetCANNVersion(module);
+    return THPUtils_packString(version);
+    END_HANDLE_TH_ERRORS
+}
+
+static PyObject* THNPModule_is_gte_cann_version(PyObject* self, PyObject *args)
+{
+    HANDLE_TH_ERRORS
+    static torch::PythonArgParser parser(
+        {"_is_gte_cann_version(std::string version, std::string module)", },
+        false);
+    torch::ParsedArgs<2> parsed_args;
+    auto _r = parser.parse(args, nullptr, parsed_args);
+    string version = _r.string(0);
+    string module = _r.string(1);
+
+    bool compareResult = IsGteCANNVersion(version, module);
+    return Py_BuildValue("i", int(compareResult));
+    END_HANDLE_TH_ERRORS
+}
+
 static struct PyMethodDef THNPModule_methods[] = {
     {"_npu_init", (PyCFunction)THNPModule_initExtension, METH_NOARGS, nullptr},
     {"_npu_set_run_yet_variable_to_false", (PyCFunction)THNPModule_set_run_yet_variable_to_false_wrap, METH_NOARGS, nullptr},
@@ -1362,6 +1389,8 @@ static struct PyMethodDef THNPModule_methods[] = {
     {"_npu_get_fft_plan_cache_max_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_max_size, METH_NOARGS, nullptr},
     {"_npu_get_fft_plan_cache_size", (PyCFunction)THNPModule_npu_get_fft_plan_cache_size, METH_NOARGS, nullptr},
     {"_npu_clear_fft_plan_cache", (PyCFunction)THNPModule_npu_clear_fft_plan_cache, METH_NOARGS, nullptr},
+    {"_get_cann_version", (PyCFunction)THNPModule_get_cann_version, METH_O, nullptr},
+    {"_is_gte_cann_version", (PyCFunction)THNPModule_is_gte_cann_version, METH_VARARGS, nullptr},
     {nullptr}};
 
 TORCH_NPU_API PyMethodDef* THNPModule_get_methods()
diff --git a/torch_npu/npu/utils.py b/torch_npu/npu/utils.py
index 8e2b7b9222..59a70856f1 100644
--- a/torch_npu/npu/utils.py
+++ b/torch_npu/npu/utils.py
@@ -19,7 +19,32 @@ __all__ = ["synchronize", "device_count", "can_device_access_peer", "set_device"
            "stream", "set_stream", "current_stream", "default_stream", "set_sync_debug_mode", "get_sync_debug_mode",
            "init_dump", "set_dump", "finalize_dump", "is_support_inf_nan", "is_bf16_supported",
            "get_npu_overflow_flag", "npu_check_overflow", "clear_npu_overflow_flag", "current_blas_handle",
-           "check_uce_in_memory", "stress_detect"]
+           "check_uce_in_memory", "stress_detect", "get_cann_version"]
+
+
+def get_cann_version(module="CANN"):
+    r"""
+    Args:
+        module: can be selected from [\"CANN\", \"RUNTIME\", \"COMPILER\", \"HCCL\", \"TOOLKIT\", \"OPP\", \"OPP_KERNEL\", \"DRIVER\"]
+
+    Returns: current version.
+
+    """
+    return torch_npu._C._get_cann_version(module)
+
+
+def _is_gte_cann_version(version, module="CANN"):
+    r"""
+    compare current cann_version and version.
+    Args:
+        version: the features are supported or not from which cann version.
+        module: can be selected from [\"CANN\", \"RUNTIME\", \"COMPILER\", \"HCCL\", \"TOOLKIT\", \"OPP\", \"OPP_KERNEL\", \"DRIVER\"]
+
+    Returns: If current_version >= version, return True, else return False.
+
+    """
+    result = torch_npu._C._is_gte_cann_version(version, module)
+    return True if result else False
 
 
 def synchronize(device=None):
-- 
Gitee


From 9459e37d4ca8fb7cefa05ea04ab38babfdf390ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=82=B2=E6=89=8D?= <wangyucai2@huawei.com>
Date: Tue, 18 Mar 2025 12:19:18 +0000
Subject: [PATCH 182/358] =?UTF-8?q?!19043=20modify=20copy=5Ffrom=5Fand=5Fr?=
 =?UTF-8?q?esize=20Merge=20pull=20request=20!19043=20from=20=E7=8E=8B?=
 =?UTF-8?q?=E8=82=B2=E6=89=8D/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/npu/test_fault_mode.py                     | 17 ++++++++++++-----
 .../aten/ops/CopyFromAndResizeKernelNpu.cpp     | 10 ++++++++--
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/test/npu/test_fault_mode.py b/test/npu/test_fault_mode.py
index 3ac8ee9638..d71f7e6fd7 100644
--- a/test/npu/test_fault_mode.py
+++ b/test/npu/test_fault_mode.py
@@ -102,11 +102,18 @@ class TestMode(TestCase):
             torch.Generator(device="cuda")
 
     def test_not_supported_ops(self):
-        with self.assertRaisesRegex(RuntimeError, "_copy_from_and_resize now only support copy with same size!"):
-            with self.assertRaisesRegex(Warning, "CAUTION: The operator 'aten::linalg_lstsq.out' is not currently "
-                                                 "supported  on the NPU backend and will fall back to run on the CPU. "
-                                                 "This may have performance implications. (function npu_cpu_fallback)"):
-                torch.linalg.lstsq(torch.randn(1, 3, 3).npu(), torch.randn(2, 3, 3).npu())
+        command = ['python', '-c', 'import torch; import torch_npu; torch.rand(1, 3, 3).npu().logit()']
+        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        message = process.stderr.read()
+        process.stderr.close()
+        process.stdout.close()
+        process.terminate()
+        process.wait()
+        self.assertIn(
+            "CAUTION: The operator 'aten::logit' is not currently supported on the NPU backend and will fall back "
+            "to run on the CPU. This may have performance implications. (function npu_cpu_fallback)",
+            message
+        )
 
     def test_param_verification(self):
         with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device. "
diff --git a/torch_npu/csrc/aten/ops/CopyFromAndResizeKernelNpu.cpp b/torch_npu/csrc/aten/ops/CopyFromAndResizeKernelNpu.cpp
index 6f1abc8911..afe0b6df26 100644
--- a/torch_npu/csrc/aten/ops/CopyFromAndResizeKernelNpu.cpp
+++ b/torch_npu/csrc/aten/ops/CopyFromAndResizeKernelNpu.cpp
@@ -6,14 +6,20 @@ namespace native {
 
 at::Tensor NPUNativeFunctions::_copy_from_and_resize(const at::Tensor& self, const at::Tensor& dst)
 {
+    TORCH_CHECK(dst.defined(), "dst is undefined", OPS_ERROR(ErrCode::NOT_SUPPORT));
+    TORCH_CHECK(self.defined(), "self is undefined", OPS_ERROR(ErrCode::NOT_SUPPORT));
+    
+    if (dst.numel() == 0) {
+        dst.resize_as_(self);
+    }
     TORCH_CHECK(self.sizes() == dst.sizes(),
-        "_copy_from_and_resize now only support copy with same size!", OPS_ERROR(ErrCode::NOT_SUPPORT));
+        "_copy_from_and_resize now only support copy with same size, or dst.numel() == 0!",
+        OPS_ERROR(ErrCode::NOT_SUPPORT));
     TORCH_CHECK(self.is_cpu() && dst.device().is_privateuseone(),
         "_copy_from_and_resize now only support copy from cpu tensor to npu tensor, but got src tensor device is ",
         self.device(), " and dst device is ", dst.device(), OPS_ERROR(ErrCode::NOT_SUPPORT));
     dst.copy_(self);
     return dst;
 }
-
 }
 }
-- 
Gitee


From 04f0544153622018e0ffb63b91076f7d0b3b2645 Mon Sep 17 00:00:00 2001
From: wangjie <wjchuee@foxmail.com>
Date: Tue, 18 Mar 2025 12:37:21 +0000
Subject: [PATCH 183/358] !19085 [PROF] Dynamic profiler config path check
 Merge pull request !19085 from wangjie/dynamic_prof_path_check_260

---
 .../_dynamic_profiler/_dynamic_profiler_monitor.py        | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py
index 2a23115df1..510bd875dc 100644
--- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_monitor.py
@@ -2,9 +2,10 @@ import os
 import mmap
 import stat
 import time
-import json
 import struct
 import multiprocessing
+from ...utils._path_manager import PathManager
+from ..analysis.prof_common_func._file_manager import FileManager
 from ._dynamic_profiler_config_context import ConfigContext
 from ._dynamic_profiler_utils import DynamicProfilerUtils
 from ._dynamic_profiler_monitor_shm import DynamicProfilerShareMemory
@@ -144,8 +145,9 @@ def worker_func(params_dict):
             if not last_file_t or last_file_t != file_t:
                 last_file_t = file_t
                 try:
-                    with open(cfg_path, 'r') as f:
-                        data = json.load(f)
+                    PathManager.check_input_file_path(cfg_path)
+                    PathManager.check_directory_path_readable(cfg_path)
+                    data = FileManager.read_json_file(cfg_path)
                     # convert json to bytes
                     data['is_valid'] = True
                     DynamicProfilerUtils.out_log("Dynamic profiler process load json success",
-- 
Gitee


From d40adc4835d5c8f5abae4aed646904e72edc2a29 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=82=B5=E9=9D=9E=E5=87=A1?= <shaofeifan2@huawei.com>
Date: Tue, 18 Mar 2025 12:40:21 +0000
Subject: [PATCH 184/358] =?UTF-8?q?!19132=20Remove=20the=20unnecessary=20a?=
 =?UTF-8?q?daptation=20for=20scalar=5Ftensor=20Merge=20pull=20request=20!1?=
 =?UTF-8?q?9132=20from=20=E9=82=B5=E9=9D=9E=E5=87=A1/get=5Fdevice26?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/aten/common/TensorFactories.cpp | 18 ------------------
 torch_npu/csrc/aten/npu_native_functions.yaml  |  1 -
 2 files changed, 19 deletions(-)

diff --git a/torch_npu/csrc/aten/common/TensorFactories.cpp b/torch_npu/csrc/aten/common/TensorFactories.cpp
index 29ee30489c..53bbb70ba2 100644
--- a/torch_npu/csrc/aten/common/TensorFactories.cpp
+++ b/torch_npu/csrc/aten/common/TensorFactories.cpp
@@ -79,24 +79,6 @@ size_t computeStorageNbytes(
 
 } // namespace
 
-at::Tensor NPUNativeFunctions::scalar_tensor(
-    const c10::Scalar& s,
-    c10::optional<at::ScalarType> dtype,
-    c10::optional<at::Layout> layout,
-    c10::optional<at::Device> device,
-    c10::optional<bool> pin_memory)
-{
-    at::tracer::impl::NoTracerDispatchMode tracer_guard;
-    at::AutoDispatchBelowAutograd mode;
-    auto result = at::native::empty_cpu({}, dtype, layout, c10::make_optional(c10::Device(at::kCPU)), pin_memory);
-    at::native::fill_(result, s);
-    if (device.has_value()) {
-        AT_ASSERT(device.value().type() == c10::DeviceType::PrivateUse1, OPS_ERROR(ErrCode::TYPE));
-        return result.to(device.value());
-    }
-    return result.to(at::device(c10::DeviceType::PrivateUse1));
-}
-
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ empty ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 at::Tensor NPUNativeFunctions::empty(
     c10::IntArrayRef size,
diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml
index 144157eeb5..edbd46750b 100644
--- a/torch_npu/csrc/aten/npu_native_functions.yaml
+++ b/torch_npu/csrc/aten/npu_native_functions.yaml
@@ -39,7 +39,6 @@ supported:
   - resize_
   - func: resize_as_
     device_check: NoCheck
-  - scalar_tensor
   - set_
   - set_.source_Storage
   - set_.source_Storage_storage_offset
-- 
Gitee


From 26d13ab1b1fd5bdb0285783044d3543aa371a97f Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 18 Mar 2025 14:15:39 +0000
Subject: [PATCH 185/358] !19137 Update op_plugin commit id Merge pull request
 !19137 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index ed9f48edcd..ae206e739a 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit ed9f48edcdf2ae747d550cf6d4fd6dbe6ebccb09
+Subproject commit ae206e739a541d716d53245bc166729cd8eaed6e
-- 
Gitee


From 526eb427b6a3292e66629cd9c461f0bc3475f9b8 Mon Sep 17 00:00:00 2001
From: wgb <wgb_strive@163.com>
Date: Wed, 19 Mar 2025 06:16:53 +0000
Subject: [PATCH 186/358] !19152 exposed allocate_workspace api Merge pull
 request !19152 from wgb/2.6_copy

---
 torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp | 13 +++++++++++++
 torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h   |  8 ++++++++
 2 files changed, 21 insertions(+)

diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp
index 753089c393..1c1b34a6da 100644
--- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp
@@ -10,6 +10,7 @@
 #include "torch_npu/csrc/core/npu/register/OptionsManager.h"
 #include "torch_npu/csrc/core/npu/NPUFunctions.h"
 #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
+#include "torch_npu/csrc/framework/utils/OpPreparation.h"
 #include "torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h"
 
 #ifndef BUILD_LIBTORCH
@@ -17,6 +18,18 @@
 #include "torch_npu/csrc/sanitizer/NPUTrace.h"
 #endif
 
+namespace at_npu {
+namespace native {
+
+at::Tensor allocate_workspace(uint64_t workspace_size, aclrtStream stream)
+{
+    return at_npu::native::OpPreparation::unsafe_empty_workspace(workspace_size, stream);
+}
+
+} // namespace native
+} // namespace at_npu
+
+
 namespace c10_npu {
 namespace NPUWorkspaceAllocator {
 
diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h
index 5f4d8abdf6..867a25c703 100644
--- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h
+++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.h
@@ -13,3 +13,11 @@ void emptyCache(int device, bool need_empty_queue, bool check_error = true);
 
 } // namespace NPUWorkspaceAllocator
 } // namespace c10_npu
+
+namespace at_npu {
+namespace native {
+
+TORCH_NPU_API at::Tensor allocate_workspace(uint64_t workspace_size, aclrtStream stream);
+
+} // namespace native
+} // namespace at_npu
-- 
Gitee


From d2975904c859dfc2d76fb784c145d31a8368f2a9 Mon Sep 17 00:00:00 2001
From: wangjie <wjchuee@foxmail.com>
Date: Wed, 19 Mar 2025 06:47:20 +0000
Subject: [PATCH 187/358] !19073 [PROF] Profiler mstx param check Merge pull
 request !19073 from wangjie/profiler_mstx_fix_260

---
 test/torch_npu_schema.json | 4 ++--
 torch_npu/npu/mstx.py      | 7 +++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json
index f664647d21..ad0e788829 100644
--- a/test/torch_npu_schema.json
+++ b/test/torch_npu_schema.json
@@ -1125,7 +1125,7 @@
     "signature": "()"
   },
   "torch_npu.npu.mstx.mark": {
-    "signature": "(message: str = '')"
+    "signature": "(message='')"
   },
   "torch_npu.npu.mstx.range_start": {
     "signature": "(message: str, stream=None) -> int"
@@ -1413,7 +1413,7 @@
     "signature": "()"
   },
   "torch_npu.npu.mstx.mstx.mark": {
-    "signature": "(message: str = '')"
+    "signature": "(message='')"
   },
   "torch_npu.npu.mstx.mstx.range_start": {
     "signature": "(message: str, stream=None) -> int"
diff --git a/torch_npu/npu/mstx.py b/torch_npu/npu/mstx.py
index 518ca45226..8ae78319ae 100644
--- a/torch_npu/npu/mstx.py
+++ b/torch_npu/npu/mstx.py
@@ -37,13 +37,16 @@ def _no_exception_func(default_ret=None):
 class mstx:
     @staticmethod
     @_no_exception_func()
-    def mark(message:str = ""):
+    def mark(message=""):
+        if not message or not isinstance(message, str):
+            warnings.warn("Invalid message for mstx.mark func. Please input valid message string.")
+            return
         torch_npu._C._mark(message)
 
     @staticmethod
     @_no_exception_func()
     def range_start(message: str, stream=None) -> int:
-        if not message:
+        if not message or not isinstance(message, str):
             warnings.warn("Invalid message for mstx.range_start func. Please input valid message string.")
             return 0
         if stream:
-- 
Gitee


From 1452b864b50aef570ffee829bb69607228ad82bf Mon Sep 17 00:00:00 2001
From: Gallium <hujia23@huawei.com>
Date: Wed, 19 Mar 2025 06:53:21 +0000
Subject: [PATCH 188/358] !19087 lazy init dynolog Merge pull request !19087
 from Gallium/bug_fix_2.6.0

---
 .../_dynamic_profiler/_dynamic_monitor_proxy.py       | 11 ++++++-----
 .../_dynamic_profiler_config_context.py               |  5 +++--
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_monitor_proxy.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_monitor_proxy.py
index 18e0e51a5d..d86de245ce 100644
--- a/torch_npu/profiler/_dynamic_profiler/_dynamic_monitor_proxy.py
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_monitor_proxy.py
@@ -6,16 +6,17 @@ from ._dynamic_profiler_utils import DynamicProfilerUtils
 class PyDynamicMonitorProxySingleton():
     def __init__(self):
         self._proxy = None
-        self._load_proxy()
+        self._load_success = True
 
     def _load_proxy(self):
-        if not self._proxy:
+        if not self._proxy and self._load_success:
             try:
                 from IPCMonitor import PyDynamicMonitorProxy
-                self._proxy = PyDynamicMonitorProxy()
             except Exception as e:
-                dynamic_profiler_utils.stdout_log(f"Import IPCMonitro module failed :{e}!",
-                                                dynamic_profiler_utils.LoggerLevelEnum.WARNING)
+                self._load_success = False
+                return
+            self._proxy = PyDynamicMonitorProxy()
 
     def get_proxy(self):
+        self._load_proxy()
         return self._proxy
\ No newline at end of file
diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
index 27060c4958..627f220393 100644
--- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
@@ -185,7 +185,8 @@ class ConfigContext:
         if not self._is_dyno:
             self._analyse = json_data.get("analyse", False)
         else:
-            self._analyse = json_data.get("PROFILE_ANALYSE", False)
+            self._analyse = json_data.get("PROFILE_ANALYSE", 'false')
+            self._analyse = self.BOOL_MAP.get(self._analyse.lower(), False)
 
     def _parse_dyno_exp_cfg(self, json_data: dict): 
         profiler_level = json_data.get('PROFILE_PROFILER_LEVEL', 'Level0')
@@ -197,7 +198,7 @@ class ConfigContext:
         op_attr = json_data.get('PROFILE_OP_ATTR', 'false')
         op_attr = self.BOOL_MAP.get(op_attr.lower(), False)
         gc_detect_threshold = json_data.get('PROFILE_GC_DETECT_THRESHOLD', None)
-        if gc_detect_threshold is not None:
+        if isinstance(gc_detect_threshold, str) and gc_detect_threshold != "None":
             gc_detect_threshold = float(gc_detect_threshold)
         data_simplification = json_data.get('PROFILE_DATA_SIMPLIFICATION', 'true')
         data_simplification = self.BOOL_MAP.get(data_simplification.lower(), True)
-- 
Gitee


From 9dbac05f55d63a4fb4788a5818bfd8079e1cc781 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Wed, 19 Mar 2025 10:00:41 +0000
Subject: [PATCH 189/358] !19173 Update op_plugin commit id Merge pull request
 !19173 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index ae206e739a..ed50babe55 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit ae206e739a541d716d53245bc166729cd8eaed6e
+Subproject commit ed50babe55fde1fa1038a0b4536de9f6daedaeb8
-- 
Gitee


From 35022ef2c1ee18a6a086ad352a4dd7d18174e753 Mon Sep 17 00:00:00 2001
From: huangyunlong <huangyunlong4@huawei.com>
Date: Wed, 19 Mar 2025 11:16:36 +0000
Subject: [PATCH 190/358] !19178 clean code Merge pull request !19178 from
 huangyunlong/2.6cl

---
 torch_npu/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_npu/__init__.py b/torch_npu/__init__.py
index 81db1cd90b..a346cbf673 100644
--- a/torch_npu/__init__.py
+++ b/torch_npu/__init__.py
@@ -85,8 +85,8 @@ from torch_npu.utils._error_code import ErrCode, pta_error, _except_handler
 from torch_npu.asd.asd import _asd_patch
 from torch_npu._C._distributed_c10d import ParallelStore
 from torch_npu.op_plugin.meta import _meta_registrations
-from .version import __version__ as __version__
-from . import _op_plugin_docs
+from torch_npu.version import __version__ as __version__
+from torch_npu import _op_plugin_docs
 del _op_plugin_docs
 
 _cann_package_check()
-- 
Gitee


From c4b897522320375f1b9d5cde6b37805fd230ebeb Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Wed, 19 Mar 2025 13:00:41 +0000
Subject: [PATCH 191/358] !19207 Update op_plugin commit id Merge pull request
 !19207 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index ed50babe55..e73467134b 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit ed50babe55fde1fa1038a0b4536de9f6daedaeb8
+Subproject commit e73467134b8d43d52a8b8d0a08ffde2ed8cb14b0
-- 
Gitee


From e616d32ed1fb5bdfdbf1ced95dd703891c72f76e Mon Sep 17 00:00:00 2001
From: yuhaiyan <yuhaiyan8@huawei.com>
Date: Wed, 19 Mar 2025 14:27:20 +0000
Subject: [PATCH 192/358] !19202 Move GetCANNInfo.h/GetCANNInfo.cpp to
 torch_npu/csrc/core/npu/ Merge pull request !19202 from yuhaiyan/v2.6.0-dev2

---
 torch_npu/csrc/{ => core}/npu/GetCANNInfo.cpp | 2 +-
 torch_npu/csrc/{ => core}/npu/GetCANNInfo.h   | 0
 torch_npu/csrc/npu/Module.cpp                 | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename torch_npu/csrc/{ => core}/npu/GetCANNInfo.cpp (98%)
 rename torch_npu/csrc/{ => core}/npu/GetCANNInfo.h (100%)

diff --git a/torch_npu/csrc/npu/GetCANNInfo.cpp b/torch_npu/csrc/core/npu/GetCANNInfo.cpp
similarity index 98%
rename from torch_npu/csrc/npu/GetCANNInfo.cpp
rename to torch_npu/csrc/core/npu/GetCANNInfo.cpp
index ed9255c4a9..84597850ed 100644
--- a/torch_npu/csrc/npu/GetCANNInfo.cpp
+++ b/torch_npu/csrc/core/npu/GetCANNInfo.cpp
@@ -1,5 +1,5 @@
 #include <iostream>
-#include "torch_npu/csrc/npu/GetCANNInfo.h"
+#include "torch_npu/csrc/core/npu/GetCANNInfo.h"
 #include "torch_npu/csrc/core/npu/register/FunctionLoader.h"
 #include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/core/npu/interface/AclInterface.h"
diff --git a/torch_npu/csrc/npu/GetCANNInfo.h b/torch_npu/csrc/core/npu/GetCANNInfo.h
similarity index 100%
rename from torch_npu/csrc/npu/GetCANNInfo.h
rename to torch_npu/csrc/core/npu/GetCANNInfo.h
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index 3a37cd73a1..39f831647f 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -43,7 +43,7 @@
 #include "torch_npu/csrc/profiler/msprof_tx.h"
 #include "torch_npu/csrc/npu/memory_snapshot.h"
 #include "torch_npu/csrc/core/npu/interface/OpInterface.h"
-#include "torch_npu/csrc/npu/GetCANNInfo.h"
+#include "torch_npu/csrc/core/npu/GetCANNInfo.h"
 #include "op_plugin/utils/custom_functions/opapi/FFTCommonOpApi.h"
 
 struct NPUDeviceProp {
-- 
Gitee


From f9107c74b7fcfd8572ad0865cfa034db42e018ab Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Wed, 19 Mar 2025 16:30:46 +0000
Subject: [PATCH 193/358] !19221 Update op_plugin commit id Merge pull request
 !19221 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index e73467134b..645f5b3f93 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit e73467134b8d43d52a8b8d0a08ffde2ed8cb14b0
+Subproject commit 645f5b3f939fa9cbe8de0654ec0223fbd1ae8339
-- 
Gitee


From ad25fb99ce9aa542d252c099fa40ceab864c2e8d Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Wed, 19 Mar 2025 23:18:12 +0000
Subject: [PATCH 194/358] !19212 Update torchair commit id Merge pull request
 !19212 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 7d6785c75a..a013adee64 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 7d6785c75a31e0d7802d17e4119f3e8d519facfe
+Subproject commit a013adee642fa13d97607f0323bb59f828f29649
-- 
Gitee


From 0d4a03c0767085ebc6190514dca46057455d26a6 Mon Sep 17 00:00:00 2001
From: yuhaiyan <yuhaiyan8@huawei.com>
Date: Thu, 20 Mar 2025 03:32:11 +0000
Subject: [PATCH 195/358] !19168 Fixed some failed testcases Merge pull request
 !19168 from yuhaiyan/v2.6.0-dev1

---
 test/contrib/test_bbox_coder.py               |  15 ++
 test/contrib/test_deform_conv.py              |   8 +-
 test/contrib/test_multiclass_nms.py           |   4 +
 test/get_failed_ut_from_log.py                |   9 +-
 test/nn/test_module_hooks.py                  | 194 ++++++++++++++++--
 test/onnx/test_wrapper_onnx_ops.py            |  82 +-------
 .../.pytorch-disabled-tests.json              |   3 +
 7 files changed, 218 insertions(+), 97 deletions(-)

diff --git a/test/contrib/test_bbox_coder.py b/test/contrib/test_bbox_coder.py
index 1ac0abf9e2..9e0b9f919d 100644
--- a/test/contrib/test_bbox_coder.py
+++ b/test/contrib/test_bbox_coder.py
@@ -2,12 +2,27 @@ import unittest
 import numpy as np
 import torch
 import torch_npu
+from torch_npu.testing.common_utils import SupportedDevices
 from torch_npu.testing.testcase import TestCase, run_tests
 from torch_npu.contrib.function import npu_bbox_coder_encode_yolo, \
     npu_bbox_coder_encode_xyxy2xywh, npu_bbox_coder_decode_xywh2xyxy
 
 
 class TestBboxCoder(TestCase):
+    @SupportedDevices(["Ascend910B"])
+    def test_npu_bbox_coder_encode_xyxy2xywh_A2(self):
+        bboxes = torch.tensor([[1., 2., 3., 4.], [3., 4., 5., 6.]], dtype=torch.float32).to("npu")
+        gt_bboxes = torch.tensor([[5., 6., 7., 8.], [7., 8., 9., 6.]], dtype=torch.float32).to("npu")
+        npuout_1 = npu_bbox_coder_encode_xyxy2xywh(bboxes, gt_bboxes)
+        npuout_2 = npu_bbox_coder_encode_xyxy2xywh(bboxes / 512., gt_bboxes / 512., is_normalized=True,
+                                                   normalized_scale=512.)
+        expect_cpu = torch.tensor([[1.3330, 1.3330, 0.0000, 0.0000],
+                                   [1.3330, 0.6665, 0.0000, np.nan]], dtype=torch.float32)
+
+        self.assertRtolEqual(expect_cpu.numpy(), npuout_1.cpu().numpy())
+        self.assertRtolEqual(expect_cpu.numpy(), npuout_2.cpu().numpy())
+
+    @SupportedDevices(['Ascend910A', 'Ascend910P'])
     def test_npu_bbox_coder_encode_xyxy2xywh(self):
         np.random.seed(123)
         data1 = np.random.randint(low=0, high=512, size=(6, 4))
diff --git a/test/contrib/test_deform_conv.py b/test/contrib/test_deform_conv.py
index 28956eccab..a164926dab 100644
--- a/test/contrib/test_deform_conv.py
+++ b/test/contrib/test_deform_conv.py
@@ -47,8 +47,8 @@ class TestDeformConv(TestCase):
                                           [[-0.1422, -0.2028, -0.1422],
                                            [-0.0641, 0.2660, -0.0641],
                                            [-0.1422, -0.2028, -0.1422]]]], dtype=torch.float32)
-        self.assertRtolEqual(expect_cpu_output, output.detach().cpu())
-        self.assertRtolEqual(expect_cpu_xgrad, x.grad.cpu())
+        self.assertRtolEqual(expect_cpu_output, output.detach().cpu(), prec=1.e-3)
+        self.assertRtolEqual(expect_cpu_xgrad, x.grad.cpu(), prec=1.e-3)
 
     def test_npu_deform_conv_2(self):
         np.random.seed(546)
@@ -100,8 +100,8 @@ class TestDeformConv(TestCase):
                                            [-0.1422, -0.2028, -0.1422, -0.2028, -0.1422],
                                            [-0.0641, 0.2660, -0.0641, 0.2660, -0.0641],
                                            [-0.1422, -0.2028, -0.1422, -0.2028, -0.1422]]]], dtype=torch.float32)
-        self.assertRtolEqual(expect_cpu_output, output.detach().cpu())
-        self.assertRtolEqual(expect_cpu_xgrad, x.grad.cpu())
+        self.assertRtolEqual(expect_cpu_output, output.detach().cpu(), prec=1.e-3)
+        self.assertRtolEqual(expect_cpu_xgrad, x.grad.cpu(), prec=1.e-3)
 
 
 if __name__ == "__main__":
diff --git a/test/contrib/test_multiclass_nms.py b/test/contrib/test_multiclass_nms.py
index 1b8685c85c..15533da1a8 100644
--- a/test/contrib/test_multiclass_nms.py
+++ b/test/contrib/test_multiclass_nms.py
@@ -2,12 +2,14 @@ import unittest
 import numpy as np
 import torch
 import torch_npu
+from torch_npu.testing.common_utils import SupportedDevices
 from torch_npu.testing.testcase import TestCase, run_tests
 from torch_npu.contrib.function import npu_multiclass_nms, \
     npu_batched_multiclass_nms
 
 
 class TestMultiClassNms(TestCase):
+    @SupportedDevices(['Ascend910A', 'Ascend910P'])
     def test_npu_multiclass_nms_1(self):
         np.random.seed(123)
         data1 = np.random.randint(low=1, high=255, size=(1000, 4))
@@ -24,6 +26,7 @@ class TestMultiClassNms(TestCase):
         self.assertRtolEqual(expect_det_bboxes, det_bboxes.cpu())
         self.assertRtolEqual(expect_det_labels, det_labels.cpu())
 
+    @SupportedDevices(['Ascend910A', 'Ascend910P'])
     def test_npu_multiclass_nms_2(self):
         np.random.seed(123)
         data1 = np.random.randn(1000, 4)
@@ -40,6 +43,7 @@ class TestMultiClassNms(TestCase):
         self.assertRtolEqual(expect_det_bboxes, det_bboxes.cpu())
         self.assertRtolEqual(expect_det_labels, det_labels.cpu())
 
+    @SupportedDevices(['Ascend910A', 'Ascend910P'])
     def test_npu_batched_multiclass_nms_1(self):
         np.random.seed(339)
         data1 = np.random.randint(low=1, high=255, size=(4, 200, 80, 4))
diff --git a/test/get_failed_ut_from_log.py b/test/get_failed_ut_from_log.py
index 2bde722a51..a2ef794234 100644
--- a/test/get_failed_ut_from_log.py
+++ b/test/get_failed_ut_from_log.py
@@ -18,19 +18,22 @@ def get_error_or_fail_ut(file):
 
 
 def write_to_json(ut_list=None):
-    file1 = ".pytorch-disabled-tests.json"
+    file1 = "unsupported_test_cases/.pytorch-disabled-tests.json"
     fr = open(file1)
     content = json.load(fr)
+    if not ut_list:
+        return
     for line in ut_list:
         content[line] = ["", [""]]
     with open("./pytorch-disabled-tests.json", mode="w") as fp:
         fp.write("{\n")
         length = len(content.keys()) - 1
         for i, (key, (value1, value2)) in enumerate(content.items()):
+            value2_str = "\"" + "\",\"".join(value2) + "\""
             if i < length:
-                fp.write(f"  \"{key}\": [\"{value1}\", [\"\"]]" + ",\n")
+                fp.write(f"  \"{key}\": [\"{value1}\", [{value2_str}]]" + ",\n")
             else:
-                fp.write(f"  \"{key}\": [\"{value1}\", [\"\"]]" + "\n")
+                fp.write(f"  \"{key}\": [\"{value1}\", [{value2_str}]]" + "\n")
         fp.write("}\n")
     fr.close()
 
diff --git a/test/nn/test_module_hooks.py b/test/nn/test_module_hooks.py
index aed673ac84..4071e088b1 100644
--- a/test/nn/test_module_hooks.py
+++ b/test/nn/test_module_hooks.py
@@ -21,6 +21,7 @@ from torch.testing._internal.common_utils import (
     skipIfTorchDynamo,
     IS_WINDOWS,
     parametrize as parametrize_test,
+    swap,
     instantiate_parametrized_tests
 )
 from torch.testing._internal.common_nn import NNTestCase, _create_basic_net
@@ -587,16 +588,22 @@ class TestStateDictHooks(TestCase):
         self.assertEqual(1, hook_called)
 
         hook_called = 0
-        m_load._register_load_state_dict_pre_hook(hook_with_module, True)
+        m_load.register_load_state_dict_pre_hook(hook_with_module)
         m_load.load_state_dict(m_state_dict)
         self.assertEqual(2, hook_called)
 
+        # Test private API with with_module=True
+        hook_called = 0
+        m_load._register_load_state_dict_pre_hook(hook_with_module, True)
+        m_load.load_state_dict(m_state_dict)
+        self.assertEqual(3, hook_called)
+
     def test_no_extra_ref_to_module(self):
         try:
             gc.disable()
             m = nn.Linear(10, 10)
 
-            m._register_load_state_dict_pre_hook(_hook_to_pickle, True)
+            m.register_load_state_dict_pre_hook(_hook_to_pickle)
             weak_m = weakref.ref(m)
             del m
 
@@ -606,9 +613,10 @@ class TestStateDictHooks(TestCase):
 
     def test_pickled_hook(self):
         m = nn.Linear(10, 10)
-        m._register_load_state_dict_pre_hook(_hook_to_pickle, True)
+        m.register_load_state_dict_pre_hook(_hook_to_pickle)
         pickle.loads(pickle.dumps(m))
 
+    @swap([True, False])
     def test_load_state_dict_module_pre_hook(self):
         hook_called = 0
 
@@ -669,12 +677,11 @@ class TestStateDictHooks(TestCase):
             self.assertEqual(1, hook_called)
 
             hook_called = 0
-            mod._register_load_state_dict_pre_hook(
-                mod.my_pre_load_hook_with_module, True
-            )
+            mod.register_load_state_dict_pre_hook(mod.my_pre_load_hook_with_module)
             m.load_state_dict(state_dict)
             self.assertEqual(2, hook_called)
 
+    @swap([True, False])
     def test_load_state_dict_post_hook(self):
         hook_called = 0
 
@@ -732,6 +739,7 @@ class TestStateDictHooks(TestCase):
         self.assertEqual([], ret.unexpected_keys)
 
     @unittest.skipIf(IS_WINDOWS, "Tempfile permission issue on windows")
+    @swap([True, False])
     def test_load_state_dict_post_hook_backward_compatibility(self):
         def my_post_load_hook(mod, _):
             nonlocal called
@@ -747,7 +755,7 @@ class TestStateDictHooks(TestCase):
                 # Note that torch.save / torch.load is not recommended to save/load
                 # modules.
                 torch.save(m, f.name)
-                m = torch.load(f.name)
+                m = torch.load(f.name, weights_only=False)
                 m.load_state_dict(sd)
                 self.assertFalse(called)
 
@@ -756,6 +764,146 @@ class TestStateDictHooks(TestCase):
             m.load_state_dict(sd)
             self.assertTrue(called)
 
+    def _test_register_state_dict_pre_hook(self, model, submodule):
+        _state_dict_prefix = "foo."
+        state_dict_pre_hook_count = 0
+        keep_var_setting = False
+
+        def my_state_dict_pre_hook(module, prefix, keep_vars):
+            self.assertEqual(keep_vars, keep_var_setting)
+            nonlocal state_dict_pre_hook_count
+            state_dict_pre_hook_count += 1
+            self.assertTrue(prefix.startswith(_state_dict_prefix))
+
+        model.register_state_dict_pre_hook(my_state_dict_pre_hook)
+        # Test to ensure submodules run the hook as well.
+        submodule.register_state_dict_pre_hook(my_state_dict_pre_hook)
+
+        def check_results(model):
+            nonlocal state_dict_pre_hook_count, keep_var_setting
+            for keep_var_setting in [True, False]:
+                _ = model.state_dict(
+                    prefix=_state_dict_prefix, keep_vars=keep_var_setting
+                )
+                self.assertEqual(2, state_dict_pre_hook_count)
+                state_dict_pre_hook_count = 0
+
+        # Test state dict works as expected after model construction
+        check_results(model)
+        # Test state dict works as expected after forward
+        model(torch.ones(10, 3))
+        check_results(model)
+
+    def test_register_state_dict_pre_hook(self):
+        class MyModule(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.a = nn.Sequential(
+                    nn.Linear(3, 3), nn.Linear(3, 3), nn.Linear(3, 3)
+                )
+
+            def forward(self, x):
+                return self.a(x)
+
+        mod = MyModule()
+        self._test_register_state_dict_pre_hook(mod, mod.a)
+
+    def test_register_state_dict_pre_hook_lazy_module(self):
+        class MyLazyModule(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.layer1 = nn.LazyLinear(8)
+                self.layer2 = nn.LazyLinear(5)
+
+            def forward(self, x):
+                return self.layer2(self.layer1(x))
+
+        mod = MyLazyModule()
+        self._test_register_state_dict_pre_hook(mod, mod.layer1)
+
+    @unittest.skipIf(IS_WINDOWS, "Tempfile permission issue on windows")
+    def test_register_state_dict_pre_hook_backward_compat(self):
+        called = False
+
+        def my_state_dict_pre_hook(*args, **kwargs):
+            nonlocal called
+            called = True
+
+        m = nn.Linear(1, 1)
+        self.assertTrue(hasattr(m, "_state_dict_pre_hooks"))
+        delattr(m, "_state_dict_pre_hooks")
+        # Save and load, ensure we can still call state_dict
+        # without running into issues.
+        with NamedTemporaryFile() as f:
+            # Note that torch.save / torch.load is not recommended
+            # to save / load modules.
+            torch.save(m, f.name)
+            # weights_only=False as this is legacy code that saves the model
+            m = torch.load(f.name, weights_only=False)
+
+        # Ensure we can run state_dict without issues
+        _ = m.state_dict()
+        self.assertFalse(called)
+        m.register_state_dict_pre_hook(my_state_dict_pre_hook)
+        _ = m.state_dict()
+        self.assertTrue(called)
+
+    @parametrize_test("private", [True, False])
+    def test_register_state_dict_post_hook(self, private):
+        m = nn.Transformer(
+            d_model=4, nhead=2, num_encoder_layers=2, num_decoder_layers=2
+        )
+
+        def linear_state_dict_post_hook(module, state_dict, prefix, local_metadata):
+            for name, param in module.named_parameters(recurse=False):
+                state_dict[prefix + name] = torch.nn.Parameter(
+                    state_dict[prefix + name]
+                )
+
+        def register_linear_hook(module):
+            if isinstance(module, nn.Linear):
+                hook_registration_fn = (
+                    module._register_state_dict_hook
+                    if private
+                    else module.register_state_dict_post_hook
+                )
+                hook_registration_fn(linear_state_dict_post_hook)
+
+        def _check_sd(state_dict):
+            for k, v in m.state_dict().items():
+                if "linear" in k or "out_proj" in k:
+                    self.assertTrue(isinstance(v, torch.nn.Parameter))
+                else:
+                    self.assertFalse(isinstance(v, torch.nn.Parameter))
+
+        # verify that return type of hook registered on child submodules has no effect
+        # regardless of whether using public or private API
+        m.apply(register_linear_hook)
+        _check_sd(m.state_dict())
+
+        # verify that return type of hook registered root module has no effect
+        # for public API but has effect for private API
+        hook_registration_fn = (
+            m._register_state_dict_hook if private else m.register_state_dict_post_hook
+        )
+
+        def fn(m, s, p, l):
+            return OrderedDict()
+
+        handle = hook_registration_fn(fn)
+        if private:
+            self.assertFalse(hasattr(fn, "_from_public_api"))
+            self.assertTrue(len(m.state_dict()) == 0)
+        else:
+            self.assertTrue(hasattr(fn, "_from_public_api"))
+            with self.assertRaisesRegex(
+                RuntimeError, "state_dict post-hook must return None"
+            ):
+                sd = m.state_dict()
+            with self.assertRaisesRegex(
+                RuntimeError, "previously registered via register_state_dict_post_hook"
+            ):
+                m._register_state_dict_hook(fn)
 
 class TestModuleGlobalHooks(TestCase):
 
@@ -1039,6 +1187,27 @@ class TestModuleGlobalHooks(TestCase):
         output.backward(torch.ones(5, 5), retain_graph=True)
         self.assertTrue(local_backward_called and global_backward_called)
 
+    @skipIfTorchDynamo("TorchDynamo does not work well with hooks")
+    def test_module_global_hooks_with_kwargs(self):
+        def kwarg_global_forward_hook(
+            module: nn.Module,
+            args: Tuple[torch.Tensor],
+            kwargs: Dict[str, Any],
+            out: torch.Tensor,
+        ) -> Any:
+            out = out + kwargs["bias"]
+            return out
+
+        model = KwargModel()
+        nn.modules.module.register_module_forward_hook(
+            kwarg_global_forward_hook,
+            with_kwargs=True,
+        )
+        x: torch.Tensor = torch.randn(10, 20)
+        bias: torch.Tensor = torch.randn(10, 20)
+        out = model(x, bias=bias)
+        self.assertEqual(out, x + 2 * bias, rtol=0, atol=1e-5)
+
 
 class TestModuleHookNN(NNTestCase):
     _do_cuda_memory_leak_check = True
@@ -1266,7 +1435,7 @@ class TestModuleHookNN(NNTestCase):
         try:
             mod(inp.detach(), inp)
         except Exception as ex:
-            self.fail("Unexpected exception: %s" % ex)
+            self.fail(f"Unexpected exception: {ex}")
 
     def test_hook_extra_input(self):
         class MyModule(nn.Module):
@@ -1349,7 +1518,7 @@ class TestModuleHookNN(NNTestCase):
         m = MyModule()
         m.register_backward_hook(noop)
 
-        with self.assertWarnsRegex(UserWarning, "does not take as input a single Tensor or a tuple of Tensors"):
+        with self.assertWarnsRegex(FutureWarning, "does not take as input a single Tensor or a tuple of Tensors"):
             m([a, b])
 
         # Check invalid output container
@@ -1360,7 +1529,7 @@ class TestModuleHookNN(NNTestCase):
         m = MyModule()
         m.register_backward_hook(noop)
 
-        with self.assertWarnsRegex(UserWarning, "does not return a single Tensor or a tuple of Tensors"):
+        with self.assertWarnsRegex(FutureWarning, "does not return a single Tensor or a tuple of Tensors"):
             m(a, b)
 
         # Check invalid output from different Nodes
@@ -1371,7 +1540,7 @@ class TestModuleHookNN(NNTestCase):
         m = MyModule()
         m.register_backward_hook(noop)
 
-        with self.assertWarnsRegex(UserWarning, "outputs are generated by different autograd Nodes"):
+        with self.assertWarnsRegex(FutureWarning, "outputs are generated by different autograd Nodes"):
             m(a, b)
 
         # Check invalid forward with multiple Nodes
@@ -1382,7 +1551,7 @@ class TestModuleHookNN(NNTestCase):
         m = MyModule()
         m.register_backward_hook(noop)
 
-        with self.assertWarnsRegex(UserWarning, "the forward contains multiple autograd Nodes"):
+        with self.assertWarnsRegex(FutureWarning, "the forward contains multiple autograd Nodes"):
             m(a)
 
     def test_hook_backward_size(self):
@@ -1499,6 +1668,7 @@ class TestModuleHookNN(NNTestCase):
 
 
 instantiate_parametrized_tests(TestModuleHooks)
+instantiate_parametrized_tests(TestStateDictHooks)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/onnx/test_wrapper_onnx_ops.py b/test/onnx/test_wrapper_onnx_ops.py
index b73d04f74d..ee9c5868bc 100644
--- a/test/onnx/test_wrapper_onnx_ops.py
+++ b/test/onnx/test_wrapper_onnx_ops.py
@@ -138,6 +138,7 @@ class TestOnnxOps(TestCase):
         assert (os.path.isfile(os.path.join(TestOnnxOps.test_onnx_path,
                                             onnx_model_name)))
 
+    @SupportedDevices(['Ascend910A', 'Ascend910P'])
     def test_wrapper_npu_batch_nms(self):
         class Model(torch.nn.Module):
             def __init__(self):
@@ -202,34 +203,6 @@ class TestOnnxOps(TestCase):
         assert(os.path.isfile(os.path.join(TestOnnxOps.test_onnx_path,
                                            onnx_model_name)))
 
-
-    def test_wrapper_npu_fused_attention_score(self):
-        class Model(torch.nn.Module):
-            def __init__(self):
-                super(Model, self).__init__()
-
-            def forward(self, query_layer, key_layer, value_layer, attention_mask):
-                scale = 0.125
-                keep_prob = 1
-                return torch_npu.npu_fused_attention_score(query_layer, key_layer,
-                                                           value_layer, attention_mask, scale, keep_prob)
-
-        def export_onnx(onnx_model_name):
-            q = torch.rand(24, 16, 512, 64).uniform_(-3, 3).npu().half()
-            k = torch.rand(24, 16, 512, 64).uniform_(-3, 3).npu().half()
-            v = torch.rand(24, 16, 512, 64).uniform_(-3, 3).npu().half()
-            mask = torch.ones(512) * -10000.
-            mask[:6] = -0.
-            mask = mask.expand(24, 1, 512, 512).npu().half()
-            model = Model().to("npu")
-            model(q, k, v, mask)
-            self.onnx_export(model, (q, k, v, mask), onnx_model_name, ["q", "k", "v", "mask"])
-
-        onnx_model_name = "model_npu_fused_attention_score.onnx"
-        export_onnx(onnx_model_name)
-        assert (os.path.isfile(os.path.join(TestOnnxOps.test_onnx_path,
-                                            onnx_model_name)))
-
     def test_wrapper_npu_multi_head_attention(self):
         class Model(torch.nn.Module):
             def __init__(self):
@@ -509,6 +482,7 @@ class TestOnnxOps(TestCase):
         assert (os.path.isfile(os.path.join(TestOnnxOps.test_onnx_path,
                                             onnx_model_name)))
 
+    @SupportedDevices(['Ascend910A', 'Ascend910P'])
     def test_wrapper_npu_ifmr(self):
         class Model(torch.nn.Module):
             def __init__(self):
@@ -538,32 +512,6 @@ class TestOnnxOps(TestCase):
         assert (os.path.isfile(os.path.join(TestOnnxOps.test_onnx_path,
                                             onnx_model_name)))
 
-    def test_wrapper_npu_fused_attention_score_fwd(self):
-        class Model(torch.nn.Module):
-            def __init__(self):
-                super(Model, self).__init__()
-
-            def forward(self, q, k, v, mask):
-                return torch_npu.npu_fused_attention_score_fwd(q, k, v, mask, 0.125, 1)
-
-        def export_onnx(onnx_model_name):
-            q = torch.rand(24, 16, 512, 64).uniform_(-3, 3).half().npu()
-            k = torch.rand(24, 16, 512, 64).uniform_(-3, 3).half().npu()
-            v = torch.rand(24, 16, 512, 64).uniform_(-3, 3).half().npu()
-            mask = torch.ones(512) * -10000.
-            mask[:6] = -0.
-            mask = mask.expand(24, 1, 512, 512).half().npu()
-
-            model = Model().to("npu")
-            model(q, k, v, mask)
-            self.onnx_export(model, (q, k, v, mask), onnx_model_name,
-                             ["q", "k", "v", "mask"], ["out1", "out2", "out3"])
-
-        onnx_model_name = "model_npu_fused_attention_score_fwd.onnx"
-        export_onnx(onnx_model_name)
-        assert (os.path.isfile(os.path.join(TestOnnxOps.test_onnx_path,
-                                            onnx_model_name)))
-
     def test_wrapper_npu_sign_bits_unpack(self):
         class Model(torch.nn.Module):
             def __init__(self):
@@ -957,28 +905,6 @@ class TestOnnxOps(TestCase):
         assert (os.path.isfile(os.path.join(TestOnnxOps.test_onnx_path,
                                             onnx_model_name)))
 
-    def test_wrapper_npu_scatter(self):
-        class Model(torch.nn.Module):
-            def __init__(self):
-                super(Model, self).__init__()
-
-            def forward(self, input_, indices, updates):
-                return torch_npu.npu_scatter(input_, indices, updates, 0)
-
-        def export_onnx(onnx_model_name):
-            input_ = torch.tensor([[1.6279, 0.1226], [0.9041, 1.0980]]).npu()
-            indices = torch.tensor([0, 1], dtype=torch.int32).npu()
-            updates = torch.tensor([-1.1993, -1.5247]).npu()
-            model = Model().to("npu")
-            model(input_, indices, updates)
-            self.onnx_export(model, (input_, indices, updates),
-                             onnx_model_name, ["input_", "indices", "updates"])
-
-        onnx_model_name = "model_npu_scatter.onnx"
-        export_onnx(onnx_model_name)
-        assert (os.path.isfile(os.path.join(TestOnnxOps.test_onnx_path,
-                                            onnx_model_name)))
-
     def test_wrapper_npu_lstm_cell(self):
         class Model(torch.nn.Module):
             def __init__(self):
@@ -1261,7 +1187,7 @@ class TestOnnxOps(TestCase):
             
         def export_onnx(onnx_model_name):
             x = torch.rand(10, 1024).uniform_(-3, 3).npu().half()
-            gamma = torch.rand(10).uniform_(-3, 3).npu().half()
+            gamma = torch.rand(1024).uniform_(-3, 3).npu().half()
             model = Model().to("npu")
             model(x, gamma)
             self.onnx_export(model, (x, gamma), onnx_model_name)
@@ -1423,7 +1349,7 @@ class TestOnnxOps(TestCase):
                 super().__init__()
 
             def forward(self, x, weight, antiquant_scale, antiquant_offset, quant_scale, quant_offset, bias, antiquant_group_size):
-                return torch_npu.npu_weight_quant_batchmatmul(x, weight, antiquant_scale, antiquant_offset, quant_scale, quant_offset, bias, antiquant_group_size)
+                return torch_npu.npu_weight_quant_batchmatmul(x, weight, antiquant_scale, antiquant_offset, quant_scale, quant_offset, bias, 0)
 
         def export_onnx(onnx_model_name):
             x = torch.randn((8192, 320), dtype=torch.bfloat16).npu()
diff --git a/test/unsupported_test_cases/.pytorch-disabled-tests.json b/test/unsupported_test_cases/.pytorch-disabled-tests.json
index 66c1d7e75a..27f2c9a5fa 100644
--- a/test/unsupported_test_cases/.pytorch-disabled-tests.json
+++ b/test/unsupported_test_cases/.pytorch-disabled-tests.json
@@ -16,6 +16,9 @@
   "test_nn_module_tests (__main__.TestComplexity)": ["", [""]],
   "test_not_import_sympy (main.TestImports)": ["", [""]],
   "test_no_warning_on_import (main.TestImports)": ["", [""]],
+  "test_output_match_nn_functional_batch_norm_cpu_float32 (__main__.TestOnnxModelOutputConsistency_opset18CPU)": ["", [""]],
+  "test_dispatch_overload_fall_back_default_raise_diagnostic_warning (__main__.TestFxToOnnx)": ["", [""]],
+  "test_fake_autocast_index_add_npu_float32 (__main__.TestFakeTensorPRIVATEUSE1)": ["", [""]],
   "test_no_mutate_global_logging_on_import_path_torch (main.TestImports)": ["", [""]],
   "test_no_mutate_global_logging_on_import_path_functorch (main.TestImports)": ["", [""]],
   "test_lazy_imports_are_lazy (main.TestImports)": ["", [""]],
-- 
Gitee


From 99b2c11c6a073dfb32f2f83a9ba124b664cb14d4 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 20 Mar 2025 05:30:45 +0000
Subject: [PATCH 196/358] !19240 Update op_plugin commit id Merge pull request
 !19240 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 645f5b3f93..06101671fe 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 645f5b3f939fa9cbe8de0654ec0223fbd1ae8339
+Subproject commit 06101671fe5556f55c06e466a2f101a6adc4bcf6
-- 
Gitee


From c894a620a6ef5017b81b33b46ec6186a87ebbc43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E5=A4=8F=E5=A4=8F?= <wangxiaxia3@huawei.com>
Date: Thu, 20 Mar 2025 07:09:05 +0000
Subject: [PATCH 197/358] =?UTF-8?q?!19230=20Skip=20test=5Fgrad=5Fscaling?=
 =?UTF-8?q?=5Funscale.=20Merge=20pull=20request=20!19230=20from=20?=
 =?UTF-8?q?=E7=8E=8B=E5=A4=8F=E5=A4=8F/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../test_amp_foreach_non_finite_check_and_unscale.py          | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/custom_ops/test_amp_foreach_non_finite_check_and_unscale.py b/test/custom_ops/test_amp_foreach_non_finite_check_and_unscale.py
index dc2511a369..9ee8542dbd 100644
--- a/test/custom_ops/test_amp_foreach_non_finite_check_and_unscale.py
+++ b/test/custom_ops/test_amp_foreach_non_finite_check_and_unscale.py
@@ -67,4 +67,6 @@ class TestAmpForeachNonFiniteCheckAndUnscale(TestCase):
 
 
 if __name__ == "__main__":
-    run_tests()
+    device_name = torch_npu.npu.get_device_name(0)[:10]
+    if device_name in ["Ascend910A"]:
+        run_tests()
-- 
Gitee


From 41adce36541d3829a301595e8604d34c459ee5e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= <wangchao430@huawei.com>
Date: Thu, 20 Mar 2025 07:15:37 +0000
Subject: [PATCH 198/358] =?UTF-8?q?!19190=20SilentCheckV3:=20Modify=20to?=
 =?UTF-8?q?=20use=20inf-norm=20method=20Merge=20pull=20request=20!19190=20?=
 =?UTF-8?q?from=20=E7=8E=8B=E8=B6=85/v2.6.0=5Fsilentperf3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/asd/asd.py                            | 2 +-
 torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_npu/asd/asd.py b/torch_npu/asd/asd.py
index 2892be1140..bae88bf708 100644
--- a/torch_npu/asd/asd.py
+++ b/torch_npu/asd/asd.py
@@ -141,7 +141,7 @@ class _SilentFaultDetectorV3:
         if grad.dtype != torch.bfloat16 and grad.dtype != torch.float32:
             return
 
-        val = grad.pow(2).max().view(-1)
+        val = torch.norm(grad, float('inf')).pow(2).view(-1)
 
         if idx not in self.silent_data_dict:
             self.silent_data_dict[idx] = SilentFaultDataV3()
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index de31e9444a..d07b3ea3b0 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -2234,7 +2234,7 @@ void ProcessGroupHCCL::silenceCheck(at::Tensor &input, c10d::OpType opType)
         }
     }
     if (c10_npu::opapi::IsExistAclnnSilentCheckV2()) {
-        at::Tensor val = input.detach().pow(2).max().view(-1);
+        at::Tensor val = at::norm(input, std::numeric_limits<float>::infinity()).pow(2).view(-1);
         at::Tensor max;
         if (silenceCheckCache_.find(opType) == silenceCheckCache_.end()) {
             at::Tensor stepTensor = at::zeros({1}, input.options().dtype(at::kLong));
-- 
Gitee


From c344339f3db58b1f9a26c097db22b565b8cca382 Mon Sep 17 00:00:00 2001
From: dilililiwhy <why.wuhuanyu@huawei.com>
Date: Thu, 20 Mar 2025 07:47:15 +0000
Subject: [PATCH 199/358] !19234 [5/N] cleancode (torch_npu/csrc/aten) Merge
 pull request !19234 from dilililiwhy/cleancode_aten_260_part5

---
 torch_npu/csrc/aten/NPUGeneratorImpl.h        |  2 +-
 .../csrc/aten/common/FormatCastKernelNpu.cpp  | 48 +++++++++----------
 .../csrc/aten/common/TensorFactories.cpp      |  4 +-
 3 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/torch_npu/csrc/aten/NPUGeneratorImpl.h b/torch_npu/csrc/aten/NPUGeneratorImpl.h
index 145b349875..8e07ff845e 100644
--- a/torch_npu/csrc/aten/NPUGeneratorImpl.h
+++ b/torch_npu/csrc/aten/NPUGeneratorImpl.h
@@ -120,7 +120,7 @@ struct PhiloxNpuState {
 struct TORCH_NPU_API NPUGeneratorImpl : public c10::GeneratorImpl {
     // Constructors
     NPUGeneratorImpl(c10::DeviceIndex device_index = -1);
-    ~NPUGeneratorImpl() = default;
+    ~NPUGeneratorImpl() override = default;
 
     // NPUGeneratorImpl methods
     std::shared_ptr<NPUGeneratorImpl> clone() const;
diff --git a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
index b5838f5cbb..0c4000e524 100644
--- a/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
+++ b/torch_npu/csrc/aten/common/FormatCastKernelNpu.cpp
@@ -35,22 +35,22 @@ at::Tensor format_cast_impl_out_npu(at::Tensor& dst, const at::Tensor& src)
     return dst;
 }
 
-// convert src from src_format to dst_format, write the result into dst
-at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& dst, const at::Tensor& src)
+// convert src from src_format to dst_format, write the result into dst(self)
+at::Tensor& NPUNativeFunctions::npu_format_cast_(at::Tensor& self, const at::Tensor& src)
 {
-    torch_npu::utils::torch_check_npu(dst);
+    torch_npu::utils::torch_check_npu(self);
     torch_npu::utils::torch_check_npu(src);
     auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_;
-    auto dst_desc = torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_;
+    auto dst_desc = torch_npu::NPUBridge::GetNpuStorageImpl(self)->npu_desc_;
     if (src_desc.npu_format_ == dst_desc.npu_format_) {
-        dst.copy_(src);
-        return dst;
+        self.copy_(src);
+        return self;
     }
 
     // calculate the output result of the NPU
-    format_cast_impl_out_npu(dst, src);
+    format_cast_impl_out_npu(self, src);
 
-    return dst;
+    return self;
 }
 
 // conver self to acl_format, write the result into new result tensor
@@ -83,48 +83,48 @@ at::Tensor npu_format_cast_impl(
 
 // conver self to dst'format, write the result into new result tensor
 at::Tensor NPUNativeFunctions::npu_format_cast(
-    const at::Tensor& src,
+    const at::Tensor& self,
     const at::Tensor& dst)
 {
     torch_npu::utils::torch_check_npu(dst);
     auto dst_desc = torch_npu::NPUBridge::GetNpuStorageImpl(dst)->npu_desc_;
     int64_t dst_format = dst_desc.npu_format_;
-    return custom_ops::npu_format_cast(src, dst_format);
+    return custom_ops::npu_format_cast(self, dst_format);
 }
 
 // conver self to acl_format, write the result into self
 at::Tensor& NPUNativeFunctions::npu_format_cast_(
-    at::Tensor& src,
+    at::Tensor& self,
     int64_t acl_format)
 {
-    torch_npu::utils::torch_check_npu(src);
-    auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_;
+    torch_npu::utils::torch_check_npu(self);
+    auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(self)->npu_desc_;
     if (src_desc.npu_format_ == acl_format) {
-        return src;
+        return self;
     }
-    if (FormatHelper::IsBaseFormatType(src) &&
+    if (FormatHelper::IsBaseFormatType(self) &&
         FormatHelper::IsBaseFormatType(static_cast<aclFormat>(acl_format))) {
-        FormatCastHelper::format_cast_as_base_format(src, static_cast<aclFormat>(acl_format));
-        return src;
+        FormatCastHelper::format_cast_as_base_format(self, static_cast<aclFormat>(acl_format));
+        return self;
     }
 
     at::Tensor dst = OpPreparation::ApplyTensorWithFormat(
-        src_desc.base_sizes_, src.options(), acl_format);
+        src_desc.base_sizes_, self.options(), acl_format);
 
     // calculate the output result of the NPU
-    format_cast_impl_out_npu(dst, src);
+    format_cast_impl_out_npu(dst, self);
 
     // format cast only change physical layout of base tensor and view tensor's
     // metadata remain unchanged
-    src.set_(dst.storage(), src.storage_offset(), src.sizes(), src.strides());
+    self.set_(dst.storage(), self.storage_offset(), self.sizes(), self.strides());
 
-    return src;
+    return self;
 }
 
-int64_t NPUNativeFunctions::get_npu_format(const at::Tensor& src)
+int64_t NPUNativeFunctions::get_npu_format(const at::Tensor& self)
 {
-    torch_npu::utils::torch_check_npu(src);
-    auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(src)->npu_desc_;
+    torch_npu::utils::torch_check_npu(self);
+    auto src_desc = torch_npu::NPUBridge::GetNpuStorageImpl(self)->npu_desc_;
     return src_desc.npu_format_;
 }
 
diff --git a/torch_npu/csrc/aten/common/TensorFactories.cpp b/torch_npu/csrc/aten/common/TensorFactories.cpp
index 53bbb70ba2..0c37c0d3c4 100644
--- a/torch_npu/csrc/aten/common/TensorFactories.cpp
+++ b/torch_npu/csrc/aten/common/TensorFactories.cpp
@@ -281,7 +281,7 @@ at::Tensor NPUNativeFunctions::empty_with_format(
     c10_npu::NPUGuard guard_(device_);
     c10::Allocator *allocator = c10_npu::NPUCachingAllocator::get();
     // when the shape and format are not match, fix format here.
-    aclFormat format = InferFormat::GuessStorageFormat(size, (aclFormat)dst_format);
+    aclFormat format = InferFormat::GuessStorageFormat(size, static_cast<aclFormat>(dst_format));
     auto dtype = c10::scalarTypeToTypeMeta(dtype_or_default(dtype_opt));
     int64_t nelements = StorageDescHelper::GetMemorySize(size, format, dtype);
     int64_t size_bytes = nelements * dtype.itemsize();
@@ -472,7 +472,7 @@ at::Tensor NPUNativeFunctions::bartlett_window(
         window_length += 1;
     }
     auto window = at::arange(window_length, options).mul_(2. / static_cast<double>(window_length - 1));
-    const int64_t first_half_size = ((uint64_t)(window_length - 1) >> 1) + 1;
+    const int64_t first_half_size = (static_cast<uint64_t>(window_length - 1) >> 1) + 1;
     window.narrow(0, first_half_size, window_length - first_half_size).mul_(-1).add_(2);
     return periodic ? window.narrow(0, 0, window_length - 1) : window;
 }
-- 
Gitee


From 046170c0fc67746625de3ecfac1ad0dd9154b0f1 Mon Sep 17 00:00:00 2001
From: dilililiwhy <why.wuhuanyu@huawei.com>
Date: Thu, 20 Mar 2025 07:48:39 +0000
Subject: [PATCH 200/358] !17965 TORCH MAIN SYNC : Change default value of
 weight_only(torch.load) to True Merge pull request !17965 from
 dilililiwhy/weights_only_issue

---
 test/npu/test_serialization.py   | 12 ++---
 test/torch_npu_schema.json       |  2 +-
 torch_npu/utils/serialization.py | 91 ++++++++++++++++++++++++++------
 3 files changed, 83 insertions(+), 22 deletions(-)

diff --git a/test/npu/test_serialization.py b/test/npu/test_serialization.py
index a5d8afea5d..af2a43f9ae 100644
--- a/test/npu/test_serialization.py
+++ b/test/npu/test_serialization.py
@@ -88,10 +88,10 @@ class TestSerialization(TestCase):
         with tempfile.TemporaryDirectory() as tmpdir:
             path = os.path.join(tmpdir, 'data.pt')
             torch.serialization.save(x, path, _use_new_zipfile_serialization=False)
-            x_loaded = torch.load(path, map_location="npu:0")
+            x_loaded = torch.load(path, map_location="npu:0", weights_only=False)
             self.assertExpectedInline(f'{x_loaded.device.type}:{x_loaded.device.index}', 'npu:0')
             self.assertRtolEqual(x, x_loaded.cpu())
-            x_loaded = torch.load(path, map_location=torch.device("npu:0"))
+            x_loaded = torch.load(path, map_location=torch.device("npu:0"), weights_only=False)
             self.assertExpectedInline(f'{x_loaded.device.type}:{x_loaded.device.index}', 'npu:0')
             self.assertRtolEqual(x, x_loaded.cpu())
 
@@ -181,7 +181,7 @@ class TestSerialization(TestCase):
         with tempfile.TemporaryDirectory() as tmpdir:
             path = os.path.join(tmpdir, 'data.pt')
             torch.save((x, model, number), path)
-            x_loaded, model_loaded, number_loaded = torch.load(path)
+            x_loaded, model_loaded, number_loaded = torch.load(path, weights_only=False)
             self.assertRtolEqual(x.cpu(), x_loaded.cpu())
             self.assertExpectedInline(str(model), str(model_loaded))
             self.assertTrue(number, number_loaded)
@@ -205,7 +205,7 @@ class TestSerialization(TestCase):
         with tempfile.TemporaryDirectory() as tmpdir:
             path = os.path.join(tmpdir, 'data.pt')
             torch.save(args, path)
-            args_loaded = torch.load(path)
+            args_loaded = torch.load(path, weights_only=False)
             self.assertTrue(args, args_loaded)
 
     def test_serialization_model(self):
@@ -213,7 +213,7 @@ class TestSerialization(TestCase):
             path = os.path.join(tmpdir, 'data.pt')
             model = NpuMNIST().npu()
             torch.save(model, path)
-            loaded_model = torch.load(path)
+            loaded_model = torch.load(path, weights_only=False)
             self.assertExpectedInline(str(model), str(loaded_model))
 
     def test_serialization_weight_norm(self):
@@ -221,7 +221,7 @@ class TestSerialization(TestCase):
             path = os.path.join(tmpdir, 'data.pt')
             model = WN(2, 4).npu()
             torch.save(model, path)
-            loaded_model = torch.load(path)
+            loaded_model = torch.load(path, weights_only=False)
             self.assertExpectedInline(str(model), str(loaded_model))
 
     def test_model_storage_ptr(self):
diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json
index ad0e788829..b15838b525 100644
--- a/test/torch_npu_schema.json
+++ b/test/torch_npu_schema.json
@@ -2415,7 +2415,7 @@
     "signature": "(cls)"
   },
   "torch_npu.utils.serialization.load": {
-    "signature": "(f: Union[str, os.PathLike, BinaryIO, IO[bytes]], map_location: Union[Callable[[torch.types.Storage, str], torch.types.Storage], torch.device, str, Dict[str, str], NoneType] = None, pickle_module: Any = None, *, weights_only: bool = False, mmap: Optional[bool] = None, **pickle_load_args: Any) -> Any"
+    "signature": "(f: Union[str, os.PathLike, BinaryIO, IO[bytes]], map_location: Union[Callable[[torch.types.Storage, str], torch.types.Storage], torch.device, str, Dict[str, str], NoneType] = None, pickle_module: Any = None, *, weights_only: Optional[bool] = None, mmap: Optional[bool] = None, **pickle_load_args: Any) -> Any"
   },
   "torch_npu.utils.serialization.save": {
     "signature": "(obj: object, f: Union[str, os.PathLike, BinaryIO, IO[bytes]], pickle_module: Any = <module 'pickle'>, pickle_protocol: int = 2, _use_new_zipfile_serialization: bool = True, _disable_byteorder_record: bool = False) -> None"
diff --git a/torch_npu/utils/serialization.py b/torch_npu/utils/serialization.py
index 3eb1e40db2..2a2c9a38c7 100644
--- a/torch_npu/utils/serialization.py
+++ b/torch_npu/utils/serialization.py
@@ -1,11 +1,13 @@
 import os
 import pickle
+import re
 from typing import Any, Optional
 
 import torch
 from torch.serialization import _check_dill_version, _open_file_like, _is_zipfile, \
     _open_zipfile_reader, _is_torchscript_zip, _weights_only_unpickler, \
     _legacy_load, _load, FILE_LIKE, MAP_LOCATION, DEFAULT_PROTOCOL
+from torch.serialization import _default_to_weights_only, UNSAFE_MESSAGE
 
 import torch_npu
 from torch_npu.utils._error_code import ErrCode, pta_error
@@ -130,20 +132,72 @@ def load(
     map_location: MAP_LOCATION = None,
     pickle_module: Any = None,
     *,
-    weights_only: bool = False,
+    weights_only: Optional[bool] = None,
     mmap: Optional[bool] = None,
     **pickle_load_args: Any
 ) -> Any:
     _update_cpu_remap_info(map_location)
     torch._C._log_api_usage_once("torch.load")
-    UNSAFE_MESSAGE = (
-        "Weights only load failed. Re-running `torch.load` with `weights_only` set to `False`"
-        " will likely succeed, but it can result in arbitrary code execution."
-        "Do it only if you get the file from a trusted source. WeightsUnpickler error: "
+    DOCS_MESSAGE = (
+        "\n\nCheck the documentation of torch.load to learn more about types accepted by default with weights_only."
     )
-    # Add ability to force safe only weight loads via environment variable
-    if os.getenv("TORCH_FORCE_WEIGHTS_ONLY_LOAD", "0").lower() in ['1', 'y', 'yes', 'true']:
+
+    def _get_wo_message(message: str) -> str:
+        unsafe_global_pattern = r"GLOBAL (\S+) was not an allowed global by default."
+        has_unsafe_global = re.search(unsafe_global_pattern, message) is not None
+        blocklist_pattern = r"whose module (\S+) is blocked"
+        has_blocklist = re.search(blocklist_pattern, message) is not None
+        import_pattern = r"(\S+) must be (\S+) to load"
+        has_import = re.search(import_pattern, message) is not None
+        if has_unsafe_global:
+            updated_message = (
+                "Weights only load failed. This file can still be loaded, to do so you have two options, "
+                "\033[1mdo those steps only if you trust the source of the checkpoint\033[0m. "
+                f"\n\t(1) {UNSAFE_MESSAGE}\n\t(2) Alternatively, to load with `weights_only=True` please check "
+                "the recommended steps in the following error message.\n\tWeightsUnpickler error: "
+                + message
+            )
+        else:
+            if has_import:
+                return f"Weights only load failed. {message}\n {UNSAFE_MESSAGE}\n"
+            else:
+                updated_message = f"Weights only load failed. {UNSAFE_MESSAGE}\n"
+                if not has_blocklist:
+                    updated_message += (
+                        "Please file an issue with the following so that we can make "
+                        "`weights_only=True` compatible with your use case: WeightsUnpickler error: "
+                    )
+            updated_message += message
+        return updated_message + DOCS_MESSAGE
+
+    weights_only_not_set = weights_only is None
+    if weights_only_not_set:
+        weights_only = _default_to_weights_only(pickle_module)
+
+    true_values = ["1", "y", "yes", "true"]
+    # Add ability to force safe only or non-safe weight loads via environment variables
+    force_weights_only_load = (
+        os.getenv("TORCH_FORCE_WEIGHTS_ONLY_LOAD", "0") in true_values
+    )
+    force_no_weights_only_load = (
+        os.getenv("TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD", "0") in true_values
+    )
+
+    if force_weights_only_load and force_no_weights_only_load:
+        raise RuntimeError(
+            "Only one of `TORCH_FORCE_WEIGHTS_ONLY_LOAD` or `TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD` "
+            "should be set, but both were set." + pta_error(ErrCode.PARAM)
+        )
+    elif force_weights_only_load:
         weights_only = True
+    elif force_no_weights_only_load:
+        # TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD can only override if callsite did not explicitly set weights_only
+        if weights_only_not_set:
+            print(
+                "Environment variable TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD detected, since the"
+                "`weights_only` argument was not explicitly passed to `torch.load`, forcing weights_only=False."
+            )
+            weights_only = False
 
     if weights_only:
         if pickle_module is not None:
@@ -169,6 +223,11 @@ def load(
                 if _is_torchscript_zip(opened_zipfile):
                     print(f"Warning: 'torch.load' received a zip file that looks like a TorchScript archive"
                           " dispatching to 'torch.jit.load' (call 'torch.jit.load' directly to silence this warning)")
+                    if weights_only:
+                        raise RuntimeError(
+                            "Cannot use ``weights_only=True`` with TorchScript archives passed to "
+                            "``torch.load``. " + UNSAFE_MESSAGE + pta_error(ErrCode.PARAM)
+                        )
                     opened_file.seek(orig_position)
                     return torch.jit.load(opened_file, map_location=map_location)
                 if mmap:
@@ -182,24 +241,26 @@ def load(
                         return _load(opened_zipfile, map_location, _weights_only_unpickler,
                                      overall_storage=overall_storage, **pickle_load_args)
                     except RuntimeError as e:
-                        raise pickle.UnpicklingError(UNSAFE_MESSAGE + str(e) + pta_error(ErrCode.SYSCALL)) from None
+                        raise pickle.UnpicklingError(_get_wo_message(str(e)) + pta_error(ErrCode.SYSCALL)) from None
                 return _load(opened_zipfile, map_location, pickle_module,
                              overall_storage=overall_storage, **pickle_load_args)
         else:
             if mmap:
-                raise RuntimeError("mmap can only be used with files saved with `torch.save(_use_new_zipfile_serialization=True), ",
-                                   "please torch.save your checkpoint with this option in order to use mmap." +
-                                   pta_error(ErrCode.PARAM))
+                raise RuntimeError("mmap can only be used with files saved with "
+                                   "`torch.save(_use_new_zipfile_serialization=True), "
+                                   "please torch.save your checkpoint with this option in order to use mmap."
+                                   + pta_error(ErrCode.PARAM))
             if weights_only:
                 try:
                     return _legacy_load(opened_file, map_location, _weights_only_unpickler, **pickle_load_args)
                 except RuntimeError as e:
-                    raise pickle.UnpicklingError(UNSAFE_MESSAGE + str(e) + pta_error(ErrCode.SYSCALL)) from None
+                    raise pickle.UnpicklingError(_get_wo_message(str(e)) + pta_error(ErrCode.SYSCALL)) from None
 
             warn_massage = (
-                "Warning: since the loaded file is not a zipfile, only \"torch.device\" and \"str\" type parameters are currently supported for parameter types of map_location"
-                "If parameter types of map_location is \"Callable[[torch.Tensor, str], torch.Tensor]\" or \"Dict[str, str]\", which is only support for zipfile,"
-                "all tensors are currently loaded onto the CPU, which may introduce problems"
+                "Warning: since the loaded file is not a zipfile, only \"torch.device\" and \"str\" type parameters "
+                "are currently supported for parameter types of map_location. If parameter types of map_location is "
+                "\"Callable[[torch.Tensor, str], torch.Tensor]\" or \"Dict[str, str]\", which is only support for "
+                "zipfile, all tensors are currently loaded onto the CPU, which may introduce problems."
             )
             _warn_legacy_serialization(warn_massage, "load")
 
-- 
Gitee


From a480893f2b9218a37ae44b2830e0a7459bb5c9f9 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 20 Mar 2025 09:15:43 +0000
Subject: [PATCH 201/358] !19248 Update op_plugin commit id Merge pull request
 !19248 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 06101671fe..e5151bc5a7 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 06101671fe5556f55c06e466a2f101a6adc4bcf6
+Subproject commit e5151bc5a70ad2dd6b9af15b54170a77c468b656
-- 
Gitee


From 6f4796ffb9aeabcb02fc3f8340803e87ba89f426 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 20 Mar 2025 11:00:43 +0000
Subject: [PATCH 202/358] !19258 Update op_plugin commit id Merge pull request
 !19258 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index e5151bc5a7..04d5a5e0a4 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit e5151bc5a70ad2dd6b9af15b54170a77c468b656
+Subproject commit 04d5a5e0a459f6bb33937c4090b5c83d35c816a6
-- 
Gitee


From ef114472b91adde296a1be74445697753b6c69a3 Mon Sep 17 00:00:00 2001
From: SCh-zx <1325467101@qq.com>
Date: Thu, 20 Mar 2025 12:24:19 +0000
Subject: [PATCH 203/358] !19165 [cleancode]npu Merge pull request !19165 from
 SCh-zx/v2.6.0

---
 torch_npu/csrc/npu/Graph.cpp           | 2 +-
 torch_npu/csrc/npu/Module.cpp          | 8 ++++----
 torch_npu/csrc/npu/Stream.h            | 3 ++-
 torch_npu/csrc/npu/memory_snapshot.cpp | 2 +-
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/torch_npu/csrc/npu/Graph.cpp b/torch_npu/csrc/npu/Graph.cpp
index b5d24efcfa..27e9174740 100644
--- a/torch_npu/csrc/npu/Graph.cpp
+++ b/torch_npu/csrc/npu/Graph.cpp
@@ -27,7 +27,7 @@ void TORCH_NPU_API THNPGraph_init(PyObject* module) {
                std::string capture_error_mode) {
                 aclmdlCaptureMode capture_mode;
                 c10_npu::MempoolId_t pool = pool_opt.has_value()
-                    ? pool_opt.value() :c10_npu::MempoolId_t{0, 0};
+                    ? pool_opt.value() : c10_npu::MempoolId_t{0, 0};
                 if (capture_error_mode == "global") {
                     capture_mode = aclmdlCaptureMode::ACL_MODEL_CAPTURE_MODE_GLOBAL;
                 } else if (capture_error_mode == "thread_local") {
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index 39f831647f..0ca1415ad2 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -480,7 +480,7 @@ PyObject* THNPModule_getDeviceUtilizationRate_wrap(PyObject* self, PyObject* dev
     } else if (cube != DEVICE_UTILIZATION_NOT_SUPPORT && vector != DEVICE_UTILIZATION_NOT_SUPPORT) {
         util_rate = (cube + vector) / 2;
     }
-    TORCH_CHECK(util_rate <=100 && util_rate >= 0, "invalid result to util_rate", PTA_ERROR(ErrCode::VALUE));
+    TORCH_CHECK(util_rate <= 100 && util_rate >= 0, "invalid result to util_rate", PTA_ERROR(ErrCode::VALUE));
     return PyLong_FromLong(util_rate);
     END_HANDLE_TH_ERRORS
 }
@@ -852,7 +852,7 @@ PyObject* THNPModule_memorySnapshot(PyObject* _unused, PyObject* noargs)
                 to_gather_dest.emplace_back(trace_entry);
             }
             trace_entry[action_s] = action_to_str(te.action_);
-            trace_entry[TraceEntry::OOM == te.action_ ? device_free_s : addr_s] = te.addr_;
+            trace_entry[te.action_ == TraceEntry::OOM ? device_free_s : addr_s] = te.addr_;
             trace_entry[size_s] = te.size_;
             trace_entry[stream_s] = int64_t(te.stream_);
             trace.append(trace_entry);
@@ -1171,8 +1171,8 @@ PyObject* THNPModule_tensor_construct_from_storage(PyObject* self, PyObject* arg
     HANDLE_TH_ERRORS
     static torch::PythonArgParser parser(
         {"set_storage_with_format_(Storage source)", },
-        /* traceable= */ false
-        );
+        /* traceable= */
+        false);
 
     torch::ParsedArgs<1> parsed_args;
     auto _r = parser.parse(args, nullptr, parsed_args);
diff --git a/torch_npu/csrc/npu/Stream.h b/torch_npu/csrc/npu/Stream.h
index da7a419214..f51479f2b0 100644
--- a/torch_npu/csrc/npu/Stream.h
+++ b/torch_npu/csrc/npu/Stream.h
@@ -14,7 +14,8 @@ extern PyObject *THNPStreamClass;
 
 TORCH_NPU_API void THNPStream_init(PyObject *module);
 
-inline bool THNPStream_Check(PyObject* obj) {
+inline bool THNPStream_Check(PyObject* obj)
+{
     return THNPStreamClass && PyObject_IsInstance(obj, THNPStreamClass);
 }
 
diff --git a/torch_npu/csrc/npu/memory_snapshot.cpp b/torch_npu/csrc/npu/memory_snapshot.cpp
index 6abb0f33a3..320239a3c1 100644
--- a/torch_npu/csrc/npu/memory_snapshot.cpp
+++ b/torch_npu/csrc/npu/memory_snapshot.cpp
@@ -276,7 +276,7 @@ std::string _memory_snapshot_pickled()
         for (const auto& te : traceInfo) {
             auto trace_entry = new_dict();
             trace_entry.insert(action_s, action_to_str(te.action_));
-            trace_entry.insert(TraceEntry::OOM == te.action_ ? device_free_s
+            trace_entry.insert(te.action_ == TraceEntry::OOM ? device_free_s
                                                              : addr_s,
                                te.addr_);
             trace_entry.insert(size_s, (int64_t)te.size_);
-- 
Gitee


From 3a5e7bf43a034b73a81e45755afb7f4bfc307d51 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E5=98=89=E8=AF=9A?=
 <12731429+wang-pierre-jiacheng@user.noreply.gitee.com>
Date: Thu, 20 Mar 2025 12:36:34 +0000
Subject: [PATCH 204/358] =?UTF-8?q?!19251=201g=20memory=20Merge=20pull=20r?=
 =?UTF-8?q?equest=20!19251=20from=20=E7=8E=8B=E5=98=89=E8=AF=9A/v2.6.0=5F1?=
 =?UTF-8?q?g=5F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/npu/test_allocator_envs.py               | 29 +++++++
 third_party/acl/inc/acl/acl_rt.h              |  4 +
 .../csrc/core/npu/NPUCachingAllocator.cpp     | 83 ++++++++++++++++++-
 torch_npu/csrc/core/npu/NPUCachingAllocator.h |  2 +
 .../csrc/core/npu/interface/AclInterface.cpp  | 41 ++++++++-
 5 files changed, 151 insertions(+), 8 deletions(-)
 create mode 100644 test/npu/test_allocator_envs.py

diff --git a/test/npu/test_allocator_envs.py b/test/npu/test_allocator_envs.py
new file mode 100644
index 0000000000..ef1f465444
--- /dev/null
+++ b/test/npu/test_allocator_envs.py
@@ -0,0 +1,29 @@
+import os
+import shutil
+import unittest
+import torch
+import torch_npu
+
+from torch_npu.testing.testcase import TestCase, run_tests
+from torch_npu.testing.common_utils import SupportedDevices
+
+
+class TestAllocator(TestCase):
+    @SupportedDevices(['Ascend910B'])
+    def test_huge_memory_alloc_20M(self):
+        prev = torch_npu.npu.memory_allocated()
+        a = torch.rand(1024 * 1024 * 40, dtype=torch.float32).npu()
+        torch.npu.synchronize()
+        # 实际申请1G内存
+        self.assertEqual(torch_npu.npu.memory_allocated(), prev + ((4 * 40 * 1024 * 1024 + 32) // 512 + 1) * 512)
+
+    @SupportedDevices(['Ascend910B'])
+    def test_huge_memory_alloc_512B(self):
+        prev = torch_npu.npu.memory_allocated()
+        a = torch.rand(8 * 8 * 16, dtype=torch.float32).npu() # 512B
+        torch.npu.synchronize()
+        # 实际申请1M内存
+        self.assertEqual(torch_npu.npu.memory_allocated(), prev + ((8 * 8 * 16 * 4 + 32) // 512 + 1) * 512)
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/third_party/acl/inc/acl/acl_rt.h b/third_party/acl/inc/acl/acl_rt.h
index 30d796f464..be607a8281 100755
--- a/third_party/acl/inc/acl/acl_rt.h
+++ b/third_party/acl/inc/acl/acl_rt.h
@@ -86,6 +86,8 @@ typedef enum aclrtMemMallocPolicy {
     ACL_MEM_MALLOC_HUGE_FIRST_P2P,
     ACL_MEM_MALLOC_HUGE_ONLY_P2P,
     ACL_MEM_MALLOC_NORMAL_ONLY_P2P,
+    ACL_MEM_MALLOC_HUGE1G_ONLY,
+    ACL_MEM_MALLOC_HUGE1G_ONLY_P2P,
     ACL_MEM_TYPE_LOW_BAND_WIDTH   = 0x0100,
     ACL_MEM_TYPE_HIGH_BAND_WIDTH  = 0x1000,
 } aclrtMemMallocPolicy;
@@ -101,6 +103,8 @@ typedef enum aclrtMemAttr {
     ACL_DDR_MEM_P2P_NORMAL,
     ACL_HBM_MEM_P2P_HUGE,
     ACL_HBM_MEM_P2P_NORMAL,
+    ACL_HBM_MEM_HUGE1G,
+    ACL_HBM_MEM_P2P_HUGE1G,
 } aclrtMemAttr;
 
 typedef enum aclrtGroupAttr {
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index bf6a583e2a..b03a26c6c8 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -21,6 +21,7 @@
 #include "torch_npu/csrc/core/npu/NPURecovery.h"
 #include "torch_npu/csrc/core/npu/NPUGuard.h"
 #include "NPUBlockHandle.h"
+#include "torch_npu/csrc/core/npu/GetCANNInfo.h"
 #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h"
 #include "torch_npu/csrc/core/npu/NPUEvent.h"
 #include "torch_npu/csrc/profiler/npu_profiler.h"
@@ -89,12 +90,17 @@ constexpr size_t kMinBlockSize = 512; // all sizes are rounded to at least 512 b
 constexpr size_t kSmallSize = 1048576; // largest "small" allocation is 1 MiB
 constexpr size_t kSmallBuffer = 2097152; // "small" allocations are packed in 2 MiB blocks
 constexpr size_t kLargeBuffer = 20971520; // "large" allocations may be packed in 20 MiB blocks
+constexpr size_t kExtraLargeBuffer = 1073741824; // "extra large" allocations may be packed in 1 GB blocks
 constexpr size_t kLargeBufferForHccl = 134217728; // "large for hccl" allocations may be packed in 128 MiB blocks
 constexpr size_t kMinLargeAlloc = 10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
 constexpr size_t kRoundLarge = 2097152; // round up large allocs to 2 MiB
 constexpr size_t kAlignRoundLarge = 16384; // round up large allocs to 16 KB
 constexpr size_t kSmallPoolVirAddrSize = 2147483648; // 2 GB
 constexpr size_t kLargePoolVirAddrSize = 10737418240; // 10 GB
+const std::string kMinCannVersion = "8.1.RC1"; // minimum cann version which supports 1g mem 8.1.RC1
+const std::string kMinDriverVersion = "25.0.RC1"; // minimum driver version which supports 1g mem 25.0.RC1
+const std::string kCannModule = "CANN"; // cann module name
+const std::string kDriverModule = "DRIVER"; // driver module name
 
 using StatTypes = std::array<bool, static_cast<size_t>(StatType::NUM_TYPES)>;
 
@@ -137,6 +143,36 @@ void update_stat_array(
       });
 }
 
+bool IsMallocPage1GMem(bool is_small_pool)
+{
+    static bool is_support_page_size_1g = []() {
+        if (!c10_npu::NPUCachingAllocator::isConfig1GPageSizeEnable()) {
+            return false;
+        }
+
+        if (!IsGteCANNVersion(kMinCannVersion, kCannModule)) {
+            TORCH_NPU_WARN_ONCE("The application for 1G large-page physical memory failed. "
+            "Using the HUGE_MEM memory page allocation method may result in performance degradation. "
+            "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is enabled, "
+            "but the current driver version does not support this feature. "
+            "Please upgrade the CANN package version.");
+            return false;
+        }
+
+        if (!IsGteCANNVersion(kMinDriverVersion, kDriverModule)) {
+            TORCH_NPU_WARN_ONCE("The application for 1G large-page physical memory failed. "
+            "Using the HUGE_MEM memory page allocation method may result in performance degradation. "
+            "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is enabled, "
+            "but the current driver version does not support this feature. "
+            "Please upgrade the CANN package version 1-2.");
+            return false;
+        }
+        return true;
+    }();
+
+    return !is_small_pool && is_support_page_size_1g;
+}
+
 struct Block;
 struct PrivatePool;
 using Comparison = bool (*)(const Block*, const Block*);
@@ -373,7 +409,7 @@ struct ExpandableSegment {
         aclrtPhysicalMemProp prop = {};
         prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
         prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
-        prop.memAttr = ACL_HBM_MEM_HUGE;
+        prop.memAttr = (segment_size_ == kExtraLargeBuffer) ? ACL_HBM_MEM_HUGE1G : ACL_HBM_MEM_HUGE;
         prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
         prop.location.id = device_;
         prop.reserve = 0;
@@ -675,6 +711,11 @@ class CachingAllocatorConfig {
       return instance().m_base_addr_aligned_size;
   }
 
+  static bool page_size_1g_enable()
+  {
+      return instance().m_page_size_1g;
+  }
+
   static CachingAllocatorConfig &instance() {
     static CachingAllocatorConfig *s_instance = ([]() {
       auto inst = new CachingAllocatorConfig();
@@ -694,6 +735,7 @@ class CachingAllocatorConfig {
   bool m_expandable_segments;
   bool set_expandable_segments_flag = false;
   size_t m_base_addr_aligned_size = kAlignRoundLarge;
+  bool m_page_size_1g = false; // 新增1G页配置标志
 
   CachingAllocatorConfig()
       : m_max_split_size(std::numeric_limits<size_t>::max()),
@@ -718,6 +760,9 @@ class CachingAllocatorConfig {
   size_t parseAddrAlignSize(
       const std::vector<std::string>& config,
       size_t i);
+  size_t parsePageSize(
+      const std::vector<std::string>& config,
+      size_t i);
 };
 
 void CachingAllocatorConfig::lexArgs(
@@ -837,6 +882,19 @@ size_t CachingAllocatorConfig::parseAddrAlignSize(
     return i;
 }
 
+size_t CachingAllocatorConfig::parsePageSize(const std::vector<std::string>& config, size_t i)
+{
+    TORCH_CHECK(i + 2 < config.size(), "page_size requires format 'page_size:1g'", OPS_ERROR(ErrCode::VALUE));
+    TORCH_CHECK(config[i+1] == ":", "Expected ':' after page_size", OPS_ERROR(ErrCode::VALUE));
+
+    if (config[i+2] == "1g") {
+        m_page_size_1g = true;
+    } else {
+        TORCH_CHECK(false, "Unsupported page_size value: ", config[i+2], OPS_ERROR(ErrCode::VALUE));
+    }
+    return i + 2; // 返回最后处理的索引位置
+}
+
 void CachingAllocatorConfig::parseArgs(const char* env) {
   // If empty, set the default values
   m_max_split_size = std::numeric_limits<size_t>::max();
@@ -859,6 +917,8 @@ void CachingAllocatorConfig::parseArgs(const char* env) {
       i = parseExpandableSegments(config, i);
     } else if (config[i] == "base_addr_aligned_kb") {
       i = parseAddrAlignSize(config, i);
+    } else if (config[i] == "page_size") {
+      i = parsePageSize(config, i);
     } else {
       TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", config[i], PTA_ERROR(ErrCode::PARAM));
     }
@@ -887,6 +947,11 @@ bool checkConfigExpandableSegments()
     return CachingAllocatorConfig::expandable_segments();
 }
 
+bool isConfig1GPageSizeEnable()
+{
+    return CachingAllocatorConfig::page_size_1g_enable();
+}
+
 class DeviceCachingAllocator {
  private:
 
@@ -1090,7 +1155,10 @@ class DeviceCachingAllocator {
     auto size = round_size(orig_size);
     auto& pool = get_pool(size, stream);
 
-    const size_t alloc_size = get_allocation_size(size);
+    // 开环境变量 大池子放1G内存块
+    const size_t alloc_size = IsMallocPage1GMem(pool.is_small)
+                              ? kExtraLargeBuffer * ((size + kExtraLargeBuffer - 1) / kExtraLargeBuffer)
+                              : get_allocation_size(size);
     AllocParams params(device, size, stream, &pool, alloc_size, stats);
     params.stat_types = get_stat_types_for_pool(pool);
 
@@ -1790,6 +1858,10 @@ class DeviceCachingAllocator {
     auto segment_size = pool->is_small ? kSmallBuffer : (
       c10_npu::option::OptionsManager::IsHcclZeroCopyEnable() ? kLargeBufferForHccl : kLargeBuffer
     );
+    // 此处申请虚拟内存，segment_size是页大小，实际虚拟内存巨大
+    if (IsMallocPage1GMem(pool->is_small)) {
+        segment_size = kExtraLargeBuffer;
+    }
     auto segment = new ExpandableSegment(device, stream, segment_size);
     if (hcclComm_) {
         segment->setHcclComm(hcclComm_);
@@ -2240,8 +2312,11 @@ class DeviceCachingAllocator {
                 ptr = active_pool->allocator()->raw_alloc(size);
                 p.err = ptr ? ACL_ERROR_NONE : ACL_ERROR_RT_MEMORY_ALLOCATION;
             } else {
-                p.err = c10_npu::acl::AclrtMallocAlign32(
-                    &ptr, size, aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST);
+                auto policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST;
+                if (IsMallocPage1GMem(p.pool->is_small)) {
+                    policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE1G_ONLY;
+                }
+                p.err = c10_npu::acl::AclrtMallocAlign32(&ptr, size, policy);
             }
             if (p.err != ACL_ERROR_NONE) {
                 return false;
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
index 35eeb0d341..40d5231fd0 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
@@ -387,6 +387,8 @@ inline void buildServerMemMapForHccl(int device, std::shared_ptr<c10d_npu::HCCLC
 
 bool checkConfigExpandableSegments();
 
+bool isConfig1GPageSizeEnable();
+
 } // namespace NPUCachingAllocator
 } // namespace c10_npu
 
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
index 0cca8cf753..6d4c0f6b9d 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
@@ -427,11 +427,29 @@ aclError AclrtGetDeviceUtilizationRate(int32_t deviceId, aclrtUtilizationInfo *u
 aclError AclrtMallocAlign32(void **devPtr, size_t size, aclrtMemMallocPolicy policy) {
     typedef aclError (*AclrtMallocAlign32)(void**, size_t, aclrtMemMallocPolicy);
     static AclrtMallocAlign32 func = (AclrtMallocAlign32)GET_FUNC(aclrtMallocAlign32);
+    aclError ret;
     if (func != nullptr) {
-        return func(devPtr, size, policy);
+        ret = func(devPtr, size, policy);
+    } else {
+        TORCH_NPU_WARN_ONCE(func, "Failed to find function ", "aclrtMallocAlign32");
+        ret = aclrtMalloc(devPtr, size, policy);
+    }
+
+    if (ret != ACL_RT_SUCCESS && (policy == aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE1G_ONLY)) {
+        TORCH_NPU_WARN_ONCE("The malloc 1G large-page physical memory failed, so malloc 2M page memory."
+                            "Using the 2M memory page may result in performance degradation. "
+                            "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is "
+                            "enabled, but the pre-allocated number of 1G large pages is insufficient or 1G large-page "
+                            "memory pre-allocation is not enabled.");
+        policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST;
+        if (func != nullptr) {
+            ret = func(devPtr, size, policy);
+        } else {
+            TORCH_NPU_WARN_ONCE(func, "Failed to find function ", "aclrtMallocAlign32");
+            ret = aclrtMalloc(devPtr, size, policy);
+        }
     }
-    TORCH_NPU_WARN_ONCE(func, "Failed to find function ", "aclrtMallocAlign32");
-    return aclrtMalloc(devPtr, size, policy);
+    return ret;
 }
 
 aclError AclrtStreamQuery(aclrtStream stream, aclrtStreamStatus *status) {
@@ -502,7 +520,22 @@ aclError AclrtMallocPhysical(aclrtDrvMemHandle *handle, size_t size, const aclrt
         func = (AclrtMallocPhysical)GET_FUNC(aclrtMallocPhysical);
     }
     TORCH_CHECK(func, "Failed to find function ", "aclrtMallocPhysical", PROF_ERROR(ErrCode::NOT_FOUND));
-    return func(handle, size, prop, flags);
+    aclError ret = func(handle, size, prop, flags);
+    if (ret != ACL_RT_SUCCESS && (prop->memAttr == ACL_HBM_MEM_HUGE1G)) {
+        TORCH_NPU_WARN_ONCE("The malloc 1G large-page physical memory failed, so malloc 2M page memory."
+                            "Using the 2M memory page may result in performance degradation. "
+                            "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration "
+                            "is enabled, but the pre-allocated number of 1G large pages is insufficient "
+                            "or 1G large-page memory pre-allocation is not enabled.");
+        aclrtPhysicalMemProp prop_update = {prop->handleType,
+                                            prop->allocationType,
+                                            ACL_HBM_MEM_HUGE,
+                                            {prop->location.id,
+                                             prop->location.type},
+                                            prop->reserve};
+        ret = func(handle, size, &prop_update, flags);
+    }
+    return ret;
 }
 
 aclError AclrtFreePhysical(aclrtDrvMemHandle handle) {
-- 
Gitee


From 6b29ba314bafd49846269d951559594b2b2432e2 Mon Sep 17 00:00:00 2001
From: hhz886 <hehongzhe@h-partners.com>
Date: Thu, 20 Mar 2025 14:04:40 +0000
Subject: [PATCH 205/358] !18784 Customize workername add PID Merge pull
 request !18784 from hhz886/v2.6.0

---
 torch_npu/profiler/_profiler_path_creator.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torch_npu/profiler/_profiler_path_creator.py b/torch_npu/profiler/_profiler_path_creator.py
index e90aded0ca..9c07ac2eb2 100644
--- a/torch_npu/profiler/_profiler_path_creator.py
+++ b/torch_npu/profiler/_profiler_path_creator.py
@@ -65,10 +65,10 @@ class ProfPathCreator:
             dir_path = self._dir_path
         if os.path.exists(dir_path):
             PathManager.check_directory_path_writeable(dir_path)
-        if not self._worker_name:
-            worker_name = "{}_{}".format(socket.gethostname(), str(os.getpid()))
-        else:
-            worker_name = self._worker_name
+        worker_name = "{}_{}".format(
+            self._worker_name or socket.gethostname(),
+            str(os.getpid())
+        )
         span_name = "{}_{}_ascend_pt".format(worker_name, datetime.utcnow().strftime("%Y%m%d%H%M%S%f")[:-3])
         self._prof_path = os.path.join(dir_path, span_name)
         PathManager.check_input_directory_path(self._prof_path)
-- 
Gitee


From 6c142a8d183d92df700237e84b5e38a78f8da496 Mon Sep 17 00:00:00 2001
From: hhz886 <hehongzhe@h-partners.com>
Date: Thu, 20 Mar 2025 14:06:43 +0000
Subject: [PATCH 206/358] !19185 memory bugfix Merge pull request !19185 from
 hhz886/v2.6.0

---
 .../profiler/analysis/prof_parse/_event_tree_parser.py     | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/torch_npu/profiler/analysis/prof_parse/_event_tree_parser.py b/torch_npu/profiler/analysis/prof_parse/_event_tree_parser.py
index 5371f91ec0..cf7d3c356d 100644
--- a/torch_npu/profiler/analysis/prof_parse/_event_tree_parser.py
+++ b/torch_npu/profiler/analysis/prof_parse/_event_tree_parser.py
@@ -608,11 +608,6 @@ class EventTree:
     def get_root_nodes(self) -> List[_ProfilerEvent]:
         events: List[_ProfilerEvent] = []
         for ev in self.sorted_events:
-            if ev.tag == _EventType.Allocation:
-                device_type = ev.extra_fields.device_type
-            else:
-                device_type = _DeviceType.CPU.value
-            if ev.parent is None and device_type == _DeviceType.CPU.value:
+            if ev.parent is None:
                 events.append(ev)
-        
         return events
\ No newline at end of file
-- 
Gitee


From f0b9e692a836cd68eb3269018e27a552f0cfe20b Mon Sep 17 00:00:00 2001
From: zhangqiongwen <zhangqiongwen@huawei.com>
Date: Thu, 20 Mar 2025 14:14:25 +0000
Subject: [PATCH 207/358] !19263 add npu_mrope Merge pull request !19263 from
 zhangqiongwen/v2.6.0_mrope

---
 test/allowlist_for_publicAPI.json | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index 7d498ce029..eb7e1dff1c 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -2858,7 +2858,8 @@
     "npu_moe_compute_expert_tokens",
     "npu_moe_gating_top_k_softmax",
     "npu_moe_init_routing",
-    "npu_group_norm_swish"
+    "npu_group_norm_swish",
+    "npu_mrope"
   ],
   "torch_npu.contrib": [
     "npu_fused_attention_with_layernorm",
-- 
Gitee


From dc2b5c4a80cf2b0fbbf793920bb6352642ea47ca Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 20 Mar 2025 16:15:43 +0000
Subject: [PATCH 208/358] !19282 Update op_plugin commit id Merge pull request
 !19282 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 04d5a5e0a4..fed7770d9e 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 04d5a5e0a459f6bb33937c4090b5c83d35c816a6
+Subproject commit fed7770d9ef630a8a0bbdce1b74069d82c085a54
-- 
Gitee


From 1610991fd79c88ae1bc894a3ed9bd63965e1cecd Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 20 Mar 2025 16:15:44 +0000
Subject: [PATCH 209/358] !19282 Update op_plugin commit id Merge pull request
 !19282 from pta-robot/v2.6.0

-- 
Gitee


From b76d62e509925e1c1885e9de1039a46233755c75 Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Thu, 20 Mar 2025 23:13:11 +0000
Subject: [PATCH 210/358] !19273 Update torchair commit id Merge pull request
 !19273 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index a013adee64..1761456f1b 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit a013adee642fa13d97607f0323bb59f828f29649
+Subproject commit 1761456f1b79e535937acb332b4d522fa2cf60f9
-- 
Gitee


From cfcacb752e12b9b4f48907e49830cdad0cfb2b53 Mon Sep 17 00:00:00 2001
From: yuhaiyan <yuhaiyan8@huawei.com>
Date: Fri, 21 Mar 2025 08:28:55 +0000
Subject: [PATCH 211/358] !19256 Fixed failed testcases. Merge pull request
 !19256 from yuhaiyan/v2.6.0-dev1

---
 test/contrib/test_linear_quant.py             |   7 +-
 test/contrib/test_matmul_transpose.py         |   3 +-
 test/contrib/test_roll.py                     |   1 +
 test/onnx/test_combined_onnx_ops.py           |   3 +
 test/onnx/test_wrapper_onnx_ops.py            |   2 +-
 test/test_fx_reinplace_pass.py                | 366 ------------------
 test/testfiles_synchronized.txt               |   1 +
 .../test_single_slice_copy_to_contiguous.py   |  38 +-
 .../.pytorch-disabled-tests.json              |  11 +-
 9 files changed, 46 insertions(+), 386 deletions(-)
 delete mode 100644 test/test_fx_reinplace_pass.py

diff --git a/test/contrib/test_linear_quant.py b/test/contrib/test_linear_quant.py
index 0fc09d453f..2d6921b069 100644
--- a/test/contrib/test_linear_quant.py
+++ b/test/contrib/test_linear_quant.py
@@ -9,8 +9,9 @@ DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
 
 class TestLinearQuant(TestCase):
 
-    def npu_linear_quant(self, in_features, out_features, x1, x2, scale):
-        model = LinearQuant(in_features, out_features, bias=False, pertoken_scale=False, offset=False)
+    def npu_linear_quant(self, in_features, out_features, x1, x2, scale, output_dtype=torch.float16):
+        model = LinearQuant(in_features, out_features, bias=False, pertoken_scale=False, offset=False,
+                            output_dtype=output_dtype)
         model = model.npu()
         model.weight.data = x2
         model.scale.data = scale
@@ -23,7 +24,7 @@ class TestLinearQuant(TestCase):
         x1 = torch.randint(-1, 1, (1, 2), dtype=torch.int32).npu()
         x2 = torch.randint(-1, 1, (128, 2), dtype=torch.int32).npu()
         scale = torch.randn(1, dtype=torch.float32).npu()
-        supported_output = torch_npu.npu_quant_matmul(x1, x2.t(), scale)
+        supported_output = torch_npu.npu_quant_matmul(x1, x2.t(), scale, output_dtype=torch.float16)
         in_features = 2
         out_features = 128
         npu_out = self.npu_linear_quant(in_features, out_features, x1, x2, scale)
diff --git a/test/contrib/test_matmul_transpose.py b/test/contrib/test_matmul_transpose.py
index ccad829a9b..b60d6934cc 100644
--- a/test/contrib/test_matmul_transpose.py
+++ b/test/contrib/test_matmul_transpose.py
@@ -4,7 +4,7 @@ import torch
 import torch_npu
 
 from torch_npu.testing.testcase import TestCase, run_tests
-from torch_npu.testing.common_utils import create_common_tensor
+from torch_npu.testing.common_utils import create_common_tensor, SupportedDevices
 from torch_npu.contrib.function import matmul_transpose
 
 
@@ -45,6 +45,7 @@ class TestMatmulTranspose(TestCase):
 
         return output, fast_time
 
+    @SupportedDevices(['Ascend910A', 'Ascend910P'])
     def test_matmul_transpose_shape_format(self):
         shape_format = [
             [[np.float16, 2, [50, 25, 7, 100]], [np.float16, 2, [50, 25, 10, 100]]],
diff --git a/test/contrib/test_roll.py b/test/contrib/test_roll.py
index 6b753578db..1a3eb70d92 100644
--- a/test/contrib/test_roll.py
+++ b/test/contrib/test_roll.py
@@ -37,6 +37,7 @@ class TestRoll(TestCase):
 
         return output.to("cpu").numpy(), fast_time
 
+    @SupportedDevices(['Ascend910A', 'Ascend910P'])
     def test_roll_shape_format(self):
         dtype_list = [np.float16, np.float32, np.uint8, np.int32]
         format_list = [-1, 2]
diff --git a/test/onnx/test_combined_onnx_ops.py b/test/onnx/test_combined_onnx_ops.py
index f31ef73ae4..43e9972edd 100644
--- a/test/onnx/test_combined_onnx_ops.py
+++ b/test/onnx/test_combined_onnx_ops.py
@@ -213,6 +213,9 @@ class TestOnnxOps(TestCase):
                 return torch_npu.npu_conv3d(input_, self.weight, self.bias,
                                             stride, paddings, dilation, groups)
 
+        torch.npu.config.allow_internal_format = True
+        torch.npu.set_compile_mode(jit_compile=True)
+
         def export_onnx(onnx_model_name):
             input_ = torch.rand([1, 128, 4, 14, 14]).npu()
             model = Model().to("npu")
diff --git a/test/onnx/test_wrapper_onnx_ops.py b/test/onnx/test_wrapper_onnx_ops.py
index ee9c5868bc..caad95d717 100644
--- a/test/onnx/test_wrapper_onnx_ops.py
+++ b/test/onnx/test_wrapper_onnx_ops.py
@@ -1353,7 +1353,7 @@ class TestOnnxOps(TestCase):
 
         def export_onnx(onnx_model_name):
             x = torch.randn((8192, 320), dtype=torch.bfloat16).npu()
-            weight = torch.randn((320, 256), dtype=torch.int8).npu()
+            weight = torch.randn((320, 256), dtype=torch.int8, device="npu")
             antiquantscale = torch.randn((1, 256), dtype=torch.bfloat16).npu()
             antiquantoffset = torch.randn((1, 256), dtype=torch.bfloat16).npu()
             model = Model().to("npu")
diff --git a/test/test_fx_reinplace_pass.py b/test/test_fx_reinplace_pass.py
deleted file mode 100644
index 4c1188ff35..0000000000
--- a/test/test_fx_reinplace_pass.py
+++ /dev/null
@@ -1,366 +0,0 @@
-# Owner(s): ["module: functionalization"]
-import torch
-import torch_npu
-import torch_npu.testing
-from torch.testing._internal.common_utils import TestCase, run_tests
-from torch.fx.passes.reinplace import reinplace
-from torch.fx.experimental.proxy_tensor import make_fx
-
-try:
-    from functorch.experimental import functionalize
-    HAS_FUNCTIONALIZATION = True
-except Exception as e:
-    HAS_FUNCTIONALIZATION = False
-
-
-class TestReinplacePass(TestCase):
-
-    def test_reinplace_basic(self):
-        # Basic test: the out-of-place add() call should be converted
-        # into add_()
-        def f(x):
-            a = x.clone()
-            b = a.add(1)
-            return b
-
-        inpt = torch.ones(2)
-        f2 = reinplace(make_fx(f)(inpt), inpt)
-        expected_out = f(inpt)
-        actual_out = f2(inpt)
-        self.assertEqual(actual_out, expected_out)
-        self.assertExpectedInline(f2.code, """\
-
-
-
-def forward(self, x_1):
-    clone = torch.ops.aten.clone.default(x_1);  x_1 = None
-    add = torch.ops.aten.add_.Tensor(clone, 1)
-    return clone
-    """)
-
-
-    def test_reinplace_with_view(self):
-        def f(x):
-            a = x.clone()
-            a_view = a.view(-1)
-            # We shouldn't re-inplace the first add(), because an alias of a is re-used later in the program
-            b = a.add(1)
-            # Second add() is fine to re-inplace
-            c = a_view.add(1)
-            return c
-
-        inpt = torch.ones(2)
-        f2 = reinplace(make_fx(f)(inpt), inpt)
-        expected_out = f(inpt)
-        actual_out = f2(inpt)
-        self.assertEqual(actual_out, expected_out)
-        self.assertExpectedInline(f2.code, """\
-
-
-
-def forward(self, x_1):
-    clone = torch.ops.aten.clone.default(x_1);  x_1 = None
-    view = torch.ops.aten.view.default(clone, [-1])
-    add = torch.ops.aten.add.Tensor(clone, 1);  clone = None
-    add_1 = torch.ops.aten.add_.Tensor(view, 1)
-    return view
-    """)
-
-    def test_reinplace_different_metadata(self):
-        def f(a_):
-            a = a_.clone()
-            b = a + 1
-            # Naively, we shouldn't try to inplace the .ge() call,
-            # because that would require resizing "b" (from a float to a bool tensor).
-            c = torch.ge(b, a)
-            return c
-        inpt = torch.ones(4)
-        f2 = reinplace(make_fx(f)(inpt), inpt)
-        expected_out = f(inpt)
-        actual_out = f2(inpt)
-        self.assertEqual(actual_out, expected_out)
-        # The .ge() should not be reinplaced.
-        self.assertExpectedInline(f2.code, """\
-
-
-
-def forward(self, a__1):
-    clone = torch.ops.aten.clone.default(a__1);  a__1 = None
-    add = torch.ops.aten.add.Tensor(clone, 1)
-    ge = torch.ops.aten.ge.Tensor(add, clone);  add = clone = None
-    return ge
-    """)
-
-    def test_reinplace_overlapping_memory(self):
-        def f(a_):
-            a = a_.clone()
-            b = a.expand(4, 4)
-            # Can't reinplace because b has overlapping memory.
-            c = b.add(1)
-            return c
-        inpt = torch.ones(1)
-        f2 = reinplace(make_fx(f)(inpt), inpt)
-        expected_out = f(inpt)
-        actual_out = f2(inpt)
-        self.assertEqual(actual_out, expected_out)
-        self.assertExpectedInline(f2.code, """\
-
-
-
-def forward(self, a__1):
-    clone = torch.ops.aten.clone.default(a__1);  a__1 = None
-    expand = torch.ops.aten.expand.default(clone, [4, 4]);  clone = None
-    add = torch.ops.aten.add.Tensor(expand, 1);  expand = None
-    return add
-    """)
-
-    # This test won't actually run in CI, because it requires functionalize() from functorch.
-    # I'm planning on testing more comprehensively with torchbench models,
-    # but we can make this testing better once functorch moves into pytorch/pytorch.
-    def test_reinplace_scatter_op(self):
-        def f(a_):
-            # for now, don't test mutations to inputs
-            a = a_.clone()
-            e = a.view(-1)
-            b = a.view(-1)
-            c = b[0]
-            d = c.view(-1)
-            d.add_(1)
-            return a + e
-
-        if not HAS_FUNCTIONALIZATION:
-            return
-        inpt = torch.ones(4)
-        f2 = reinplace(make_fx(functionalize(f))(inpt), inpt)
-        expected_out = f(inpt)
-        actual_out = f2(inpt)
-        self.assertEqual(actual_out, expected_out)
-        # NOTE: one slight pessimization here is the fact that
-        # there are a bunch of redundant views in the graph.
-        # Technically, half of these views are duplicates that we could de-dup.
-        # This shouldn't really hurt performance though, since creating an extra view
-        # is effectively just moving some metadata around (and allocating a new TensorImpl).
-        # We can/should update the pass in the future to clean this up.
-        self.assertExpectedInline(f2.code, """\
-
-
-
-def forward(self, a__1):
-    clone = torch.ops.aten.clone.default(a__1);  a__1 = None
-    view = torch.ops.aten.view.default(clone, [-1])
-    view_1 = torch.ops.aten.view.default(clone, [-1])
-    select = torch.ops.aten.select.int(view_1, 0, 0);  view_1 = None
-    view_2 = torch.ops.aten.view.default(select, [-1]);  select = None
-    add = torch.ops.aten.add_.Tensor(view_2, 1)
-    view_3 = torch.ops.aten.view.default(clone, [-1]);  clone = None
-    select_1 = torch.ops.aten.select.int(view_3, 0, 0)
-    view_4 = torch.ops.aten.view.default(view_2, []);  view_2 = None
-    view_5 = torch.ops.aten.view.default(view_3, [4]);  view_3 = None
-    view_6 = torch.ops.aten.view.default(view_5, [-1])
-    select_2 = torch.ops.aten.select.int(view_6, 0, 0);  view_6 = None
-    view_7 = torch.ops.aten.view.default(select_2, [-1]);  select_2 = None
-    view_8 = torch.ops.aten.view.default(view_5, [-1])
-    add_1 = torch.ops.aten.add_.Tensor(view_5, view_8);  view_8 = None
-    return view_5
-    """)
-
-    def test_reinplace_scatter_twice(self):
-        def f(a_):
-            # for now, don't test mutations to inputs
-            a = a_.clone()
-            b = a[:, 1]
-            c = b[1]
-            c.add_(1)
-            return a
-
-        if not HAS_FUNCTIONALIZATION:
-            return
-
-        inpt = torch.ones(4, 4)
-        f2 = reinplace(make_fx(functionalize(f))(inpt), inpt)
-        expected_out = f(inpt)
-        actual_out = f2(inpt)
-        self.assertEqual(actual_out, expected_out)
-        self.assertExpectedInline(f2.code, """\
-
-
-
-def forward(self, a__1):
-    clone = torch.ops.aten.clone.default(a__1);  a__1 = None
-    slice_1 = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
-    select = torch.ops.aten.select.int(slice_1, 1, 1);  slice_1 = None
-    select_1 = torch.ops.aten.select.int(select, 0, 1);  select = None
-    add = torch.ops.aten.add_.Tensor(select_1, 1);  select_1 = None
-    slice_2 = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
-    select_2 = torch.ops.aten.select.int(slice_2, 1, 1);  slice_2 = None
-    slice_3 = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
-    select_3 = torch.ops.aten.select.int(slice_3, 1, 1);  slice_3 = None
-    select_4 = torch.ops.aten.select.int(select_3, 0, 1);  select_3 = None
-    return clone
-    """)
-
-    def test_reinplace_scatter_twice_with_different_view_op_valid(self):
-        def f(a_):
-            a = a_.clone()
-            b = a[:, 1]
-            c = b[1]
-            c_updated = c.add(1)
-            good_mirror_of_b = a.as_strided((4,), (4,), 1)
-            # good_mirror_of_b points to the same region of memory as b.
-            # and this scatter op below tries to scatter c_updated into the same region
-            # that c currently takes up.
-            # reinplacing logic checks this by confirming that:
-            #   c_updated
-            #   good_mirror_of_b.select(0, 1)
-            # have the same size/stride/storage_offset.
-            b_updated = torch.select_scatter(good_mirror_of_b, c_updated, 0, 1)
-            return b_updated
-
-        inpt = torch.ones(4, 4)
-        f2 = reinplace(make_fx(f)(inpt), inpt)
-        expected_out = f(inpt)
-        actual_out = f2(inpt)
-        self.assertEqual(actual_out, expected_out)
-        self.assertExpectedInline(f2.code, """\
-
-
-
-def forward(self, a__1):
-    clone = torch.ops.aten.clone.default(a__1);  a__1 = None
-    slice_1 = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
-    select = torch.ops.aten.select.int(slice_1, 1, 1);  slice_1 = None
-    select_1 = torch.ops.aten.select.int(select, 0, 1);  select = None
-    add = torch.ops.aten.add_.Tensor(select_1, 1);  select_1 = None
-    as_strided = torch.ops.aten.as_strided.default(clone, [4], [4], 1);  clone = None
-    return as_strided
-    """)
-
-    # Test example where we have a scatter op, where the base tensor
-    # has the same size/stride/storage offset (even though it is a different view),
-    # making it valid to re-inplace
-    def test_reinplace_scatter_twice_with_different_view_op_invalid(self):
-        def f(a_):
-            a = a_.clone()
-            b = a[:, 1]
-            c = b[1]
-            c_updated = c.add(1)
-            good_mirror_of_b = a.as_strided((4,), (4,), 1)
-            # The first arg to select_scatter is an equivalent view to b.
-            # However, the select_scatter call below tries to put c_updated
-            # into a different slice of "b" than what "c" currently occupies.
-            #
-            b_updated = torch.select_scatter(good_mirror_of_b, c_updated, 0, 0)
-            return b_updated
-
-        inpt = torch.ones(4, 4)
-        f2 = reinplace(make_fx(f)(inpt), inpt)
-        expected_out = f(inpt)
-        actual_out = f2(inpt)
-        self.assertEqual(actual_out, expected_out)
-        self.assertExpectedInline(f2.code, """\
-
-
-
-def forward(self, a__1):
-    clone = torch.ops.aten.clone.default(a__1);  a__1 = None
-    slice_1 = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
-    select = torch.ops.aten.select.int(slice_1, 1, 1);  slice_1 = None
-    select_1 = torch.ops.aten.select.int(select, 0, 1);  select = None
-    add = torch.ops.aten.add.Tensor(select_1, 1);  select_1 = None
-    as_strided = torch.ops.aten.as_strided.default(clone, [4], [4], 1);  clone = None
-    select_int = torch.ops.aten.select.int(as_strided, 0, 0)
-    copy__default = torch.ops.aten.copy_.default(select_int, add);  select_int = add = None
-    return as_strided
-    """)  # noqa: B950
-
-    def test_reinplace_scatter_twice_with_different_view_op_invalid2(self):
-        def f(a_):
-            a = a_.clone()
-            b = a[:, 1]
-            c = b[1]
-            c_updated = c.add(1)
-            bad_mirror_of_b = a.as_strided((4,), (4,), 0)
-            # The first arg to select_scatter points to a different than c's base.
-            # This makes it invalid to re-inplace.
-            b_updated = torch.select_scatter(bad_mirror_of_b, c_updated, 0, 1)
-            return b_updated
-
-        inpt = torch.ones(4, 4)
-        f2 = reinplace(make_fx(f)(inpt), inpt)
-        expected_out = f(inpt)
-        actual_out = f2(inpt)
-        # self.assertEqual(actual_out, expected_out)
-        self.assertExpectedInline(f2.code, """\
-
-
-
-def forward(self, a__1):
-    clone = torch.ops.aten.clone.default(a__1);  a__1 = None
-    slice_1 = torch.ops.aten.slice.Tensor(clone, 0, 0, 9223372036854775807)
-    select = torch.ops.aten.select.int(slice_1, 1, 1);  slice_1 = None
-    select_1 = torch.ops.aten.select.int(select, 0, 1);  select = None
-    add = torch.ops.aten.add.Tensor(select_1, 1);  select_1 = None
-    as_strided = torch.ops.aten.as_strided.default(clone, [4], [4], 0);  clone = None
-    select_int = torch.ops.aten.select.int(as_strided, 0, 1)
-    copy__default = torch.ops.aten.copy_.default(select_int, add);  select_int = add = None
-    return as_strided
-    """)  # noqa: B950
-
-
-    def test_out_node_updated(self):
-        def f():
-            x = torch.zeros(2, 2)
-            y = x.diagonal()
-            y_updated = y.add(1)
-            z = torch.diagonal_scatter(x, y_updated)
-            # reinplace needs to know to replace output [z] with [x]
-            return [z]
-
-        if not HAS_FUNCTIONALIZATION:
-            return
-        f2 = reinplace(make_fx(functionalize(f))())
-        expected_out = f()
-        actual_out = f2()
-        self.assertEqual(actual_out, expected_out)
-        self.assertExpectedInline(f2.code, """\
-
-
-
-def forward(self):
-    zeros = torch.ops.aten.zeros.default([2, 2], device = device(type='cpu'), pin_memory = False)
-    diagonal = torch.ops.aten.diagonal.default(zeros)
-    add = torch.ops.aten.add_.Tensor(diagonal, 1);  diagonal = None
-    return [zeros]
-    """)
-
-    def test_reinplace_index_mutation(self):
-        def f():
-            a = torch.zeros(4, 4, 4)
-            a[:, 2:] = torch.ones(4, 2, 4)
-            return a
-
-        if not HAS_FUNCTIONALIZATION:
-            return
-        f2 = reinplace(make_fx(functionalize(f))())
-        expected_out = f()
-        actual_out = f2()
-        self.assertEqual(actual_out, expected_out)
-        self.assertExpectedInline(f2.code, """\
-
-
-
-def forward(self):
-    zeros = torch.ops.aten.zeros.default([4, 4, 4], device = device(type='cpu'), pin_memory = False)
-    ones = torch.ops.aten.ones.default([4, 2, 4], device = device(type='cpu'), pin_memory = False)
-    slice_1 = torch.ops.aten.slice.Tensor(zeros, 0, 0, 9223372036854775807)
-    slice_2 = torch.ops.aten.slice.Tensor(slice_1, 1, 2, 9223372036854775807);  slice_1 = None
-    copy = torch.ops.aten.copy_.default(slice_2, ones);  slice_2 = ones = None
-    slice_3 = torch.ops.aten.slice.Tensor(zeros, 0, 0, 9223372036854775807)
-    slice_4 = torch.ops.aten.slice.Tensor(zeros, 0, 0, 9223372036854775807)
-    slice_5 = torch.ops.aten.slice.Tensor(slice_4, 1, 2, 9223372036854775807);  slice_4 = None
-    return zeros
-    """)
-
-
-if __name__ == '__main__':
-    run_tests()
diff --git a/test/testfiles_synchronized.txt b/test/testfiles_synchronized.txt
index bb7ae12f7f..826ab6a834 100644
--- a/test/testfiles_synchronized.txt
+++ b/test/testfiles_synchronized.txt
@@ -119,6 +119,7 @@ test/test_jit_profiling.py
 test/test_jit_llga_fuser.py
 test/test_type_hints.py
 test/test_typing.py
+test/test_fx_reinplace_pass.py
 mypy.ini
 test/profiler/profiler_utils_mock_events.json
 test/profiler/test_execution_trace.py
diff --git a/test/trans_contiguous/test_single_slice_copy_to_contiguous.py b/test/trans_contiguous/test_single_slice_copy_to_contiguous.py
index e46c141f14..45357bd768 100644
--- a/test/trans_contiguous/test_single_slice_copy_to_contiguous.py
+++ b/test/trans_contiguous/test_single_slice_copy_to_contiguous.py
@@ -33,16 +33,19 @@ class SingleViewCopyToContiguous(TestCase):
             # for narrow with step=1, if narrow at the first axis, it will generate a contiguous tensor
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input[:, :16, :, :].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof),
-                             True, "contiguous_d_Slice is not called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, "contiguous_d_Slice or aclnnInplaceCopy is not called!")
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[:, :, 1:16, :].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof),
-                             True, "contiguous_d_Slice is not called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, "contiguous_d_Slice or aclnnInplaceCopy is not called!")
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out3 = npu_input[:, :, :, 2:16].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof),
-                             True, "contiguous_d_Slice is not called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, "contiguous_d_Slice or aclnnInplaceCopy is not called!")
 
             cpu_out1 = cpu_input[:, :16, :, :].contiguous()
             cpu_out2 = cpu_input[:, :, 1:16, :].contiguous()
@@ -70,24 +73,29 @@ class SingleViewCopyToContiguous(TestCase):
             if cpu_input.dim() == 4:
                 with torch.autograd.profiler.profile(use_device='npu') as prof:
                     npu_out1 = npu_input[::2].contiguous()
-                self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof),
+                self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof)
+                                 or check_operators_in_prof(['aclnnInplaceCopy'], prof),
                                  True, "Error operators called!")
                 with torch.autograd.profiler.profile(use_device='npu') as prof:
                     npu_out2 = npu_input[:, 1:17:4].contiguous()
-                self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof),
+                self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof)
+                                 or check_operators_in_prof(['aclnnInplaceCopy'], prof),
                                  True, "Error operators called!")
                 with torch.autograd.profiler.profile(use_device='npu') as prof:
                     npu_out3 = npu_input[:, :, 2:16:5].contiguous()
-                self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof),
+                self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof)
+                                 or check_operators_in_prof(['aclnnInplaceCopy'], prof),
                                  True, "Error operators called!")
                 with torch.autograd.profiler.profile(use_device='npu') as prof:
                     # stridedSlice do not support slice at last dim
                     npu_out4 = npu_input[:, :, :, 3:9:2].contiguous()
-                self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof),
+                self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof)
+                                 or check_operators_in_prof(['aclnnInplaceCopy'], prof),
                                  True, "Error operators called!")
                 with torch.autograd.profiler.profile(use_device='npu') as prof:
                     npu_out5 = npu_input[::2, 1:17:4, 2:16:5, :].contiguous()
-                self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof),
+                self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof)
+                                 or check_operators_in_prof(['aclnnInplaceCopy'], prof),
                                  True, "Error operators called!")
 
                 cpu_out1 = cpu_input[::2].contiguous()
@@ -120,8 +128,9 @@ class SingleViewCopyToContiguous(TestCase):
             for dim in range(1, len(item[2])):
                 with torch.autograd.profiler.profile(use_device='npu') as prof:
                     npu_out = npu_input.select(dim, 1).contiguous()
-                self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof),
-                                 True, "contiguous_d_StridedSlice is not called!")
+                self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof)
+                                 or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                                 True, "contiguous_d_StridedSlice or aclnnInplaceCopy is not called!")
                 cpu_out = cpu_input.select(dim, 1).contiguous()
                 self.assertRtolEqual(npu_out.to("cpu").numpy(), cpu_out.numpy())
 
@@ -139,7 +148,8 @@ class SingleViewCopyToContiguous(TestCase):
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out = torch.as_strided(npu_input,
                                            shape_list[1][0], shape_list[1][1], shape_list[1][2]).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_d_StridedSlice']),
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_d_StridedSlice'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_d_StridedSlice']),
                              True, "Error operators called!")
             cpu_out = torch.as_strided(cpu_input,
                                        shape_list[1][0], shape_list[1][1], shape_list[1][2]).contiguous()
diff --git a/test/unsupported_test_cases/.pytorch-disabled-tests.json b/test/unsupported_test_cases/.pytorch-disabled-tests.json
index 27f2c9a5fa..e281b46a61 100644
--- a/test/unsupported_test_cases/.pytorch-disabled-tests.json
+++ b/test/unsupported_test_cases/.pytorch-disabled-tests.json
@@ -31238,5 +31238,14 @@
   "test_conj_view__refs_unbind_copy_npu_complex64 (__main__.TestMathBitsPRIVATEUSE1)": ["", [""]],
   "test_conj_view_unbind_copy_npu_complex64 (__main__.TestMathBitsPRIVATEUSE1)": ["", [""]],
   "test_neg_conj_view__refs_unbind_copy_npu_complex128 (__main__.TestMathBitsPRIVATEUSE1)": ["", [""]],
-  "test_neg_conj_view_unbind_copy_npu_complex128 (__main__.TestMathBitsPRIVATEUSE1)": ["", [""]]
+  "test_neg_conj_view_unbind_copy_npu_complex128 (__main__.TestMathBitsPRIVATEUSE1)": ["", [""]],
+  "test_reinplace_scatter_twice_with_different_view_op_invalid2 (__main__.TestReinplacePass)": ["", [""]],
+  "test_reinplace_scatter_twice_with_different_view_op_invalid (__main__.TestReinplacePass)": ["", [""]],
+  "test_reinplace_scatter_twice (__main__.TestReinplacePass)": ["", [""]],
+  "test_reinplace_scatter_twice_with_different_view_op_valid (__main__.TestReinplacePass)": ["", [""]],
+  "test_out_node_updated (__main__.TestReinplacePass)": ["", [""]],
+  "test_reinplace_with_view (__main__.TestReinplacePass)": ["", [""]],
+  "test_reinplace_basic (__main__.TestReinplacePass)": ["", [""]],
+  "test_reinplace_index_mutation (__main__.TestReinplacePass)": ["", [""]],
+  "test_reinplace_scatter_op (__main__.TestReinplacePass)": ["", [""]]
 }
-- 
Gitee


From 6829fb9635c1c5f6622153f39dc8b4fe251b61e1 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 21 Mar 2025 08:56:17 +0000
Subject: [PATCH 212/358] !19300 Update op_plugin commit id Merge pull request
 !19300 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index fed7770d9e..1e02b842ae 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit fed7770d9ef630a8a0bbdce1b74069d82c085a54
+Subproject commit 1e02b842aed973fdc70122fc9acaeb84ebcfa282
-- 
Gitee


From ec31c415bf43c246c37b6ec4abd6db4ccc72c912 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 21 Mar 2025 10:41:14 +0000
Subject: [PATCH 213/358] !19319 Update op_plugin commit id Merge pull request
 !19319 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 1e02b842ae..f81415746a 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 1e02b842aed973fdc70122fc9acaeb84ebcfa282
+Subproject commit f81415746a3c4759795ff76a8ed657b3ca84908f
-- 
Gitee


From 44e86e173c830b51060c2addc01ffc80120a756e Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 21 Mar 2025 10:41:15 +0000
Subject: [PATCH 214/358] !19319 Update op_plugin commit id Merge pull request
 !19319 from pta-robot/v2.6.0

-- 
Gitee


From 318256bbd0f77dca1f1cff94fa61ae9db6fac12b Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 21 Mar 2025 13:56:18 +0000
Subject: [PATCH 215/358] !19327 Update op_plugin commit id Merge pull request
 !19327 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index f81415746a..c1cd60c7df 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit f81415746a3c4759795ff76a8ed657b3ca84908f
+Subproject commit c1cd60c7df6583ce56ba415464eac88a829f94b8
-- 
Gitee


From 4c11bdd2605e75415b6cd0ea04f3e34b12acbb1c Mon Sep 17 00:00:00 2001
From: czy1255959842 <cuizhengyao@huawei.com>
Date: Sat, 22 Mar 2025 03:23:31 +0000
Subject: [PATCH 216/358] !19121 Decoupling createDomain function Merge pull
 request !19121 from czy1255959842/v2.6.0

---
 .../csrc/core/npu/NPUCachingAllocator.cpp     | 37 +++++++------------
 .../csrc/core/npu/NPUWorkspaceAllocator.cpp   | 20 ++++++----
 torch_npu/csrc/profiler/mstx_mgr.cpp          | 20 +++++++---
 torch_npu/csrc/profiler/mstx_mgr.h            |  6 +--
 torch_npu/csrc/profiler/npu_profiler.h        |  2 +-
 5 files changed, 45 insertions(+), 40 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index b03a26c6c8..999f3d46cf 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -1398,9 +1398,13 @@ class DeviceCachingAllocator {
       stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current);
 
 #ifndef BUILD_LIBTORCH
-    mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
-    mstxMemVirtualRangeDesc_t desc{block->device, block->ptr, block->size};
-    torch_npu::profiler::MstxMgr::GetInstance()->memRegionsRegister(msleaksDomain, &desc);
+    if (torch_npu::profiler::MstxMgr::GetInstance()->isMsleaksEnable()) {
+      mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+      mstxMemVirtualRangeDesc_t heapDesc{block->device, block->ptr, stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current};
+      torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &heapDesc);
+      mstxMemVirtualRangeDesc_t regionDesc{block->device, block->ptr, block->size};
+      torch_npu::profiler::MstxMgr::GetInstance()->memRegionsRegister(msleaksDomain, &regionDesc);
+    }
     torch_npu::profiler::reportMemoryDataToNpuProfiler({
       static_cast<int8_t>(c10::DeviceType::PrivateUse1),
       block->device,
@@ -1468,8 +1472,12 @@ class DeviceCachingAllocator {
         stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
         stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current);
 #ifndef BUILD_LIBTORCH
-    mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
-    torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, orig_block_ptr);
+    if (torch_npu::profiler::MstxMgr::GetInstance()->isMsleaksEnable()) {
+      mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+      mstxMemVirtualRangeDesc_t desc{block->device, orig_block_ptr, stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current};
+      torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &desc);
+      torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, orig_block_ptr);
+    }
     torch_npu::profiler::reportMemoryDataToNpuProfiler({
         static_cast<int8_t>(c10::DeviceType::PrivateUse1),
         block->device,
@@ -1924,11 +1932,6 @@ class DeviceCachingAllocator {
     for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
       update_stat(stats.reserved_bytes[stat_type], mapped_range.size);
     });
-#ifndef BUILD_LIBTORCH
-    mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
-    mstxMemVirtualRangeDesc_t desc{to_map->device, mapped_range.ptr, mapped_range.size};
-    torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &desc);
-#endif
     record_trace(
         TraceEntry::SEGMENT_MAP,
         int64_t(mapped_range.ptr),
@@ -2343,11 +2346,7 @@ class DeviceCachingAllocator {
 
         // p.block came from new, not npuMalloc. It should not be nullptr here.
         TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr);
-#ifndef BUILD_LIBTORCH
-        mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
-        mstxMemVirtualRangeDesc_t desc{p.block->device, p.block->ptr, p.block->size};
-        torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &desc);
-#endif
+
         record_trace(
             TraceEntry::SEGMENT_ALLOC,
             int64_t(p.block->ptr),
@@ -2485,10 +2484,6 @@ class DeviceCachingAllocator {
 
     if (block->size >= CachingAllocatorConfig::max_split_size())
       update_stat(stats.oversize_segments, -1);
-#ifndef BUILD_LIBTORCH
-    mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
-    torch_npu::profiler::MstxMgr::GetInstance()->memHeapUnregister(msleaksDomain, block->ptr);
-#endif
     ASCEND_LOGD("pta_memory acl_free: free_size = %zu", block->size);
 
     pool->blocks.erase(block);
@@ -2554,10 +2549,6 @@ class DeviceCachingAllocator {
       block->pool->owner_PrivatePool->npuMalloc_count--;
     }
 
-#ifndef BUILD_LIBTORCH
-    mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
-    torch_npu::profiler::MstxMgr::GetInstance()->memHeapUnregister(msleaksDomain, block->ptr);
-#endif
     record_trace(
         TraceEntry::SEGMENT_UNMAP,
         int64_t(unmapped.ptr),
diff --git a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp
index 1c1b34a6da..54782044cf 100644
--- a/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUWorkspaceAllocator.cpp
@@ -66,8 +66,10 @@ public:
                 NPU_CHECK_ERROR(c10_npu::acl::AclrtSynchronizeDeviceWithTimeout());
                 NPU_CHECK_ERROR(aclrtFree(block->data_ptr));
 #ifndef BUILD_LIBTORCH
-                mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
-                torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, block->data_ptr);
+                if (torch_npu::profiler::MstxMgr::GetInstance()->isMsleaksEnable()) {
+                    mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+                    torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, block->data_ptr);
+                }
                 record_mem_size_decrement(block->size);
                 const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
                 if (C10_UNLIKELY(trigger)) {
@@ -104,9 +106,11 @@ public:
 
             ASCEND_LOGD("NPUWorkspaceAllocator malloc by AclrtMallocAlign32: size=%zu", block->size);
 #ifndef BUILD_LIBTORCH
-            mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
-            mstxMemVirtualRangeDesc_t desc{device, block->data_ptr, block->size};
-            torch_npu::profiler::MstxMgr::GetInstance()->memRegionsRegister(msleaksDomain, &desc);
+            if (torch_npu::profiler::MstxMgr::GetInstance()->isMsleaksEnable()) {
+                mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+                mstxMemVirtualRangeDesc_t desc{device, block->data_ptr, block->size};
+                torch_npu::profiler::MstxMgr::GetInstance()->memRegionsRegister(msleaksDomain, &desc);
+            }
             record_mem_size_increment(block->size);
             torch_npu::profiler::reportMemoryDataToNpuProfiler({
                 static_cast<int8_t>(c10::DeviceType::PrivateUse1),
@@ -152,8 +156,10 @@ public:
                 ASCEND_LOGI("NPUWorkspaceAllocator free by aclrtFree: size=%zu", block_pair.second->size);
                 NPU_CHECK_ERROR(aclrtFree(block_pair.second->data_ptr));
 #ifndef BUILD_LIBTORCH
-                mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
-                torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, block_pair.second->data_ptr);
+                if (torch_npu::profiler::MstxMgr::GetInstance()->isMsleaksEnable()) {
+                    mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+                    torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, block_pair.second->data_ptr);
+                }
                 record_mem_size_decrement(block_pair.second->size);
                 const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
                 if (C10_UNLIKELY(trigger)) {
diff --git a/torch_npu/csrc/profiler/mstx_mgr.cpp b/torch_npu/csrc/profiler/mstx_mgr.cpp
index 671908257b..c628502803 100644
--- a/torch_npu/csrc/profiler/mstx_mgr.cpp
+++ b/torch_npu/csrc/profiler/mstx_mgr.cpp
@@ -84,9 +84,17 @@ int MstxMgr::getRangeId()
     return ptRangeId_++;
 }
 
-mstxDomainHandle_t MstxMgr::createDomain(const char* name)
+mstxDomainHandle_t MstxMgr::createProfDomain(const char *name)
 {
-    if (!isMsleaksEnable() && !isMstxEnable()) {
+    if (!isMstxEnable()) {
+        return nullptr;
+    }
+    return at_npu::native::MstxDomainCreateA(name);
+}
+
+mstxDomainHandle_t MstxMgr::createLeaksDomain(const char* name)
+{
+    if (!at_npu::native::IsSupportMstxFunc()) {
         return nullptr;
     }
     return at_npu::native::MstxDomainCreateA(name);
@@ -163,7 +171,7 @@ void MstxMgr::domainRangeEnd(mstxDomainHandle_t domain, int ptRangeId)
 
 mstxMemHeapHandle_t MstxMgr::memHeapRegister(mstxDomainHandle_t domain, mstxMemVirtualRangeDesc_t* desc)
 {
-    if (!isMsleaksEnable() || desc==nullptr) {
+    if (!at_npu::native::IsSupportMstxFunc() || desc == nullptr) {
         return nullptr;
     }
     mstxMemHeapDesc_t heapDesc;
@@ -173,7 +181,7 @@ mstxMemHeapHandle_t MstxMgr::memHeapRegister(mstxDomainHandle_t domain, mstxMemV
 
 void MstxMgr::memHeapUnregister(mstxDomainHandle_t domain, void* ptr)
 {
-    if (!isMsleaksEnable() || ptr == nullptr) {
+    if (!at_npu::native::IsSupportMstxFunc() || ptr == nullptr) {
         return;
     }
     at_npu::native::MstxMemHeapUnregister(domain, reinterpret_cast<mstxMemHeapHandle_t>(ptr));
@@ -181,7 +189,7 @@ void MstxMgr::memHeapUnregister(mstxDomainHandle_t domain, void* ptr)
 
 void MstxMgr::memRegionsRegister(mstxDomainHandle_t domain, mstxMemVirtualRangeDesc_t* desc)
 {
-    if (!isMsleaksEnable() || desc == nullptr) {
+    if (!at_npu::native::IsSupportMstxFunc() || desc == nullptr) {
         return;
     }
     mstxMemRegionsRegisterBatch_t batch;
@@ -192,7 +200,7 @@ void MstxMgr::memRegionsRegister(mstxDomainHandle_t domain, mstxMemVirtualRangeD
 
 void MstxMgr::memRegionsUnregister(mstxDomainHandle_t domain, void* ptr)
 {
-    if (!isMsleaksEnable() || ptr == nullptr) {
+    if (!at_npu::native::IsSupportMstxFunc() || ptr == nullptr) {
         return;
     }
     mstxMemRegionsUnregisterBatch_t unregisterBatch;
diff --git a/torch_npu/csrc/profiler/mstx_mgr.h b/torch_npu/csrc/profiler/mstx_mgr.h
index bea6f59bea..156a220a67 100644
--- a/torch_npu/csrc/profiler/mstx_mgr.h
+++ b/torch_npu/csrc/profiler/mstx_mgr.h
@@ -20,10 +20,11 @@ public:
     void mark(const char* message, const aclrtStream stream);
     int rangeStart(const char* message, const aclrtStream stream);
     void rangeEnd(int ptRangeId);
+    bool isMsleaksEnable();
     bool isMstxEnable();
     int getRangeId();
-
-    mstxDomainHandle_t createDomain(const char* name);
+    mstxDomainHandle_t createProfDomain(const char* name);
+    mstxDomainHandle_t createLeaksDomain(const char* name);
     void destroyDomain(mstxDomainHandle_t domain);
     void domainMark(mstxDomainHandle_t domain, const char* message, const aclrtStream stream);
     int domainRangeStart(mstxDomainHandle_t domain, const char* message, const aclrtStream stream);
@@ -40,7 +41,6 @@ private:
     explicit MstxMgr(MstxMgr &&obj) = delete;
     MstxMgr& operator=(MstxMgr &&obj) = delete;
 
-    bool isMsleaksEnable();
     bool isMsleaksEnableImpl();
     bool isProfTxEnable();
     bool isMsptiTxEnable();
diff --git a/torch_npu/csrc/profiler/npu_profiler.h b/torch_npu/csrc/profiler/npu_profiler.h
index 2a6f44a318..8d3222fa2c 100644
--- a/torch_npu/csrc/profiler/npu_profiler.h
+++ b/torch_npu/csrc/profiler/npu_profiler.h
@@ -139,7 +139,7 @@ struct MstxRange {
         }
         rangeId = MstxMgr::GetInstance()->getRangeId();
         if (at_npu::native::IsSupportMstxDomainFunc()) {
-            domainHandle = MstxMgr::GetInstance()->createDomain(domainName.c_str());
+            domainHandle = MstxMgr::GetInstance()->createProfDomain(domainName.c_str());
             at_npu::native::MstxDomainRangeStartA(domainHandle, message.c_str(), stream, rangeId);
         } else {
             at_npu::native::MstxRangeStartA(message.c_str(), stream, rangeId);
-- 
Gitee


From 30a1047baccdc6655f7f30113c5931c1c10c482c Mon Sep 17 00:00:00 2001
From: jiangpengfei <jiangpengfei24@huawei.com>
Date: Sat, 22 Mar 2025 07:26:04 +0000
Subject: [PATCH 217/358] !19333 cleancode Merge pull request !19333 from
 jiangpengfei/v2.6.0

---
 torch_npu/csrc/core/npu/DeviceUtils.h         | 92 ++++++++++--------
 torch_npu/csrc/core/npu/NPUCachingAllocator.h | 96 +++++++++----------
 torch_npu/csrc/core/npu/NPUEventManager.h     | 32 +++----
 3 files changed, 115 insertions(+), 105 deletions(-)

diff --git a/torch_npu/csrc/core/npu/DeviceUtils.h b/torch_npu/csrc/core/npu/DeviceUtils.h
index 4cbbb51126..2e253f1fca 100644
--- a/torch_npu/csrc/core/npu/DeviceUtils.h
+++ b/torch_npu/csrc/core/npu/DeviceUtils.h
@@ -11,72 +11,82 @@
 namespace torch_npu {
 namespace utils {
 
-inline bool is_npu(const at::Tensor& tensor) {
+inline bool is_npu(const at::Tensor& tensor)
+{
     return tensor.is_privateuseone();
 }
 
-inline bool is_npu(const at::TensorOptions& options) {
+inline bool is_npu(const at::TensorOptions& options)
+{
     return options.device().is_privateuseone();
 }
 
-inline bool is_npu(const at::Device& device) {
+inline bool is_npu(const at::Device& device)
+{
     return device.is_privateuseone();
 }
 
-inline void torch_check_npu(const at::Tensor& tensor) {
-  TORCH_CHECK(is_npu(tensor),
-              "Expected NPU tensor, please check whether the input tensor device is correct.",
-              PTA_ERROR(ErrCode::PARAM));
+inline void torch_check_npu(const at::Tensor& tensor)
+{
+    TORCH_CHECK(is_npu(tensor),
+                "Expected NPU tensor, please check whether the input tensor device is correct.",
+                PTA_ERROR(ErrCode::PARAM));
 }
 
-inline void torch_check_npu(const at::TensorOptions& options) {
-  TORCH_CHECK(is_npu(options),
-              "Expected NPU tensor, please check whether the input tensor device is correct.",
-              PTA_ERROR(ErrCode::PARAM));
+inline void torch_check_npu(const at::TensorOptions& options)
+{
+    TORCH_CHECK(is_npu(options),
+                "Expected NPU tensor, please check whether the input tensor device is correct.",
+                PTA_ERROR(ErrCode::PARAM));
 }
 
-inline void torch_check_npu(const at::Device& device) {
-  TORCH_CHECK(is_npu(device),
-              "Expected NPU tensor, please check whether the input tensor device is correct.",
-              PTA_ERROR(ErrCode::PARAM));
+inline void torch_check_npu(const at::Device& device)
+{
+    TORCH_CHECK(is_npu(device),
+                "Expected NPU tensor, please check whether the input tensor device is correct.",
+                PTA_ERROR(ErrCode::PARAM));
 }
 
-inline c10::DeviceType get_npu_device_type() {
-  return c10::DeviceType::PrivateUse1;
+inline c10::DeviceType get_npu_device_type()
+{
+    return c10::DeviceType::PrivateUse1;
 }
 
-inline void maybe_initialize_npu(const at::TensorOptions& options) {
-  if (torch_npu::utils::is_npu(options)) {
-    c10_npu::NpuSysCtrl::SysStatus status =
-        c10_npu::NpuSysCtrl::GetInstance().Initialize(options.device().index());
-    if (status != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) {
-      TORCH_CHECK(false, "npu device ", options.device().index(), " init failed.",
-                  PTA_ERROR(ErrCode::INTERNAL));
-    }
+inline void maybe_initialize_npu(const at::TensorOptions& options)
+{
+    if (torch_npu::utils::is_npu(options)) {
+        c10_npu::NpuSysCtrl::SysStatus status =
+            c10_npu::NpuSysCtrl::GetInstance().Initialize(options.device().index());
+        if (status != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) {
+            TORCH_CHECK(false, "npu device ", options.device().index(), " init failed.",
+                        PTA_ERROR(ErrCode::INTERNAL));
+        }
 #ifndef BUILD_LIBTORCH
-    torch_npu::utils::npu_lazy_init();
+        torch_npu::utils::npu_lazy_init();
 #endif
-  }
+    }
 }
 
-inline void maybe_initialize_npu(const at::Device& device) {
-  if (torch_npu::utils::is_npu(device)) {
-    c10_npu::NpuSysCtrl::SysStatus status =
-        c10_npu::NpuSysCtrl::GetInstance().Initialize(device.index());
-    if (status != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) {
-      TORCH_CHECK(false, "npu device ", device.index(), " init failed.",
-                  PTA_ERROR(ErrCode::INTERNAL));
-    }
+inline void maybe_initialize_npu(const at::Device& device)
+{
+    if (torch_npu::utils::is_npu(device)) {
+        c10_npu::NpuSysCtrl::SysStatus status =
+            c10_npu::NpuSysCtrl::GetInstance().Initialize(device.index());
+        if (status != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) {
+            TORCH_CHECK(false, "npu device ", device.index(), " init failed.",
+                        PTA_ERROR(ErrCode::INTERNAL));
+        }
 #ifndef BUILD_LIBTORCH
-    torch_npu::utils::npu_lazy_init();
+        torch_npu::utils::npu_lazy_init();
 #endif
-  }
+    }
 }
 
-inline void maybe_initialize_npu(const c10::optional<at::Device>& device) {
-  if (device) {
-    maybe_initialize_npu(*device);
-  }
+inline void maybe_initialize_npu(const c10::optional<at::Device>& device)
+{
+    if (device) {
+        maybe_initialize_npu(*device);
+    }
 }
 
 }
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
index 40d5231fd0..12104d59a9 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
@@ -45,68 +45,68 @@ C10_DECLARE_REGISTRY(FreeNPUMemoryCallbacksRegistry, FreeMemoryCallback);
 // not counted as a word boundary, so you would otherwise have to list each
 // of these functions.
 struct Stat {
-  int64_t current = 0;
-  int64_t peak = 0;
-  int64_t allocated = 0;
-  int64_t freed = 0;
+    int64_t current = 0;
+    int64_t peak = 0;
+    int64_t allocated = 0;
+    int64_t freed = 0;
 };
 
 enum struct StatType : uint64_t {
-  AGGREGATE = 0,
-  SMALL_POOL = 1,
-  LARGE_POOL = 2,
-  NUM_TYPES = 3  // remember to update this whenever a new stat type is added
+    AGGREGATE = 0,
+    SMALL_POOL = 1,
+    LARGE_POOL = 2,
+    NUM_TYPES = 3  // remember to update this whenever a new stat type is added
 };
 
 typedef std::array<Stat, static_cast<size_t>(StatType::NUM_TYPES)> StatArray;
 // Struct containing memory allocator summary statistics for a device.
 struct DeviceStats {
-  // COUNT: allocations requested by client code
-  StatArray allocation;
-  // COUNT: number of allocated segments from npuMalloc().
-  StatArray segment;
-  // COUNT: number of active memory blocks (allocated or used by stream)
-  StatArray active;
-  // COUNT: number of inactive, split memory blocks (unallocated but can't be released via npuFree)
-  StatArray inactive_split;
-
-  // SUM: bytes requested by client code
-  StatArray allocated_bytes;
-  // SUM: bytes reserved by this memory allocator (both free and used)
-  StatArray reserved_bytes;
-  // SUM: bytes within active memory blocks
-  StatArray active_bytes;
-  // SUM: bytes within inactive, split memory blocks
-  StatArray inactive_split_bytes;
-  // SUM: bytes requested by client code
-  StatArray requested_bytes;
-
-  // COUNT: total number of failed calls to NPU malloc necessitating cache flushes.
-  int64_t num_alloc_retries = 0;
-
-  // COUNT: total number of OOMs (i.e. failed calls to NPU after cache flush)
-  int64_t num_ooms = 0;
-
-  // COUNT: total number of oversize blocks allocated from pool
-  Stat oversize_allocations;
-
-  // COUNT: total number of oversize blocks requiring malloc
-  Stat oversize_segments;
-
-  // SIZE: maximum block size that is allowed to be split.
-  int64_t max_split_size = 0;
+    // COUNT: allocations requested by client code
+    StatArray allocation;
+    // COUNT: number of allocated segments from npuMalloc().
+    StatArray segment;
+    // COUNT: number of active memory blocks (allocated or used by stream)
+    StatArray active;
+    // COUNT: number of inactive, split memory blocks (unallocated but can't be released via npuFree)
+    StatArray inactive_split;
+
+    // SUM: bytes requested by client code
+    StatArray allocated_bytes;
+    // SUM: bytes reserved by this memory allocator (both free and used)
+    StatArray reserved_bytes;
+    // SUM: bytes within active memory blocks
+    StatArray active_bytes;
+    // SUM: bytes within inactive, split memory blocks
+    StatArray inactive_split_bytes;
+    // SUM: bytes requested by client code
+    StatArray requested_bytes;
+
+    // COUNT: total number of failed calls to NPU malloc necessitating cache flushes.
+    int64_t num_alloc_retries = 0;
+
+    // COUNT: total number of OOMs (i.e. failed calls to NPU after cache flush)
+    int64_t num_ooms = 0;
+
+    // COUNT: total number of oversize blocks allocated from pool
+    Stat oversize_allocations;
+
+    // COUNT: total number of oversize blocks requiring malloc
+    Stat oversize_segments;
+
+    // SIZE: maximum block size that is allowed to be split.
+    int64_t max_split_size = 0;
 };
 
 typedef std::shared_ptr<c10::GatheredContext> (*CreateContextFn)(void);
 
 // Struct containing info of an allocation block (i.e. a fractional part of a cudaMalloc)..
 struct BlockInfo {
-  int64_t size = 0;
-  int64_t requested_size = 0;
-  int32_t gc_counter = 0;
-  bool allocated = false;
-  bool active = false;
-  std::shared_ptr<c10::GatheredContext> context_when_allocated;
+    int64_t size = 0;
+    int64_t requested_size = 0;
+    int32_t gc_counter = 0;
+    bool allocated = false;
+    bool active = false;
+    std::shared_ptr<c10::GatheredContext> context_when_allocated;
 };
 
 // Struct containing info of a memory segment (i.e. one contiguous cudaMalloc).
diff --git a/torch_npu/csrc/core/npu/NPUEventManager.h b/torch_npu/csrc/core/npu/NPUEventManager.h
index 431ddc9f00..c01491aa03 100644
--- a/torch_npu/csrc/core/npu/NPUEventManager.h
+++ b/torch_npu/csrc/core/npu/NPUEventManager.h
@@ -14,27 +14,27 @@ namespace c10_npu {
 
 class NPUEventManager {
 public:
-  static NPUEventManager& GetInstance();
-  aclError QueryAndDestroyEvent();
-  aclError LazyDestroy(aclrtEvent npu_event);
-  void ClearEvent();
-  void IncreaseUnrecordedCount(aclrtEvent event);
-  void DecreaseUnrecordedCount(aclrtEvent event);
-  bool IsEventRecorded(aclrtEvent event);
-  void ClearUnrecordedCount();
-  ~NPUEventManager() {}
+    static NPUEventManager& GetInstance();
+    aclError QueryAndDestroyEvent();
+    aclError LazyDestroy(aclrtEvent npu_event);
+    void ClearEvent();
+    void IncreaseUnrecordedCount(aclrtEvent event);
+    void DecreaseUnrecordedCount(aclrtEvent event);
+    bool IsEventRecorded(aclrtEvent event);
+    void ClearUnrecordedCount();
+    ~NPUEventManager() {}   
 
 private:
-  void run(aclrtEvent event);
+    void run(aclrtEvent event);
 
 private:
-  std::mutex event_queue_mutex_;
-  NPUEventManager();
-  std::deque<aclrtEvent> npu_events_;
-  std::shared_ptr<c10::TaskThreadPool> thread_pool_;
+    std::mutex event_queue_mutex_;
+    NPUEventManager();
+    std::deque<aclrtEvent> npu_events_;
+    std::shared_ptr<c10::TaskThreadPool> thread_pool_;
 
-  std::mutex event_unrecorded_count_mutex_;
-  ska::flat_hash_map<aclrtEvent, int> event_unrecorded_count_;
+    std::mutex event_unrecorded_count_mutex_;
+    ska::flat_hash_map<aclrtEvent, int> event_unrecorded_count_;
 };
 
 } // namespace c10_npu
\ No newline at end of file
-- 
Gitee


From 9f4d761d9796fa22ec06d0cbca70b050702999dd Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Sat, 22 Mar 2025 10:26:16 +0000
Subject: [PATCH 218/358] !19354 Update op_plugin commit id Merge pull request
 !19354 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index c1cd60c7df..3ab1002fa6 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit c1cd60c7df6583ce56ba415464eac88a829f94b8
+Subproject commit 3ab1002fa6d7527f73aeb78e3c1cb9a5f4882423
-- 
Gitee


From cbd8c0b515afe99e533c83a6254df781ae04af2a Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Sun, 23 Mar 2025 02:41:16 +0000
Subject: [PATCH 219/358] !19368 Update op_plugin commit id Merge pull request
 !19368 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 3ab1002fa6..09401f51fb 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 3ab1002fa6d7527f73aeb78e3c1cb9a5f4882423
+Subproject commit 09401f51fbed2091b6810f33320b8ec8e3267436
-- 
Gitee


From 03e9d23d07e667c0acbdb821796d1f7c93779eb7 Mon Sep 17 00:00:00 2001
From: jiangpengfei <jiangpengfei24@huawei.com>
Date: Mon, 24 Mar 2025 02:24:47 +0000
Subject: [PATCH 220/358] !19345 update README.md Merge pull request !19345
 from jiangpengfei/v2.6.0

---
 README.md    | 4 ++++
 README.zh.md | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/README.md b/README.md
index 7fab4df860..123fa00043 100644
--- a/README.md
+++ b/README.md
@@ -149,6 +149,10 @@ Refer to [API of Ascend Extension for PyTorch](docs/api/torch_npu_apis.md) for m
 
 | CANN Version          | Supported PyTorch Version | Supported Extension Version | Github Branch     |
 |-----------------------|---------------------------|-----------------------------|-------------------|
+| CANN 8.1.RC1          | 2.5.1                     | 2.5.0                       | v2.5.1-7.0.0      |
+|                       | 2.4.0                     | 2.4.0.post3                 | v2.4.0-7.0.0      |
+|                       | 2.3.1                     | 2.3.1.post5                 | v2.3.1-7.0.0      |
+|                       | 2.1.0                     | 2.1.0.post11                | v2.1.0-7.0.0      |
 | CANN 8.0.0.beta1      | 2.6.0                     | 2.6.0rc1                    | v2.6.0            |
 | CANN 8.0.0.alpha001   | 2.5.1                     | 2.5.1rc1                    | v2.5.1            |
 | CANN 8.0.0            | 2.4.0                     | 2.4.0.post2                 | v2.4.0-6.0.0      | 
diff --git a/README.zh.md b/README.zh.md
index 04689e4b71..7b189b792d 100644
--- a/README.zh.md
+++ b/README.zh.md
@@ -158,6 +158,10 @@ print(z)
 
 | CANN版本                | 支持的PyTorch版本 | 支持的Extension版本   | Gitee分支           | 
 |-----------------------|--------------|------------------|-------------------|
+| CANN 8.1.RC1          | 2.5.1        | 2.5.0            | v2.5.1-7.0.0      |
+|                       | 2.4.0        | 2.4.0.post3      | v2.4.0-7.0.0      |
+|                       | 2.3.1        | 2.3.1.post5      | v2.3.1-7.0.0      |
+|                       | 2.1.0        | 2.1.0.post11     | v2.1.0-7.0.0      |
 | CANN 8.0.0.beta1      | 2.6.0        | 2.6.0rc1         | v2.6.0            |
 | CANN 8.0.0.alpha001   | 2.5.1        | 2.5.1rc1         | v2.5.1            |
 | CANN 8.0.0            | 2.4.0        | 2.4.0.post2      | v2.4.0-6.0.0      |
-- 
Gitee


From a7a21293b84561b3bcc3231279b8bad00bf95036 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Mon, 24 Mar 2025 05:11:18 +0000
Subject: [PATCH 221/358] !19387 Update op_plugin commit id Merge pull request
 !19387 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 09401f51fb..895c63ea5a 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 09401f51fbed2091b6810f33320b8ec8e3267436
+Subproject commit 895c63ea5a4a338dfb9606762a588195a2212bda
-- 
Gitee


From 54d15c533a7ad8d40189e075c26911420d669f38 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Mon, 24 Mar 2025 05:11:18 +0000
Subject: [PATCH 222/358] !19387 Update op_plugin commit id Merge pull request
 !19387 from pta-robot/v2.6.0

-- 
Gitee


From 0a9b1d16831bda2d98ed14f6b05933b5571f8713 Mon Sep 17 00:00:00 2001
From: wangzixuan <617225691@qq.com>
Date: Mon, 24 Mar 2025 08:02:48 +0000
Subject: [PATCH 223/358] !19374 [bugfix] [profiler] communication function
 error Merge pull request !19374 from wangzixuan/dev-2.6.0

---
 .../prof_view/prof_db_parse/_communication_db_parser.py   | 6 +++---
 .../prof_view/prof_db_parse/_fwk_api_db_parser.py         | 8 ++++----
 .../analysis/prof_view/prof_db_parse/_memory_db_parser.py | 4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_communication_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_communication_db_parser.py
index d51533c477..af72fdcdf2 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_communication_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_communication_db_parser.py
@@ -94,9 +94,9 @@ class CommunicationDbParser(CommunicationParser):
             }]
 
     def generate_view(self) -> None:
-        self.generate_communication_db(self._output_path)
+        self.generate_communication_db()
     
-    def generate_communication_db(self, output_path: str):
+    def generate_communication_db(self):
         db_files = CANNFileParser(self._profiler_path).get_file_list_by_type(CANNDataEnum.ANALYSIS_DB)
         if not db_files:
             return
@@ -104,7 +104,7 @@ class CommunicationDbParser(CommunicationParser):
         band_width_data, matrix_data, time_data = \
             self.set_step_and_type_info_for_db_data(band_width_data, matrix_data, time_data)
         matrix_data = self.reformat_matrix_db_data(matrix_data)
-        self.save_communication_db_data(band_width_data, matrix_data, time_data, output_path)
+        self.save_communication_db_data(band_width_data, matrix_data, time_data)
 
     def get_communication_db_data(self, db_path: str):
         # 在处理原analysis.db里的数据
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_fwk_api_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_fwk_api_db_parser.py
index c9a7d5aff1..2ae2ac6474 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_fwk_api_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_fwk_api_db_parser.py
@@ -158,16 +158,16 @@ class FwkApiDbParser(BaseParser):
         sql = "select startNs, endNs, globalTid, connectionId from {} " \
               "where name = {} and type = 10000 order by startNs" \
             .format(DbConstant.TABLE_CANN_API, node_launch_str_id)  # 10000 : node level
-        node_lauch_apis = TorchDb().fetch_all_data(sql)
-        if not node_lauch_apis:
+        node_launch_apis = TorchDb().fetch_all_data(sql)
+        if not node_launch_apis:
             raise RuntimeWarning("Failed to get node launch apis")
         torch_op_apis.sort(key=lambda x: x[TorchOpDataOri.START_NS.value])
         torch_op_len = len(torch_op_apis)
         if task_enqueues and task_dequeues:
             self.get_torch_op_connection_ids_with_task_queue(task_enqueues, task_dequeues, torch_op_apis, torch_op_len,
-                                                             node_lauch_apis)
+                                                             node_launch_apis)
         else:
-            self.get_torch_op_connection_ids_without_task_queue(torch_op_apis, torch_op_len, node_lauch_apis)
+            self.get_torch_op_connection_ids_without_task_queue(torch_op_apis, torch_op_len, node_launch_apis)
 
     def get_torch_op_connection_ids_with_task_queue(self, task_enqueues: list, task_dequeues: list, torch_op_apis: list, torch_op_len: int, node_lauch_apis: list):
         enqueue_corr_ids = {task_enqueue[TaskQueueDataOri.CORRELATION_ID.value] for task_enqueue in task_enqueues}
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py
index 6afff95c70..64de6315f2 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_memory_db_parser.py
@@ -175,7 +175,7 @@ class MemoryDbParser(BaseParser):
         TorchDb().create_table_with_headers(DbConstant.TABLE_OPERATOR_MEMORY, TableColumnsManager.TableColumns.get(DbConstant.TABLE_OPERATOR_MEMORY))
         TorchDb().insert_data_into_table(DbConstant.TABLE_OPERATOR_MEMORY, self._pta_op_memory_data + self._ge_op_memory_data)
 
-    def get_pta_memort_record_list(self):
+    def get_pta_memory_record_list(self):
         if not self._pta_memory_bean_list:
             return
         for memory_bean in self._pta_memory_bean_list:
@@ -218,7 +218,7 @@ class MemoryDbParser(BaseParser):
             pta_ptr += 1
 
     def save_memory_record_data_to_db(self):
-        self.get_pta_memort_record_list()
+        self.get_pta_memory_record_list()
         self.get_pta_ge_record_list()
         if not self._record_list:
             return
-- 
Gitee


From 8869f8e34a654734082da81eb3945469bea25b8e Mon Sep 17 00:00:00 2001
From: dilililiwhy <why.wuhuanyu@huawei.com>
Date: Mon, 24 Mar 2025 12:13:34 +0000
Subject: [PATCH 224/358] !19407 Update Release Info (7.0.0) Merge pull request
 !19407 from dilililiwhy/260_update_release_info_700

---
 README.md    | 38 ++++++++++++++++++--------------------
 README.zh.md | 36 +++++++++++++++++-------------------
 2 files changed, 35 insertions(+), 39 deletions(-)

diff --git a/README.md b/README.md
index 123fa00043..67e0ea5ad6 100644
--- a/README.md
+++ b/README.md
@@ -39,12 +39,12 @@ pip3 install setuptools
 
 If the installation fails, use the download link or visit the [PyTorch official website](https://pytorch.org/) to download the installation package of the corresponding version.
 
-| OS arch | Python version | link                                                  |
-| ------- | -------------- | ----------------------------------------------------- |
-| x86     | Python3.9      | [link](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp39-cp39-linux_x86_64.whl#sha256=b68274aeb4047ba8c73e903f0621e2a4adb54ad5282b0845689c3e1dcd2e2546) |
-| x86     | Python3.10     | [link](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp310-cp310-linux_x86_64.whl#sha256=35a9e78b7e4096968b54c1a198687b981569c50ae93e661aa430f9fd208da102) |
-| x86     | Python3.11     | [link](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl#sha256=5b6ae523bfb67088a17ca7734d131548a2e60346c622621e4248ed09dd0790cc) |
-| aarch64 | Python3.9      | [link](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp39-cp39-manylinux_2_28_aarch64.whl#sha256=2ab9c6b3d6eea506bda9b82a0155e974d8ef8e38b417589d144568b4fa59afe1) |
+| OS arch | Python version | link                                                                                                                                                                          |
+|---------|----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| x86     | Python3.9      | [link](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp39-cp39-linux_x86_64.whl#sha256=b68274aeb4047ba8c73e903f0621e2a4adb54ad5282b0845689c3e1dcd2e2546)             |
+| x86     | Python3.10     | [link](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp310-cp310-linux_x86_64.whl#sha256=35a9e78b7e4096968b54c1a198687b981569c50ae93e661aa430f9fd208da102)           |
+| x86     | Python3.11     | [link](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl#sha256=5b6ae523bfb67088a17ca7734d131548a2e60346c622621e4248ed09dd0790cc)           |
+| aarch64 | Python3.9      | [link](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp39-cp39-manylinux_2_28_aarch64.whl#sha256=2ab9c6b3d6eea506bda9b82a0155e974d8ef8e38b417589d144568b4fa59afe1)   |
 | aarch64 | Python3.10     | [link](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp310-cp310-manylinux_2_28_aarch64.whl#sha256=90832f4d118c566b8652a2196ac695fc1f14cf420db27b5a1b41c7eaaf2141e9) |
 | aarch64 | Python3.11     | [link](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp311-cp311-manylinux_2_28_aarch64.whl#sha256=d3dab9fb0294f268aec28e8aaba834e9d006b90a50db5bc2fe2191a9d48c6084) |
 
@@ -154,7 +154,6 @@ Refer to [API of Ascend Extension for PyTorch](docs/api/torch_npu_apis.md) for m
 |                       | 2.3.1                     | 2.3.1.post5                 | v2.3.1-7.0.0      |
 |                       | 2.1.0                     | 2.1.0.post11                | v2.1.0-7.0.0      |
 | CANN 8.0.0.beta1      | 2.6.0                     | 2.6.0rc1                    | v2.6.0            |
-| CANN 8.0.0.alpha001   | 2.5.1                     | 2.5.1rc1                    | v2.5.1            |
 | CANN 8.0.0            | 2.4.0                     | 2.4.0.post2                 | v2.4.0-6.0.0      | 
 |                       | 2.3.1                     | 2.3.1.post4                 | v2.3.1-6.0.0      |
 |                       | 2.1.0                     | 2.1.0.post10                | v2.1.0-6.0.0      |
@@ -165,7 +164,6 @@ Refer to [API of Ascend Extension for PyTorch](docs/api/torch_npu_apis.md) for m
 |                       | 2.2.0                     | 2.2.0.post2                 | v2.2.0-6.0.rc2    |
 |                       | 2.1.0                     | 2.1.0.post6                 | v2.1.0-6.0.rc2    |
 |                       | 1.11.0                    | 1.11.0.post14               | v1.11.0-6.0.rc2   |
-| CANN 8.0.RC2.alpha002 | 2.3.1                     | 2.3.1rc1                    | v2.3.1            |
 | CANN 8.0.RC1          | 2.2.0                     | 2.2.0                       | v2.2.0-6.0.rc1    |
 |                       | 2.1.0                     | 2.1.0.post4                 | v2.1.0-6.0.rc1    |
 |                       | 1.11.0                    | 1.11.0.post11               | v1.11.0-6.0.rc1   |
@@ -241,18 +239,18 @@ The version branches of AscendPyTorch have the following maintenance phases:
 
 ##  PyTorch Maintenance Policies
 
-| **PyTorch** | **Maintenance Policies** | **Status**  | **Launch Date** | **Subsequent Status**                                               | **EOL Date** |
-|-------------|--------------------------|-------------|-----------------|---------------------------------------------------------------------|--------------|
-| 2.6.0       | Regular Release          | Development | 2025/02/20      | Expected to enter maintenance status from  July 20, 2025            |              |
-| 2.5.1       | Regular Release          | Development | 2024/11/08      | Expected to enter maintenance status from  April 8, 2025            |              |
-| 2.4.0       | Regular Release          | Development | 2024/10/15      | Expected to enter maintenance status from  June 15, 2025           |              |
-| 2.3.1       | Regular Release          | Development | 2024/06/06      | Expected to enter maintenance status from  June 7, 2025         |              |
-| 2.2.0       | Regular Release          | Maintained  | 2024/04/01      | Expected to enter maintenance free status from September 10th, 2025 |              |
-| 2.1.0       | Long Term Support        | Development | 2023/10/15      | Expected to enter maintenance status from September 15, 2025            |              |
-| 2.0.1       | Regular Release          | EOL         | 2023/7/19       |                                                                     | 2024/3/14    |
-| 1.11.0      | Long Term Support        | Maintained  | 2023/4/19       | Expected to enter maintenance free status from September 10th, 2025 |              |
-| 1.8.1       | Long Term Support        | EOL         | 2022/4/10       |                                                                     | 2023/4/10    |
-| 1.5.0       | Long Term Support        | EOL         | 2021/7/29       |                                                                     | 2022/7/29    |
+| **PyTorch** | **Maintenance Policies** | **Status**  | **Launch Date** | **Subsequent Status**                                             | **EOL Date** |
+|-------------|--------------------------|-------------|-----------------|-------------------------------------------------------------------|--------------|
+| 2.6.0       | Regular Release          | Development | 2025/03/31      | Expected to enter maintenance status from August 31, 2025         |              |
+| 2.5.1       | Regular Release          | Development | 2024/11/08      | Expected to enter maintenance status from April 8, 2025           |              |
+| 2.4.0       | Regular Release          | Development | 2024/10/15      | Expected to enter maintenance status from June 15, 2025           |              |
+| 2.3.1       | Regular Release          | Development | 2024/06/06      | Expected to enter maintenance status from June 7, 2025            |              |
+| 2.2.0       | Regular Release          | Maintained  | 2024/04/01      | Expected to enter maintenance free status from September 10, 2025 |              |
+| 2.1.0       | Long Term Support        | Development | 2023/10/15      | Expected to enter maintenance status from September 15, 2025      |              |
+| 2.0.1       | Regular Release          | EOL         | 2023/7/19       |                                                                   | 2024/3/14    |
+| 1.11.0      | Long Term Support        | Maintained  | 2023/4/19       | Expected to enter maintenance free status from September 10, 2025 |              |
+| 1.8.1       | Long Term Support        | EOL         | 2022/4/10       |                                                                   | 2023/4/10    |
+| 1.5.0       | Long Term Support        | EOL         | 2021/7/29       |                                                                   | 2022/7/29    |
 
 ## Reference Documents
 
diff --git a/README.zh.md b/README.zh.md
index 7b189b792d..beb722d8aa 100644
--- a/README.zh.md
+++ b/README.zh.md
@@ -30,12 +30,12 @@ pip3 install torch==2.6.0+cpu  --index-url https://download.pytorch.org/whl/cpu
 
 若使用pip命令安装失败，请使用下载链接或进入[PyTorch官方网站](https://pytorch.org/)进行查询下载对应版本。
 
-| 架构    | Python版本 | 下载链接                                              |
-| ------- | ---------- | ----------------------------------------------------- |
-| x86     | Python3.9  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp39-cp39-linux_x86_64.whl#sha256=b68274aeb4047ba8c73e903f0621e2a4adb54ad5282b0845689c3e1dcd2e2546) |
-| x86     | Python3.10 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp310-cp310-linux_x86_64.whl#sha256=35a9e78b7e4096968b54c1a198687b981569c50ae93e661aa430f9fd208da102) |
-| x86     | Python3.11 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl#sha256=5b6ae523bfb67088a17ca7734d131548a2e60346c622621e4248ed09dd0790cc) |
-| aarch64 | Python3.9  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp39-cp39-manylinux_2_28_aarch64.whl#sha256=2ab9c6b3d6eea506bda9b82a0155e974d8ef8e38b417589d144568b4fa59afe1) |
+| 架构      | Python版本   | 下载链接                                                                                                                                                                          |
+|---------|------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| x86     | Python3.9  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp39-cp39-linux_x86_64.whl#sha256=b68274aeb4047ba8c73e903f0621e2a4adb54ad5282b0845689c3e1dcd2e2546)             |
+| x86     | Python3.10 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp310-cp310-linux_x86_64.whl#sha256=35a9e78b7e4096968b54c1a198687b981569c50ae93e661aa430f9fd208da102)           |
+| x86     | Python3.11 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl#sha256=5b6ae523bfb67088a17ca7734d131548a2e60346c622621e4248ed09dd0790cc)           |
+| aarch64 | Python3.9  | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp39-cp39-manylinux_2_28_aarch64.whl#sha256=2ab9c6b3d6eea506bda9b82a0155e974d8ef8e38b417589d144568b4fa59afe1)   |
 | aarch64 | Python3.10 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp310-cp310-manylinux_2_28_aarch64.whl#sha256=90832f4d118c566b8652a2196ac695fc1f14cf420db27b5a1b41c7eaaf2141e9) |
 | aarch64 | Python3.11 | [下载链接](https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp311-cp311-manylinux_2_28_aarch64.whl#sha256=d3dab9fb0294f268aec28e8aaba834e9d006b90a50db5bc2fe2191a9d48c6084) |
 
@@ -163,7 +163,6 @@ print(z)
 |                       | 2.3.1        | 2.3.1.post5      | v2.3.1-7.0.0      |
 |                       | 2.1.0        | 2.1.0.post11     | v2.1.0-7.0.0      |
 | CANN 8.0.0.beta1      | 2.6.0        | 2.6.0rc1         | v2.6.0            |
-| CANN 8.0.0.alpha001   | 2.5.1        | 2.5.1rc1         | v2.5.1            |
 | CANN 8.0.0            | 2.4.0        | 2.4.0.post2      | v2.4.0-6.0.0      |
 |                       | 2.3.1        | 2.3.1.post4      | v2.3.1-6.0.0      |
 |                       | 2.1.0        | 2.1.0.post10     | v2.1.0-6.0.0      |
@@ -173,8 +172,7 @@ print(z)
 | CANN 8.0.RC2          | 2.3.1        | 2.3.1            | v2.3.1-6.0.rc2    | 
 |                       | 2.2.0        | 2.2.0.post2      | v2.2.0-6.0.rc2    |
 |                       | 2.1.0        | 2.1.0.post6      | v2.1.0-6.0.rc2    |
-|                       | 1.11.0       | 1.11.0.post14    | v1.11.0-6.0.rc2   | 
-| CANN 8.0.RC2.alpha002 | 2.3.1        | 2.3.1rc1         | v2.3.1            | 
+|                       | 1.11.0       | 1.11.0.post14    | v1.11.0-6.0.rc2   |
 | CANN 8.0.RC1          | 2.2.0        | 2.2.0            | v2.2.0-6.0.rc1    |
 |                       | 2.1.0        | 2.1.0.post4      | v2.1.0-6.0.rc1    | 
 |                       | 1.11.0       | 1.11.0.post11    | v1.11.0-6.0.rc1   | 
@@ -243,18 +241,18 @@ AscendPyTorch版本分支的维护阶段如下：
 
 ## PyTorch版本维护策略
 
-| **PyTorch版本** | **维护策略** | **当前状态** | **发布时间**   | **后续状态**            | **EOL日期** |
-|---------------|----------|----------|------------|---------------------|-----------|
-| 2.6.0         | 常规分支     | 开发       | 2025/02/20 | 预计2025/07/20起进入维护状态 | -         | 
-| 2.5.1         | 常规分支     | 开发       | 2024/11/08 | 预计2025/04/08起进入维护状态 | -         | 
-| 2.4.0         | 常规分支     | 开发       | 2024/10/15 | 预计2025/06/15起进入维护状态 | -         | 
-| 2.3.1         | 常规分支     | 开发       | 2024/06/06 | 预计2025/06/07起进入维护状态 |           |
+| **PyTorch版本** | **维护策略** | **当前状态** | **发布时间**   | **后续状态**             | **EOL日期** |
+|---------------|----------|----------|------------|----------------------|-----------|
+| 2.6.0         | 常规分支     | 开发       | 2025/03/31 | 预计2025/08/31起进入维护状态  | -         | 
+| 2.5.1         | 常规分支     | 开发       | 2024/11/08 | 预计2025/04/08起进入维护状态  | -         | 
+| 2.4.0         | 常规分支     | 开发       | 2024/10/15 | 预计2025/06/15起进入维护状态  | -         | 
+| 2.3.1         | 常规分支     | 开发       | 2024/06/06 | 预计2025/06/07起进入维护状态  |           |
 | 2.2.0         | 常规分支     | 维护       | 2024/04/01 | 预计2025/09/10起进入无维护状态 |           |
-| 2.1.0         | 长期支持     | 开发       | 2023/10/15 | 预计2025/09/15起进入维护状态 |           |
-| 2.0.1         | 常规分支     | EOL      | 2023/7/19  |                     | 2024/3/14 |
+| 2.1.0         | 长期支持     | 开发       | 2023/10/15 | 预计2025/09/15起进入维护状态  |           |
+| 2.0.1         | 常规分支     | EOL      | 2023/7/19  |                      | 2024/3/14 |
 | 1.11.0        | 长期支持     | 维护       | 2023/4/19  | 预计2025/09/10起进入无维护状态 |           |
-| 1.8.1         | 长期支持     | EOL      | 2022/4/10  |                     | 2023/4/10 |
-| 1.5.0         | 长期支持     | EOL      | 2021/7/29  |                     | 2022/7/29 |
+| 1.8.1         | 长期支持     | EOL      | 2022/4/10  |                      | 2023/4/10 |
+| 1.5.0         | 长期支持     | EOL      | 2021/7/29  |                      | 2022/7/29 |
 
 ## 安全声明
 
-- 
Gitee


From 5db7f9ca1eca845d898cacb6de673d9b0cc5b8ba Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Mon, 24 Mar 2025 13:56:25 +0000
Subject: [PATCH 225/358] !19434 Update op_plugin commit id Merge pull request
 !19434 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 895c63ea5a..ce8f4d5527 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 895c63ea5a4a338dfb9606762a588195a2212bda
+Subproject commit ce8f4d5527a15adac3759ac820f9d9c6d166e5ed
-- 
Gitee


From d636c599303e92271f63914f24ae7ad108ae91b0 Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Mon, 24 Mar 2025 23:16:07 +0000
Subject: [PATCH 226/358] !19449 Update torchair commit id Merge pull request
 !19449 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 1761456f1b..6fffca1d06 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 1761456f1b79e535937acb332b4d522fa2cf60f9
+Subproject commit 6fffca1d0661b0f7946d4c2454957cbe989a4a05
-- 
Gitee


From 7c6226eeb5085059fdef8b8556c3dea83d0916c6 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 25 Mar 2025 02:56:21 +0000
Subject: [PATCH 227/358] !19452 Update op_plugin commit id Merge pull request
 !19452 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index ce8f4d5527..1e0f7956aa 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit ce8f4d5527a15adac3759ac820f9d9c6d166e5ed
+Subproject commit 1e0f7956aa64cd347fcd483d2cfb38033c66325c
-- 
Gitee


From 8f93bb14b6aa839cfb7f807b403a893f3190ad8f Mon Sep 17 00:00:00 2001
From: SCh-zx <1325467101@qq.com>
Date: Tue, 25 Mar 2025 06:24:51 +0000
Subject: [PATCH 228/358] !19311 [cleancode]npu&profiler Merge pull request
 !19311 from SCh-zx/v2.6.0

---
 torch_npu/csrc/npu/Event.cpp                  |  55 ++--
 torch_npu/csrc/npu/Event.h                    |   2 +-
 torch_npu/csrc/npu/Module.cpp                 |   4 +-
 torch_npu/csrc/npu/Stream.cpp                 |  48 +--
 torch_npu/csrc/npu/memory_snapshot.cpp        |   2 +-
 torch_npu/csrc/profiler/init.cpp              |   6 +-
 torch_npu/csrc/profiler/npu_profiler.h        |  78 ++---
 torch_npu/csrc/profiler/profiler_mgr.cpp      |   3 +-
 torch_npu/csrc/profiler/profiler_mgr.h        |  14 +-
 torch_npu/csrc/profiler/utils.cpp             | 299 +++++++++---------
 torch_npu/csrc/profiler/utils.h               |  34 +-
 .../toolkit/profiler/common/ring_buffer.h     | 149 ++++-----
 .../csrc/toolkit/profiler/common/singleton.h  |  21 +-
 .../csrc/toolkit/profiler/common/thread.h     |  82 ++---
 .../csrc/toolkit/profiler/common/utils.h      | 241 +++++++-------
 .../csrc/toolkit/profiler/inc/data_dumper.h   |  40 +--
 .../csrc/toolkit/profiler/inc/data_reporter.h | 274 ++++++++--------
 .../csrc/toolkit/profiler/src/data_dumper.cpp |  68 ++--
 18 files changed, 716 insertions(+), 704 deletions(-)

diff --git a/torch_npu/csrc/npu/Event.cpp b/torch_npu/csrc/npu/Event.cpp
index 653b468494..3c92a33539 100644
--- a/torch_npu/csrc/npu/Event.cpp
+++ b/torch_npu/csrc/npu/Event.cpp
@@ -1,11 +1,12 @@
+#include "torch_npu/csrc/npu/Event.h"
+
 #include <pybind11/pybind11.h>
 #include <torch/csrc/Device.h>
 #include <torch/csrc/THP.h>
 #include <torch/csrc/utils/python_arg_parser.h>
-#include "torch_npu/csrc/core/npu/NPUGuard.h"
 #include <structmember.h>
+#include "torch_npu/csrc/core/npu/NPUGuard.h"
 
-#include "torch_npu/csrc/npu/Event.h"
 #include "torch_npu/csrc/npu/Stream.h"
 
 PyObject *THNPEventClass = nullptr;
@@ -143,37 +144,37 @@ PyTypeObject THNPEventType = {
     0,                                     /* tp_itemsize */
     (destructor)THNPEvent_dealloc,         /* tp_dealloc */
     0,                                     /* tp_vectorcall_offset */
-    0,                                     /* tp_getattr */
-    0,                                     /* tp_setattr */
-    0,                                     /* tp_reserved */
-    0,                                     /* tp_repr */
-    0,                                     /* tp_as_number */
-    0,                                     /* tp_as_sequence */
-    0,                                     /* tp_as_mapping */
-    0,                                     /* tp_hash  */
-    0,                                     /* tp_call */
-    0,                                     /* tp_str */
-    0,                                     /* tp_getattro */
-    0,                                     /* tp_setattro */
-    0,                                     /* tp_as_buffer */
+    nullptr,                               /* tp_getattr */
+    nullptr,                               /* tp_setattr */
+    nullptr,                               /* tp_reserved */
+    nullptr,                               /* tp_repr */
+    nullptr,                               /* tp_as_number */
+    nullptr,                               /* tp_as_sequence */
+    nullptr,                               /* tp_as_mapping */
+    nullptr,                               /* tp_hash  */
+    nullptr,                               /* tp_call */
+    nullptr,                               /* tp_str */
+    nullptr,                               /* tp_getattro */
+    nullptr,                               /* tp_setattro */
+    nullptr,                               /* tp_as_buffer */
     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
     nullptr,                                  /* tp_doc */
-    0,                                     /* tp_traverse */
-    0,                                     /* tp_clear */
-    0,                                     /* tp_richcompare */
+    nullptr,                               /* tp_traverse */
+    nullptr,                               /* tp_clear */
+    nullptr,                               /* tp_richcompare */
     0,                                     /* tp_weaklistoffset */
-    0,                                     /* tp_iter */
-    0,                                     /* tp_iternext */
+    nullptr,                               /* tp_iter */
+    nullptr,                               /* tp_iternext */
     THNPEvent_methods,                     /* tp_methods */
-    0,                                     /* tp_members */
+    nullptr,                               /* tp_members */
     THNPEvent_properties,                  /* tp_getset */
-    0,                                     /* tp_base */
-    0,                                     /* tp_dict */
-    0,                                     /* tp_descr_get */
-    0,                                     /* tp_descr_set */
+    nullptr,                               /* tp_base */
+    nullptr,                               /* tp_dict */
+    nullptr,                               /* tp_descr_get */
+    nullptr,                               /* tp_descr_set */
     0,                                     /* tp_dictoffset */
-    0,                                     /* tp_init */
-    0,                                     /* tp_alloc */
+    nullptr,                               /* tp_init */
+    nullptr,                               /* tp_alloc */
     THNPEvent_pynew,                       /* tp_new */
 };
 
diff --git a/torch_npu/csrc/npu/Event.h b/torch_npu/csrc/npu/Event.h
index 674a2ef29b..b5b8074a16 100644
--- a/torch_npu/csrc/npu/Event.h
+++ b/torch_npu/csrc/npu/Event.h
@@ -1,9 +1,9 @@
 #ifndef THNP_EVENT_INC
 #define THNP_EVENT_INC
 
+#include <torch/csrc/python_headers.h>
 #include "torch_npu/csrc/core/npu/NPUMacros.h"
 #include "torch_npu/csrc/core/npu/NPUEvent.h"
-#include <torch/csrc/python_headers.h>
 
 struct THNPEvent {
     PyObject_HEAD
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index 0ca1415ad2..f4d7a341f8 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -31,9 +31,9 @@
 #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h"
 #include "torch_npu/csrc/core/npu/register/OptionRegister.h"
 #include "torch_npu/csrc/core/OverflowUtils.h"
+#include "torch_npu/csrc/npu/Module.h"
 #include "torch_npu/csrc/framework/StorageDescHelper.h"
 #include "torch_npu/csrc/npu/DataParallelComm.h"
-#include "torch_npu/csrc/npu/Module.h"
 #include "torch_npu/csrc/npu/NPUPluggableAllocator.h"
 #include "torch_npu/csrc/npu/Stream.h"
 #include "torch_npu/csrc/npu/Stress_detect.h"
@@ -263,7 +263,7 @@ PyObject* THNPModule_msTxMark(PyObject* self, PyObject* args)
     HANDLE_TH_ERRORS
     const char *input_string;
     if (!PyArg_ParseTuple(args, "s", &input_string)) {
-        return NULL;
+        return nullptr;
     }
     torch_npu::profiler::Mark(input_string);
 
diff --git a/torch_npu/csrc/npu/Stream.cpp b/torch_npu/csrc/npu/Stream.cpp
index 53e84e4e10..8059cf3447 100644
--- a/torch_npu/csrc/npu/Stream.cpp
+++ b/torch_npu/csrc/npu/Stream.cpp
@@ -174,37 +174,37 @@ PyTypeObject THNPStreamType = {
   0,                                     /* tp_itemsize */
   (destructor)THNPStream_dealloc,        /* tp_dealloc */
   0,                                     /* tp_vectorcall_offset */
-  0,                                     /* tp_getattr */
-  0,                                     /* tp_setattr */
-  0,                                     /* tp_reserved */
-  0,                                     /* tp_repr */
-  0,                                     /* tp_as_number */
-  0,                                     /* tp_as_sequence */
-  0,                                     /* tp_as_mapping */
-  0,                                     /* tp_hash  */
-  0,                                     /* tp_call */
-  0,                                     /* tp_str */
-  0,                                     /* tp_getattro */
-  0,                                     /* tp_setattro */
-  0,                                     /* tp_as_buffer */
+  nullptr,                               /* tp_getattr */
+  nullptr,                               /* tp_setattr */
+  nullptr,                               /* tp_reserved */
+  nullptr,                               /* tp_repr */
+  nullptr,                               /* tp_as_number */
+  nullptr,                               /* tp_as_sequence */
+  nullptr,                               /* tp_as_mapping */
+  nullptr,                               /* tp_hash  */
+  nullptr,                               /* tp_call */
+  nullptr,                               /* tp_str */
+  nullptr,                               /* tp_getattro */
+  nullptr,                               /* tp_setattro */
+  nullptr,                               /* tp_as_buffer */
   Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
   nullptr,                                  /* tp_doc */
-  0,                                     /* tp_traverse */
-  0,                                     /* tp_clear */
-  0,                                     /* tp_richcompare */
+  nullptr,                               /* tp_traverse */
+  nullptr,                               /* tp_clear */
+  nullptr,                               /* tp_richcompare */
   0,                                     /* tp_weaklistoffset */
-  0,                                     /* tp_iter */
-  0,                                     /* tp_iternext */
+  nullptr,                               /* tp_iter */
+  nullptr,                               /* tp_iternext */
   THNPStream_methods,                    /* tp_methods */
   THNPStream_members,                    /* tp_members */
   THNPStream_properties,                /* tp_getset */
-  0,                                     /* tp_base */
-  0,                                     /* tp_dict */
-  0,                                     /* tp_descr_get */
-  0,                                     /* tp_descr_set */
+  nullptr,                               /* tp_base */
+  nullptr,                               /* tp_dict */
+  nullptr,                               /* tp_descr_get */
+  nullptr,                               /* tp_descr_set */
   0,                                     /* tp_dictoffset */
-  0,                                     /* tp_init */
-  0,                                     /* tp_alloc */
+  nullptr,                               /* tp_init */
+  nullptr,                               /* tp_alloc */
   THNPStream_pynew,                      /* tp_new */
 };
 
diff --git a/torch_npu/csrc/npu/memory_snapshot.cpp b/torch_npu/csrc/npu/memory_snapshot.cpp
index 320239a3c1..eeaf20ddbc 100644
--- a/torch_npu/csrc/npu/memory_snapshot.cpp
+++ b/torch_npu/csrc/npu/memory_snapshot.cpp
@@ -2,9 +2,9 @@
 #include <torch/csrc/profiler/combined_traceback.h>
 #include <torch/csrc/jit/serialization/pickler.h>
 
+#include "torch_npu/csrc/utils/LazyInit.h"
 #include "torch_npu/csrc/core/npu/NPUCachingAllocator.h"
 #include "torch_npu/csrc/npu/memory_snapshot.h"
-#include "torch_npu/csrc/utils/LazyInit.h"
 
 using torch::jit::Pickler;
 using c10_npu::NPUCachingAllocator::BlockInfo;
diff --git a/torch_npu/csrc/profiler/init.cpp b/torch_npu/csrc/profiler/init.cpp
index a367c250e3..b871f2d991 100644
--- a/torch_npu/csrc/profiler/init.cpp
+++ b/torch_npu/csrc/profiler/init.cpp
@@ -22,7 +22,8 @@
 namespace torch_npu {
 namespace profiler {
 
-PyObject* profiler_initExtension(PyObject* _unused, PyObject *unused) {
+PyObject* profiler_initExtension(PyObject* _unused, PyObject *unused)
+{
     auto torch_npu_C_module = THPObjectPtr(PyImport_ImportModule("torch_npu._C"));
     if (!torch_npu_C_module) {
         return nullptr;
@@ -97,7 +98,8 @@ static PyMethodDef TorchProfilerMethods[] = { // NOLINT
 };
 
 
-PyMethodDef* profiler_functions() {
+PyMethodDef* profiler_functions()
+{
     return TorchProfilerMethods;
 }
 
diff --git a/torch_npu/csrc/profiler/npu_profiler.h b/torch_npu/csrc/profiler/npu_profiler.h
index 8d3222fa2c..679e63be09 100644
--- a/torch_npu/csrc/profiler/npu_profiler.h
+++ b/torch_npu/csrc/profiler/npu_profiler.h
@@ -20,9 +20,9 @@ void registerFunctions(CallFn call);
 } // python_tracer
 
 enum class NpuActivityType {
-  NONE = 0,
-  CPU,
-  NPU,
+    NONE = 0,
+    CPU,
+    NPU,
 };
 
 enum class MemoryDataType {
@@ -39,28 +39,28 @@ enum class MemoryAllocatorType {
 };
 
 struct MemoryUsage {
-    int8_t device_type{0};
-    int8_t device_index{0};
-    uint8_t data_type{static_cast<uint8_t>(MemoryDataType::MEMORY_INVALID)};
-    uint8_t allocator_type{static_cast<uint8_t>(MemoryAllocatorType::ALLOCATOR_INVALID)};
-    int64_t ptr{0};
-    int64_t alloc_size{0};
-    int64_t total_allocated{0};
-    int64_t total_reserved{0};
-    int64_t total_active{0};
-    int64_t stream_ptr{0};
+    int8_t device_type{ 0 };
+    int8_t device_index{ 0 };
+    uint8_t data_type{ static_cast<uint8_t>(MemoryDataType::MEMORY_INVALID) };
+    uint8_t allocator_type{ static_cast<uint8_t>(MemoryAllocatorType::ALLOCATOR_INVALID) };
+    int64_t ptr{ 0 };
+    int64_t alloc_size{ 0 };
+    int64_t total_allocated{ 0 };
+    int64_t total_reserved{ 0 };
+    int64_t total_active{ 0 };
+    int64_t stream_ptr{ 0 };
 };
 
 struct ExperimentalConfig {
-    ExperimentalConfig(std::string level = "Level0", std::string metrics = "ACL_AICORE_NONE",
-                       bool l2_cache = false, bool record_op_args = false, bool msprof_tx = false,
-                       bool op_attr = false)
+    ExperimentalConfig(std::string level = "Level0", std::string metrics = "ACL_AICORE_NONE", bool l2_cache = false,
+        bool record_op_args = false, bool msprof_tx = false, bool op_attr = false)
         : trace_level(level),
           metrics(metrics),
           l2_cache(l2_cache),
           record_op_args(record_op_args),
           msprof_tx(msprof_tx),
-          op_attr(op_attr) {}
+          op_attr(op_attr)
+    {}
     ~ExperimentalConfig() = default;
 
     std::string trace_level;
@@ -72,21 +72,22 @@ struct ExperimentalConfig {
 };
 
 struct NpuProfilerConfig {
-  explicit NpuProfilerConfig(
-    std::string path,
-    bool record_shapes = false,
-    bool profile_memory = false,
-    bool with_stack = false,
-    bool with_flops = false,
-    bool with_modules = false,
-    ExperimentalConfig experimental_config = ExperimentalConfig())
-    : path(path),
-      record_shapes(record_shapes),
-      profile_memory(profile_memory),
-      with_stack(with_stack),
-      with_flops(with_flops),
-      with_modules(with_modules),
-      experimental_config(experimental_config) {}
+    explicit NpuProfilerConfig(
+        std::string path,
+        bool record_shapes = false,
+        bool profile_memory = false,
+        bool with_stack = false,
+        bool with_flops = false,
+        bool with_modules = false,
+        ExperimentalConfig experimental_config = ExperimentalConfig())
+        : path(path),
+          record_shapes(record_shapes),
+          profile_memory(profile_memory),
+          with_stack(with_stack),
+          with_flops(with_flops),
+          with_modules(with_modules),
+          experimental_config(experimental_config)
+    {}
 
     ~NpuProfilerConfig() = default;
     std::string path;
@@ -98,13 +99,14 @@ struct NpuProfilerConfig {
     ExperimentalConfig experimental_config;
 };
 
-std::atomic<bool>& profDataReportEnable();
+std::atomic<bool> &profDataReportEnable();
 
 void initNpuProfiler(const std::string &path, const std::set<NpuActivityType> &activities);
 
 void warmupNpuProfiler(const NpuProfilerConfig &config, const std::set<NpuActivityType> &activities);
 
-void startNpuProfiler(const NpuProfilerConfig &config, const std::set<NpuActivityType> &activities, const std::unordered_set<at::RecordScope> &scops = {});
+void startNpuProfiler(const NpuProfilerConfig &config, const std::set<NpuActivityType> &activities,
+    const std::unordered_set<at::RecordScope> &scops = {});
 
 void stopNpuProfiler();
 
@@ -112,9 +114,9 @@ void finalizeNpuProfiler();
 
 void reportMarkDataToNpuProfiler(uint32_t category, const std::string &msg, uint64_t correlation_id);
 
-void reportMemoryDataToNpuProfiler(const MemoryUsage& data);
+void reportMemoryDataToNpuProfiler(const MemoryUsage &data);
 
-inline int mstxRangeStart(const char* message, const aclrtStream stream)
+inline int mstxRangeStart(const char *message, const aclrtStream stream)
 {
     return MstxMgr::GetInstance()->rangeStart(message, stream);
 }
@@ -130,8 +132,8 @@ inline bool mstxEnable()
 }
 
 struct MstxRange {
-    int rangeId{0};
-    mstxDomainHandle_t domainHandle{nullptr};
+    int rangeId{ 0 };
+    mstxDomainHandle_t domainHandle{ nullptr };
     MstxRange(const std::string &message, aclrtStream stream, const std::string &domainName = "default")
     {
         if (!mstxEnable()) {
diff --git a/torch_npu/csrc/profiler/profiler_mgr.cpp b/torch_npu/csrc/profiler/profiler_mgr.cpp
index b773e2cf4f..0cd764a83f 100644
--- a/torch_npu/csrc/profiler/profiler_mgr.cpp
+++ b/torch_npu/csrc/profiler/profiler_mgr.cpp
@@ -89,7 +89,8 @@ void ProfilerMgr::WarmupMsProfiler(uint32_t *deviceIdList, uint32_t deviceNum, a
     }
 }
 
-void ProfilerMgr::EnableMsProfiler(uint32_t *deviceIdList, uint32_t deviceNum, aclprofAicoreMetrics aicMetrics, uint64_t dataTypeConfig) {
+void ProfilerMgr::EnableMsProfiler(uint32_t *deviceIdList, uint32_t deviceNum, aclprofAicoreMetrics aicMetrics, uint64_t dataTypeConfig)
+{
     // Avoid duplicate config creation in scenarios where warmup is turned on
     if (profConfig_ == nullptr) {
         profConfig_ = at_npu::native::AclProfilingCreateConfig(deviceIdList, deviceNum, aicMetrics, nullptr, dataTypeConfig);
diff --git a/torch_npu/csrc/profiler/profiler_mgr.h b/torch_npu/csrc/profiler/profiler_mgr.h
index cd4400555c..8f212957e6 100644
--- a/torch_npu/csrc/profiler/profiler_mgr.h
+++ b/torch_npu/csrc/profiler/profiler_mgr.h
@@ -17,13 +17,13 @@ constexpr uint64_t Level1 = ACL_PROF_TASK_TIME | ACL_PROF_ACL_API | ACL_PROF_HCC
 constexpr uint64_t Level2 = Level1 | ACL_PROF_RUNTIME_API | ACL_PROF_AICPU;
 
 struct NpuTraceConfig {
-  std::string trace_level;
-  std::string metrics;
-  bool npu_memory;
-  bool l2_cache;
-  bool record_op_args;
-  bool msprof_tx;
-  bool op_attr;
+    std::string trace_level;
+    std::string metrics;
+    bool npu_memory;
+    bool l2_cache;
+    bool record_op_args;
+    bool msprof_tx;
+    bool op_attr;
 };
 
 C10_NPU_API int8_t GetTraceLevel();
diff --git a/torch_npu/csrc/profiler/utils.cpp b/torch_npu/csrc/profiler/utils.cpp
index 7e267193c8..bd2f1b1b21 100644
--- a/torch_npu/csrc/profiler/utils.cpp
+++ b/torch_npu/csrc/profiler/utils.cpp
@@ -27,22 +27,22 @@ static constexpr auto kMat2Size = "mat2_size";
 bool NPURecordFunction::use_npu_simple = false;
 
 static bool validateInput(
-    const std::string& op_name,
+    const std::string &op_name,
     size_t min_size,
     c10::ArrayRef<const c10::IValue> inputs,
-    const c10::ArrayRef<int>& should_be_tensor) {
-  std::stringstream ss;
+    const c10::ArrayRef<int> &should_be_tensor)
+{
+    std::stringstream ss;
     if (inputs.size() < min_size) {
-        ss << "Failed to save extra arguments for flops compuation of op "
-        << op_name << ", min size: " << min_size
-        << ", actual size: " << inputs.size();
+        ss << "Failed to save extra arguments for flops compuation of op " << op_name << ", min size: " << min_size <<
+            ", actual size: " << inputs.size();
         TORCH_NPU_WARN(ss.str());
         return false;
     }
     for (auto index : should_be_tensor) {
         if (!inputs[index].isTensor()) {
-            ss << "Failed to save extra arguments for flops compuation of op "
-                << op_name << ", input[" << index << "] must be a tensor.";
+            ss << "Failed to save extra arguments for flops compuation of op " << op_name << ", input[" << index <<
+                "] must be a tensor.";
             TORCH_NPU_WARN(ss.str());
             return false;
         }
@@ -50,7 +50,8 @@ static bool validateInput(
     return true;
 }
 
-std::unordered_map<std::string, c10::IValue> saveExtraArgs(const at::RecordFunction& fn) {
+std::unordered_map<std::string, c10::IValue> saveExtraArgs(const at::RecordFunction &fn)
+{
     // for specific types of fn, return the saved extra args for computing flops
     std::unordered_map<std::string, c10::IValue> map;
     auto inputs = fn.inputs();
@@ -62,7 +63,7 @@ std::unordered_map<std::string, c10::IValue> saveExtraArgs(const at::RecordFunct
     }
 
     if (fname == kConv2dOp) {
-        std::vector<int> tensors{0, 1};
+        std::vector<int> tensors{ 0, 1 };
         bool check = validateInput(fname, kConv2dGroups + 1, inputs, tensors);
         if (!check) {
             return map;
@@ -81,8 +82,8 @@ std::unordered_map<std::string, c10::IValue> saveExtraArgs(const at::RecordFunct
         map[kDilation] = inputs[kConv2dDilation];
         map[kGroups] = inputs[kConv2dGroups];
     } else if (fname == kGemmOp) {
-            std::vector<int> tensors{0, 1};
-            bool check = validateInput(fname, 2, inputs, tensors);
+        std::vector<int> tensors{ 0, 1 };
+        bool check = validateInput(fname, 2, inputs, tensors);
         if (!check) {
             return map;
         }
@@ -92,7 +93,7 @@ std::unordered_map<std::string, c10::IValue> saveExtraArgs(const at::RecordFunct
         map[kMat1Size] = at::IValue(left.sizes());
         map[kMat2Size] = at::IValue(right.sizes());
     } else if (fname == kMulOp) {
-        std::vector<int> tensors{0};
+        std::vector<int> tensors{ 0 };
         bool check = validateInput(fname, 1, inputs, tensors);
         if (!check) {
             return map;
@@ -101,7 +102,7 @@ std::unordered_map<std::string, c10::IValue> saveExtraArgs(const at::RecordFunct
         at::Tensor mat = inputs[0].toTensor();
         map[kMatSize] = at::IValue(mat.sizes());
     } else if (fname == kAddOp) {
-        std::vector<int> tensors{0};
+        std::vector<int> tensors{ 0 };
         bool check = validateInput(fname, 1, inputs, tensors);
         if (!check) {
             return map;
@@ -114,145 +115,147 @@ std::unordered_map<std::string, c10::IValue> saveExtraArgs(const at::RecordFunct
     return map;
 }
 
-uint64_t computeFlops(const std::string &op_name, const std::unordered_map<std::string, c10::IValue> &extra_args) {
-  if (op_name == kConv2dOp) {
-    if (extra_args.find(kInputSize) == extra_args.end()
-        || extra_args.find(kWeightSize) == extra_args.end()
-        || extra_args.find(kGroups) == extra_args.end()
-        || extra_args.find(kPadding) == extra_args.end()
-        || extra_args.find(kStride) == extra_args.end()
-        || extra_args.find(kDilation) == extra_args.end()) {
-      TORCH_NPU_WARN("Calculating flops for aten::conv2d requires groups, padding, stride, dilation, input_size, and weight_size in saved arguments.");
-      return 0;
-    }
-    auto input_sizes_ref = extra_args.at(kInputSize);
-    auto kernel_sizes_ref = extra_args.at(kWeightSize);
-    auto groups_ref = extra_args.at(kGroups);
-    auto padding_ref = extra_args.at(kPadding);
-    auto stride_ref = extra_args.at(kStride);
-    auto dilation_ref = extra_args.at(kDilation);
-    if (!input_sizes_ref.isIntList() || !kernel_sizes_ref.isIntList()) {
-      TORCH_NPU_WARN("Failed to compute flops for op aten::conv2d because it requires input and weight tensor sizes.");
-      return 0;
-    }
-    if (!padding_ref.isIntList() || !stride_ref.isIntList() || !dilation_ref.isIntList()) {
-      TORCH_NPU_WARN("Failed to compute flops for op aten::conv2d because it requires padding, stride, and dilation values.");
-      return 0;
-    }
+uint64_t computeFlops(const std::string &op_name, const std::unordered_map<std::string, c10::IValue> &extra_args)
+{
+    if (op_name == kConv2dOp) {
+        if (extra_args.find(kInputSize) == extra_args.end() || extra_args.find(kWeightSize) == extra_args.end() ||
+            extra_args.find(kGroups) == extra_args.end() || extra_args.find(kPadding) == extra_args.end() ||
+            extra_args.find(kStride) == extra_args.end() || extra_args.find(kDilation) == extra_args.end()) {
+            TORCH_NPU_WARN("Calculating flops for aten::conv2d requires groups, padding, stride, dilation, input_size, "
+                           "and weight_size in saved arguments.");
+            return 0;
+        }
+        auto input_sizes_ref = extra_args.at(kInputSize);
+        auto kernel_sizes_ref = extra_args.at(kWeightSize);
+        auto groups_ref = extra_args.at(kGroups);
+        auto padding_ref = extra_args.at(kPadding);
+        auto stride_ref = extra_args.at(kStride);
+        auto dilation_ref = extra_args.at(kDilation);
+        if (!input_sizes_ref.isIntList() || !kernel_sizes_ref.isIntList()) {
+            TORCH_NPU_WARN(
+                "Failed to compute flops for op aten::conv2d because it requires input and weight tensor sizes.");
+            return 0;
+        }
+        if (!padding_ref.isIntList() || !stride_ref.isIntList() || !dilation_ref.isIntList()) {
+            TORCH_NPU_WARN("Failed to compute flops for op aten::conv2d because it requires padding, stride, and "
+                           "dilation values.");
+            return 0;
+        }
 
-    const std::vector<int64_t> input_sizes = input_sizes_ref.toIntVector();
-    const std::vector<int64_t> kernel_sizes = kernel_sizes_ref.toIntVector();
-    const uint64_t groups = (uint64_t)groups_ref.toInt();
-    const std::vector<int64_t> padding = padding_ref.toIntVector();
-    const std::vector<int64_t> stride = stride_ref.toIntVector();
-    const std::vector<int64_t> dilation = dilation_ref.toIntVector();
-    if (input_sizes.size() != 4 || kernel_sizes.size() != 4) {
-      TORCH_NPU_WARN("Failed to compute flops for op aten::conv2d because both input and weight must be size 4.");
-      return 0;
-    }
-    if (!groups) {
-      TORCH_NPU_WARN("Failed to compute flops for op aten::conv2d because group size must not be 0.");
-      return 0;
-    }
-    if (padding.size() != 2 || dilation.size() != 2) {
-      TORCH_NPU_WARN("Failed to compute flops for op aten::conv2d because both padding and dilation must be size 2.");
-      return 0;
-    }
-    if (stride.size() != 2 || (stride[0] * stride[1] == 0)) {
-      TORCH_NPU_WARN("Failed to compute flops for op aten::conv2d because stride must be size 2 and cannot be 0.");
-      return 0;
-    }
-    // format of the input is defined in torch.nn.quantized.functional.conv2d()
-    uint64_t minibatch = 0;
-    uint64_t in_channels = 0;
-    uint64_t input_h = 0;
-    uint64_t input_w = 0;
-    uint64_t out_channels = 0;
-    uint64_t kernel_h = 0;
-    uint64_t kernel_w = 0;
-    const uint64_t conv2d_multiply_factor = 2;
-    std::tie(minibatch, in_channels, input_h, input_w) = std::make_tuple(input_sizes[0], input_sizes[1],
-                                                                         input_sizes[2], input_sizes[3]);
-    std::tie(out_channels, std::ignore, kernel_h, kernel_w) = std::make_tuple(kernel_sizes[0], kernel_sizes[1],
-                                                                              kernel_sizes[2], kernel_sizes[3]);
-    uint64_t output_h = (input_h + 2 * padding[0] - dilation[0] * (kernel_h - 1) - 1) / stride[0] + 1;
-    uint64_t output_w = (input_w + 2 * padding[1] - dilation[1] * (kernel_w - 1) - 1) / stride[1] + 1;
-    if (groups == 0) {
-      TORCH_CHECK(false, "groups can not be 0.", PTA_ERROR(ErrCode::VALUE));
-    }
-    return conv2d_multiply_factor * minibatch * output_h * output_w *
-           kernel_h * kernel_w * in_channels * out_channels / groups;
-  } else if (op_name == kGemmOp) {
-    if (extra_args.find(kMat1Size) == extra_args.end()
-        || extra_args.find(kMat2Size) == extra_args.end()) {
-      TORCH_NPU_WARN("Calculating flops for aten::mm requires mat1_size and mat2_size in saved arguments.");
-      return 0;
-    }
-    auto mat1_sizes_ref = extra_args.at(kMat1Size);
-    auto mat2_sizes_ref = extra_args.at(kMat2Size);
-    if (!mat1_sizes_ref.isIntList() || !mat2_sizes_ref.isIntList()) {
-      TORCH_NPU_WARN("Failed to compute flops for op aten::mm because it requires mat1_size and mat2_size to be IntList.");
-      return 0;
-    }
+        const std::vector<int64_t> input_sizes = input_sizes_ref.toIntVector();
+        const std::vector<int64_t> kernel_sizes = kernel_sizes_ref.toIntVector();
+        const uint64_t groups = (uint64_t)groups_ref.toInt();
+        const std::vector<int64_t> padding = padding_ref.toIntVector();
+        const std::vector<int64_t> stride = stride_ref.toIntVector();
+        const std::vector<int64_t> dilation = dilation_ref.toIntVector();
+        if (input_sizes.size() != 4 || kernel_sizes.size() != 4) {
+            TORCH_NPU_WARN("Failed to compute flops for op aten::conv2d because both input and weight must be size 4.");
+            return 0;
+        }
+        if (!groups) {
+            TORCH_NPU_WARN("Failed to compute flops for op aten::conv2d because group size must not be 0.");
+            return 0;
+        }
+        if (padding.size() != 2 || dilation.size() != 2) {
+            TORCH_NPU_WARN(
+                "Failed to compute flops for op aten::conv2d because both padding and dilation must be size 2.");
+            return 0;
+        }
+        if (stride.size() != 2 || (stride[0] * stride[1] == 0)) {
+            TORCH_NPU_WARN(
+                "Failed to compute flops for op aten::conv2d because stride must be size 2 and cannot be 0.");
+            return 0;
+        }
+        // format of the input is defined in torch.nn.quantized.functional.conv2d()
+        uint64_t minibatch = 0;
+        uint64_t in_channels = 0;
+        uint64_t input_h = 0;
+        uint64_t input_w = 0;
+        uint64_t out_channels = 0;
+        uint64_t kernel_h = 0;
+        uint64_t kernel_w = 0;
+        const uint64_t conv2d_multiply_factor = 2;
+        std::tie(minibatch, in_channels, input_h, input_w) =
+            std::make_tuple(input_sizes[0], input_sizes[1], input_sizes[2], input_sizes[3]);
+        std::tie(out_channels, std::ignore, kernel_h, kernel_w) =
+            std::make_tuple(kernel_sizes[0], kernel_sizes[1], kernel_sizes[2], kernel_sizes[3]);
+        uint64_t output_h = (input_h + 2 * padding[0] - dilation[0] * (kernel_h - 1) - 1) / stride[0] + 1;
+        uint64_t output_w = (input_w + 2 * padding[1] - dilation[1] * (kernel_w - 1) - 1) / stride[1] + 1;
+        if (groups == 0) {
+            TORCH_CHECK(false, "groups can not be 0.", PTA_ERROR(ErrCode::VALUE));
+        }
+        return conv2d_multiply_factor * minibatch * output_h * output_w * kernel_h * kernel_w * in_channels *
+            out_channels / groups;
+    } else if (op_name == kGemmOp) {
+        if (extra_args.find(kMat1Size) == extra_args.end() || extra_args.find(kMat2Size) == extra_args.end()) {
+            TORCH_NPU_WARN("Calculating flops for aten::mm requires mat1_size and mat2_size in saved arguments.");
+            return 0;
+        }
+        auto mat1_sizes_ref = extra_args.at(kMat1Size);
+        auto mat2_sizes_ref = extra_args.at(kMat2Size);
+        if (!mat1_sizes_ref.isIntList() || !mat2_sizes_ref.isIntList()) {
+            TORCH_NPU_WARN(
+                "Failed to compute flops for op aten::mm because it requires mat1_size and mat2_size to be IntList.");
+            return 0;
+        }
 
-    std::vector<int64_t> mat1_size = mat1_sizes_ref.toIntVector();
-    std::vector<int64_t> mat2_size = mat2_sizes_ref.toIntVector();
-    if (mat1_size.size() == 0) {
-      return 0;
-    } else {
-      int64_t overlap_dim = mat1_size.back();
-      const uint64_t gemm_multiply_factor = 2;
-      uint64_t flops = 1;
-      for (int64_t dim : mat1_size) {
-        flops *= (uint64_t)dim;
-      }
-      if (overlap_dim == 0) {
-        TORCH_CHECK(false, "overlap_dim can not be 0.", PTA_ERROR(ErrCode::VALUE));
-      }
-      flops /= (uint64_t)overlap_dim;
-      for (int64_t dim : mat2_size) {
-        flops *= (uint64_t)dim;
-      }
-      flops *= gemm_multiply_factor;
-      return flops;
-    }
-  } else if (op_name == kMulOp) {
-    if (extra_args.find(kMatSize) == extra_args.end()) {
-      TORCH_NPU_WARN("Calculating flops for aten::mul.Tensor requires mat_size in saved arguments.");
-      return 0;
-    }
-    auto mat_sizes = extra_args.at(kMatSize);
-    if (!mat_sizes.isIntList()) {
-      TORCH_NPU_WARN("Failed to compute flops for op aten::mul because it requires mat_size to be IntList.");
-      return 0;
-    }
+        std::vector<int64_t> mat1_size = mat1_sizes_ref.toIntVector();
+        std::vector<int64_t> mat2_size = mat2_sizes_ref.toIntVector();
+        if (mat1_size.size() == 0) {
+            return 0;
+        } else {
+            int64_t overlap_dim = mat1_size.back();
+            const uint64_t gemm_multiply_factor = 2;
+            uint64_t flops = 1;
+            for (int64_t dim : mat1_size) {
+                flops *= (uint64_t)dim;
+            }
+            if (overlap_dim == 0) {
+                TORCH_CHECK(false, "overlap_dim can not be 0.", PTA_ERROR(ErrCode::VALUE));
+            }
+            flops /= (uint64_t)overlap_dim;
+            for (int64_t dim : mat2_size) {
+                flops *= (uint64_t)dim;
+            }
+            flops *= gemm_multiply_factor;
+            return flops;
+        }
+    } else if (op_name == kMulOp) {
+        if (extra_args.find(kMatSize) == extra_args.end()) {
+            TORCH_NPU_WARN("Calculating flops for aten::mul.Tensor requires mat_size in saved arguments.");
+            return 0;
+        }
+        auto mat_sizes = extra_args.at(kMatSize);
+        if (!mat_sizes.isIntList()) {
+            TORCH_NPU_WARN("Failed to compute flops for op aten::mul because it requires mat_size to be IntList.");
+            return 0;
+        }
 
-    std::vector<int64_t> mat_size = mat_sizes.toIntVector();
-    uint64_t flops = 1;
-    for (int64_t dim : mat_size) {
-      flops *= (uint64_t)dim;
-    }
-    return flops;
-  } else if (op_name == kAddOp) {
-    if (extra_args.find(kMatSize) == extra_args.end()) {
-      TORCH_NPU_WARN("Calculating flops for aten::add.Tensor requires mat_size in saved arguments.");
-      return 0;
-    }
-    auto mat_sizes = extra_args.at(kMatSize);
-    if (!mat_sizes.isIntList()) {
-      TORCH_NPU_WARN("Failed to compute flops for op aten::add because it requires mat_size to be IntList.");
-      return 0;
-    }
+        std::vector<int64_t> mat_size = mat_sizes.toIntVector();
+        uint64_t flops = 1;
+        for (int64_t dim : mat_size) {
+            flops *= (uint64_t)dim;
+        }
+        return flops;
+    } else if (op_name == kAddOp) {
+        if (extra_args.find(kMatSize) == extra_args.end()) {
+            TORCH_NPU_WARN("Calculating flops for aten::add.Tensor requires mat_size in saved arguments.");
+            return 0;
+        }
+        auto mat_sizes = extra_args.at(kMatSize);
+        if (!mat_sizes.isIntList()) {
+            TORCH_NPU_WARN("Failed to compute flops for op aten::add because it requires mat_size to be IntList.");
+            return 0;
+        }
 
-    std::vector<int64_t> mat_size = mat_sizes.toIntVector();
-    uint64_t flops = 1;
-    for (int64_t dim : mat_size) {
-      flops *= (uint64_t)dim;
+        std::vector<int64_t> mat_size = mat_sizes.toIntVector();
+        uint64_t flops = 1;
+        for (int64_t dim : mat_size) {
+            flops *= (uint64_t)dim;
+        }
+        return flops;
     }
-    return flops;
-  }
-  return 0;
+    return 0;
 }
-
 }
 }
\ No newline at end of file
diff --git a/torch_npu/csrc/profiler/utils.h b/torch_npu/csrc/profiler/utils.h
index 4aa80d9efb..e753d86121 100644
--- a/torch_npu/csrc/profiler/utils.h
+++ b/torch_npu/csrc/profiler/utils.h
@@ -12,42 +12,42 @@
 
 namespace torch_npu {
 namespace profiler {
+std::unordered_map<std::string, c10::IValue> saveExtraArgs(const at::RecordFunction &fn);
 
-std::unordered_map<std::string, c10::IValue> saveExtraArgs(const at::RecordFunction& fn);
-
-uint64_t computeFlops(const std::string &op_name,
-    const std::unordered_map<std::string, c10::IValue> &extra_args);
+uint64_t computeFlops(const std::string &op_name, const std::unordered_map<std::string, c10::IValue> &extra_args);
 
 class NPURecordFunction {
 public:
-  NPURecordFunction(bool enable_ = false) : enable(enable_) {
-    if (NPURecordFunction::use_npu_simple) {
-      at::enableRecordFunction(enable);
+    NPURecordFunction(bool enable_ = false) : enable(enable_)
+    {
+        if (NPURecordFunction::use_npu_simple) {
+            at::enableRecordFunction(enable);
+        }
     }
-  }
 
-  ~NPURecordFunction() {
-    if (NPURecordFunction::use_npu_simple) {
-      at::enableRecordFunction(!enable);
+    ~NPURecordFunction()
+    {
+        if (NPURecordFunction::use_npu_simple) {
+            at::enableRecordFunction(!enable);
+        }
     }
-  }
-  bool enable = false;
-  static bool use_npu_simple;
+    bool enable = false;
+    static bool use_npu_simple;
 };
 
-inline THPCodeObjectPtr PyFrame_GetCode_NPU(PyFrameObject* frame)
+inline THPCodeObjectPtr PyFrame_GetCode_NPU(PyFrameObject *frame)
 {
     return THPCodeObjectPtr(PyFrame_GetCode(frame));
 }
 
-inline PyFrameObject* PyEval_GetFrame_NPU()
+inline PyFrameObject *PyEval_GetFrame_NPU()
 {
     auto frame = PyEval_GetFrame();
     Py_XINCREF(frame);
     return frame;
 }
 
-inline THPObjectPtr PyFrame_GetLocals_NPU(PyFrameObject* frame)
+inline THPObjectPtr PyFrame_GetLocals_NPU(PyFrameObject *frame)
 {
     return THPObjectPtr(PyFrame_GetLocals(frame));
 }
diff --git a/torch_npu/csrc/toolkit/profiler/common/ring_buffer.h b/torch_npu/csrc/toolkit/profiler/common/ring_buffer.h
index 1bbc2f33e3..9ca4f2ae70 100644
--- a/torch_npu/csrc/toolkit/profiler/common/ring_buffer.h
+++ b/torch_npu/csrc/toolkit/profiler/common/ring_buffer.h
@@ -12,28 +12,31 @@ namespace profiler {
 template <typename T>
 class RingBuffer {
 public:
-  RingBuffer()
-      : is_inited_(false),
-        is_quit_(false),
-        read_index_(0),
-        write_index_(0),
-        idle_write_index_(0),
-        capacity_(0),
-        mask_(0),
-        cycles_exceed_cnt_(0),
-        full_cnt_(0) {}
+    RingBuffer()
+        : is_inited_(false),
+          is_quit_(false),
+          read_index_(0),
+          write_index_(0),
+          idle_write_index_(0),
+          capacity_(0),
+          mask_(0),
+          cycles_exceed_cnt_(0),
+          full_cnt_(0)
+    {}
 
-  ~RingBuffer() {
-    UnInit();
-  }
+    ~RingBuffer()
+    {
+        UnInit();
+    }
 
-  void Init(size_t capacity) {
-    capacity_ = capacity;
-    mask_ = capacity_ - 1;
-    data_queue_.resize(capacity);
-    is_inited_ = true;
-    is_quit_ = false;
-  }
+    void Init(size_t capacity)
+    {
+        capacity_ = capacity;
+        mask_ = capacity_ - 1;
+        data_queue_.resize(capacity);
+        is_inited_ = true;
+        is_quit_ = false;
+    }
 
     void UnInit()
     {
@@ -59,49 +62,51 @@ public:
         }
     }
 
-  bool Push(T data) {
-    size_t curr_read_index = 0;
-    size_t curr_write_index = 0;
-    size_t next_write_index = 0;
-    size_t cycles = 0;
-    static const size_t cycle_limit = 1024;
-    do {
-      if (!is_inited_ || is_quit_) {
-        return false;
-      }
-      cycles++;
-      if (cycles >= cycle_limit) {
-        cycles_exceed_cnt_.fetch_add(1, std::memory_order_relaxed);
-        return false;
-      }
-      curr_read_index = read_index_.load(std::memory_order_relaxed);
-      curr_write_index = idle_write_index_.load(std::memory_order_relaxed);
-      next_write_index = curr_write_index + 1;
-      if ((next_write_index & mask_) == (curr_read_index & mask_)) {
-        full_cnt_.fetch_add(1, std::memory_order_relaxed);
-        return false;
-      }
-    } while (!idle_write_index_.compare_exchange_weak(curr_write_index, next_write_index));
-    size_t index = curr_write_index & mask_;
-    data_queue_[index] = std::move(data);
-    write_index_++;
-    return true;
-  }
-
-  bool Pop(T &data) {
-    if (!is_inited_) {
-      return false;
+    bool Push(T data)
+    {
+        size_t curr_read_index = 0;
+        size_t curr_write_index = 0;
+        size_t next_write_index = 0;
+        size_t cycles = 0;
+        static const size_t cycle_limit = 1024;
+        do {
+            if (!is_inited_ || is_quit_) {
+                return false;
+            }
+            cycles++;
+            if (cycles >= cycle_limit) {
+                cycles_exceed_cnt_.fetch_add(1, std::memory_order_relaxed);
+                return false;
+            }
+            curr_read_index = read_index_.load(std::memory_order_relaxed);
+            curr_write_index = idle_write_index_.load(std::memory_order_relaxed);
+            next_write_index = curr_write_index + 1;
+            if ((next_write_index & mask_) == (curr_read_index & mask_)) {
+                full_cnt_.fetch_add(1, std::memory_order_relaxed);
+                return false;
+            }
+        } while (!idle_write_index_.compare_exchange_weak(curr_write_index, next_write_index));
+        size_t index = curr_write_index & mask_;
+        data_queue_[index] = std::move(data);
+        write_index_++;
+        return true;
     }
-    size_t curr_read_index = read_index_.load(std::memory_order_relaxed);
-    size_t curr_write_index = write_index_.load(std::memory_order_relaxed);
-    if ((curr_read_index & mask_) == (curr_write_index & mask_) && !is_quit_) {
-      return false;
+
+    bool Pop(T &data)
+    {
+        if (!is_inited_) {
+            return false;
+        }
+        size_t curr_read_index = read_index_.load(std::memory_order_relaxed);
+        size_t curr_write_index = write_index_.load(std::memory_order_relaxed);
+        if ((curr_read_index & mask_) == (curr_write_index & mask_) && !is_quit_) {
+            return false;
+        }
+        size_t index = curr_read_index & mask_;
+        data = std::move(data_queue_[index]);
+        read_index_++;
+        return true;
     }
-    size_t index = curr_read_index & mask_;
-    data = std::move(data_queue_[index]);
-    read_index_++;
-    return true;
-  }
 
     size_t Size()
     {
@@ -114,18 +119,18 @@ public:
     }
 
 private:
-  bool is_inited_;
-  volatile bool is_quit_;
-  std::atomic<size_t> read_index_;
-  std::atomic<size_t> write_index_;
-  std::atomic<size_t> idle_write_index_;
-  size_t capacity_;
-  size_t mask_;
-  std::vector<T> data_queue_;
+    bool is_inited_;
+    volatile bool is_quit_;
+    std::atomic<size_t> read_index_;
+    std::atomic<size_t> write_index_;
+    std::atomic<size_t> idle_write_index_;
+    size_t capacity_;
+    size_t mask_;
+    std::vector<T> data_queue_;
 
-  // Ringbuffer push failed info
-  std::atomic<size_t> cycles_exceed_cnt_;
-  std::atomic<size_t> full_cnt_;
+    // Ringbuffer push failed info
+    std::atomic<size_t> cycles_exceed_cnt_;
+    std::atomic<size_t> full_cnt_;
 };
 } // profiler
 } // toolkit
diff --git a/torch_npu/csrc/toolkit/profiler/common/singleton.h b/torch_npu/csrc/toolkit/profiler/common/singleton.h
index 4997e534f0..c613f30bad 100644
--- a/torch_npu/csrc/toolkit/profiler/common/singleton.h
+++ b/torch_npu/csrc/toolkit/profiler/common/singleton.h
@@ -8,21 +8,22 @@ namespace profiler {
 template<typename T>
 class Singleton {
 public:
-  static T *GetInstance() noexcept(std::is_nothrow_constructible<T>::value) {
-    static T instance;
-    return &instance;
-  }
+    static T *GetInstance() noexcept(std::is_nothrow_constructible<T>::value)
+    {
+        static T instance;
+        return &instance;
+    }
 
-  virtual ~Singleton() = default;
+    virtual ~Singleton() = default;
 
 protected:
-  explicit Singleton() = default;
+    explicit Singleton() = default;
 
 private:
-  explicit Singleton(const Singleton &obj) = delete;
-  Singleton& operator=(const Singleton &obj) = delete;
-  explicit Singleton(Singleton &&obj) = delete;
-  Singleton& operator=(Singleton &&obj) = delete;
+    explicit Singleton(const Singleton &obj) = delete;
+    Singleton &operator = (const Singleton &obj) = delete;
+    explicit Singleton(Singleton &&obj) = delete;
+    Singleton &operator = (Singleton &&obj) = delete;
 };
 } // profiler
 } // toolkit
diff --git a/torch_npu/csrc/toolkit/profiler/common/thread.h b/torch_npu/csrc/toolkit/profiler/common/thread.h
index 6db1b09d0a..b53e9d8fe8 100644
--- a/torch_npu/csrc/toolkit/profiler/common/thread.h
+++ b/torch_npu/csrc/toolkit/profiler/common/thread.h
@@ -9,57 +9,61 @@ namespace toolkit {
 namespace profiler {
 class Thread {
 public:
-  Thread()
-      : is_alive_(false),
-        pid_(0),
-        thread_name_("NPUProfiler") {};
+    Thread() : is_alive_(false), pid_(0), thread_name_("NPUProfiler"){};
 
-  ~Thread() {
-    if (is_alive_) {
-        (void)pthread_cancel(pid_);
-        (void)pthread_join(pid_, nullptr);
+    ~Thread()
+    {
+        if (is_alive_) {
+            (void)pthread_cancel(pid_);
+            (void)pthread_join(pid_, nullptr);
+        }
     }
-  }
 
-  void SetThreadName(const std::string &name) {
-    if (!name.empty()) {
-      thread_name_ = name;
+    void SetThreadName(const std::string &name)
+    {
+        if (!name.empty()) {
+            thread_name_ = name;
+        }
     }
-  }
 
-  std::string GetThreadName() {
-    return thread_name_;
-  }
+    std::string GetThreadName()
+    {
+        return thread_name_;
+    }
 
-  int Start() {
-    int ret = pthread_create(&pid_, nullptr, Execute, (void*)this);
-    is_alive_ = (ret == 0) ? true : false;
-    return ret;
-  }
+    int Start()
+    {
+        int ret = pthread_create(&pid_, nullptr, Execute, (void *)this);
+        is_alive_ = (ret == 0) ? true : false;
+        return ret;
+    }
 
-  int Stop() {
-    return Join();
-  }
+    int Stop()
+    {
+        return Join();
+    }
 
-  int Join() {
-    int ret = pthread_join(pid_, nullptr);
-    is_alive_ = (ret == 0) ? false : true;
-    return ret;
-  }
+    int Join()
+    {
+        int ret = pthread_join(pid_, nullptr);
+        is_alive_ = (ret == 0) ? false : true;
+        return ret;
+    }
 
 private:
-  static void* Execute(void *args) {
-    Thread *thr = (Thread *)args;
-    prctl(PR_SET_NAME, (unsigned long)thr->GetThreadName().data());
-    thr->Run();
-    return nullptr;
-  }
-  virtual void Run() = 0;
+    static void *Execute(void *args)
+    {
+        Thread *thr = (Thread *)args;
+        prctl(PR_SET_NAME, (unsigned long)thr->GetThreadName().data());
+        thr->Run();
+        return nullptr;
+    }
+    virtual void Run() = 0;
 
 private:
-  bool is_alive_;
-  pthread_t pid_;
-  std::string thread_name_;
+    bool is_alive_;
+    pthread_t pid_;
+    std::string thread_name_;
 };
 } // profiler
 } // toolkit
diff --git a/torch_npu/csrc/toolkit/profiler/common/utils.h b/torch_npu/csrc/toolkit/profiler/common/utils.h
index f13d1c51c0..454d66d69e 100644
--- a/torch_npu/csrc/toolkit/profiler/common/utils.h
+++ b/torch_npu/csrc/toolkit/profiler/common/utils.h
@@ -17,145 +17,158 @@ namespace toolkit {
 namespace profiler {
 class Utils {
 public:
-  static bool IsFileExist(const std::string &path) {
-    if (path.empty() || path.size() > PATH_MAX) {
-      return false;
+    static bool IsFileExist(const std::string &path)
+    {
+        if (path.empty() || path.size() > PATH_MAX) {
+            return false;
+        }
+        return (access(path.c_str(), F_OK) == 0) ? true : false;
     }
-    return (access(path.c_str(), F_OK) == 0) ? true : false;
-  }
 
-  static bool IsFileWritable(const std::string &path) {
-    if (path.empty() || path.size() > PATH_MAX) {
-      return false;
+    static bool IsFileWritable(const std::string &path)
+    {
+        if (path.empty() || path.size() > PATH_MAX) {
+            return false;
+        }
+        return (access(path.c_str(), W_OK) == 0) ? true : false;
     }
-    return (access(path.c_str(), W_OK) == 0) ? true : false;
-  }
 
-  static bool IsDir(const std::string &path) {
-    if (path.empty() || path.size() > PATH_MAX) {
-      return false;
-    }
-    struct stat st = {0};
-    int ret = lstat(path.c_str(), &st);
-    if (ret != 0) {
-      return false;
+    static bool IsDir(const std::string &path)
+    {
+        if (path.empty() || path.size() > PATH_MAX) {
+            return false;
+        }
+        struct stat st = { 0 };
+        int ret = lstat(path.c_str(), &st);
+        if (ret != 0) {
+            return false;
+        }
+        return S_ISDIR(st.st_mode) ? true : false;
     }
-    return S_ISDIR(st.st_mode) ? true : false;
-  }
 
-  static bool CreateDir(const std::string &path) {
-    if (path.empty() || path.size() > PATH_MAX) {
-      return false;
-    }
-    if (IsFileExist(path)) {
-      return IsDir(path) ? true : false;
-    }
-    size_t pos = 0;
-    while ((pos = path.find_first_of('/', pos)) != std::string::npos) {
-      std::string base_dir = path.substr(0, ++pos);
-      if (IsFileExist(base_dir)) {
-        if (IsDir(base_dir)) {
-          continue;
-        } else {
-          return false;
+    static bool CreateDir(const std::string &path)
+    {
+        if (path.empty() || path.size() > PATH_MAX) {
+            return false;
+        }
+        if (IsFileExist(path)) {
+            return IsDir(path) ? true : false;
+        }
+        size_t pos = 0;
+        while ((pos = path.find_first_of('/', pos)) != std::string::npos) {
+            std::string base_dir = path.substr(0, ++pos);
+            if (IsFileExist(base_dir)) {
+                if (IsDir(base_dir)) {
+                    continue;
+                } else {
+                    return false;
+                }
+            }
+            if (mkdir(base_dir.c_str(), 0750) != 0) {
+                return false;
+            }
         }
-      }
-      if (mkdir(base_dir.c_str(), 0750) != 0) {
-        return false;
-      }
+        return (mkdir(path.c_str(), 0750) == 0) ? true : false;
     }
-    return (mkdir(path.c_str(), 0750) == 0) ? true : false;
-  }
 
-  static std::string RealPath(const std::string &path) {
-    if (path.empty() || path.size() > PATH_MAX) {
-      return "";
-    }
-    char realPath[PATH_MAX] = {0};
-    if (realpath(path.c_str(), realPath) == nullptr) {
-      return "";
+    static std::string RealPath(const std::string &path)
+    {
+        if (path.empty() || path.size() > PATH_MAX) {
+            return "";
+        }
+        char realPath[PATH_MAX] = {0};
+        if (realpath(path.c_str(), realPath) == nullptr) {
+            return "";
+        }
+        return std::string(realPath);
     }
-    return std::string(realPath);
-  }
 
-  static std::string RelativeToAbsPath(const std::string &path) {
-    if (path.empty() || path.size() > PATH_MAX) {
-      return "";
+    static std::string RelativeToAbsPath(const std::string &path)
+    {
+        if (path.empty() || path.size() > PATH_MAX) {
+            return "";
+        }
+        if (path[0] != '/') {
+            char pwd_path[PATH_MAX] = {0};
+            if (getcwd(pwd_path, PATH_MAX) != nullptr) {
+                return std::string(pwd_path) + "/" + path;
+            }
+            return "";
+        }
+        return std::string(path);
     }
-    if (path[0] != '/') {
-      char pwd_path[PATH_MAX] = {0};
-      if (getcwd(pwd_path, PATH_MAX) != nullptr) {
-        return std::string(pwd_path) + "/" + path;
-      }
-      return "";
+
+    static std::string DirName(const std::string &path)
+    {
+        if (path.empty()) {
+            return "";
+        }
+        std::string temp_path = std::string(path.begin(), path.end());
+        char *path_c = dirname(const_cast<char *>(temp_path.data()));
+        return path_c ? std::string(path_c) : "";
     }
-    return std::string(path);
-  }
 
-  static std::string DirName(const std::string &path) {
-    if (path.empty()) {
-      return "";
+    static uint64_t GetClockMonotonicRawNs()
+    {
+        struct timespec ts = { 0 };
+        clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
+        return static_cast<uint64_t>(ts.tv_sec) * 1000000000 +
+            static_cast<uint64_t>(ts.tv_nsec); // 1000000000为秒转换为纳秒的倍数
     }
-    std::string temp_path = std::string(path.begin(), path.end());
-    char *path_c = dirname(const_cast<char *>(temp_path.data()));
-    return path_c ? std::string(path_c) : "";
-  }
-
-  static uint64_t GetClockMonotonicRawNs() {
-    struct timespec ts = {0};
-    clock_gettime(CLOCK_MONOTONIC_RAW, &ts);
-    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + static_cast<uint64_t>(ts.tv_nsec); // 1000000000为秒转换为纳秒的倍数
-  }
-
-  static uint64_t getClockSyscnt() {
-    uint64_t cycles;
+
+    static uint64_t getClockSyscnt()
+    {
+        uint64_t cycles;
 #if defined(__aarch64__)
-    asm volatile("mrs %0, cntvct_el0" : "=r"(cycles));
+        asm volatile("mrs %0, cntvct_el0" : "=r"(cycles));
 #elif defined(__x86_64__)
-    constexpr uint32_t uint32Bits = 32U;
-    uint32_t hi = 0;
-    uint32_t lo = 0;
-    __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
-    cycles = (static_cast<uint64_t>(lo)) | ((static_cast<uint64_t>(hi)) << uint32Bits);
+        constexpr uint32_t uint32Bits = 32U;
+        uint32_t hi = 0;
+        uint32_t lo = 0;
+        __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
+        cycles = (static_cast<uint64_t>(lo)) | ((static_cast<uint64_t>(hi)) << uint32Bits);
 #elif defined(__arm__)
-    const uint32_t uint32Bits = 32U;
-    uint32_t hi = 0;
-    uint32_t lo = 0;
-    asm volatile("mrrc p15, 1, %0, %1, c14" : "=r"(lo), "=r"(hi));
-    cycles = (static_cast<uint64_t>(lo)) | ((static_cast<uint64_t>(hi)) << uint32Bits);
+        const uint32_t uint32Bits = 32U;
+        uint32_t hi = 0;
+        uint32_t lo = 0;
+        asm volatile("mrrc p15, 1, %0, %1, c14" : "=r"(lo), "=r"(hi));
+        cycles = (static_cast<uint64_t>(lo)) | ((static_cast<uint64_t>(hi)) << uint32Bits);
 #else
-    cycles = 0;
+        cycles = 0;
 #endif
-    return cycles;
-  }
-
-  static uint64_t GetClockTime() {
-    static const bool isSupportSysCnt = at_npu::native::isSyscntEnable();
-    if (isSupportSysCnt) {
-      return getClockSyscnt();
-    } else {
-      return GetClockMonotonicRawNs();
+        return cycles;
     }
-  }
 
-  static bool CreateFile(const std::string &path) {
-    if (path.empty() || path.size() > PATH_MAX || !CreateDir(DirName(path))) {
-      return false;
+    static uint64_t GetClockTime()
+    {
+        static const bool isSupportSysCnt = at_npu::native::isSyscntEnable();
+        if (isSupportSysCnt) {
+            return getClockSyscnt();
+        } else {
+            return GetClockMonotonicRawNs();
+        }
     }
-    int fd = creat(path.c_str(), S_IRUSR | S_IWUSR | S_IRGRP);
-    return (fd < 0 || close(fd) != 0) ? false : true;
-  }
 
-  static bool IsSoftLink(const std::string &path) {
-    if (path.empty() || path.size() > PATH_MAX || !IsFileExist(path)) {
-      return false;
+    static bool CreateFile(const std::string &path)
+    {
+        if (path.empty() || path.size() > PATH_MAX || !CreateDir(DirName(path))) {
+            return false;
+        }
+        int fd = creat(path.c_str(), S_IRUSR | S_IWUSR | S_IRGRP);
+        return (fd < 0 || close(fd) != 0) ? false : true;
     }
-    struct stat st{};
-    if (lstat(path.c_str(), &st) != 0) {
-      return false;
+
+    static bool IsSoftLink(const std::string &path)
+    {
+        if (path.empty() || path.size() > PATH_MAX || !IsFileExist(path)) {
+            return false;
+        }
+        struct stat st {};
+        if (lstat(path.c_str(), &st) != 0) {
+            return false;
+        }
+        return S_ISLNK(st.st_mode);
     }
-    return S_ISLNK(st.st_mode);
-  }
 
     static uint64_t GetTid()
     {
@@ -171,7 +184,7 @@ public:
 
     static uint64_t GetHostUid();
 
-    static int safe_strcpy_s(char* dest, const char* src, size_t destSize);
+    static int safe_strcpy_s(char *dest, const char *src, size_t destSize);
 };
 } // profiler
 } // toolkit
diff --git a/torch_npu/csrc/toolkit/profiler/inc/data_dumper.h b/torch_npu/csrc/toolkit/profiler/inc/data_dumper.h
index 46b0141aed..6880980426 100644
--- a/torch_npu/csrc/toolkit/profiler/inc/data_dumper.h
+++ b/torch_npu/csrc/toolkit/profiler/inc/data_dumper.h
@@ -19,26 +19,26 @@ constexpr uint32_t kNotifyInterval = 256;
 
 class DataDumper : public Thread {
 public:
-  explicit DataDumper();
-  virtual ~DataDumper();
-  void Init(const std::string &path, size_t capacity);
-  void UnInit();
-  void Report(std::unique_ptr<BaseReportData> data);
-  void Start();
-  void Stop();
+    explicit DataDumper();
+    virtual ~DataDumper();
+    void Init(const std::string &path, size_t capacity);
+    void UnInit();
+    void Report(std::unique_ptr<BaseReportData> data);
+    void Start();
+    void Stop();
 
 private:
-  void Flush();
-  void Dump(const std::map<std::string, std::vector<uint8_t>> &dataMap);
-  void Run();
-  void GatherAndDumpData();
+    void Flush();
+    void Dump(const std::map<std::string, std::vector<uint8_t>> &dataMap);
+    void Run();
+    void GatherAndDumpData();
 
 private:
-  std::string path_;
-  std::atomic<bool> start_;
-  std::atomic<bool> init_;
-  RingBuffer<std::unique_ptr<BaseReportData>> data_chunk_buf_;
-  std::map<std::string, FILE*> fd_map_;
+    std::string path_;
+    std::atomic<bool> start_;
+    std::atomic<bool> init_;
+    RingBuffer<std::unique_ptr<BaseReportData>> data_chunk_buf_;
+    std::map<std::string, FILE *> fd_map_;
 };
 
 class TraceDataDumper : public Thread {
@@ -58,17 +58,17 @@ private:
     void FlushTraceData();
     void FlushHashData();
     void FlushParamData();
-    void Dump(const std::string& file_name, const std::vector<uint8_t>& encode_data);
+    void Dump(const std::string &file_name, const std::vector<uint8_t> &encode_data);
     void Run();
 
 private:
     std::string path_;
     std::atomic<bool> start_;
     std::atomic<bool> init_;
-    std::unique_ptr<PythonTracerHashData> trace_hash_data_{nullptr};
-    std::unique_ptr<ParamTensorData> param_data_{nullptr};
+    std::unique_ptr<PythonTracerHashData> trace_hash_data_{ nullptr };
+    std::unique_ptr<ParamTensorData> param_data_{ nullptr };
     RingBuffer<std::unique_ptr<PythonTracerFuncData>> trace_data_buf_;
-    std::map<std::string, FILE*> fd_map_;
+    std::map<std::string, FILE *> fd_map_;
 };
 } // profiler
 } // toolkit
diff --git a/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h b/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h
index bbdd15abfd..bd59433a30 100644
--- a/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h
+++ b/torch_npu/csrc/toolkit/profiler/inc/data_reporter.h
@@ -18,81 +18,88 @@ namespace toolkit {
 namespace profiler {
 
 template<typename T>
-std::string to_string(T value) {
-  std::ostringstream oss;
-  oss << value;
-  return oss.str();
+std::string to_string(T value)
+{
+    std::ostringstream oss;
+    oss << value;
+    return oss.str();
 }
 
 template<typename T>
-void encodeFixedData(const std::vector<T> &datas, std::vector<uint8_t> &result) {
-  for (auto data : datas) {
-    for (size_t i = 0; i < sizeof(T); ++i) {
-      result.push_back((static_cast<size_t>(data) >> (i * 8)) & 0xff);
+void encodeFixedData(const std::vector<T> &datas, std::vector<uint8_t> &result)
+{
+    for (auto data : datas) {
+        for (size_t i = 0; i < sizeof(T); ++i) {
+            result.push_back((static_cast<size_t>(data) >> (i * 8)) & 0xff);
+        }
     }
-  }
 }
 
-inline void encodeStrData(uint16_t type, const std::string &data, std::vector<uint8_t> &result) {
-  for (size_t i = 0; i < sizeof(uint16_t); ++i) {
-    result.push_back((type >> (i * 8)) & 0xff);
-  }
-  uint32_t length = data.size();
-  for (size_t i = 0; i < sizeof(uint32_t); ++i) {
-    result.push_back((length >> (i * 8)) & 0xff);
-  }
-  for (const auto &c : data) {
-    result.push_back(c);
-  }
+inline void encodeStrData(uint16_t type, const std::string &data, std::vector<uint8_t> &result)
+{
+    for (size_t i = 0; i < sizeof(uint16_t); ++i) {
+        result.push_back((type >> (i * 8)) & 0xff);
+    }
+    uint32_t length = data.size();
+    for (size_t i = 0; i < sizeof(uint32_t); ++i) {
+        result.push_back((length >> (i * 8)) & 0xff);
+    }
+    for (const auto &c : data) {
+        result.push_back(c);
+    }
 }
 
-inline void encodeStrArrayData(uint16_t type, const std::vector<std::string> &datas, std::vector<uint8_t> &result) {
-  std::string rst;
-  for (auto str : datas) {
-    rst += (str + ";");
-  }
-  if (!rst.empty()) {
-    rst.pop_back();
-  }
-  encodeStrData(type, rst, result);
+inline void encodeStrArrayData(uint16_t type, const std::vector<std::string> &datas, std::vector<uint8_t> &result)
+{
+    std::string rst;
+    for (auto str : datas) {
+        rst += (str + ";");
+    }
+    if (!rst.empty()) {
+        rst.pop_back();
+    }
+    encodeStrData(type, rst, result);
 }
 
-inline void encodeMapData(uint16_t type, const std::unordered_map<std::string, c10::IValue> &datas, std::vector<uint8_t> &result) {
-  std::string rst;
-  for (auto &entry : datas) {
-    rst += entry.first + ":" + to_string(entry.second) + ";";
-  }
-  if (!rst.empty()) {
-    rst.pop_back();
-  }
-  encodeStrData(type, rst, result);
+inline void encodeMapData(uint16_t type, const std::unordered_map<std::string, c10::IValue> &datas,
+    std::vector<uint8_t> &result)
+{
+    std::string rst;
+    for (auto &entry : datas) {
+        rst += entry.first + ":" + to_string(entry.second) + ";";
+    }
+    if (!rst.empty()) {
+        rst.pop_back();
+    }
+    encodeStrData(type, rst, result);
 }
 
 template<typename T>
-void encode2DIntegerMatrixDatas(uint16_t type, std::vector<std::vector<T>> &datas, std::vector<uint8_t> &result) {
-  std::string rst;
-  for (auto tensor : datas) {
-    std::stringstream ss;
-    copy(tensor.begin(), tensor.end(), std::ostream_iterator<T>(ss, ","));
-    std::string str = ss.str();
-    if (!str.empty()) {
-      str.pop_back();
+void encode2DIntegerMatrixDatas(uint16_t type, std::vector<std::vector<T>> &datas, std::vector<uint8_t> &result)
+{
+    std::string rst;
+    for (auto tensor : datas) {
+        std::stringstream ss;
+        copy(tensor.begin(), tensor.end(), std::ostream_iterator<T>(ss, ","));
+        std::string str = ss.str();
+        if (!str.empty()) {
+            str.pop_back();
+        }
+        rst += (str + ";");
+    }
+    if (!rst.empty()) {
+        rst.pop_back();
     }
-    rst += (str + ";");
-  }
-  if (!rst.empty()) {
-    rst.pop_back();
-  }
-  encodeStrData(type, rst, result);
+    encodeStrData(type, rst, result);
 }
 
 class WeakTensor {
 public:
-    explicit WeakTensor(const at::Tensor& t) : weak_self_(t.getIntrusivePtr()) {}
+    explicit WeakTensor(const at::Tensor &t) : weak_self_(t.getIntrusivePtr()) {}
 
     auto get() const
     {
-        return (c10::TensorImpl*)(weak_self_._unsafe_get_target());
+        return (c10::TensorImpl *)(weak_self_._unsafe_get_target());
     }
 
 private:
@@ -103,14 +110,14 @@ struct TensorMetadata {
     TensorMetadata() = default;
     explicit TensorMetadata(const at::Tensor &t);
 
-    c10::TensorImpl* impl_;
-    const void* ptr_;
+    c10::TensorImpl *impl_;
+    const void *ptr_;
     std::string dtype_;
-    uint64_t dtype_size_{0};
+    uint64_t dtype_size_{ 0 };
     std::vector<int64_t> sizes_;
     std::vector<int64_t> strides_;
-    int device_type_{0};
-    int device_index_{-1};
+    int device_type_{ 0 };
+    int device_index_{ -1 };
 };
 
 struct ModuleParam {
@@ -172,7 +179,7 @@ inline void encodeTensor(const TensorMetadata &t, std::ostringstream &oss)
 inline void encodeTensors(uint16_t type, std::vector<TensorMetadata> tensors, std::vector<uint8_t> &result)
 {
     std::ostringstream oss;
-    for (auto& tensor: tensors) {
+    for (auto &tensor : tensors) {
         encodeTensor(tensor, oss);
         oss << ")";
     }
@@ -180,11 +187,12 @@ inline void encodeTensors(uint16_t type, std::vector<TensorMetadata> tensors, st
     encodeStrData(type, str, result);
 }
 
-inline void encodeTensorLists(uint16_t type, std::vector<std::vector<TensorMetadata>> tensorlists, std::vector<uint8_t> &result)
+inline void encodeTensorLists(uint16_t type, std::vector<std::vector<TensorMetadata>> tensorlists,
+    std::vector<uint8_t> &result)
 {
     std::ostringstream oss;
-    for (auto& tensorlist: tensorlists) {
-        for (auto& tensor: tensorlist) {
+    for (auto &tensorlist : tensorlists) {
+        for (auto &tensor : tensorlist) {
             encodeTensor(tensor, oss);
             oss << ")";
         }
@@ -197,7 +205,7 @@ inline void encodeTensorLists(uint16_t type, std::vector<std::vector<TensorMetad
 inline void encodeModuleParams(uint16_t type, std::vector<ModuleParam> params, std::vector<uint8_t> &result)
 {
     std::ostringstream oss;
-    for (auto& param: params) {
+    for (auto &param : params) {
         appendWithDelimiter(oss, param.name_, ')');
         encodeTensor(param.metadata_, oss);
         oss << ")";
@@ -213,7 +221,7 @@ inline void encodeModuleParams(uint16_t type, std::vector<ModuleParam> params, s
 inline void encodeOptimizerParams(uint16_t type, std::vector<OptimizerParam> params, std::vector<uint8_t> &result)
 {
     std::ostringstream oss;
-    for (auto& param: params) {
+    for (auto &param : params) {
         encodeTensor(param.metadata_, oss);
         oss << ")";
         if (param.grad_.has_value()) {
@@ -232,10 +240,9 @@ inline void encodeOptimizerParams(uint16_t type, std::vector<OptimizerParam> par
 }
 
 struct BaseReportData {
-    int32_t device_id{0};
+    int32_t device_id{ 0 };
     std::string tag;
-    BaseReportData(int32_t device_id, std::string tag)
-        : device_id(device_id), tag(std::move(tag)) {}
+    BaseReportData(int32_t device_id, std::string tag) : device_id(device_id), tag(std::move(tag)) {}
     virtual ~BaseReportData() = default;
     virtual std::vector<uint8_t> encode() = 0;
 };
@@ -262,17 +269,17 @@ enum class OpRangeDataType {
     RESERVED = 30,
 };
 
-struct OpRangeData : BaseReportData{
-    int64_t start_ns{0};
-    int64_t end_ns{0};
-    int64_t sequence_number{0};
-    uint64_t process_id{0};
-    uint64_t start_thread_id{0};
-    uint64_t end_thread_id{0};
-    uint64_t forward_thread_id{0};
-    bool is_async{false};
+struct OpRangeData : BaseReportData {
+    int64_t start_ns{ 0 };
+    int64_t end_ns{ 0 };
+    int64_t sequence_number{ 0 };
+    uint64_t process_id{ 0 };
+    uint64_t start_thread_id{ 0 };
+    uint64_t end_thread_id{ 0 };
+    uint64_t forward_thread_id{ 0 };
+    bool is_async{ false };
     std::string name;
-    uint8_t scope{0};
+    uint8_t scope{ 0 };
     std::vector<std::string> input_dtypes;
     std::vector<std::vector<int64_t>> input_shapes;
     std::vector<TensorMetadata> input_tensors;
@@ -281,15 +288,8 @@ struct OpRangeData : BaseReportData{
     std::vector<std::string> stack;
     std::vector<std::string> module_hierarchy;
     std::unordered_map<std::string, c10::IValue> extra_args;
-    OpRangeData(int64_t start_ns,
-        int64_t end_ns,
-        int64_t sequence_number,
-        uint64_t process_id,
-        uint64_t start_thread_id,
-        uint64_t end_thread_id,
-        uint64_t forward_thread_id,
-        bool is_async,
-        std::string name)
+    OpRangeData(int64_t start_ns, int64_t end_ns, int64_t sequence_number, uint64_t process_id,
+        uint64_t start_thread_id, uint64_t end_thread_id, uint64_t forward_thread_id, bool is_async, std::string name)
         : BaseReportData(0, "torch.op_range"),
           start_ns(start_ns),
           end_ns(end_ns),
@@ -299,7 +299,8 @@ struct OpRangeData : BaseReportData{
           end_thread_id(end_thread_id),
           forward_thread_id(forward_thread_id),
           is_async(is_async),
-          name(std::move(name)) {}
+          name(std::move(name))
+    {}
     std::vector<uint8_t> encode();
 };
 
@@ -308,18 +309,13 @@ enum class OpMarkDataType {
 };
 
 struct OpMarkData : BaseReportData {
-    int64_t time_ns{0};
-    uint64_t category{0};
-    uint64_t correlation_id{0};
-    uint64_t thread_id{0};
-    uint64_t process_id{0};
+    int64_t time_ns{ 0 };
+    uint64_t category{ 0 };
+    uint64_t correlation_id{ 0 };
+    uint64_t thread_id{ 0 };
+    uint64_t process_id{ 0 };
     std::string name;
-    OpMarkData(
-        int64_t time_ns,
-        uint64_t category,
-        uint64_t correlation_id,
-        uint64_t thread_id,
-        uint64_t process_id,
+    OpMarkData(int64_t time_ns, uint64_t category, uint64_t correlation_id, uint64_t thread_id, uint64_t process_id,
         const std::string &name)
         : BaseReportData(0, "torch.op_mark"),
           time_ns(time_ns),
@@ -327,38 +323,28 @@ struct OpMarkData : BaseReportData {
           correlation_id(correlation_id),
           thread_id(thread_id),
           process_id(process_id),
-          name(name) {}
+          name(name)
+    {}
     std::vector<uint8_t> encode();
 };
 
 struct MemoryData : BaseReportData {
-    int64_t ptr{0};
-    int64_t time_ns{0};
-    int64_t alloc_size{0};
-    int64_t total_allocated{0};
-    int64_t total_reserved{0};
-    int64_t total_active{0};
-    int64_t stream_ptr{0};
-    int8_t device_type{0};
-    int8_t device_index{0};
-    uint8_t data_type{0};
-    uint8_t allocator_type{0};
-    uint64_t thread_id{0};
-    uint64_t process_id{0};
-    MemoryData(
-        int64_t ptr,
-        int64_t time_ns,
-        int64_t alloc_size,
-        int64_t total_allocated,
-        int64_t total_reserved,
-        int64_t total_active,
-        int64_t stream_ptr,
-        int8_t device_type,
-        int8_t device_index,
-        uint8_t data_type,
-        uint8_t allocator_type,
-        uint64_t thread_id,
-        uint64_t process_id)
+    int64_t ptr{ 0 };
+    int64_t time_ns{ 0 };
+    int64_t alloc_size{ 0 };
+    int64_t total_allocated{ 0 };
+    int64_t total_reserved{ 0 };
+    int64_t total_active{ 0 };
+    int64_t stream_ptr{ 0 };
+    int8_t device_type{ 0 };
+    int8_t device_index{ 0 };
+    uint8_t data_type{ 0 };
+    uint8_t allocator_type{ 0 };
+    uint64_t thread_id{ 0 };
+    uint64_t process_id{ 0 };
+    MemoryData(int64_t ptr, int64_t time_ns, int64_t alloc_size, int64_t total_allocated, int64_t total_reserved,
+        int64_t total_active, int64_t stream_ptr, int8_t device_type, int8_t device_index, uint8_t data_type,
+        uint8_t allocator_type, uint64_t thread_id, uint64_t process_id)
         : BaseReportData(0, "torch.memory_usage"),
           ptr(ptr),
           time_ns(time_ns),
@@ -372,48 +358,42 @@ struct MemoryData : BaseReportData {
           data_type(data_type),
           allocator_type(allocator_type),
           thread_id(thread_id),
-          process_id(process_id) {}
+          process_id(process_id)
+    {}
     std::vector<uint8_t> encode();
 };
 
 struct PythonTracerFuncData : BaseReportData {
-    uint64_t process_id{0};
+    uint64_t process_id{ 0 };
     torch_npu::profiler::AppendOnlyList<torch_npu::profiler::python_tracer::TraceEvent> events;
-    PythonTracerFuncData(
-        uint64_t process_id,
-        torch_npu::profiler::AppendOnlyList<torch_npu::profiler::python_tracer::TraceEvent>&& events)
-        : BaseReportData(0, "torch.python_tracer_func"),
-          process_id(process_id),
-          events(std::move(events)) {}
+    PythonTracerFuncData(uint64_t process_id,
+        torch_npu::profiler::AppendOnlyList<torch_npu::profiler::python_tracer::TraceEvent> &&events)
+        : BaseReportData(0, "torch.python_tracer_func"), process_id(process_id), events(std::move(events))
+    {}
     std::vector<uint8_t> encode();
 };
 
-enum class PythonTracerHashDataType {
-    VALUE = 1
-};
+enum class PythonTracerHashDataType { VALUE = 1 };
 
 struct PythonTracerHashData : BaseReportData {
     std::vector<std::pair<uint64_t, std::string>> hash_data;
     PythonTracerHashData(std::vector<std::pair<uint64_t, std::string>> hash_data)
-        : BaseReportData(0, "torch.python_tracer_hash"),
-          hash_data(std::move(hash_data)) {}
+        : BaseReportData(0, "torch.python_tracer_hash"), hash_data(std::move(hash_data))
+    {}
     std::vector<uint8_t> encode();
 };
 
-enum class ParamTensorDataType {
-    MODULE_PARAM = 1,
-    OPTIMIZER_PARAM = 2
-};
+enum class ParamTensorDataType { MODULE_PARAM = 1, OPTIMIZER_PARAM = 2 };
 
 struct ParamTensorData : BaseReportData {
     std::vector<std::pair<uint64_t, std::vector<ModuleParam>>> module_param_data;
     std::vector<std::pair<uint64_t, std::vector<OptimizerParam>>> optimizer_param_data;
-    ParamTensorData(
-        std::vector<std::pair<uint64_t, std::vector<ModuleParam>>> module_param_data,
+    ParamTensorData(std::vector<std::pair<uint64_t, std::vector<ModuleParam>>> module_param_data,
         std::vector<std::pair<uint64_t, std::vector<OptimizerParam>>> optimizer_param_data)
         : BaseReportData(0, "torch.param_tensor_info"),
           module_param_data(std::move(module_param_data)),
-          optimizer_param_data(std::move(optimizer_param_data)) {}
+          optimizer_param_data(std::move(optimizer_param_data))
+    {}
     std::vector<uint8_t> encode();
 };
 } // profiler
diff --git a/torch_npu/csrc/toolkit/profiler/src/data_dumper.cpp b/torch_npu/csrc/toolkit/profiler/src/data_dumper.cpp
index def8b02653..0b6882c0bf 100644
--- a/torch_npu/csrc/toolkit/profiler/src/data_dumper.cpp
+++ b/torch_npu/csrc/toolkit/profiler/src/data_dumper.cpp
@@ -13,19 +13,18 @@
 namespace torch_npu {
 namespace toolkit {
 namespace profiler {
-DataDumper::DataDumper()
-    : path_(""),
-      start_(false),
-      init_(false) {}
+DataDumper::DataDumper() : path_(""), start_(false), init_(false) {}
 
-DataDumper::~DataDumper() {
-  UnInit();
+DataDumper::~DataDumper()
+{
+    UnInit();
 }
 
-void DataDumper::Init(const std::string &path, size_t capacity = kDefaultRingBuffer) {
-  path_ = path;
-  data_chunk_buf_.Init(capacity);
-  init_.store(true);
+void DataDumper::Init(const std::string &path, size_t capacity = kDefaultRingBuffer)
+{
+    path_ = path;
+    data_chunk_buf_.Init(capacity);
+    init_.store(true);
 }
 
 void DataDumper::UnInit()
@@ -44,22 +43,25 @@ void DataDumper::UnInit()
     }
 }
 
-void DataDumper::Start() {
+void DataDumper::Start()
+{
     if (!init_.load() || Thread::Start() != 0) {
         return;
     }
     start_.store(true);
 }
 
-void DataDumper::Stop() {
-  if (start_.load() == true) {
-    start_.store(false);
-    Thread::Stop();
-  }
-  Flush();
+void DataDumper::Stop()
+{
+    if (start_.load() == true) {
+        start_.store(false);
+        Thread::Stop();
+    }
+    Flush();
 }
 
-void DataDumper::GatherAndDumpData() {
+void DataDumper::GatherAndDumpData()
+{
     std::map<std::string, std::vector<uint8_t>> dataMap;
     uint64_t batchSize = 0;
     while (batchSize < kBatchMaxLen) {
@@ -72,7 +74,7 @@ void DataDumper::GatherAndDumpData() {
         const std::string &key = data->tag;
         auto iter = dataMap.find(key);
         if (iter == dataMap.end()) {
-            dataMap.insert({key, encodeData});
+            dataMap.insert({ key, encodeData });
         } else {
             iter->second.insert(iter->second.end(), encodeData.cbegin(), encodeData.cend());
         }
@@ -86,7 +88,8 @@ void DataDumper::GatherAndDumpData() {
     }
 }
 
-void DataDumper::Run() {
+void DataDumper::Run()
+{
     for (;;) {
         if (!start_.load()) {
             break;
@@ -99,10 +102,11 @@ void DataDumper::Run() {
     }
 }
 
-void DataDumper::Flush() {
-  while (data_chunk_buf_.Size() != 0) {
-    GatherAndDumpData();
-  }
+void DataDumper::Flush()
+{
+    while (data_chunk_buf_.Size() != 0) {
+        GatherAndDumpData();
+    }
 }
 
 void DataDumper::Report(std::unique_ptr<BaseReportData> data)
@@ -129,19 +133,15 @@ void DataDumper::Dump(const std::map<std::string, std::vector<uint8_t>> &dataMap
                 ASCEND_LOGE("DataDumper open file failed: %s", dump_file.c_str());
                 continue;
             }
-            fd_map_.insert({dump_file, fd});
+            fd_map_.insert({ dump_file, fd });
         } else {
             fd = iter->second;
         }
-        fwrite(reinterpret_cast<const char*>(data.second.data()), sizeof(char), data.second.size(), fd);
+        fwrite(reinterpret_cast<const char *>(data.second.data()), sizeof(char), data.second.size(), fd);
     }
 }
 
-TraceDataDumper::TraceDataDumper()
-    : path_(""),
-      start_(false),
-      init_(false),
-      trace_hash_data_(nullptr) {}
+TraceDataDumper::TraceDataDumper() : path_(""), start_(false), init_(false), trace_hash_data_(nullptr) {}
 
 TraceDataDumper::~TraceDataDumper()
 {
@@ -277,7 +277,7 @@ void TraceDataDumper::FlushParamData()
     param_data_ = nullptr;
 }
 
-void TraceDataDumper::Dump(const std::string& file_name, const std::vector<uint8_t>& encode_data)
+void TraceDataDumper::Dump(const std::string &file_name, const std::vector<uint8_t> &encode_data)
 {
     FILE *fd = nullptr;
     const std::string dump_file = path_ + "/" + file_name;
@@ -292,11 +292,11 @@ void TraceDataDumper::Dump(const std::string& file_name, const std::vector<uint8
             ASCEND_LOGE("TraceDataDumper open file failed: %s", dump_file.c_str());
             return;
         }
-        fd_map_.insert({dump_file, fd});
+        fd_map_.insert({ dump_file, fd });
     } else {
         fd = iter->second;
     }
-    fwrite(reinterpret_cast<const char*>(encode_data.data()), sizeof(char), encode_data.size(), fd);
+    fwrite(reinterpret_cast<const char *>(encode_data.data()), sizeof(char), encode_data.size(), fd);
 }
 } // profiler
 } // toolkit
-- 
Gitee


From 051c3ad2539a9d0d2c919307f8e6ba7f5bf457ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= <yanpengquan@huawei.com>
Date: Tue, 25 Mar 2025 09:06:26 +0000
Subject: [PATCH 229/358] =?UTF-8?q?!19467=20fix=20lccl=20func=20regist=20a?=
 =?UTF-8?q?nd=20get=20error=20Merge=20pull=20request=20!19467=20from=20?=
 =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/core/npu/interface/LcclInterface.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_npu/csrc/core/npu/interface/LcclInterface.cpp b/torch_npu/csrc/core/npu/interface/LcclInterface.cpp
index dccfe342ce..326bd5b01d 100644
--- a/torch_npu/csrc/core/npu/interface/LcclInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/LcclInterface.cpp
@@ -30,7 +30,7 @@ int LcclCommInitRankLocal(int rankSize, int rank, LcclComm *comms)
     typedef int(*lcalCommInitRankLocal)(int, int, LcclComm *);
     static lcalCommInitRankLocal func = nullptr;
     if (func == nullptr) {
-        func = (lcalCommInitRankLocal)GET_FUNC(LcclCommInitRankLocal);
+        func = (lcalCommInitRankLocal)GET_FUNC(LcalCommInitRankLocal);
         if (func == nullptr) {
             TORCH_CHECK(func, "Failed to find function ", "lcalCommInitRankLocal", PTA_ERROR(ErrCode::NOT_FOUND));
             return -1;
@@ -44,7 +44,7 @@ int LcclCommInit(int rank, int rankSize, LcclComm *comms)
     typedef int(*lcalCommInit)(int, int, LcclComm *);
     static lcalCommInit func = nullptr;
     if (func == nullptr) {
-        func = (lcalCommInit)GET_FUNC(LcclCommInit);
+        func = (lcalCommInit)GET_FUNC(LcalCommInit);
         if (func == nullptr) {
             TORCH_CHECK(func, "Failed to find function ", "lcalCommInit", PTA_ERROR(ErrCode::NOT_FOUND));
             return -1;
-- 
Gitee


From 4cf9da72e5d3f2ba2a668c61331917d623f4e944 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= <jiangyiwen5@huawei.com>
Date: Tue, 25 Mar 2025 11:35:28 +0000
Subject: [PATCH 230/358] =?UTF-8?q?!19510=20Check=20ACL=5FOP=5FINIT=5FMODE?=
 =?UTF-8?q?=20value=20Merge=20pull=20request=20!19510=20from=20=E5=A7=9C?=
 =?UTF-8?q?=E6=80=A1=E6=96=87/v2.6.0=5Flz?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/core/npu/register/OptionsManager.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
index 22abad0d3d..9dd23d0d17 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
@@ -483,6 +483,10 @@ uint32_t OptionsManager::GetAclOpInitMode()
         char* buf_val = std::getenv("ACL_OP_INIT_MODE");
         // Default 0
         int64_t acl_op_init_mode = (buf_val != nullptr) ? strtol(buf_val, nullptr, 10) : 0;
+        std::unordered_map<int32_t, std::string> aclOpInitMode = getAclOpInitMode();
+        if (aclOpInitMode.find(acl_op_init_mode) == aclOpInitMode.end()) {
+            TORCH_CHECK(false, "ACL_OP_INIT_MODE should be 0, 1 or 2", PTA_ERROR(ErrCode::VALUE));
+        }
         return static_cast<uint32_t>(acl_op_init_mode);
     }();
     return acl_op_init_mode;
-- 
Gitee


From 4a38dfac298fe9e0f806a2fb4da8fe6cfba0c26a Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 25 Mar 2025 11:41:19 +0000
Subject: [PATCH 231/358] !19514 Update op_plugin commit id Merge pull request
 !19514 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 1e0f7956aa..2a0dc12795 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 1e0f7956aa64cd347fcd483d2cfb38033c66325c
+Subproject commit 2a0dc1279515725e1d49a3e14df88e8748d4f17b
-- 
Gitee


From 2c941bb4b5614691e870eed5f3b1fecbdd4e6fb1 Mon Sep 17 00:00:00 2001
From: jiangpengfei <jiangpengfei24@huawei.com>
Date: Tue, 25 Mar 2025 12:26:22 +0000
Subject: [PATCH 232/358] !19491 acl changename Merge pull request !19491 from
 jiangpengfei/v2.6.0

---
 third_party/acl/inc/acl/acl_mdl.h             | 64 ++++++--------
 torch_npu/csrc/core/npu/NPUGraph.cpp          | 36 ++++----
 torch_npu/csrc/core/npu/NPUGraph.h            |  4 +-
 torch_npu/csrc/core/npu/NPUGraphsUtils.h      | 32 +++----
 .../csrc/core/npu/interface/AclInterface.cpp  | 88 +++++++++----------
 .../csrc/core/npu/interface/AclInterface.h    | 12 +--
 torch_npu/csrc/npu/Graph.cpp                  |  8 +-
 torch_npu/npu/graphs.py                       |  8 +-
 8 files changed, 120 insertions(+), 132 deletions(-)

diff --git a/third_party/acl/inc/acl/acl_mdl.h b/third_party/acl/inc/acl/acl_mdl.h
index 78dcabb8f1..45a36898ef 100755
--- a/third_party/acl/inc/acl/acl_mdl.h
+++ b/third_party/acl/inc/acl/acl_mdl.h
@@ -50,6 +50,7 @@ typedef struct aclmdlAIPP aclmdlAIPP;
 typedef struct aclAippExtendInfo aclAippExtendInfo;
 typedef struct aclmdlConfigHandle aclmdlConfigHandle;
 typedef struct aclmdlExecConfigHandle aclmdlExecConfigHandle;
+typedef void *aclmdlRI;
 
 typedef enum {
     ACL_YUV420SP_U8 = 1,
@@ -215,16 +216,16 @@ typedef struct aclmdlExeOMDesc {
 } aclmdlExeOMDesc;
 
 typedef enum {
-    ACL_MODEL_CAPTURE_MODE_GLOBAL = 0,
-    ACL_MODEL_CAPTURE_MODE_THREAD_LOCAL,
-    ACL_MODEL_CAPTURE_MODE_RELAXED,
-} aclmdlCaptureMode;
+    ACL_MODEL_RI_CAPTURE_MODE_GLOBAL = 0,
+    ACL_MODEL_RI_CAPTURE_MODE_THREAD_LOCAL,
+    ACL_MODEL_RI_CAPTURE_MODE_RELAXED,
+} aclmdlRICaptureMode;
 
 typedef enum {
-    ACL_MODEL_CAPTURE_STATUS_NONE = 0,
-    ACL_MODEL_CAPTURE_STATUS_ACTIVE,
-    ACL_MODEL_CAPTURE_STATUS_INVALIDATED,
-} aclmdlCaptureStatus;
+    ACL_MODEL_RI_CAPTURE_STATUS_NONE = 0,
+    ACL_MODEL_RI_CAPTURE_STATUS_ACTIVE,
+    ACL_MODEL_RI_CAPTURE_STATUS_INVALIDATED,
+} aclmdlRICaptureStatus;
 
 /**
  * @ingroup AscendCL
@@ -663,45 +664,34 @@ ACL_FUNC_VISIBILITY aclError aclmdlExecuteV2(uint32_t modelId, const aclmdlDatas
  * @ingroup AscendCL
  * @brief Execute model asynchronous inference until the inference result is returned
  *
- * @param  modelId [IN]   ID of the model to perform inference
- * @param  input [IN]     Input data for model inference
- * @param  output [OUT]   Output data for model inference
- * @param  stream [IN]   stream
- * @param  handle [IN]   config of model execute
+ * @param  modelRI [IN]   runtime instance of the model to perform inference
+ * @param  stream [IN]    stream
  *
  * @retval ACL_SUCCESS The function is successfully executed.
  * @retval OtherValues Failure
  */
-ACL_FUNC_VISIBILITY  aclError aclmdlExecuteAsyncV2(uint32_t modelId, const aclmdlDataset *input, aclmdlDataset *output,
-                                                   aclrtStream stream, const aclmdlExecConfigHandle *handle);
+ACL_FUNC_VISIBILITY aclError aclmdlRIExecuteAsync(aclmdlRI modelRI, aclrtStream stream);
+
 /**
  * @ingroup AscendCL
- * @brief Execute model asynchronous inference until the inference result is returned
- *
- * @param  modelId [IN]   ID of the model to perform inference
- * @param  input [IN]     Input data for model inference
- * @param  output [OUT]   Output data for model inference
- * @param  stream [IN]    stream
+ * @brief unload model with model id
  *
- * @retval ACL_SUCCESS The function is successfully executed.
+ * @param  modelId [IN]   model id to be unloaded
+ * @retval ACL_ERROR_NONE The function is successfully executed.
  * @retval OtherValues Failure
- *
- * @see aclmdlLoadFromFile | aclmdlLoadFromMem | aclmdlLoadFromFileWithMem |
- * aclmdlLoadFromMemWithMem
  */
-ACL_FUNC_VISIBILITY aclError aclmdlExecuteAsync(uint32_t modelId, const aclmdlDataset *input,
-                                                aclmdlDataset *output, aclrtStream stream);
+ACL_FUNC_VISIBILITY aclError aclmdlUnload(uint32_t modelId);
 
 /**
  * @ingroup AscendCL
- * @brief unload model with model id
+ * @brief destroy the model
  *
- * @param  modelId [IN]   model id to be unloaded
+ * @param  modelRI [IN]   runtime instance of the model to be destroyed
  *
  * @retval ACL_SUCCESS The function is successfully executed.
  * @retval OtherValues Failure
  */
-ACL_FUNC_VISIBILITY aclError aclmdlUnload(uint32_t modelId);
+ACL_FUNC_VISIBILITY aclError aclmdlRIDestroy(aclmdlRI modelRI);
 
 /**
  * @ingroup AscendCL
@@ -1523,37 +1513,37 @@ ACL_FUNC_VISIBILITY const char *aclmdlGetTensorRealName(const aclmdlDesc *modelD
  * @retval ACL_SUCCESS The function is successfully executed.
  * @retval OtherValues Failure
  */
-ACL_FUNC_VISIBILITY aclError aclmdlCaptureBegin(aclrtStream stream, aclmdlCaptureMode mode);
+ACL_FUNC_VISIBILITY aclError aclmdlRICaptureBegin(aclrtStream stream, aclmdlRICaptureMode mode);
 
 /**
  * @ingroup AscendCL
  * @brief obtain the capture information of a stream
  * @param stream [IN] stream to be queried
  * @param status [OUT] return the stream status
- * @param modelId [OUT] return the model id
+ * @param modelRI [OUT] return the model runtime instance
  * @retval ACL_SUCCESS The function is successfully executed.
  * @retval OtherValues Failure
  */
-ACL_FUNC_VISIBILITY aclError aclmdlCaptureGetInfo(aclrtStream stream, aclmdlCaptureStatus *status, uint32_t *modelId);
+ACL_FUNC_VISIBILITY aclError aclmdlRICaptureGetInfo(aclrtStream stream, aclmdlRICaptureStatus *status, aclmdlRI *modelRI);
 
 /**
  * @ingroup AscendCL
  * @brief end the stream capture and obtain the corresponding model
  * @param stream [IN] stream to be ended
- * @param modelId [OUT] return the model id
+ * @param modelRI [OUT] return the model runtime instance
  * @retval ACL_SUCCESS The function is successfully executed.
  * @retval OtherValues Failure
  */
-ACL_FUNC_VISIBILITY aclError aclmdlCaptureEnd(aclrtStream stream, uint32_t *modelId);
+ACL_FUNC_VISIBILITY aclError aclmdlRICaptureEnd(aclrtStream stream, aclmdlRI *modelRI);
 
 /**
  * @ingroup AscendCL
  * @brief print model information
- * @param modelId [IN] model information needs to be printed
+ * @param modelRI [IN] model runtime instance
  * @retval ACL_SUCCESS The function is successfully executed.
  * @retval OtherValues Failure
  */
-ACL_FUNC_VISIBILITY aclError aclmdlDebugPrint(uint32_t modelId);
+ACL_FUNC_VISIBILITY aclError aclmdlRIDebugPrint(aclmdlRI modelRI);
 
 #ifdef __cplusplus
 }
diff --git a/torch_npu/csrc/core/npu/NPUGraph.cpp b/torch_npu/csrc/core/npu/NPUGraph.cpp
index be5f367cdd..60351139d9 100644
--- a/torch_npu/csrc/core/npu/NPUGraph.cpp
+++ b/torch_npu/csrc/core/npu/NPUGraph.cpp
@@ -20,7 +20,7 @@ constexpr int kSynchronizeBusyWaitMillis = 10;
 MempoolId_t graph_pool_handle()
 {
     // Sets just the second value, to distinguish it from MempoolId_ts created from
-    // aclmdlCaptureGetInfo id_s in capture_begin.
+    // aclmdlRICaptureGetInfo id_s in capture_begin.
     auto new_pool = c10_npu::MemPool();
     return new_pool.id();
 }
@@ -73,7 +73,7 @@ NPUGraph::NPUGraph()
     : capture_stream_(c10_npu::getCurrentNPUStream()) {
 }
 
-void NPUGraph::capture_begin(MempoolId_t pool, aclmdlCaptureMode capture_mode)
+void NPUGraph::capture_begin(MempoolId_t pool, aclmdlRICaptureMode capture_mode)
 {
     TORCH_CHECK(!has_graph_exec_,
                 "This NPUGraph instance already owns a captured graph. "
@@ -108,10 +108,10 @@ void NPUGraph::capture_begin(MempoolId_t pool, aclmdlCaptureMode capture_mode)
     // autograd thread's free() call triggering an invalid cudaEventRecord in the caching allocator
     // due to the capture status being updated _after_ a capture had already started.
     c10_npu::NPUCachingAllocator::beginAllocateToPool(capture_dev_, mempool_id_, [this](aclrtStream stream) {
-        aclmdlCaptureStatus status;
-        uint32_t model_id;
-        NPU_CHECK_ERROR(c10_npu::acl::AclmdlCaptureGetInfo(stream, &status, &model_id));
-        return status == aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_ACTIVE && model_id == model_id_;
+        aclmdlRICaptureStatus status;
+        aclmdlRI model_ri;
+        NPU_CHECK_ERROR(c10_npu::acl::AclmdlRICaptureGetInfo(stream, &status, &model_ri));
+        return status == aclmdlRICaptureStatus::ACL_MODEL_RI_CAPTURE_STATUS_ACTIVE && model_ri == model_ri_;
     });
 
     // At this point, any NCCL watchdogs should be aware that we are in capture mode
@@ -125,13 +125,13 @@ void NPUGraph::capture_begin(MempoolId_t pool, aclmdlCaptureMode capture_mode)
 
     // cudaStreamCaptureModeGlobal is the most conservative option to
     // prevent potentially unsafe CUDA API calls during capture.
-    NPU_CHECK_ERROR(c10_npu::acl::AclmdlCaptureBegin(capture_stream_, capture_mode));
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlRICaptureBegin(capture_stream_, capture_mode));
 
     c10_npu::is_stream_capturing.store(true);
 
-    aclmdlCaptureStatus status;
-    NPU_CHECK_ERROR(c10_npu::acl::AclmdlCaptureGetInfo(stream, &status, &model_id_));
-    TORCH_INTERNAL_ASSERT(status == aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_ACTIVE);
+    aclmdlRICaptureStatus status;
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlRICaptureGetInfo(stream, &status, &model_ri_));
+    TORCH_INTERNAL_ASSERT(status == aclmdlRICaptureStatus::ACL_MODEL_RI_CAPTURE_STATUS_ACTIVE);
 }
 
 void NPUGraph::capture_end()
@@ -141,14 +141,14 @@ void NPUGraph::capture_end()
     TORCH_CHECK(stream == capture_stream_,
                 "Capture must end on the same stream it began on.");
 
-    uint32_t model_id;
-    NPU_CHECK_ERROR(c10_npu::acl::AclmdlCaptureEnd(capture_stream_, &model_id));
+    aclmdlRI model_ri;
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlRICaptureEnd(capture_stream_, &model_ri));
 
     c10_npu::is_stream_capturing.store(false);
 
     c10_npu::NPUCachingAllocator::endAllocateToPool(capture_dev_, mempool_id_);
 
-    TORCH_CHECK(model_id == model_id_, "Invalid end capture model id: ", model_id);
+    TORCH_CHECK(model_ri == model_ri_, "Invalid end capture model id: ", model_ri);
 
     // In typical graph usage some tensors (e.g. the tensors used for graph IO) are not freed
     // between replays.
@@ -171,8 +171,8 @@ void NPUGraph::replay()
 
     c10::OptionalDeviceGuard device_guard{capture_stream_.device()};
 
-    // model_id_ may be replayed in any stream.
-    NPU_CHECK_ERROR(c10_npu::acl::AclmdlExecuteAsync(model_id_, c10_npu::getCurrentNPUStream()));
+    // model_ri_ may be replayed in any stream.
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlRIExecuteAsync(model_ri_, c10_npu::getCurrentNPUStream()));
 }
 
 void NPUGraph::enable_debug_mode()
@@ -184,8 +184,8 @@ void NPUGraph::debug_dump()
 {
     if (_npu_graphs_debug) {
         if (has_graph_exec_) {
-            TORCH_WARN("DEBUG: calling NPUGraph::debug_dump() for model id ", model_id_);
-            NPU_CHECK_ERROR(c10_npu::acl::AclmdlDebugPrint(model_id_));
+            TORCH_WARN("DEBUG: calling NPUGraph::debug_dump() for model id ", model_ri_);
+            NPU_CHECK_ERROR(c10_npu::acl::AclmdlRIDebugPrint(model_ri_));
         }
     } else {
         TORCH_WARN("NPU Graphs debug not enabled, set with NPUGraph::enable_debug_mode().");
@@ -216,7 +216,7 @@ void NPUGraph::reset()
     if (has_graph_exec_) {
         // notifyCaptureDestroy may throw. How should we handle this?
         c10_npu::NPUCachingAllocator::releasePool(capture_dev_, mempool_id_);
-        NPU_CHECK_ERROR(c10_npu::acl::AclmdlUnload(model_id_));
+        NPU_CHECK_ERROR(c10_npu::acl::AclmdlRIDestroy(model_ri_));
         has_graph_exec_ = false;
     }
 }
diff --git a/torch_npu/csrc/core/npu/NPUGraph.h b/torch_npu/csrc/core/npu/NPUGraph.h
index b2833744c1..ccb8c29067 100644
--- a/torch_npu/csrc/core/npu/NPUGraph.h
+++ b/torch_npu/csrc/core/npu/NPUGraph.h
@@ -24,7 +24,7 @@ struct TORCH_NPU_API NPUGraph {
 
     void capture_begin(
         MempoolId_t pool = {0, 0},
-        aclmdlCaptureMode capture_mode = aclmdlCaptureMode::ACL_MODEL_CAPTURE_MODE_GLOBAL);
+        aclmdlRICaptureMode capture_mode = aclmdlRICaptureMode::ACL_MODEL_RI_CAPTURE_MODE_GLOBAL);
     void capture_end();
     void replay();
     void reset();
@@ -33,7 +33,7 @@ struct TORCH_NPU_API NPUGraph {
     void debug_dump();
 
 protected:
-    uint32_t model_id_ = -1;
+    aclmdlRI model_ri_ = nullptr;
 
     static std::atomic<int> pending_event_queries;
 
diff --git a/torch_npu/csrc/core/npu/NPUGraphsUtils.h b/torch_npu/csrc/core/npu/NPUGraphsUtils.h
index 395f27a049..254dc42599 100644
--- a/torch_npu/csrc/core/npu/NPUGraphsUtils.h
+++ b/torch_npu/csrc/core/npu/NPUGraphsUtils.h
@@ -18,33 +18,33 @@ using CaptureId_t = unsigned long long;
 // second is set if the instance is created by at::cuda::graph_pool_handle.
 using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
 
-// RAII guard for "aclmdlCaptureMode", a thread-local value
+// RAII guard for "aclmdlRICaptureMode", a thread-local value
 // that controls the error-checking strictness of a capture.
 struct C10_NPU_API NPUStreamCaptureModeGuard{
-    NPUStreamCaptureModeGuard(aclmdlCaptureMode desired)
+    NPUStreamCaptureModeGuard(aclmdlRICaptureMode desired)
     : strictness_(desired) {}
     ~NPUStreamCaptureModeGuard() {}
 
     private:
-    aclmdlCaptureMode strictness_;
+    aclmdlRICaptureMode strictness_;
 };
 
-// Protects against enum aclmdlCaptureStatus implementation changes.
+// Protects against enum aclmdlRICaptureStatus implementation changes.
 // Some compilers seem not to like static_assert without the messages.
 static_assert(
-    int(aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_NONE) == 0,
-    "unexpected int(ACL_MODEL_CAPTURE_STATUS_NONE) value");
+    int(aclmdlRICaptureStatus::ACL_MODEL_RI_CAPTURE_STATUS_NONE) == 0,
+    "unexpected int(ACL_MODEL_RI_CAPTURE_STATUS_NONE) value");
 static_assert(
-    int(aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_ACTIVE) == 1,
-    "unexpected int(ACL_MODEL_CAPTURE_STATUS_ACTIVE) value");
+    int(aclmdlRICaptureStatus::ACL_MODEL_RI_CAPTURE_STATUS_ACTIVE) == 1,
+    "unexpected int(ACL_MODEL_RI_CAPTURE_STATUS_ACTIVE) value");
 static_assert(
-    int(aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_INVALIDATED) == 2,
-    "unexpected int(ACL_MODEL_CAPTURE_STATUS_INVALIDATED) value");
+    int(aclmdlRICaptureStatus::ACL_MODEL_RI_CAPTURE_STATUS_INVALIDATED) == 2,
+    "unexpected int(ACL_MODEL_RI_CAPTURE_STATUS_INVALIDATED) value");
 
 enum class CaptureStatus : int {
-    None = int(aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_NONE),
-    Active = int(aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_ACTIVE),
-    Invalidated = int(aclmdlCaptureStatus::ACL_MODEL_CAPTURE_STATUS_INVALIDATED)
+    None = int(aclmdlRICaptureStatus::ACL_MODEL_RI_CAPTURE_STATUS_NONE),
+    Active = int(aclmdlRICaptureStatus::ACL_MODEL_RI_CAPTURE_STATUS_ACTIVE),
+    Invalidated = int(aclmdlRICaptureStatus::ACL_MODEL_RI_CAPTURE_STATUS_INVALIDATED)
 };
 
 inline std::ostream &operator<<(std::ostream &os, CaptureStatus status)
@@ -73,10 +73,10 @@ inline CaptureStatus currentStreamCaptureStatusMayInitCtx()
         return CaptureStatus::None;
     }
 
-    aclmdlCaptureStatus is_capturing{ACL_MODEL_CAPTURE_STATUS_NONE};
-    uint32_t modelId;
+    aclmdlRICaptureStatus is_capturing{ACL_MODEL_RI_CAPTURE_STATUS_NONE};
+    aclmdlRI model_ri;
     NPU_CHECK_ERROR(
-        c10_npu::acl::AclmdlCaptureGetInfo(c10_npu::getCurrentNPUStream(), &is_capturing, &modelId));
+        c10_npu::acl::AclmdlRICaptureGetInfo(c10_npu::getCurrentNPUStream(), &is_capturing, &model_ri));
     return CaptureStatus(is_capturing);
 }
 
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
index 6d4c0f6b9d..78737ed91a 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
@@ -69,12 +69,12 @@ LOAD_FUNCTION(aclrtPeekAtLastError)
 LOAD_FUNCTION(aclrtSynchronizeDevice)
 LOAD_FUNCTION(aclrtSynchronizeDeviceWithTimeout)
 LOAD_FUNCTION(aclrtEventGetTimestamp)
-LOAD_FUNCTION(aclmdlCaptureBegin)
-LOAD_FUNCTION(aclmdlCaptureGetInfo)
-LOAD_FUNCTION(aclmdlCaptureEnd)
-LOAD_FUNCTION(aclmdlDebugPrint)
-LOAD_FUNCTION(aclmdlExecuteAsync)
-LOAD_FUNCTION(aclmdlUnload)
+LOAD_FUNCTION(aclmdlRICaptureBegin)
+LOAD_FUNCTION(aclmdlRICaptureGetInfo)
+LOAD_FUNCTION(aclmdlRICaptureEnd)
+LOAD_FUNCTION(aclmdlRIDebugPrint)
+LOAD_FUNCTION(aclmdlRIExecuteAsync)
+LOAD_FUNCTION(aclmdlRIDestroy)
 LOAD_FUNCTION(aclsysGetCANNVersion)
 
 aclprofStepInfoPtr init_stepinfo() {
@@ -742,67 +742,65 @@ aclError AclrtEventGetTimestamp(aclrtEvent event, uint64_t *timestamp)
     return func(event, timestamp);
 }
 
-aclError AclmdlCaptureBegin(aclrtStream stream, aclmdlCaptureMode mode)
+aclError AclmdlRICaptureBegin(aclrtStream stream, aclmdlRICaptureMode mode)
 {
-    typedef aclError (*AclmdlCaptureBegin)(aclrtStream, aclmdlCaptureMode);
-    static AclmdlCaptureBegin func = nullptr;
+    typedef aclError (*AclmdlRICaptureBegin)(aclrtStream, aclmdlRICaptureMode);
+    static AclmdlRICaptureBegin func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlCaptureBegin) GET_FUNC(aclmdlCaptureBegin);
+        func = (AclmdlRICaptureBegin) GET_FUNC(aclmdlRICaptureBegin);
     }
 
-    TORCH_CHECK(func, "Failed to find function aclmdlCaptureBegin", PTA_ERROR(ErrCode::NOT_FOUND));
+    TORCH_CHECK(func, "Failed to find function aclmdlRICaptureBegin", PTA_ERROR(ErrCode::NOT_FOUND));
     return func(stream, mode);
 }
 
-aclError AclmdlCaptureGetInfo(aclrtStream stream, aclmdlCaptureStatus *status, uint32_t *modelId)
+aclError AclmdlRICaptureGetInfo(aclrtStream stream, aclmdlRICaptureStatus *status, aclmdlRI *modelRI)
 {
-    typedef aclError (*AclmdlCaptureGetInfo)(aclrtStream, aclmdlCaptureStatus *, uint32_t *);
-    static AclmdlCaptureGetInfo func = nullptr;
+    typedef aclError (*AclmdlRICaptureGetInfo)(aclrtStream, aclmdlRICaptureStatus *, aclmdlRI *);
+    static AclmdlRICaptureGetInfo func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlCaptureGetInfo) GET_FUNC(aclmdlCaptureGetInfo);
+        func = (AclmdlRICaptureGetInfo) GET_FUNC(aclmdlRICaptureGetInfo);
     }
 
-    TORCH_CHECK(func, "Failed to find function aclmdlCaptureGetInfo", PTA_ERROR(ErrCode::NOT_FOUND));
-    return func(stream, status, modelId);
+    TORCH_CHECK(func, "Failed to find function aclmdlRICaptureGetInfo", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(stream, status, modelRI);
 }
 
-aclError AclmdlCaptureEnd(aclrtStream stream, uint32_t *modelId)
+aclError AclmdlRICaptureEnd(aclrtStream stream, aclmdlRI *modelRI)
 {
-    typedef aclError (*AclmdlCaptureEnd)(aclrtStream, uint32_t *);
-    static AclmdlCaptureEnd func = nullptr;
+    typedef aclError (*AclmdlRICaptureEnd)(aclrtStream, aclmdlRI *);
+    static AclmdlRICaptureEnd func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlCaptureEnd) GET_FUNC(aclmdlCaptureEnd);
+        func = (AclmdlRICaptureEnd) GET_FUNC(aclmdlRICaptureEnd);
     }
 
-    TORCH_CHECK(func, "Failed to find function aclmdlCaptureEnd", PTA_ERROR(ErrCode::NOT_FOUND));
-    return func(stream, modelId);
+    TORCH_CHECK(func, "Failed to find function aclmdlRICaptureEnd", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(stream, modelRI);
 }
 
-aclError AclmdlDebugPrint(uint32_t modelId)
+aclError AclmdlRIDebugPrint(aclmdlRI modelRI)
 {
-    typedef aclError (*AclmdlDebugPrint)(uint32_t);
-    static AclmdlDebugPrint func = nullptr;
+    typedef aclError (*AclmdlRIDebugPrint)(aclmdlRI);
+    static AclmdlRIDebugPrint func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlDebugPrint) GET_FUNC(aclmdlDebugPrint);
+        func = (AclmdlRIDebugPrint) GET_FUNC(aclmdlRIDebugPrint);
     }
 
-    TORCH_CHECK(func, "Failed to find function aclmdlDebugPrint", PTA_ERROR(ErrCode::NOT_FOUND));
-    return func(modelId);
+    TORCH_CHECK(func, "Failed to find function aclmdlRIDebugPrint", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(modelRI);
 }
 
-aclError AclmdlExecuteAsync(uint32_t modelId, aclrtStream stream)
+aclError AclmdlRIExecuteAsync(aclmdlRI modelRI, aclrtStream stream)
 {
-    typedef aclError (*AclmdlExecuteAsync)(uint32_t, const aclmdlDataset *, aclmdlDataset *, aclrtStream);
-    static AclmdlExecuteAsync func = nullptr;
+    typedef aclError (*AclmdlRIExecuteAsync)(aclmdlRI, aclrtStream);
+    static AclmdlRIExecuteAsync func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlExecuteAsync) GET_FUNC(aclmdlExecuteAsync);
+        func = (AclmdlRIExecuteAsync) GET_FUNC(aclmdlRIExecuteAsync);
     }
 
-    TORCH_CHECK(func, "Failed to find function aclmdlExecuteAsync", PTA_ERROR(ErrCode::NOT_FOUND));
+    TORCH_CHECK(func, "Failed to find function aclmdlRIExecuteAsync", PTA_ERROR(ErrCode::NOT_FOUND));
 
-    static aclmdlDataset *inputs = aclmdlCreateDataset();
-    static aclmdlDataset *outputs = aclmdlCreateDataset();
-    return func(modelId, inputs, outputs, stream);
+    return func(modelRI, stream);
 }
 
 aclError AclsysGetCANNVersion(aclCANNPackageName name, aclCANNPackageVersion *version)
@@ -819,16 +817,16 @@ aclError AclsysGetCANNVersion(aclCANNPackageName name, aclCANNPackageVersion *ve
     return func(name, version);
 }
 
-aclError AclmdlUnload(uint32_t modelId)
+aclError AclmdlRIDestroy(aclmdlRI modelRI)
 {
-    typedef aclError (*AclmdlUnload)(uint32_t);
-    static AclmdlUnload func = nullptr;
+    typedef aclError (*AclmdlRIDestroy)(aclmdlRI);
+    static AclmdlRIDestroy func = nullptr;
     if (func == nullptr) {
-        func = (AclmdlUnload) GET_FUNC(aclmdlUnload);
+        func = (AclmdlRIDestroy) GET_FUNC(aclmdlRIDestroy);
     }
 
-    TORCH_CHECK(func, "Failed to find function aclmdlUnload", PTA_ERROR(ErrCode::NOT_FOUND));
-    return func(modelId);
+    TORCH_CHECK(func, "Failed to find function aclmdlRIDestroy", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(modelRI);
 }
 
 bool IsCaptureSupported()
@@ -840,8 +838,8 @@ bool IsCaptureSupported()
         (GetSocVersion() >= SocVersion::Ascend910_9391);
     if (default_support_capture && !have_load_func) {
         have_load_func = true;
-        typedef aclError (*AclmdlCaptureGetInfo)(aclrtStream, aclmdlCaptureStatus *, uint32_t *);
-        static AclmdlCaptureGetInfo func = (AclmdlCaptureGetInfo) GET_FUNC(aclmdlCaptureGetInfo);
+        typedef aclError (*AclmdlRICaptureGetInfo)(aclrtStream, aclmdlRICaptureStatus *, aclmdlRI *);
+        static AclmdlRICaptureGetInfo func = (AclmdlRICaptureGetInfo) GET_FUNC(aclmdlRICaptureGetInfo);
         is_support = (func != nullptr);
     }
 
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h
index 245ad09584..ca5c03d30e 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.h
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.h
@@ -183,19 +183,19 @@ aclError AclrtSynchronizeDeviceWithTimeout(void);
 
 aclError AclrtEventGetTimestamp(aclrtEvent event, uint64_t *timestamp);
 
-aclError AclmdlCaptureBegin(aclrtStream stream, aclmdlCaptureMode mode);
+aclError AclmdlRICaptureBegin(aclrtStream stream, aclmdlRICaptureMode mode);
 
-aclError AclmdlCaptureGetInfo(aclrtStream stream, aclmdlCaptureStatus *status, uint32_t *modelId);
+aclError AclmdlRICaptureGetInfo(aclrtStream stream, aclmdlRICaptureStatus *status, aclmdlRI *modelRI);
 
-aclError AclmdlCaptureEnd(aclrtStream stream, uint32_t *modelId);
+aclError AclmdlRICaptureEnd(aclrtStream stream, aclmdlRI *modelRI);
 
-aclError AclmdlDebugPrint(uint32_t modelId);
+aclError AclmdlRIDebugPrint(aclmdlRI modelRI);
 
-aclError AclmdlExecuteAsync(uint32_t modelId, aclrtStream stream);
+aclError AclmdlRIExecuteAsync(aclmdlRI modelRI, aclrtStream stream);
 
 aclError AclsysGetCANNVersion(aclCANNPackageName name, aclCANNPackageVersion *version);
 
-aclError AclmdlUnload(uint32_t modelId);
+aclError AclmdlRIDestroy(aclmdlRI modelRI);
 
 bool IsCaptureSupported();
 
diff --git a/torch_npu/csrc/npu/Graph.cpp b/torch_npu/csrc/npu/Graph.cpp
index 27e9174740..3a471cb2aa 100644
--- a/torch_npu/csrc/npu/Graph.cpp
+++ b/torch_npu/csrc/npu/Graph.cpp
@@ -25,15 +25,15 @@ void TORCH_NPU_API THNPGraph_init(PyObject* module) {
             [](c10_npu::NPUGraph& self,
                std::optional<c10_npu::MempoolId_t> pool_opt,
                std::string capture_error_mode) {
-                aclmdlCaptureMode capture_mode;
+                aclmdlRICaptureMode capture_mode;
                 c10_npu::MempoolId_t pool = pool_opt.has_value()
                     ? pool_opt.value() : c10_npu::MempoolId_t{0, 0};
                 if (capture_error_mode == "global") {
-                    capture_mode = aclmdlCaptureMode::ACL_MODEL_CAPTURE_MODE_GLOBAL;
+                    capture_mode = aclmdlRICaptureMode::ACL_MODEL_RI_CAPTURE_MODE_GLOBAL;
                 } else if (capture_error_mode == "thread_local") {
-                    capture_mode = aclmdlCaptureMode::ACL_MODEL_CAPTURE_MODE_THREAD_LOCAL;
+                    capture_mode = aclmdlRICaptureMode::ACL_MODEL_RI_CAPTURE_MODE_THREAD_LOCAL;
                 } else if (capture_error_mode == "relaxed") {
-                    capture_mode = aclmdlCaptureMode::ACL_MODEL_CAPTURE_MODE_RELAXED;
+                    capture_mode = aclmdlRICaptureMode::ACL_MODEL_RI_CAPTURE_MODE_RELAXED;
                 } else {
                     TORCH_CHECK(
                         false,
diff --git a/torch_npu/npu/graphs.py b/torch_npu/npu/graphs.py
index e78f60f365..c9608906fd 100644
--- a/torch_npu/npu/graphs.py
+++ b/torch_npu/npu/graphs.py
@@ -63,11 +63,11 @@ class NPUGraph(torch_npu._C._NPUGraph):
             pool (optional): Token (returned by :func:`~torch.npu.graph_pool_handle` or
                 :meth:`other_Graph_instance.pool()<torch.npu.NPUGraph.pool>`) that hints this graph may share memory
                 with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
-            capture_error_mode (str, optional): specifies the aclmdlCaptureMode for the graph capture stream.
+            capture_error_mode (str, optional): specifies the aclmdlRICaptureMode for the graph capture stream.
                 Can be "global", "thread_local" or "relaxed". During npu graph capture, some actions, such as npuMalloc,
                 may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
                 actions in the current thread, and "relaxed" will not error on these actions. Do NOT change this setting
-                unless you're familiar with `aclmdlCaptureMode`_
+                unless you're familiar with `aclmdlRICaptureMode`_
         """  # noqa: B950
         super().capture_begin(pool=pool, capture_error_mode=capture_error_mode)
 
@@ -112,11 +112,11 @@ class graph:
             may share memory from the specified pool. See :ref:`Graph memory management<graph-memory-management>`.
         stream (torch.npu.Stream, optional): If supplied, will be set as the current stream in the context.
             If not supplied, ``graph`` sets its own internal side stream as the current stream in the context.
-        capture_error_mode (str, optional): specifies the aclmdlCaptureMode for the graph capture stream.
+        capture_error_mode (str, optional): specifies the aclmdlRICaptureMode for the graph capture stream.
             Can be "global", "thread_local" or "relaxed". During npu graph capture, some actions, such as npuMalloc,
             may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
             actions in the current thread, and "relaxed" will not error on actions. Do NOT change this setting
-            unless you're familiar with `aclmdlCaptureMode`_
+            unless you're familiar with `aclmdlRICaptureMode`_
 
     .. note::
         For effective memory sharing, if you pass a ``pool`` used by a previous capture and the previous capture
-- 
Gitee


From 8494f49f9324710090528cde116fa37428b02179 Mon Sep 17 00:00:00 2001
From: yuhaiyan <yuhaiyan8@huawei.com>
Date: Tue, 25 Mar 2025 13:00:03 +0000
Subject: [PATCH 233/358] !19480 Add ascend310p5 and ascend310p7 Merge pull
 request !19480 from yuhaiyan/v2.6.0-dev2

---
 torch_npu/csrc/core/npu/NpuVariables.cpp | 2 ++
 torch_npu/csrc/core/npu/NpuVariables.h   | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/torch_npu/csrc/core/npu/NpuVariables.cpp b/torch_npu/csrc/core/npu/NpuVariables.cpp
index 40efb7fc25..4e0fce02fb 100644
--- a/torch_npu/csrc/core/npu/NpuVariables.cpp
+++ b/torch_npu/csrc/core/npu/NpuVariables.cpp
@@ -20,6 +20,8 @@ static std::map<std::string, SocVersion> socVersionMap = {
     {"Ascend310P2", SocVersion::Ascend310P2},
     {"Ascend310P3", SocVersion::Ascend310P3},
     {"Ascend310P4", SocVersion::Ascend310P4},
+    {"Ascend310P5", SocVersion::Ascend310P5},
+    {"Ascend310P7", SocVersion::Ascend310P7},
     {"Ascend910B1", SocVersion::Ascend910B1},
     {"Ascend910B2", SocVersion::Ascend910B2},
     {"Ascend910B2C", SocVersion::Ascend910B2C},
diff --git a/torch_npu/csrc/core/npu/NpuVariables.h b/torch_npu/csrc/core/npu/NpuVariables.h
index f2575ee8cf..3119a64515 100644
--- a/torch_npu/csrc/core/npu/NpuVariables.h
+++ b/torch_npu/csrc/core/npu/NpuVariables.h
@@ -13,6 +13,8 @@ enum class SocVersion {
   Ascend310P2,
   Ascend310P3,
   Ascend310P4,
+  Ascend310P5,
+  Ascend310P7,
   Ascend910B1 = 220,
   Ascend910B2,
   Ascend910B2C,
-- 
Gitee


From 44a452988e1214d75aa388115c6066e4efeeae0c Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Tue, 25 Mar 2025 23:26:24 +0000
Subject: [PATCH 234/358] !19540 Update torchair commit id Merge pull request
 !19540 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 6fffca1d06..f6d3ebd3b5 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 6fffca1d0661b0f7946d4c2454957cbe989a4a05
+Subproject commit f6d3ebd3b50835fe706805c79bdb2fb3d5bd6ba6
-- 
Gitee


From 17f89b9e3dbce6bb9bb48b621bcab03f95064e77 Mon Sep 17 00:00:00 2001
From: SCh-zx <1325467101@qq.com>
Date: Wed, 26 Mar 2025 01:42:32 +0000
Subject: [PATCH 235/358] !19526 [cleancode]core Merge pull request !19526 from
 SCh-zx/v2.6.0

---
 torch_npu/csrc/core/NPUBridge.cpp             |   24 +-
 torch_npu/csrc/core/NPUSerialization.cpp      |   78 +-
 torch_npu/csrc/core/NPUTensorImpl.cpp         |   57 +-
 torch_npu/csrc/core/OverflowUtils.cpp         |   41 +-
 torch_npu/csrc/core/OverflowUtils.h           |   23 +-
 .../csrc/core/npu/NPUAffinityController.cpp   |    3 +-
 .../csrc/core/npu/NPUCachingAllocator.cpp     | 4829 ++++++++---------
 torch_npu/csrc/core/npu/NPUCachingAllocator.h |    2 +-
 torch_npu/csrc/core/npu/NPUEvent.cpp          |    4 +-
 torch_npu/csrc/core/npu/NPUGuard.h            |  526 +-
 torch_npu/csrc/core/npu/NPUQueue.cpp          |  197 +-
 .../npu/interface/AsyncTaskQueueInterface.cpp |  149 +-
 .../csrc/core/npu/register/FunctionLoader.cpp |   75 +-
 .../csrc/core/npu/register/OptionRegister.cpp |   42 +-
 14 files changed, 2945 insertions(+), 3105 deletions(-)

diff --git a/torch_npu/csrc/core/NPUBridge.cpp b/torch_npu/csrc/core/NPUBridge.cpp
index af03768ab9..5a7a60ebe7 100644
--- a/torch_npu/csrc/core/NPUBridge.cpp
+++ b/torch_npu/csrc/core/NPUBridge.cpp
@@ -2,27 +2,29 @@
 
 
 namespace torch_npu {
-
-NPUStorageImpl* NPUBridge::GetNpuStorageImpl(c10::StorageImpl* storageImpl) {
-  return static_cast<NPUStorageImpl*>(storageImpl);
+NPUStorageImpl *NPUBridge::GetNpuStorageImpl(c10::StorageImpl *storageImpl)
+{
+    return static_cast<NPUStorageImpl *>(storageImpl);
 }
 
-NPUStorageImpl* NPUBridge::GetNpuStorageImpl(c10::Storage&& storage) {
-  return static_cast<NPUStorageImpl*>(storage.unsafeGetStorageImpl());
+NPUStorageImpl *NPUBridge::GetNpuStorageImpl(c10::Storage &&storage)
+{
+    return static_cast<NPUStorageImpl *>(storage.unsafeGetStorageImpl());
 }
 
-NPUStorageImpl* NPUBridge::GetNpuStorageImpl(const at::Tensor &tensor) {
-  return static_cast<NPUStorageImpl*>(tensor.storage().unsafeGetStorageImpl());
+NPUStorageImpl *NPUBridge::GetNpuStorageImpl(const at::Tensor &tensor)
+{
+    return static_cast<NPUStorageImpl *>(tensor.storage().unsafeGetStorageImpl());
 }
 
-NPUStorageDesc& NPUBridge::GetNpuStorageImplDesc(const at::Tensor &tensor) {
-  return static_cast<NPUStorageImpl*>(tensor.storage().unsafeGetStorageImpl())->npu_desc_;
+NPUStorageDesc &NPUBridge::GetNpuStorageImplDesc(const at::Tensor &tensor)
+{
+    return static_cast<NPUStorageImpl *>(tensor.storage().unsafeGetStorageImpl())->npu_desc_;
 }
 
 
 NPUTensorImpl *NPUBridge::GetNpuTensorImpl(const at::Tensor &tensor)
 {
-  return static_cast<NPUTensorImpl*>(tensor.unsafeGetTensorImpl());
+    return static_cast<NPUTensorImpl *>(tensor.unsafeGetTensorImpl());
 }
-
 }
\ No newline at end of file
diff --git a/torch_npu/csrc/core/NPUSerialization.cpp b/torch_npu/csrc/core/NPUSerialization.cpp
index af80b2b655..1ae122f342 100644
--- a/torch_npu/csrc/core/NPUSerialization.cpp
+++ b/torch_npu/csrc/core/NPUSerialization.cpp
@@ -6,51 +6,53 @@
 #include "torch_npu/csrc/framework/StorageDescHelper.h"
 
 namespace torch_npu {
-
 std::unordered_map<std::string, aclFormat> FORMAT_INFO = {
-    {"NC1HWC0", ACL_FORMAT_NC1HWC0},
-    {"ND", ACL_FORMAT_ND},
-    {"NCHW", ACL_FORMAT_NCHW},
-    {"NHWC", ACL_FORMAT_NHWC},
-    {"FRACTAL_NZ", ACL_FORMAT_FRACTAL_NZ},
-    {"FRACTAL_Z", ACL_FORMAT_FRACTAL_Z},
-    {"NDHWC", ACL_FORMAT_NDHWC},
-    {"NCDHW", ACL_FORMAT_NCDHW},
-    {"NDC1HWC0", ACL_FORMAT_NDC1HWC0},
-    {"FRACTAL_Z_3D", ACL_FRACTAL_Z_3D},
+    { "NC1HWC0", ACL_FORMAT_NC1HWC0 },
+    { "ND", ACL_FORMAT_ND },
+    { "NCHW", ACL_FORMAT_NCHW },
+    { "NHWC", ACL_FORMAT_NHWC },
+    { "FRACTAL_NZ", ACL_FORMAT_FRACTAL_NZ },
+    { "FRACTAL_Z", ACL_FORMAT_FRACTAL_Z },
+    { "NDHWC", ACL_FORMAT_NDHWC },
+    { "NCDHW", ACL_FORMAT_NCDHW },
+    { "NDC1HWC0", ACL_FORMAT_NDC1HWC0 },
+    { "FRACTAL_Z_3D", ACL_FRACTAL_Z_3D },
 };
 
-void npu_info_serialization(const at::Tensor& t, std::unordered_map<std::string, bool>& map) {
-  at_npu::native::StorageDescHelper::GetDescForSerialization(t, map);
+void npu_info_serialization(const at::Tensor &t, std::unordered_map<std::string, bool> &map)
+{
+    at_npu::native::StorageDescHelper::GetDescForSerialization(t, map);
 }
 
-void npu_info_deserialization(const at::Tensor& t, std::unordered_map<std::string, bool>& map) {
-  // Set the true stroage description
-  at_npu::native::StorageDescHelper::SetDescForSerialization(t, map);
+void npu_info_deserialization(const at::Tensor &t, std::unordered_map<std::string, bool> &map)
+{
+    // Set the true stroage description
+    at_npu::native::StorageDescHelper::SetDescForSerialization(t, map);
 
-  auto str_to_aclFormat = [](std::string str) -> aclFormat {
-    int start = 0;
-    while (str[start++] != '/');
-    return FORMAT_INFO[str.substr(start, str.size() - start)];
-  };
+    auto str_to_aclFormat = [](std::string str) -> aclFormat {
+        int start = 0;
+        while (str[start++] != '/') {
+            ;
+        }
+        return FORMAT_INFO[str.substr(start, str.size() - start)];
+    };
 
-  for (auto &m : map) {
-    if (m.first.find("npu_format_") != std::string::npos) {
-      aclFormat format = str_to_aclFormat(m.first);
-      // The format cast is an operator,
-      // so special handling is required for scenarios
-      // where the leaf node tensor requires grad at the same time
-      bool revert_flag = false;
-      if (t.is_leaf() && t.requires_grad()) {
-        revert_flag = true;
-        t.set_requires_grad(false);
-      }
-      at_npu::native::NPUNativeFunctions::npu_format_cast_(const_cast<at::Tensor&>(t), format);
-      if (revert_flag) {
-        t.set_requires_grad(true);
-      }
+    for (auto &m : map) {
+        if (m.first.find("npu_format_") != std::string::npos) {
+            aclFormat format = str_to_aclFormat(m.first);
+            // The format cast is an operator,
+            // so special handling is required for scenarios
+            // where the leaf node tensor requires grad at the same time
+            bool revert_flag = false;
+            if (t.is_leaf() && t.requires_grad()) {
+                revert_flag = true;
+                t.set_requires_grad(false);
+            }
+            at_npu::native::NPUNativeFunctions::npu_format_cast_(const_cast<at::Tensor &>(t), format);
+            if (revert_flag) {
+                t.set_requires_grad(true);
+            }
+        }
     }
-  }
 }
-
 }
diff --git a/torch_npu/csrc/core/NPUTensorImpl.cpp b/torch_npu/csrc/core/NPUTensorImpl.cpp
index a2e57e7700..78b15a3978 100644
--- a/torch_npu/csrc/core/NPUTensorImpl.cpp
+++ b/torch_npu/csrc/core/NPUTensorImpl.cpp
@@ -8,56 +8,39 @@
 #include "third_party/acl/inc/acl/acl_rt.h"
 #include "torch_npu/csrc/core/NPUStorageImpl.h"
 
-namespace torch_npu
+namespace torch_npu {
+NPUTensorImpl::NPUTensorImpl(c10::Storage &&storage, const caffe2::TypeMeta &data_type)
+    : c10::TensorImpl(std::move(storage),
+    c10::DispatchKeySet{ c10::DispatchKey::PrivateUse1, c10::DispatchKey::AutogradPrivateUse1 }, data_type)
 {
-  NPUTensorImpl::NPUTensorImpl(c10::Storage &&storage, const caffe2::TypeMeta &data_type)
-      : c10::TensorImpl(std::move(storage),
-                        c10::DispatchKeySet{c10::DispatchKey::PrivateUse1,
-                                            c10::DispatchKey::AutogradPrivateUse1},
-                        data_type)
-  {
     is_non_overlapping_and_dense_ = false;
-  }
+}
 
-  void NPUTensorImpl::shallow_copy_from(const c10::intrusive_ptr<TensorImpl> &impl)
-  {
-    copy_tensor_metadata(
-        impl.get(),
-        this,
-        version_counter(),
-        allow_tensor_metadata_change());
+void NPUTensorImpl::shallow_copy_from(const c10::intrusive_ptr<TensorImpl> &impl)
+{
+    copy_tensor_metadata(impl.get(), this, version_counter(), allow_tensor_metadata_change());
     refresh_numel();
     refresh_contiguous();
-  }
+}
 
-  c10::intrusive_ptr<c10::TensorImpl> NPUTensorImpl::shallow_copy_and_detach(
-      const c10::VariableVersion &version_counter,
-      bool allow_tensor_metadata_change) const
-  {
+c10::intrusive_ptr<c10::TensorImpl> NPUTensorImpl::shallow_copy_and_detach(const c10::VariableVersion &version_counter,
+    bool allow_tensor_metadata_change) const
+{
     auto impl = c10::make_intrusive<NPUTensorImpl>(c10::Storage(this->storage()), this->data_type_);
-    copy_tensor_metadata(
-        this,
-        impl.get(),
-        version_counter,
-        allow_tensor_metadata_change);
+    copy_tensor_metadata(this, impl.get(), version_counter, allow_tensor_metadata_change);
     impl->refresh_numel();
     impl->refresh_contiguous();
     return impl;
-  }
+}
 
-  c10::intrusive_ptr<c10::TensorImpl> NPUTensorImpl::shallow_copy_and_detach(
-      c10::VariableVersion &&version_counter,
-      bool allow_tensor_metadata_change) const
-  {
+c10::intrusive_ptr<c10::TensorImpl> NPUTensorImpl::shallow_copy_and_detach(c10::VariableVersion &&version_counter,
+    bool allow_tensor_metadata_change) const
+{
     auto impl = c10::make_intrusive<NPUTensorImpl>(c10::Storage(this->storage()), this->data_type_);
-    copy_tensor_metadata(
-        this,
-        impl.get(),
-        std::move(version_counter),
-        allow_tensor_metadata_change);
+    copy_tensor_metadata(this, impl.get(), std::move(version_counter), allow_tensor_metadata_change);
     impl->refresh_numel();
     impl->refresh_contiguous();
     return impl;
-  }
-  NPUTensorImpl::~NPUTensorImpl() {}
+}
+NPUTensorImpl::~NPUTensorImpl() {}
 }
diff --git a/torch_npu/csrc/core/OverflowUtils.cpp b/torch_npu/csrc/core/OverflowUtils.cpp
index f77b3db4e7..42994b8387 100644
--- a/torch_npu/csrc/core/OverflowUtils.cpp
+++ b/torch_npu/csrc/core/OverflowUtils.cpp
@@ -6,34 +6,35 @@
 
 namespace torch_npu {
 namespace utils {
-
 OverflowUtil::OverflowUtil() {}
 
 OverflowUtil::~OverflowUtil() {}
 
-void OverflowUtil::EnableOverflowNpu() {
-  auto result = c10_npu::NpuSysCtrl::GetInstance().OverflowSwitchEnable();
-  return;
+void OverflowUtil::EnableOverflowNpu()
+{
+    auto result = c10_npu::NpuSysCtrl::GetInstance().OverflowSwitchEnable();
+    return;
 }
 
-bool OverflowUtil::CheckOverflowNpu() {
-  auto options = at::TensorOptions(c10::DeviceType::PrivateUse1).dtype(at::kFloat);
-  at::Tensor tmp = at::empty({8}, options);
-  auto floatStatus = op_plugin::npu_alloc_float_status(tmp);
-  auto result = op_plugin::npu_get_float_status(floatStatus);
-  if (result.cpu()[0].item().toInt() != 0) {
-    return true;
-  }
-  return false;
+bool OverflowUtil::CheckOverflowNpu()
+{
+    auto options = at::TensorOptions(c10::DeviceType::PrivateUse1).dtype(at::kFloat);
+    at::Tensor tmp = at::empty({ 8 }, options);
+    auto floatStatus = op_plugin::npu_alloc_float_status(tmp);
+    auto result = op_plugin::npu_get_float_status(floatStatus);
+    if (result.cpu()[0].item().toInt() != 0) {
+        return true;
+    }
+    return false;
 }
 
-void OverflowUtil::ClearOverflowNpu() {
-  auto options = at::TensorOptions(c10::DeviceType::PrivateUse1).dtype(at::kFloat);
-  at::Tensor tmp = at::empty({8}, options);
-  auto floatStatus = op_plugin::npu_alloc_float_status(tmp);
-  auto result = op_plugin::npu_clear_float_status(floatStatus);
-  return;
+void OverflowUtil::ClearOverflowNpu()
+{
+    auto options = at::TensorOptions(c10::DeviceType::PrivateUse1).dtype(at::kFloat);
+    at::Tensor tmp = at::empty({ 8 }, options);
+    auto floatStatus = op_plugin::npu_alloc_float_status(tmp);
+    auto result = op_plugin::npu_clear_float_status(floatStatus);
+    return;
 }
-
 }
 }
diff --git a/torch_npu/csrc/core/OverflowUtils.h b/torch_npu/csrc/core/OverflowUtils.h
index 8c4c9607c5..7267ecbf16 100644
--- a/torch_npu/csrc/core/OverflowUtils.h
+++ b/torch_npu/csrc/core/OverflowUtils.h
@@ -4,24 +4,23 @@
 
 namespace torch_npu {
 namespace utils {
-
 class OverflowUtil {
 public:
-  ~OverflowUtil();
+    ~OverflowUtil();
 
-  static OverflowUtil *GetInstance() {
-    static OverflowUtil instance;
-    return &instance;
-  }
+    static OverflowUtil *GetInstance()
+    {
+        static OverflowUtil instance;
+        return &instance;
+    }
 
-  void EnableOverflowNpu();
-  bool CheckOverflowNpu();
-  void ClearOverflowNpu();
+    void EnableOverflowNpu();
+    bool CheckOverflowNpu();
+    void ClearOverflowNpu();
 
 private:
-  OverflowUtil();
-  bool hasOverflow = false;
+    OverflowUtil();
+    bool hasOverflow = false;
 };
-
 }
 }
diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
index 2ffe862c52..ddc190e597 100644
--- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
@@ -261,8 +261,9 @@ namespace c10_npu {
         } else if (bind_conf == 2) {
             auto thread_core_map = GetCpuAffinityMap(device_id);
             // Bind the main thread only when the dispatch phase begins (i.e., when ThreadType::backwardThread is set)
-            if (current_thread_type == ThreadType::backwardThread)
+            if (current_thread_type == ThreadType::backwardThread) {
                 SetThreadAffinity(thread_core_map.at(ThreadType::mainThread), mainthread_tid);
+            }
             return SetThreadAffinity(thread_core_map.at(current_thread_type), pthread_self());
         } else {
             ASCEND_LOGD("Thread affinity setting is disabled.");
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index 999f3d46cf..7afdec6037 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -51,7 +51,6 @@ std::string format_size(uint64_t size)
 
 namespace c10_npu {
 namespace NPUCachingAllocator {
-
 C10_DEFINE_REGISTRY(FreeNPUMemoryCallbacksRegistry, FreeMemoryCallback);
 
 //
@@ -86,61 +85,60 @@ C10_DEFINE_REGISTRY(FreeNPUMemoryCallbacksRegistry, FreeMemoryCallback);
 namespace {
 using stream_set = ska::flat_hash_set<c10_npu::NPUStream>;
 
-constexpr size_t kMinBlockSize = 512; // all sizes are rounded to at least 512 bytes
-constexpr size_t kSmallSize = 1048576; // largest "small" allocation is 1 MiB
-constexpr size_t kSmallBuffer = 2097152; // "small" allocations are packed in 2 MiB blocks
-constexpr size_t kLargeBuffer = 20971520; // "large" allocations may be packed in 20 MiB blocks
-constexpr size_t kExtraLargeBuffer = 1073741824; // "extra large" allocations may be packed in 1 GB blocks
-constexpr size_t kLargeBufferForHccl = 134217728; // "large for hccl" allocations may be packed in 128 MiB blocks
-constexpr size_t kMinLargeAlloc = 10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
-constexpr size_t kRoundLarge = 2097152; // round up large allocs to 2 MiB
-constexpr size_t kAlignRoundLarge = 16384; // round up large allocs to 16 KB
-constexpr size_t kSmallPoolVirAddrSize = 2147483648; // 2 GB
+constexpr size_t kMinBlockSize = 512;                 // all sizes are rounded to at least 512 bytes
+constexpr size_t kSmallSize = 1048576;                // largest "small" allocation is 1 MiB
+constexpr size_t kSmallBuffer = 2097152;              // "small" allocations are packed in 2 MiB blocks
+constexpr size_t kLargeBuffer = 20971520;             // "large" allocations may be packed in 20 MiB blocks
+constexpr size_t kExtraLargeBuffer = 1073741824;      // "extra large" allocations may be packed in 1 GB blocks
+constexpr size_t kLargeBufferForHccl = 134217728;     // "large for hccl" allocations may be packed in 128 MiB blocks
+constexpr size_t kMinLargeAlloc = 10485760;           // allocations between 1 and 10 MiB may use kLargeBuffer
+constexpr size_t kRoundLarge = 2097152;               // round up large allocs to 2 MiB
+constexpr size_t kAlignRoundLarge = 16384;            // round up large allocs to 16 KB
+constexpr size_t kSmallPoolVirAddrSize = 2147483648;  // 2 GB
 constexpr size_t kLargePoolVirAddrSize = 10737418240; // 10 GB
-const std::string kMinCannVersion = "8.1.RC1"; // minimum cann version which supports 1g mem 8.1.RC1
-const std::string kMinDriverVersion = "25.0.RC1"; // minimum driver version which supports 1g mem 25.0.RC1
-const std::string kCannModule = "CANN"; // cann module name
-const std::string kDriverModule = "DRIVER"; // driver module name
+const std::string kMinCannVersion = "8.1.RC1";        // minimum cann version which supports 1g mem 8.1.RC1
+const std::string kMinDriverVersion = "25.0.RC1";     // minimum driver version which supports 1g mem 25.0.RC1
+const std::string kCannModule = "CANN";               // cann module name
+const std::string kDriverModule = "DRIVER";           // driver module name
 
 using StatTypes = std::array<bool, static_cast<size_t>(StatType::NUM_TYPES)>;
 
-void update_stat(Stat& stat, int64_t amount) {
-  stat.current += amount;
-  stat.peak = std::max(stat.current, stat.peak);
-  if (amount > 0) {
-    stat.allocated += amount;
-  }
-  if (amount < 0) {
-    stat.freed += -amount;
-  }
+void update_stat(Stat &stat, int64_t amount)
+{
+    stat.current += amount;
+    stat.peak = std::max(stat.current, stat.peak);
+    if (amount > 0) {
+        stat.allocated += amount;
+    }
+    if (amount < 0) {
+        stat.freed += -amount;
+    }
 }
 
-void reset_accumulated_stat(Stat& stat) {
-  stat.allocated = 0;
-  stat.freed = 0;
+void reset_accumulated_stat(Stat &stat)
+{
+    stat.allocated = 0;
+    stat.freed = 0;
 }
 
-void reset_peak_stat(Stat& stat) {
-  stat.peak = stat.current;
+void reset_peak_stat(Stat &stat)
+{
+    stat.peak = stat.current;
 }
 
-template <typename Func>
-void for_each_selected_stat_type(const StatTypes& stat_types, Func f) {
-  for (const auto stat_type : c10::irange(stat_types.size())) {
-    if (stat_types[stat_type]) {
-      f(stat_type);
+template <typename Func> void for_each_selected_stat_type(const StatTypes &stat_types, Func f)
+{
+    for (const auto stat_type : c10::irange(stat_types.size())) {
+        if (stat_types[stat_type]) {
+            f(stat_type);
+        }
     }
-  }
 }
 
-void update_stat_array(
-    StatArray& stat_array,
-    int64_t amount,
-    const StatTypes& stat_types) {
-  for_each_selected_stat_type(
-      stat_types, [&stat_array, amount](size_t stat_type) {
-        update_stat(stat_array[stat_type], amount);
-      });
+void update_stat_array(StatArray &stat_array, int64_t amount, const StatTypes &stat_types)
+{
+    for_each_selected_stat_type(stat_types,
+        [&stat_array, amount](size_t stat_type) { update_stat(stat_array[stat_type], amount); });
 }
 
 bool IsMallocPage1GMem(bool is_small_pool)
@@ -152,19 +150,19 @@ bool IsMallocPage1GMem(bool is_small_pool)
 
         if (!IsGteCANNVersion(kMinCannVersion, kCannModule)) {
             TORCH_NPU_WARN_ONCE("The application for 1G large-page physical memory failed. "
-            "Using the HUGE_MEM memory page allocation method may result in performance degradation. "
-            "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is enabled, "
-            "but the current driver version does not support this feature. "
-            "Please upgrade the CANN package version.");
+                "Using the HUGE_MEM memory page allocation method may result in performance degradation. "
+                "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is enabled, "
+                "but the current driver version does not support this feature. "
+                "Please upgrade the CANN package version.");
             return false;
         }
 
         if (!IsGteCANNVersion(kMinDriverVersion, kDriverModule)) {
             TORCH_NPU_WARN_ONCE("The application for 1G large-page physical memory failed. "
-            "Using the HUGE_MEM memory page allocation method may result in performance degradation. "
-            "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is enabled, "
-            "but the current driver version does not support this feature. "
-            "Please upgrade the CANN package version 1-2.");
+                "Using the HUGE_MEM memory page allocation method may result in performance degradation. "
+                "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is enabled, "
+                "but the current driver version does not support this feature. "
+                "Please upgrade the CANN package version 1-2.");
             return false;
         }
         return true;
@@ -175,46 +173,47 @@ bool IsMallocPage1GMem(bool is_small_pool)
 
 struct Block;
 struct PrivatePool;
-using Comparison = bool (*)(const Block*, const Block*);
-static bool BlockComparatorSize(const Block* a, const Block* b);
-static bool BlockComparatorAddress(const Block* a, const Block* b);
+using Comparison = bool (*)(const Block *, const Block *);
+static bool BlockComparatorSize(const Block *a, const Block *b);
+static bool BlockComparatorAddress(const Block *a, const Block *b);
 
-struct BlockPool{
-    std::set<Block*, Comparison> blocks;
-    std::set<Block*, Comparison> unmapped;
+struct BlockPool {
+    std::set<Block *, Comparison> blocks;
+    std::set<Block *, Comparison> unmapped;
     const bool is_small;
-    PrivatePool* owner_PrivatePool;
+    PrivatePool *owner_PrivatePool;
 
-    BlockPool(bool small, PrivatePool* private_pool = nullptr)
+    BlockPool(bool small, PrivatePool *private_pool = nullptr)
         : blocks(BlockComparatorSize),
           unmapped(BlockComparatorAddress),
           is_small(small),
-          owner_PrivatePool(private_pool) {}
+          owner_PrivatePool(private_pool)
+    {}
 };
 
 struct ExpandableSegment;
 
 struct Block {
-    int device; // npu
-    aclrtStream stream; // allocation stream
+    int device;             // npu
+    aclrtStream stream;     // allocation stream
     stream_set stream_uses; // streams on which the block was used
-    size_t size; // block size in bytes
-    size_t requested_size; // memory originally requested
-    BlockPool* pool; // owning memory pool
-    void* ptr; // memory address
-    bool allocated; // in-use flag
-    bool mapped{true}; // is the virtual address range this Block references
-                       // backed by physical pages. Always true when
-                      // expandable_segment_ is null. When false
-                      // This Block will be aligned to the segment size
-                      // of its expandable_segment_.
-    Block* prev; // prev block if split from a larger allocation
-    Block* next; // next block if split from a larger allocation
-    int event_count; // number of outstanding NPU events
-    int gc_count{0}; // counter for prioritizing older / less useful blocks for
-                     // garbage collection
-    ExpandableSegment* expandable_segment_{nullptr};
-    bool is_safe{true};
+    size_t size;            // block size in bytes
+    size_t requested_size;  // memory originally requested
+    BlockPool *pool;        // owning memory pool
+    void *ptr;              // memory address
+    bool allocated;         // in-use flag
+    bool mapped{ true };    // is the virtual address range this Block references
+                            // backed by physical pages. Always true when
+                            // expandable_segment_ is null. When false
+                            // This Block will be aligned to the segment size
+                            // of its expandable_segment_.
+    Block *prev;            // prev block if split from a larger allocation
+    Block *next;            // next block if split from a larger allocation
+    int event_count;        // number of outstanding NPU events
+    int gc_count{ 0 };      // counter for prioritizing older / less useful blocks for
+                            // garbage collection
+    ExpandableSegment *expandable_segment_{ nullptr };
+    bool is_safe{ true };
     std::shared_ptr<c10::GatheredContext> context_when_allocated;
     // only set for the first block in the segment (when prev == null)
     // this records the frame information when cudaMalloc was called
@@ -222,7 +221,7 @@ struct Block {
     // memory out from our cache.
     std::shared_ptr<c10::GatheredContext> context_when_segment_allocated;
 
-    Block(int device, aclrtStream stream, size_t size, BlockPool* pool, void* ptr)
+    Block(int device, aclrtStream stream, size_t size, BlockPool *pool, void *ptr)
         : device(device),
           stream(stream),
           stream_uses(),
@@ -234,7 +233,8 @@ struct Block {
           prev(nullptr),
           next(nullptr),
           event_count(0),
-          gc_count(0) {}
+          gc_count(0)
+    {}
 
     // constructor for search key
     Block(int device, aclrtStream stream, size_t size)
@@ -249,14 +249,15 @@ struct Block {
           prev(nullptr),
           next(nullptr),
           event_count(0),
-          gc_count(0) {}
+          gc_count(0)
+    {}
 
     bool is_split() const
     {
         return (prev != nullptr) || (next != nullptr);
     }
 
-    void splice(Block* before, Block* after)
+    void splice(Block *before, Block *after)
     {
         if (before) {
             TORCH_INTERNAL_ASSERT(before->next == after, PTA_ERROR(ErrCode::PTR));
@@ -272,9 +273,9 @@ struct Block {
 };
 
 struct SegmentRange {
-  char* ptr;
-  size_t size;
-  SegmentRange(void* p, size_t s) : ptr(static_cast<char*>(p)), size(s) {}
+    char *ptr;
+    size_t size;
+    SegmentRange(void *p, size_t s) : ptr(static_cast<char *>(p)), size(s) {}
 };
 
 
@@ -355,199 +356,194 @@ bevhavior for allocator tensors that need to be used cross-process.
 */
 
 struct ExpandableSegment {
-  ExpandableSegment(
-      int device,
-      aclrtStream stream,
-      size_t size)
-      : device_(device),
-        stream_(stream),
-        max_handles_(0),
-        // 2MB for small pool, 20MB for large pool
-        segment_size_(size) {
-    size_t device_free;
-    size_t device_total;
-    NPU_CHECK_ERROR(aclrtGetMemInfo(ACL_HBM_MEM, &device_free, &device_total));
-    // we allocate enough address space for 1 1/8 the total memory on the NPU.
-    // This allows for some cases where we have to unmap pages earlier in the
-    // segment to put them at the end.
-    max_handles_ = numSegments(device_total + device_total / 8);
-    if (c10_npu::option::OptionsManager::IsHcclZeroCopyEnable()) {
-        // prevent HCCL reserve virtual address out of memory
-        // small pool reserve 2G
-        // non-default stream large pool 10G
-        auto default_stream = c10_npu::getDefaultNPUStream().stream(false);
-        if (kSmallBuffer == segment_size_) {
-            max_handles_ = numSegments(kSmallPoolVirAddrSize);
-        } else if (default_stream != stream) {
-            max_handles_ = numSegments(kLargePoolVirAddrSize);
-        }
-    }
-
-    NPU_CHECK_ERROR(c10_npu::acl::AclrtReserveMemAddress(
-        &ptr_, segment_size_ * max_handles_, 0, NULL, 1, getHcclComm()));
-    ASCEND_LOGD(
-        "NPUCachingAllocator malloc by AclrtReserveMemAddress: size=%zu, segment_size=%zu",
-        segment_size_ * max_handles_, segment_size_);
-  }
-  // begin must be aligned to segment_size_.
-  // returns the actual range mapped, which may be
-  // greater than requested if size is not aligned to segment_size_.
-  // return size of 0 indicates OOM
-  SegmentRange map(SegmentRange range) {
-    auto begin = segmentLeft(range.ptr);
-    auto end = segmentRight(range.ptr + range.size);
-    TORCH_INTERNAL_ASSERT(ptr() + begin * segment_size_ == range.ptr, PTA_ERROR(ErrCode::PTR));
-    if (begin == end) {
-      return rangeFromHandles(begin, end);
-    }
-    while (end > handles_.size()) {
-      handles_.emplace_back(c10::nullopt);
-    }
-    for (auto i : c10::irange(begin, end)) {
-        TORCH_INTERNAL_ASSERT(!handles_.at(i), PTA_ERROR(ErrCode::VALUE));
-        aclrtDrvMemHandle handle = nullptr;
-        aclrtPhysicalMemProp prop = {};
-        prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
-        prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
-        prop.memAttr = (segment_size_ == kExtraLargeBuffer) ? ACL_HBM_MEM_HUGE1G : ACL_HBM_MEM_HUGE;
-        prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
-        prop.location.id = device_;
-        prop.reserve = 0;
-        auto status =
-            c10_npu::acl::AclrtMallocPhysical(&handle, segment_size_, &prop, 0);
-        if (status == ACL_ERROR_RT_MEMORY_ALLOCATION) {
-            for (auto j : c10::irange(begin, i)) {
-                auto h = handles_.at(j).value();
-                handles_.at(j) = c10::nullopt;
-                NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
+    ExpandableSegment(int device, aclrtStream stream, size_t size)
+        : device_(device),
+          stream_(stream),
+          max_handles_(0),
+          // 2MB for small pool, 20MB for large pool
+          segment_size_(size)
+    {
+        size_t device_free;
+        size_t device_total;
+        NPU_CHECK_ERROR(aclrtGetMemInfo(ACL_HBM_MEM, &device_free, &device_total));
+        // we allocate enough address space for 1 1/8 the total memory on the NPU.
+        // This allows for some cases where we have to unmap pages earlier in the
+        // segment to put them at the end.
+        max_handles_ = numSegments(device_total + device_total / 8);
+        if (c10_npu::option::OptionsManager::IsHcclZeroCopyEnable()) {
+            // prevent HCCL reserve virtual address out of memory
+            // small pool reserve 2G
+            // non-default stream large pool 10G
+            auto default_stream = c10_npu::getDefaultNPUStream().stream(false);
+            if (kSmallBuffer == segment_size_) {
+                max_handles_ = numSegments(kSmallPoolVirAddrSize);
+            } else if (default_stream != stream) {
+                max_handles_ = numSegments(kLargePoolVirAddrSize);
+            }
+        }
+
+        NPU_CHECK_ERROR(
+            c10_npu::acl::AclrtReserveMemAddress(&ptr_, segment_size_ * max_handles_, 0, nullptr, 1, getHcclComm()));
+        ASCEND_LOGD("NPUCachingAllocator malloc by AclrtReserveMemAddress: size=%zu, segment_size=%zu",
+            segment_size_ * max_handles_, segment_size_);
+    }
+    // begin must be aligned to segment_size_.
+    // returns the actual range mapped, which may be
+    // greater than requested if size is not aligned to segment_size_.
+    // return size of 0 indicates OOM
+    SegmentRange map(SegmentRange range)
+    {
+        auto begin = segmentLeft(range.ptr);
+        auto end = segmentRight(range.ptr + range.size);
+        TORCH_INTERNAL_ASSERT(ptr() + begin * segment_size_ == range.ptr, PTA_ERROR(ErrCode::PTR));
+        if (begin == end) {
+            return rangeFromHandles(begin, end);
+        }
+        while (end > handles_.size()) {
+            handles_.emplace_back(c10::nullopt);
+        }
+        for (auto i : c10::irange(begin, end)) {
+            TORCH_INTERNAL_ASSERT(!handles_.at(i), PTA_ERROR(ErrCode::VALUE));
+            aclrtDrvMemHandle handle = nullptr;
+            aclrtPhysicalMemProp prop = {};
+            prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
+            prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
+            prop.memAttr = (segment_size_ == kExtraLargeBuffer) ? ACL_HBM_MEM_HUGE1G : ACL_HBM_MEM_HUGE;
+            prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
+            prop.location.id = device_;
+            prop.reserve = 0;
+            auto status = c10_npu::acl::AclrtMallocPhysical(&handle, segment_size_, &prop, 0);
+            if (status == ACL_ERROR_RT_MEMORY_ALLOCATION) {
+                for (auto j : c10::irange(begin, i)) {
+                    auto h = handles_.at(j).value();
+                    handles_.at(j) = c10::nullopt;
+                    NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
+                }
+                trimHandles();
+                return rangeFromHandles(begin, begin);
             }
-            trimHandles();
-            return rangeFromHandles(begin, begin);
-        }
-        NPU_CHECK_ERROR(status, "aclrtMallocPhysical");
-        handles_.at(i) = handle;
-    }
-    for (auto i : c10::irange(begin, end)) {
-      NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem(
-          (char*)ptr_ + i * segment_size_,
-          segment_size_,
-          0,
-          handles_.at(i).value(),
-          0,
-          getHcclComm()));
-    }
-    ASCEND_LOGD(
-        "NPUCachingAllocator map: segment_size=%zu", segment_size_);
-    return rangeFromHandles(begin, end);
-  }
-
-  // unmaps all the completely empty segment_size_ segments between
-  // [begin, begin + size), returns the offset where the range begin,
-  // and the actual size unmapped (multiple of segment_size_)
-  SegmentRange unmap(SegmentRange range) {
-    auto begin = segmentRight(range.ptr);
-    auto end = segmentLeft(range.ptr + range.size);
-    if (begin >= end) {
-      return SegmentRange{range.ptr, 0};
-    }
-    unmapHandles(begin, end);
-    return rangeFromHandles(begin, end);
-  }
-
-  char* ptr() const {
-    return (char*)ptr_;
-  }
-
-  size_t size() const {
-    return max_handles_ * segment_size_;
-  }
+            NPU_CHECK_ERROR(status, "aclrtMallocPhysical");
+            handles_.at(i) = handle;
+        }
+        for (auto i : c10::irange(begin, end)) {
+            NPU_CHECK_ERROR(c10_npu::acl::AclrtMapMem((char *)ptr_ + i * segment_size_, segment_size_, 0,
+                handles_.at(i).value(), 0, getHcclComm()));
+        }
+        ASCEND_LOGD("NPUCachingAllocator map: segment_size=%zu", segment_size_);
+        return rangeFromHandles(begin, end);
+    }
+
+    // unmaps all the completely empty segment_size_ segments between
+    // [begin, begin + size), returns the offset where the range begin,
+    // and the actual size unmapped (multiple of segment_size_)
+    SegmentRange unmap(SegmentRange range)
+    {
+        auto begin = segmentRight(range.ptr);
+        auto end = segmentLeft(range.ptr + range.size);
+        if (begin >= end) {
+            return SegmentRange{ range.ptr, 0 };
+        }
+        unmapHandles(begin, end);
+        return rangeFromHandles(begin, end);
+    }
+
+    char *ptr() const
+    {
+        return (char *)ptr_;
+    }
+
+    size_t size() const
+    {
+        return max_handles_ * segment_size_;
+    }
 
     void setHcclComm(std::shared_ptr<c10d_npu::HCCLComm> hcclComm)
     {
         TORCH_INTERNAL_ASSERT(hcclComm, "hcclComm is null.", PTA_ERROR(ErrCode::INTERNAL));
         hcclComm_ = hcclComm;
         HCCL_CHECK_ERROR(at_npu::hccl::HcclCommSetMemoryRangeFace(hcclComm_->getHcclComm(), ptr_,
-                                                                  segment_size_ * max_handles_, 0, 1));
+            segment_size_ * max_handles_, 0, 1));
         for (auto i : c10::irange(handles_.size())) {
             HCCL_CHECK_ERROR(at_npu::hccl::HcclCommActivateCommMemoryFace(hcclComm_->getHcclComm(),
-                                                                          (char*)ptr_ + i * segment_size_,
-                                                                          segment_size_,
-                                                                          0,
-                                                                          handles_.at(i).value(),
-                                                                          0));
-        }
-    }
-
-  ~ExpandableSegment() {
-    forEachAllocatedRange(
-        [&](size_t begin, size_t end) { unmapHandles(begin, end); });
-    NPU_CHECK_ERROR(c10_npu::acl::AclrtReleaseMemAddress(ptr_, getHcclComm()));
-    ASCEND_LOGD("NPUCachingAllocator free by AclrtReleaseMemAddress");
-  }
-
- private:
-  void unmapHandles(size_t begin, size_t end) {
-    // note: unlike aclrtFree, MemUnmap and MemRelease do
-    // not appear to synchronize in all cases, so we have to wait for the
-    // stream to finish before this memory is truly free.
-
-    // cannot call c10::npu::stream_synchronize because
-    // it might grab the GIL which can lead to a deadlock
-    // Locking order must be GIL -> Allocator Lock
-    NPU_CHECK_ERROR(aclrtSynchronizeStream(stream_));
-#ifndef BUILD_LIBTORCH
-    const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
-    if (C10_UNLIKELY(trigger)) {
-        trigger->traceNpuStreamSynchronization(
-            reinterpret_cast<uintptr_t>(stream_));
+                (char *)ptr_ + i * segment_size_, segment_size_, 0, handles_.at(i).value(), 0));
+        }
     }
+
+    ~ExpandableSegment()
+    {
+        forEachAllocatedRange([&](size_t begin, size_t end) { unmapHandles(begin, end); });
+        NPU_CHECK_ERROR(c10_npu::acl::AclrtReleaseMemAddress(ptr_, getHcclComm()));
+        ASCEND_LOGD("NPUCachingAllocator free by AclrtReleaseMemAddress");
+    }
+
+private:
+    void unmapHandles(size_t begin, size_t end)
+    {
+        // note: unlike aclrtFree, MemUnmap and MemRelease do
+        // not appear to synchronize in all cases, so we have to wait for the
+        // stream to finish before this memory is truly free.
+
+        // cannot call c10::npu::stream_synchronize because
+        // it might grab the GIL which can lead to a deadlock
+        // Locking order must be GIL -> Allocator Lock
+        NPU_CHECK_ERROR(aclrtSynchronizeStream(stream_));
+#ifndef BUILD_LIBTORCH
+        const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace();
+        if (C10_UNLIKELY(trigger)) {
+            trigger->traceNpuStreamSynchronization(reinterpret_cast<uintptr_t>(stream_));
+        }
 #endif
-    for (auto i : c10::irange(begin, end)) {
-      aclrtDrvMemHandle h = handles_.at(i).value();
-      handles_.at(i) = c10::nullopt;
-      NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char*)ptr_ + segment_size_ * i, getHcclComm()));
-      NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
-    }
-      ASCEND_LOGD("NPUCachingAllocator unmap: segment_size=%zu", segment_size_);
-    trimHandles();
-  }
-
-  void trimHandles() {
-    while (!handles_.empty() && !handles_.back()) {
-      handles_.pop_back();
-    }
-  }
-
-  void forEachAllocatedRange(std::function<void(size_t, size_t)> fn) {
-    auto start = 0;
-    for (auto i : c10::irange(handles_.size())) {
-      if (handles_.at(i) && (i == 0 || !handles_.at(i - 1))) {
-        start = i;
-      }
-      if (handles_.at(i) && (i + 1 == handles_.size() || !handles_.at(i + 1))) {
-        fn(start, i + 1);
-      }
-    }
-  }
-
-  size_t numSegments(size_t size) {
-    return (size + segment_size_ - 1) / segment_size_;
-  }
-
-  size_t segmentLeft(char* p) {
-    auto size = p - ptr();
-    return size / segment_size_;
-  }
-
-  size_t segmentRight(char* p) {
-    auto size = p - ptr();
-    return numSegments(size);
-  }
-
-  SegmentRange rangeFromHandles(size_t begin, size_t end) {
-    return SegmentRange(
-        ptr() + segment_size_ * begin, segment_size_ * (end - begin));
-  }
+        for (auto i : c10::irange(begin, end)) {
+            aclrtDrvMemHandle h = handles_.at(i).value();
+            handles_.at(i) = c10::nullopt;
+            NPU_CHECK_ERROR(c10_npu::acl::AclrtUnmapMem((char *)ptr_ + segment_size_ * i, getHcclComm()));
+            NPU_CHECK_ERROR(c10_npu::acl::AclrtFreePhysical(h));
+        }
+        ASCEND_LOGD("NPUCachingAllocator unmap: segment_size=%zu", segment_size_);
+        trimHandles();
+    }
+
+    void trimHandles()
+    {
+        while (!handles_.empty() && !handles_.back()) {
+            handles_.pop_back();
+        }
+    }
+
+    void forEachAllocatedRange(std::function<void(size_t, size_t)> fn)
+    {
+        auto start = 0;
+        for (auto i : c10::irange(handles_.size())) {
+            if (handles_.at(i) && (i == 0 || !handles_.at(i - 1))) {
+                start = i;
+            }
+            if (handles_.at(i) && (i + 1 == handles_.size() || !handles_.at(i + 1))) {
+                fn(start, i + 1);
+            }
+        }
+    }
+
+    size_t numSegments(size_t size)
+    {
+        return (size + segment_size_ - 1) / segment_size_;
+    }
+
+    size_t segmentLeft(char *p)
+    {
+        auto size = p - ptr();
+        return size / segment_size_;
+    }
+
+    size_t segmentRight(char *p)
+    {
+        auto size = p - ptr();
+        return numSegments(size);
+    }
+
+    SegmentRange rangeFromHandles(size_t begin, size_t end)
+    {
+        return SegmentRange(ptr() + segment_size_ * begin, segment_size_ * (end - begin));
+    }
 
     HcclComm getHcclComm()
     {
@@ -557,122 +553,118 @@ struct ExpandableSegment {
         return nullptr;
     }
 
-  int device_;
-  aclrtStream stream_;
-  void* ptr_{};
-  size_t max_handles_;
-  size_t segment_size_;
-  std::vector<c10::optional<aclrtDrvMemHandle>> handles_;
-  std::shared_ptr<c10d_npu::HCCLComm> hcclComm_;
+    int device_;
+    aclrtStream stream_;
+    void *ptr_{};
+    size_t max_handles_;
+    size_t segment_size_;
+    std::vector<c10::optional<aclrtDrvMemHandle>> handles_;
+    std::shared_ptr<c10d_npu::HCCLComm> hcclComm_;
 };
 
-static bool BlockComparatorSize(const Block* a, const Block* b) {
-  if (a->stream != b->stream) {
-    return reinterpret_cast<uintptr_t>(a->stream) <
-        reinterpret_cast<uintptr_t>(b->stream);
-  }
-  if (a->size != b->size) {
-    return a->size < b->size;
-  }
-  return reinterpret_cast<uintptr_t>(a->ptr) <
-      reinterpret_cast<uintptr_t>(b->ptr);
+static bool BlockComparatorSize(const Block *a, const Block *b)
+{
+    if (a->stream != b->stream) {
+        return reinterpret_cast<uintptr_t>(a->stream) < reinterpret_cast<uintptr_t>(b->stream);
+    }
+    if (a->size != b->size) {
+        return a->size < b->size;
+    }
+    return reinterpret_cast<uintptr_t>(a->ptr) < reinterpret_cast<uintptr_t>(b->ptr);
 }
 
-static bool BlockComparatorAddress(const Block* a, const Block* b) {
-  if (a->stream != b->stream) {
-    return reinterpret_cast<uintptr_t>(a->stream) <
-        reinterpret_cast<uintptr_t>(b->stream);
-  }
-  return reinterpret_cast<uintptr_t>(a->ptr) <
-      reinterpret_cast<uintptr_t>(b->ptr);
+static bool BlockComparatorAddress(const Block *a, const Block *b)
+{
+    if (a->stream != b->stream) {
+        return reinterpret_cast<uintptr_t>(a->stream) < reinterpret_cast<uintptr_t>(b->stream);
+    }
+    return reinterpret_cast<uintptr_t>(a->ptr) < reinterpret_cast<uintptr_t>(b->ptr);
 }
 
 struct AllocParams {
-  AllocParams(
-      int device,
-      size_t size,
-      aclrtStream stream,
-      BlockPool* pool,
-      size_t alloc_size,
-      DeviceStats& stats)
-      : search_key(device, stream, size),
-        pool(pool),
-        alloc_size(alloc_size),
-        block(nullptr),
-        err(ACL_ERROR_NONE) {}
-
-  int device() const { return search_key.device; }
-  aclrtStream stream() const { return search_key.stream; }
-  size_t size() const { return search_key.size; }
-
-  Block search_key;
-  BlockPool* pool;
-  size_t alloc_size;
-  Block* block;
-  StatTypes stat_types = {false};
-  aclError err;
+    AllocParams(int device, size_t size, aclrtStream stream, BlockPool *pool, size_t alloc_size, DeviceStats &stats)
+        : search_key(device, stream, size), pool(pool), alloc_size(alloc_size), block(nullptr), err(ACL_ERROR_NONE)
+    {}
+
+    int device() const
+    {
+        return search_key.device;
+    }
+    aclrtStream stream() const
+    {
+        return search_key.stream;
+    }
+    size_t size() const
+    {
+        return search_key.size;
+    }
+
+    Block search_key;
+    BlockPool *pool;
+    size_t alloc_size;
+    Block *block;
+    StatTypes stat_types = { false };
+    aclError err;
 };
 
 class EventPool {
 public:
-  using Event = std::unique_ptr<c10_npu::NPUEvent, std::function<void(c10_npu::NPUEvent*)>>;
-  // Explicit device count
-  EventPool() : pools_(c10_npu::device_count()) {}
-
-  Event get(int device) {
-    TORCH_INTERNAL_ASSERT(0 <= device, PTA_ERROR(ErrCode::VALUE));
-    TORCH_INTERNAL_ASSERT(device < static_cast<int>(pools_.size()), PTA_ERROR(ErrCode::VALUE));
-    auto& pool = pools_[device];
-    auto destructor = [&pool](c10_npu::NPUEvent* event) {
-      std::lock_guard<std::mutex> g(pool.mutex_);
-      pool.event_pool_.push_back(std::unique_ptr<c10_npu::NPUEvent>(event));
-    };
+    using Event = std::unique_ptr<c10_npu::NPUEvent, std::function<void(c10_npu::NPUEvent *)>>;
+    // Explicit device count
+    EventPool() : pools_(c10_npu::device_count()) {}
 
-    // Try to acquire an event from the per-device pool.
+    Event get(int device)
     {
-      std::lock_guard<std::mutex> g(pool.mutex_);
-      if (!pool.event_pool_.empty()) {
-        auto* event = pool.event_pool_.back().release();
-        pool.event_pool_.pop_back();
-        return Event(event, destructor);
-      }
+        TORCH_INTERNAL_ASSERT(0 <= device, PTA_ERROR(ErrCode::VALUE));
+        TORCH_INTERNAL_ASSERT(device < static_cast<int>(pools_.size()), PTA_ERROR(ErrCode::VALUE));
+        auto &pool = pools_[device];
+        auto destructor = [&pool](c10_npu::NPUEvent *event) {
+            std::lock_guard<std::mutex> g(pool.mutex_);
+            pool.event_pool_.push_back(std::unique_ptr<c10_npu::NPUEvent>(event));
+        };
+
+        // Try to acquire an event from the per-device pool.
+        {
+            std::lock_guard<std::mutex> g(pool.mutex_);
+            if (!pool.event_pool_.empty()) {
+                auto *event = pool.event_pool_.back().release();
+                pool.event_pool_.pop_back();
+                return Event(event, destructor);
+            }
+        }
+        // otherwise, allocate a new event that will be returned to the pool on
+        // destruction.
+        return Event(std::make_unique<c10_npu::NPUEvent>(ACL_EVENT_CAPTURE_STREAM_PROGRESS).release(), destructor);
     }
-    // otherwise, allocate a new event that will be returned to the pool on
-    // destruction.
-    return Event(
-        std::make_unique<c10_npu::NPUEvent>(ACL_EVENT_CAPTURE_STREAM_PROGRESS).release(),
-        destructor);
-  }
 
-  void empty_cache() {
-    for (auto& pool : pools_) {
-      std::lock_guard<std::mutex> g(pool.mutex_);
-      pool.event_pool_.clear();
+    void empty_cache()
+    {
+        for (auto &pool : pools_) {
+            std::lock_guard<std::mutex> g(pool.mutex_);
+            pool.event_pool_.clear();
+        }
     }
-  }
 
 private:
-  struct PerDevicePool {
-    alignas(64) std::mutex mutex_;
-    std::vector<std::unique_ptr<c10_npu::NPUEvent>> event_pool_;
-  };
-  std::vector<PerDevicePool> pools_;
+    struct PerDevicePool {
+        alignas(64) std::mutex mutex_;
+        std::vector<std::unique_ptr<c10_npu::NPUEvent>> event_pool_;
+    };
+    std::vector<PerDevicePool> pools_;
 };
 
 // NPU graphs helper
 struct PrivatePool {
-    PrivatePool()
-        : large_blocks(false, this),
-          small_blocks(true, this) {}
-    PrivatePool(const PrivatePool&) = delete;
-    PrivatePool(PrivatePool&&) = delete;
-    PrivatePool& operator=(const PrivatePool&) = delete;
+    PrivatePool() : large_blocks(false, this), small_blocks(true, this) {}
+    PrivatePool(const PrivatePool &) = delete;
+    PrivatePool(PrivatePool &&) = delete;
+    PrivatePool &operator = (const PrivatePool &) = delete;
     // Number of live graphs using this pool
-    int use_count{1};
+    int use_count{ 1 };
     // Number of unfreed npuMallocs made for this pool. When use_count and
     // npuMalloc_count drop to zero, we can delete this PrivatePool from
     // graph_pools.
-    int npuMalloc_count{0};
+    int npuMalloc_count{ 0 };
     // Instead of maintaining private BlockPools here, I could stuff all blocks
     // (private or no) into the top-level large_blocks and small_blocks, and
     // distinguish private blocks by adding a "pool id" check above the stream
@@ -683,198 +675,172 @@ struct PrivatePool {
 };
 
 struct MempoolIdHash {
-    std::size_t operator()(const MempoolId_t& mempool_id) const noexcept
+    std::size_t operator () (const MempoolId_t &mempool_id) const noexcept
     {
         return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
     }
 };
-
 } // namespace
 
 class CachingAllocatorConfig {
- public:
-
-  static size_t max_split_size() {
-    return instance().m_max_split_size;
-  }
-
-  static double garbage_collection_threshold() {
-    return instance().m_garbage_collection_threshold;
-  }
-
-  static bool expandable_segments() {
-    return instance().m_expandable_segments;
-  }
-
-  static size_t base_addr_aligned_size()
-  {
-      return instance().m_base_addr_aligned_size;
-  }
-
-  static bool page_size_1g_enable()
-  {
-      return instance().m_page_size_1g;
-  }
-
-  static CachingAllocatorConfig &instance() {
-    static CachingAllocatorConfig *s_instance = ([]() {
-      auto inst = new CachingAllocatorConfig();
-      const char* env = getenv("PYTORCH_NPU_ALLOC_CONF");
-      inst->parseArgs(env);
-      return inst;
-    })();
-    return *s_instance;
-  }
-
-  void parseArgs(const char* env);
-
- private:
-
-  size_t m_max_split_size;
-  double m_garbage_collection_threshold;
-  bool m_expandable_segments;
-  bool set_expandable_segments_flag = false;
-  size_t m_base_addr_aligned_size = kAlignRoundLarge;
-  bool m_page_size_1g = false; // 新增1G页配置标志
-
-  CachingAllocatorConfig()
-      : m_max_split_size(std::numeric_limits<size_t>::max()),
-        m_garbage_collection_threshold(0),
-        m_expandable_segments(false),
-        m_base_addr_aligned_size(kAlignRoundLarge)
-        {
-        }
+public:
+    static size_t max_split_size()
+    {
+        return instance().m_max_split_size;
+    }
 
-  void lexArgs(const char* env, std::vector<std::string>& config);
-  void consumeToken(
-      const std::vector<std::string>& config,
-      size_t i,
-      const char c);
-  size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
-  size_t parseGarbageCollectionThreshold(
-      const std::vector<std::string>& config,
-      size_t i);
-  size_t parseExpandableSegments(
-      const std::vector<std::string>& config,
-      size_t i);
-  size_t parseAddrAlignSize(
-      const std::vector<std::string>& config,
-      size_t i);
-  size_t parsePageSize(
-      const std::vector<std::string>& config,
-      size_t i);
-};
+    static double garbage_collection_threshold()
+    {
+        return instance().m_garbage_collection_threshold;
+    }
+
+    static bool expandable_segments()
+    {
+        return instance().m_expandable_segments;
+    }
+
+    static size_t base_addr_aligned_size()
+    {
+        return instance().m_base_addr_aligned_size;
+    }
+
+    static bool page_size_1g_enable()
+    {
+        return instance().m_page_size_1g;
+    }
+
+    static CachingAllocatorConfig &instance()
+    {
+        static CachingAllocatorConfig *s_instance = ([]() {
+            auto inst = new CachingAllocatorConfig();
+            const char *env = getenv("PYTORCH_NPU_ALLOC_CONF");
+            inst->parseArgs(env);
+            return inst;
+        })();
+        return *s_instance;
+    }
 
-void CachingAllocatorConfig::lexArgs(
-    const char* env,
-    std::vector<std::string>& config) {
-  std::vector<char> buf;
+    void parseArgs(const char *env);
+
+private:
+    size_t m_max_split_size;
+    double m_garbage_collection_threshold;
+    bool m_expandable_segments;
+    bool set_expandable_segments_flag = false;
+    size_t m_base_addr_aligned_size = kAlignRoundLarge;
+    bool m_page_size_1g = false; // 新增1G页配置标志
+
+    CachingAllocatorConfig()
+        : m_max_split_size(std::numeric_limits<size_t>::max()),
+          m_garbage_collection_threshold(0),
+          m_expandable_segments(false),
+          m_base_addr_aligned_size(kAlignRoundLarge)
+    {}
+
+    void lexArgs(const char *env, std::vector<std::string> &config);
+    void consumeToken(const std::vector<std::string> &config, size_t i, const char c);
+    size_t parseMaxSplitSize(const std::vector<std::string> &config, size_t i);
+    size_t parseGarbageCollectionThreshold(const std::vector<std::string> &config, size_t i);
+    size_t parseExpandableSegments(const std::vector<std::string> &config, size_t i);
+    size_t parseAddrAlignSize(const std::vector<std::string> &config, size_t i);
+    size_t parsePageSize(const std::vector<std::string> &config, size_t i);
+};
 
-  size_t env_length = strlen(env);
-  for (size_t i = 0; i < env_length; i++) {
-    if (env[i] == ',' || env[i] == ':' || env[i] == '[' || env[i] == ']') {
-      if (!buf.empty()) {
+void CachingAllocatorConfig::lexArgs(const char *env, std::vector<std::string> &config)
+{
+    std::vector<char> buf;
+
+    size_t env_length = strlen(env);
+    for (size_t i = 0; i < env_length; i++) {
+        if (env[i] == ',' || env[i] == ':' || env[i] == '[' || env[i] == ']') {
+            if (!buf.empty()) {
+                config.emplace_back(buf.begin(), buf.end());
+                buf.clear();
+            }
+            config.emplace_back(1, env[i]);
+        } else if (env[i] != ' ') {
+            buf.emplace_back(static_cast<char>(env[i]));
+        }
+    }
+    if (!buf.empty()) {
         config.emplace_back(buf.begin(), buf.end());
-        buf.clear();
-      }
-      config.emplace_back(1, env[i]);
-    } else if (env[i] != ' ') {
-      buf.emplace_back(static_cast<char>(env[i]));
-    }
-  }
-  if (!buf.empty()) {
-    config.emplace_back(buf.begin(), buf.end());
-  }
+    }
 }
 
-void CachingAllocatorConfig::consumeToken(
-    const std::vector<std::string>& config,
-    size_t i,
-    const char c) {
-  TORCH_CHECK(
-      i < config.size() && config[i].compare(std::string(1, c)) == 0,
-      "Error parsing CachingAllocator settings, expected ", c, PTA_ERROR(ErrCode::PARAM));
+void CachingAllocatorConfig::consumeToken(const std::vector<std::string> &config, size_t i, const char c)
+{
+    TORCH_CHECK(i < config.size() && config[i].compare(std::string(1, c)) == 0,
+        "Error parsing CachingAllocator settings, expected ", c, PTA_ERROR(ErrCode::PARAM));
 }
 
-size_t CachingAllocatorConfig::parseMaxSplitSize(
-    const std::vector<std::string>& config,
-    size_t i) {
-  consumeToken(config, ++i, ':');
-  if (++i < config.size()) {
-    size_t val1 = static_cast<size_t>(stoi(config[i]));
-    TORCH_CHECK(
-        val1 > kLargeBuffer / (1024 * 1024),
-        "CachingAllocator option max_split_size_mb too small, must be > ",
-        kLargeBuffer / (1024 * 1024), PTA_ERROR(ErrCode::VALUE));
-    val1 = std::max(val1, kLargeBuffer / (1024 * 1024));
-    val1 = std::min(val1, (std::numeric_limits<size_t>::max() / (1024 * 1024)));
-    m_max_split_size = val1 * 1024 * 1024;
-  } else {
-    TORCH_CHECK(false, "Error, expecting max_split_size_mb value", PTA_ERROR(ErrCode::PARAM));
-  }
-  return i;
+size_t CachingAllocatorConfig::parseMaxSplitSize(const std::vector<std::string> &config, size_t i)
+{
+    consumeToken(config, ++i, ':');
+    if (++i < config.size()) {
+        size_t val1 = static_cast<size_t>(stoi(config[i]));
+        TORCH_CHECK(val1 > kLargeBuffer / (1024 * 1024),
+            "CachingAllocator option max_split_size_mb too small, must be > ", kLargeBuffer / (1024 * 1024),
+            PTA_ERROR(ErrCode::VALUE));
+        val1 = std::max(val1, kLargeBuffer / (1024 * 1024));
+        val1 = std::min(val1, (std::numeric_limits<size_t>::max() / (1024 * 1024)));
+        m_max_split_size = val1 * 1024 * 1024;
+    } else {
+        TORCH_CHECK(false, "Error, expecting max_split_size_mb value", PTA_ERROR(ErrCode::PARAM));
+    }
+    return i;
 }
 
-size_t CachingAllocatorConfig::parseGarbageCollectionThreshold(
-    const std::vector<std::string>& config,
-    size_t i) {
-  consumeToken(config, ++i, ':');
-  if (++i < config.size()) {
-    double val1 = stod(config[i]);
-    TORCH_CHECK(
-        val1 > 0, "garbage_collect_threshold too small, set it 0.0~1.0", PTA_ERROR(ErrCode::VALUE));
-    TORCH_CHECK(
-        val1 < 1.0, "garbage_collect_threshold too big, set it 0.0~1.0", PTA_ERROR(ErrCode::VALUE));
-    m_garbage_collection_threshold = val1;
-  } else {
-    TORCH_CHECK(
-        false, "Error, expecting garbage_collection_threshold value", PTA_ERROR(ErrCode::VALUE));
-  }
-  return i;
+size_t CachingAllocatorConfig::parseGarbageCollectionThreshold(const std::vector<std::string> &config, size_t i)
+{
+    consumeToken(config, ++i, ':');
+    if (++i < config.size()) {
+        double val1 = stod(config[i]);
+        TORCH_CHECK(val1 > 0, "garbage_collect_threshold too small, set it 0.0~1.0", PTA_ERROR(ErrCode::VALUE));
+        TORCH_CHECK(val1 < 1.0, "garbage_collect_threshold too big, set it 0.0~1.0", PTA_ERROR(ErrCode::VALUE));
+        m_garbage_collection_threshold = val1;
+    } else {
+        TORCH_CHECK(false, "Error, expecting garbage_collection_threshold value", PTA_ERROR(ErrCode::VALUE));
+    }
+    return i;
 }
 
-size_t CachingAllocatorConfig::parseExpandableSegments(
-    const std::vector<std::string>& config,
-    size_t i) {
-  consumeToken(config, ++i, ':');
-  if (++i < config.size()) {
-    TORCH_CHECK(
-        i < config.size() && (config[i] == "True" || config[i] == "False"),
-        "Expected a single True/False argument for expandable_segments", PTA_ERROR(ErrCode::PARAM));
-    m_expandable_segments = (config[i] == "True");
-    if (m_expandable_segments) {
-        void* ptr = nullptr;
-        auto status = c10_npu::acl::AclrtReserveMemAddress(&ptr, 512, 0, NULL, 1);
-        if (status == ACL_ERROR_NONE) {
-            NPU_CHECK_ERROR(c10_npu::acl::AclrtReleaseMemAddress(ptr));
-        } else {
-            NPU_CHECK_SUPPORTED_OR_ERROR(status, "aclrtReserveMemAddress");
-            TORCH_NPU_WARN_ONCE("expandable_segments setting failure, now change to `False`.");
-            m_expandable_segments = false;
+size_t CachingAllocatorConfig::parseExpandableSegments(const std::vector<std::string> &config, size_t i)
+{
+    consumeToken(config, ++i, ':');
+    if (++i < config.size()) {
+        TORCH_CHECK(i < config.size() && (config[i] == "True" || config[i] == "False"),
+            "Expected a single True/False argument for expandable_segments", PTA_ERROR(ErrCode::PARAM));
+        m_expandable_segments = (config[i] == "True");
+        if (m_expandable_segments) {
+            void *ptr = nullptr;
+            auto status = c10_npu::acl::AclrtReserveMemAddress(&ptr, 512, 0, nullptr, 1);
+            if (status == ACL_ERROR_NONE) {
+                NPU_CHECK_ERROR(c10_npu::acl::AclrtReleaseMemAddress(ptr));
+            } else {
+                NPU_CHECK_SUPPORTED_OR_ERROR(status, "aclrtReserveMemAddress");
+                TORCH_NPU_WARN_ONCE("expandable_segments setting failure, now change to `False`.");
+                m_expandable_segments = false;
+            }
         }
+    } else {
+        TORCH_CHECK(false, "Error, expecting expandable_segments value", PTA_ERROR(ErrCode::PARAM));
     }
-  } else {
-    TORCH_CHECK(
-        false, "Error, expecting expandable_segments value", PTA_ERROR(ErrCode::PARAM));
-  }
-  return i;
+    return i;
 }
 
-size_t CachingAllocatorConfig::parseAddrAlignSize(
-    const std::vector<std::string>& config,
-    size_t i)
+size_t CachingAllocatorConfig::parseAddrAlignSize(const std::vector<std::string> &config, size_t i)
 {
     consumeToken(config, ++i, ':');
     if (++i < config.size()) {
         size_t val = static_cast<size_t>(stoi(config[i]));
         TORCH_CHECK(config[i].length() == std::to_string(val).length(),
-                    "CachingAllocator option base_addr_aligned_kb error, must be [0~16], dtype is int",
-                    OPS_ERROR(ErrCode::VALUE));
+            "CachingAllocator option base_addr_aligned_kb error, must be [0~16], dtype is int",
+            OPS_ERROR(ErrCode::VALUE));
         TORCH_CHECK(val >= 0, "CachingAllocator option base_addr_aligned_kb error, must be [0~16], dtype is int",
-                    OPS_ERROR(ErrCode::VALUE));
+            OPS_ERROR(ErrCode::VALUE));
         TORCH_CHECK(val <= kAlignRoundLarge / 1024,
-                    "CachingAllocator option base_addr_aligned_kb error, must be [0~16], dtype is int",
-                    OPS_ERROR(ErrCode::VALUE));
+            "CachingAllocator option base_addr_aligned_kb error, must be [0~16], dtype is int",
+            OPS_ERROR(ErrCode::VALUE));
         m_base_addr_aligned_size = val * 1024;
     } else {
         TORCH_CHECK(false, "Error, expecting base_addr_aligned_kb value", OPS_ERROR(ErrCode::VALUE));
@@ -882,64 +848,65 @@ size_t CachingAllocatorConfig::parseAddrAlignSize(
     return i;
 }
 
-size_t CachingAllocatorConfig::parsePageSize(const std::vector<std::string>& config, size_t i)
+size_t CachingAllocatorConfig::parsePageSize(const std::vector<std::string> &config, size_t i)
 {
     TORCH_CHECK(i + 2 < config.size(), "page_size requires format 'page_size:1g'", OPS_ERROR(ErrCode::VALUE));
-    TORCH_CHECK(config[i+1] == ":", "Expected ':' after page_size", OPS_ERROR(ErrCode::VALUE));
+    TORCH_CHECK(config[i + 1] == ":", "Expected ':' after page_size", OPS_ERROR(ErrCode::VALUE));
 
-    if (config[i+2] == "1g") {
+    if (config[i + 2] == "1g") {
         m_page_size_1g = true;
     } else {
-        TORCH_CHECK(false, "Unsupported page_size value: ", config[i+2], OPS_ERROR(ErrCode::VALUE));
+        TORCH_CHECK(false, "Unsupported page_size value: ", config[i + 2], OPS_ERROR(ErrCode::VALUE));
     }
     return i + 2; // 返回最后处理的索引位置
 }
 
-void CachingAllocatorConfig::parseArgs(const char* env) {
-  // If empty, set the default values
-  m_max_split_size = std::numeric_limits<size_t>::max();
-  m_garbage_collection_threshold = 0;
-
-  if (env == nullptr) {
-    return;
-  }
-
-  std::vector<std::string> config;
-  lexArgs(env, config);
-
-  for (size_t i = 0; i < config.size(); i++) {
-    if (config[i].compare("max_split_size_mb") == 0) {
-      i = parseMaxSplitSize(config, i);
-    } else if (config[i].compare("garbage_collection_threshold") == 0) {
-      i = parseGarbageCollectionThreshold(config, i);
-    } else if (config[i] == "expandable_segments") {
-      set_expandable_segments_flag = true;
-      i = parseExpandableSegments(config, i);
-    } else if (config[i] == "base_addr_aligned_kb") {
-      i = parseAddrAlignSize(config, i);
-    } else if (config[i] == "page_size") {
-      i = parsePageSize(config, i);
-    } else {
-      TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", config[i], PTA_ERROR(ErrCode::PARAM));
-    }
-
-    if (i + 1 < config.size()) {
-      consumeToken(config, ++i, ',');
-    }
-  }
-
-  if (m_expandable_segments) {
-      if (set_expandable_segments_flag) {
-          TORCH_CHECK(m_max_split_size == std::numeric_limits<size_t>::max() && m_garbage_collection_threshold == 0,
-                      "`max_split_size_mb` or `garbage_collection_threshold`, cannot be enabled with "
-                      "`expandable_segments`, please set `expandable_segments` to `False`.",
-                      OPS_ERROR(ErrCode::PARAM));
-      } else if (m_max_split_size != std::numeric_limits<size_t>::max() || m_garbage_collection_threshold != 0) {
-          m_expandable_segments = false;
-          TORCH_NPU_WARN_ONCE("`max_split_size_mb` or `garbage_collection_threshold` is enabled, and the "
-                              "`expandable_segments` is changed to `False` by default.");
-      }
-  }
+void CachingAllocatorConfig::parseArgs(const char *env)
+{
+    // If empty, set the default values
+    m_max_split_size = std::numeric_limits<size_t>::max();
+    m_garbage_collection_threshold = 0;
+
+    if (env == nullptr) {
+        return;
+    }
+
+    std::vector<std::string> config;
+    lexArgs(env, config);
+
+    for (size_t i = 0; i < config.size(); i++) {
+        if (config[i].compare("max_split_size_mb") == 0) {
+            i = parseMaxSplitSize(config, i);
+        } else if (config[i].compare("garbage_collection_threshold") == 0) {
+            i = parseGarbageCollectionThreshold(config, i);
+        } else if (config[i] == "expandable_segments") {
+            set_expandable_segments_flag = true;
+            i = parseExpandableSegments(config, i);
+        } else if (config[i] == "base_addr_aligned_kb") {
+            i = parseAddrAlignSize(config, i);
+        } else if (config[i] == "page_size") {
+            i = parsePageSize(config, i);
+        } else {
+            TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", config[i], PTA_ERROR(ErrCode::PARAM));
+        }
+
+        if (i + 1 < config.size()) {
+            consumeToken(config, ++i, ',');
+        }
+    }
+
+    if (m_expandable_segments) {
+        if (set_expandable_segments_flag) {
+            TORCH_CHECK(m_max_split_size == std::numeric_limits<size_t>::max() && m_garbage_collection_threshold == 0,
+                "`max_split_size_mb` or `garbage_collection_threshold`, cannot be enabled with "
+                "`expandable_segments`, please set `expandable_segments` to `False`.",
+                OPS_ERROR(ErrCode::PARAM));
+        } else if (m_max_split_size != std::numeric_limits<size_t>::max() || m_garbage_collection_threshold != 0) {
+            m_expandable_segments = false;
+            TORCH_NPU_WARN_ONCE("`max_split_size_mb` or `garbage_collection_threshold` is enabled, and the "
+                "`expandable_segments` is changed to `False` by default.");
+        }
+    }
 }
 
 bool checkConfigExpandableSegments()
@@ -953,748 +920,696 @@ bool isConfig1GPageSizeEnable()
 }
 
 class DeviceCachingAllocator {
- private:
-
-  // lock around all operations
-  mutable std::recursive_mutex mutex;
+private:
+    // lock around all operations
+    mutable std::recursive_mutex mutex;
 
-  // device statistics
-  DeviceStats stats;
+    // device statistics
+    DeviceStats stats;
 
-  // unallocated cached blocks larger than 1 MB
-  BlockPool large_blocks;
+    // unallocated cached blocks larger than 1 MB
+    BlockPool large_blocks;
 
-  // unallocated cached blocks 1 MB or smaller
-  BlockPool small_blocks;
+    // unallocated cached blocks 1 MB or smaller
+    BlockPool small_blocks;
 
-  // allocated or in use by a stream
-  ska::flat_hash_set<Block*> active_blocks;
+    // allocated or in use by a stream
+    ska::flat_hash_set<Block *> active_blocks;
 
-  // captures_underway tracks if we are diverting some
-  // allocations to a specific pool.
-  // Most of the time it's empty, in which case malloc can avoid calling
-  // aclrtStreamGetCaptureInfo in the hot path.
-  std::vector<std::pair<MempoolId_t, std::function<bool(aclrtStream)>>>
-      captures_underway;
+    // captures_underway tracks if we are diverting some
+    // allocations to a specific pool.
+    // Most of the time it's empty, in which case malloc can avoid calling
+    // aclrtStreamGetCaptureInfo in the hot path.
+    std::vector<std::pair<MempoolId_t, std::function<bool(aclrtStream)>>> captures_underway;
 
-  // See free() for this thing's purpose
-  std::vector<Block*> needs_events_deferred_until_no_capture;
+    // See free() for this thing's purpose
+    std::vector<Block *> needs_events_deferred_until_no_capture;
 
-  // outstanding acl events
-  ska::flat_hash_map<
-      c10_npu::NPUStream,
-      std::deque<std::pair<EventPool::Event, Block*>>>
-      npu_events;
+    // outstanding acl events
+    ska::flat_hash_map<c10_npu::NPUStream, std::deque<std::pair<EventPool::Event, Block *>>> npu_events;
 
-  // record used memory.
-  size_t total_allocated_memory = 0;
+    // record used memory.
+    size_t total_allocated_memory = 0;
 
-  // record maximum allowed memory.
-  size_t allowed_memory_maximum = 0;
+    // record maximum allowed memory.
+    size_t allowed_memory_maximum = 0;
 
-  // all live expandable segments
-  std::vector<ExpandableSegment*> expandable_segments_;
+    // all live expandable segments
+    std::vector<ExpandableSegment *> expandable_segments_;
 
-  bool set_fraction = false;
+    bool set_fraction = false;
 
-  bool record_history = false;
+    bool record_history = false;
 
-  std::atomic<CreateContextFn> context_recorder_;
-  size_t alloc_trace_next = 0;
-  RecordContext record_context_ = RecordContext::NEVER;
-  size_t alloc_trace_max_entries_ = 1;
-  std::vector<TraceEntry>*
-        alloc_trace; // pointer because we need to intentionally leak this on
-                   // deallocation it can hold references to Python state which
-                   // will already be destroyed when we are in exit handlers
+    std::atomic<CreateContextFn> context_recorder_;
+    size_t alloc_trace_next = 0;
+    RecordContext record_context_ = RecordContext::NEVER;
+    size_t alloc_trace_max_entries_ = 1;
+    std::vector<TraceEntry> *alloc_trace; // pointer because we need to intentionally leak this on
+                                          // deallocation it can hold references to Python state which
+                                          // will already be destroyed when we are in exit handlers
 
-  // XXX - maybe we should generalize and have multiple events
-  std::vector<OutOfMemoryObserver> oom_observers_;
+    // XXX - maybe we should generalize and have multiple events
+    std::vector<OutOfMemoryObserver> oom_observers_;
     std::shared_ptr<c10d_npu::HCCLComm> hcclComm_;
 
-  // Private pools for NPU graphs
-  ska::flat_hash_map<MempoolId_t, std::unique_ptr<PrivatePool>, MempoolIdHash>
-      graph_pools;
-
-  // Pools no longer referenced by any graph. Their BlockPools are eligible for
-  // free_blocks. Can't be a vector or deque because we might erase entries in
-  // any order. Could be an std::list, but we don't care much, access and
-  // insert/erase are rare.
-  ska::flat_hash_map<MempoolId_t, PrivatePool*, MempoolIdHash>
-      graph_pools_freeable;
-
-  // mapping from block to a stream_set, containing streams on which the block
-  // was used while npugraph capturing
-  std::unordered_map<Block*, stream_set> block_to_npugraph_stream_uses;
- public:
-
-  DeviceCachingAllocator() :
-    large_blocks(false),
-    small_blocks(true),
-    alloc_trace(new std::vector<TraceEntry>()) {
-    stats.max_split_size = static_cast<int64_t>(CachingAllocatorConfig::max_split_size());
-    context_recorder_.store(nullptr);
-  }
-
-  void recordHistory(bool enabled, CreateContextFn context_recorder,
-                     size_t alloc_trace_max_entries, RecordContext when)
-  {
-      std::unique_lock<std::recursive_mutex> lock(mutex);
-      TORCH_CHECK(when == RecordContext::NEVER || context_recorder, PTA_ERROR(ErrCode::INTERNAL));
-      record_history = enabled;
-      context_recorder_.store(record_history ? context_recorder : nullptr);
-      alloc_trace_max_entries_ = std::max(size_t(1), alloc_trace_max_entries);
-      record_context_ = enabled ? when : RecordContext::NEVER;
-      alloc_trace_next = 0;
-      alloc_trace->clear();
-  }
-
-  bool isHistoryEnabled() { return record_history; }
-
-  void attachOutOfMemoryObserver(OutOfMemoryObserver observer)
-  {
-      oom_observers_.emplace_back(observer);
-  }
-
-  bool checkUceInMemPool()
-  {
-      auto memUceInfo_ = c10_npu::get_mem_uce_info();
-      auto info = memUceInfo_.info;
-      const auto all_blocks = get_all_blocks();
-      bool any_found = false;
-      aclrtMemUceInfo temp_info[memUceInfo_.retSize];
-      size_t temp_retsize = 0;
-
-      for (int i = 0; i < memUceInfo_.retSize; ++i) {
-          void* addr = info[i].addr;
-          size_t length = info[i].len;
-          bool found = false;
-
-          // Calculate the start and end address for info[i]
-          void* addr_end = static_cast<char*>(addr) + length - 1;
-
-          // Iterate through all blocks and check if there's an overlap with addr
-          for (const Block* const head_block : all_blocks) {
-              void* block_start = head_block->ptr;
-              void* block_end = static_cast<char*>(head_block->ptr) + head_block->size - 1;
-
-              // If there is an overlap, mark the block as unsafe
-              if (addr <= block_end && addr_end >= block_start) {
-                  const_cast<Block*>(head_block)->is_safe = false;
-                  ASCEND_LOGI("Memory block with UCE fault error found in the NPUCachingAllocator and was marked as unsafe");
-                  found = true;
-                  any_found = true;
-                  // Set the unsafe flag only once
-                  if (c10_npu::get_npu_data_unsafe_flag() == false) {
-                      c10_npu::set_npu_data_unsafe_flag(true);
-                  }
-              }
-          }
-
-          if (found) {
-            // update memuceinfo
-            temp_info[temp_retsize++] = info[i];
-          }
-      }
-
-      std::memcpy(memUceInfo_.info, temp_info, temp_retsize * sizeof(aclrtMemUceInfo));
-      memUceInfo_.retSize = temp_retsize;
-
-      c10_npu::set_mem_uce_info(memUceInfo_);
-      if (!any_found) {
-          return false;
-      }
-      return true;
-  }
-
-  void markAllBlockUnsafe()
-  {
-      for (auto& active_block : active_blocks) {
-          active_block->is_safe = false;
-      }
-      return;
-  }
-
-  // Must be called outside of `mutex` or deadlocks are possible with Python
-  std::shared_ptr<c10::GatheredContext> maybeGatherContext(RecordContext level)
-  {
-      if (record_context_ < level) {
-          return nullptr;
-      }
-      return context_recorder_.load()();
-  }
-
-  // All public methods (except the above) acquire the allocator mutex.
-  // Thus, do not call a public method from another public method.
-
-  Block* malloc(int device, size_t orig_size, aclrtStream stream, uint8_t allocator_type = 0)
-  {
-    // done outside the lock because we don't know what locks the recorder needs
-    // to have...
-    auto context = maybeGatherContext(RecordContext::STATE);
-
-    std::unique_lock<std::recursive_mutex> lock(mutex);
-
-    if (device == -1) {
-        NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
-    }
+    // Private pools for NPU graphs
+    ska::flat_hash_map<MempoolId_t, std::unique_ptr<PrivatePool>, MempoolIdHash> graph_pools;
 
-    if (C10_LIKELY(captures_underway.empty())) {
-      // Processes end-of-life events for outstanding allocations used on
-      // multiple streams (checks if their NPU-side uses are complete and
-      // recycles their memory if so)
-      //
-      // Q. Why skip process_events if a capture might be underway?
-      // A. process_events involves npuEventQueries, illegal during NPU graph
-      //    capture.
-      //    Dumb simple solution: defer reclaiming these allocations until after
-      //    capture. Cross-stream memory use is uncommon, so the deferral's
-      //    effect on memory use during capture should be small.
-      process_events(context);
-    }
-    auto size = round_size(orig_size);
-    auto& pool = get_pool(size, stream);
-
-    // 开环境变量 大池子放1G内存块
-    const size_t alloc_size = IsMallocPage1GMem(pool.is_small)
-                              ? kExtraLargeBuffer * ((size + kExtraLargeBuffer - 1) / kExtraLargeBuffer)
-                              : get_allocation_size(size);
-    AllocParams params(device, size, stream, &pool, alloc_size, stats);
-    params.stat_types = get_stat_types_for_pool(pool);
-
-    // First, try to get a block from the existing pool.
-    bool block_found =
-      // Search pool
-      get_free_block(params) ||
-      // Trigger callbacks and retry search
-      (trigger_free_memory_callbacks(params) && get_free_block(params));
-    // Can't reuse an existing block; try to get a new one.
-    if (!block_found) {
-      // Do garbage collection if the flag is set.
-      if (C10_UNLIKELY(set_fraction &&
-              CachingAllocatorConfig::garbage_collection_threshold() > 0.0)) {
-        garbage_collect_cached_blocks(context);
-      }
-      // Attempt allocate
-      block_found = alloc_block(params, false, context, lock) ||
-          // Free enough available cached blocks to satisfy alloc and retry
-          // alloc.
-          (release_available_cached_blocks(params, context) &&
-              alloc_block(params, false, context, lock));
-    }
-
-    if (!block_found && C10_LIKELY(captures_underway.empty())) {
-        ASCEND_LOGE(
-            "Get a block from the existing pool failed. Try to free cached blocks and reallocate. This error log "
-            "can be ignored.");
-        // Free all non-split cached blocks and retry alloc.
-        c10_npu::NPUWorkspaceAllocator::emptyCache(device, true, true);
-        block_found = (release_cached_blocks(true, context) && alloc_block(params, true, context, lock));
-    }
-
-    if (!block_found) {
-      if (params.err == ACL_ERROR_RT_MEMORY_ALLOCATION) {
-        size_t device_free;
-        size_t device_total;
-        NPU_CHECK_ERROR(aclrtGetMemInfo(ACL_HBM_MEM, &device_free, &device_total));
+    // Pools no longer referenced by any graph. Their BlockPools are eligible for
+    // free_blocks. Can't be a vector or deque because we might erase entries in
+    // any order. Could be an std::list, but we don't care much, access and
+    // insert/erase are rare.
+    ska::flat_hash_map<MempoolId_t, PrivatePool *, MempoolIdHash> graph_pools_freeable;
 
-        std::string allowed_info;
-        if (set_fraction) {
-          allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
-        }
-        stats.num_ooms += 1;
-
-        record_trace(
-            TraceEntry::OOM,
-            device_free,
-            params.size(),
-            params.stream(),
-            params.device(),
-            std::move(context));
-        auto observers_local = oom_observers_;
-
-        // Make sure we do not have the device lock before calling our
-        // observers which might need hold the GIL
-        // It is safe to release at this point because will no longer
-        // be reading any allocator state.
-
-        lock.unlock();
-
-        for (const auto& obs : observers_local) {
-            obs(device,
-                alloc_size,
-                set_fraction ? allowed_memory_maximum : device_total,
-                device_free);
-        }
-        // "total capacity": total global memory on NPU
-        // "allowed": memory is allowed to use, which set by fraction.
-        // "already allocated": memory allocated by the program using the
-        //                      caching allocator
-        // "free": free memory as reported by the NPU API
-        // "cached": memory held by the allocator but not used by the program
-        //
-        // The "allocated" amount  does not include memory allocated outside
-        // of the caching allocator, such as memory allocated by other programs
-        // or memory held by the driver.
-        //
-        // The sum of "allocated" + "free" + "cached" may be less than the
-        // total capacity due to memory held by the driver and usage by other
-        // programs.
-        //
-        // Note that at this point free_cached_blocks has already returned all
-        // possible "cached" memory to the driver. The only remaining "cached"
-        // memory is split from a larger block that is partially in-use.
-        AT_ERROR(
-            "NPU out of memory. Tried to allocate ",
-            format_size(alloc_size),
-            " (NPU ", device, "; ",
-            format_size(device_total),
-            " total capacity; ",
-            format_size(stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
-            " already allocated; ",
-            format_size(stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
-            " current active; ",
-            format_size(device_free),
-            " free; ",
-            allowed_info,
-            format_size(stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
-            " reserved in total by PyTorch)",
-            " If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.");
-      } else {
-        NPU_CHECK_ERROR(params.err);
-      }
-    }
-
-    int64_t ori_block_ptr = int64_t(params.block->ptr);
-    size_t align_round = CachingAllocatorConfig::base_addr_aligned_size();
-    if (params.size() >= kRoundLarge && CachingAllocatorConfig::expandable_segments() && align_round != 0 &&
-        ori_block_ptr % align_round != 0) {
-        char* align_ptr = reinterpret_cast<char*>((ori_block_ptr + align_round) - (ori_block_ptr % align_round));
-        size_t offset_size = align_ptr - (char*)params.block->ptr;
-        if (offset_size + params.size() <= params.block->size) {
-            auto size = params.block->size;
-            Block* remaining = params.block;
-
-            Block* block = new Block(params.device(), params.stream(), size - offset_size, params.pool, align_ptr);
-            block->expandable_segment_ = remaining->expandable_segment_;
-            block->next = remaining->next;
-            if (block->next) {
-                block->next->prev = block;
-            }
-            block->prev = remaining;
-
-            remaining->next = block;
-            remaining->size = offset_size;
-            params.pool->blocks.insert(remaining);
-
-            params.block = block;
-        }
-    }
-
-    bool split_remainder = should_split(params.block, params.size());
-    return alloc_found_block(
-        std::move(params), orig_size, std::move(context), split_remainder, allocator_type);
-  }
-
-  Block* alloc_found_block(
-    AllocParams params,
-    size_t orig_size,
-    std::shared_ptr<c10::GatheredContext> context,
-    bool split_remainder,
-    uint8_t allocator_type)
-  {
-  auto size = params.size();
-  auto device = params.device();
-  auto pool = params.pool;
-  auto stream = params.stream();
-
-  TORCH_INTERNAL_ASSERT(
-      params.err == ACL_ERROR_NONE && params.block != nullptr &&
-      params.block->ptr != nullptr, PTA_ERROR(ErrCode::PTR));
-  Block* block = params.block;
-  Block* remaining = nullptr;
-
-  const bool already_split = block->is_split();
-  if (split_remainder) {
-    remaining = block;
-
-    block = new Block(device, stream, size, pool, block->ptr);
-    block->expandable_segment_ = remaining->expandable_segment_;
-    block->prev = remaining->prev;
-    if (block->prev) {
-      block->prev->next = block;
-    }
-    block->next = remaining;
-
-    remaining->prev = block;
-    remaining->ptr = static_cast<char*>(remaining->ptr) + size;
-    remaining->size -= size;
-    pool->blocks.insert(remaining);
-
-    if (already_split && !block->expandable_segment_) {
-      // An already-split inactive block is being shrunk by size bytes.
-      update_stat_array(
-          stats.inactive_split_bytes,
-          -static_cast<std::int64_t>(block->size),
-          params.stat_types);
-    } else if (!block->expandable_segment_) {
-      // A new split inactive block is being created from a previously unsplit
-      // block, size remaining->size bytes.
-      for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
-        update_stat(
-            stats.inactive_split_bytes[stat_type],
-            static_cast<std::int64_t>(remaining->size));
-        update_stat(stats.inactive_split[stat_type], 1);
-      });
-    }
-  } else if (already_split && !block->expandable_segment_) {
-    // An already-split block is becoming active
-    for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
-      update_stat(
-          stats.inactive_split_bytes[stat_type],
-          -static_cast<std::int64_t>(block->size));
-      update_stat(stats.inactive_split[stat_type], -1);
-    });
-  }
-
-  block->allocated = true;
-  block->requested_size = orig_size;
-  if (block->is_safe == false) {
-      ASCEND_LOGI("Unsafe memory block is passively refreshed by releasing and mallocing memory again");
-  }
-  block->is_safe = true;
-
-  block->context_when_allocated = std::move(context);
-  record_trace(
-      TraceEntry::ALLOC,
-      int64_t(block->ptr),
-      orig_size,
-      block->stream,
-      block->device,
-      block->context_when_allocated);
-
-  active_blocks.insert(block);
-
-  for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
-    update_stat(stats.allocation[stat_type], 1);
-    update_stat(
-        stats.allocated_bytes[stat_type],
-        static_cast<std::int64_t>(block->size));
-    update_stat(stats.active[stat_type], 1);
-    update_stat(
-        stats.active_bytes[stat_type],
-        static_cast<std::int64_t>(block->size));
-    update_stat(
-        stats.requested_bytes[stat_type],
-        static_cast<std::int64_t>(block->requested_size));
-  });
-
-  if (block->size >= CachingAllocatorConfig::max_split_size())
-    update_stat(stats.oversize_allocations, 1);
-
-  ASCEND_LOGD("PTA CachingAllocator malloc: malloc = %zu, cached = %lu, allocated = %lu",
-      block->size,
-      stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-      stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current);
+    // mapping from block to a stream_set, containing streams on which the block
+    // was used while npugraph capturing
+    std::unordered_map<Block *, stream_set> block_to_npugraph_stream_uses;
 
-#ifndef BUILD_LIBTORCH
-    if (torch_npu::profiler::MstxMgr::GetInstance()->isMsleaksEnable()) {
-      mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
-      mstxMemVirtualRangeDesc_t heapDesc{block->device, block->ptr, stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current};
-      torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &heapDesc);
-      mstxMemVirtualRangeDesc_t regionDesc{block->device, block->ptr, block->size};
-      torch_npu::profiler::MstxMgr::GetInstance()->memRegionsRegister(msleaksDomain, &regionDesc);
-    }
-    torch_npu::profiler::reportMemoryDataToNpuProfiler({
-      static_cast<int8_t>(c10::DeviceType::PrivateUse1),
-      block->device,
-      static_cast<uint8_t>(torch_npu::profiler::MemoryDataType::MEMORY_MALLOC),
-      allocator_type,
-      reinterpret_cast<int64_t>(block->ptr),
-      block->size,
-      stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-      stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-      stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-      reinterpret_cast<int64_t>(block->stream)}
-    );
-#endif
+public:
+    DeviceCachingAllocator() : large_blocks(false), small_blocks(true), alloc_trace(new std::vector<TraceEntry>())
+    {
+        stats.max_split_size = static_cast<int64_t>(CachingAllocatorConfig::max_split_size());
+        context_recorder_.store(nullptr);
+    }
 
-  return block;
-}
+    void recordHistory(bool enabled, CreateContextFn context_recorder, size_t alloc_trace_max_entries,
+        RecordContext when)
+    {
+        std::unique_lock<std::recursive_mutex> lock(mutex);
+        TORCH_CHECK(when == RecordContext::NEVER || context_recorder, PTA_ERROR(ErrCode::INTERNAL));
+        record_history = enabled;
+        context_recorder_.store(record_history ? context_recorder : nullptr);
+        alloc_trace_max_entries_ = std::max(size_t(1), alloc_trace_max_entries);
+        record_context_ = enabled ? when : RecordContext::NEVER;
+        alloc_trace_next = 0;
+        alloc_trace->clear();
+    }
 
+    bool isHistoryEnabled()
+    {
+        return record_history;
+    }
 
-  void free(Block* block, uint8_t allocator_type = 0)
-  {
-    std::shared_ptr<c10::GatheredContext> context =
-        maybeGatherContext(RecordContext::ALL);
-    std::lock_guard<std::recursive_mutex> lock(mutex);
+    void attachOutOfMemoryObserver(OutOfMemoryObserver observer)
+    {
+        oom_observers_.emplace_back(observer);
+    }
 
-    block->allocated = false;
+    bool checkUceInMemPool()
+    {
+        auto memUceInfo_ = c10_npu::get_mem_uce_info();
+        auto info = memUceInfo_.info;
+        const auto all_blocks = get_all_blocks();
+        bool any_found = false;
+        aclrtMemUceInfo temp_info[memUceInfo_.retSize];
+        size_t temp_retsize = 0;
+
+        for (int i = 0; i < memUceInfo_.retSize; ++i) {
+            void *addr = info[i].addr;
+            size_t length = info[i].len;
+            bool found = false;
+
+            // Calculate the start and end address for info[i]
+            void *addr_end = static_cast<char *>(addr) + length - 1;
+
+            // Iterate through all blocks and check if there's an overlap with addr
+            for (const Block * const head_block : all_blocks) {
+                void *block_start = head_block->ptr;
+                void *block_end = static_cast<char *>(head_block->ptr) + head_block->size - 1;
+
+                // If there is an overlap, mark the block as unsafe
+                if (addr <= block_end && addr_end >= block_start) {
+                    const_cast<Block *>(head_block)->is_safe = false;
+                    ASCEND_LOGI(
+                        "Memory block with UCE fault error found in the NPUCachingAllocator and was marked as unsafe");
+                    found = true;
+                    any_found = true;
+                    // Set the unsafe flag only once
+                    if (c10_npu::get_npu_data_unsafe_flag() == false) {
+                        c10_npu::set_npu_data_unsafe_flag(true);
+                    }
+                }
+            }
 
-    // following logic might modifying underlaying Block, causing the size
-    // changed. We store ahead for reporting
-    auto orig_block_ptr = block->ptr;
-    auto orig_block_size = block->size;
+            if (found) {
+                // update memuceinfo
+                temp_info[temp_retsize++] = info[i];
+            }
+        }
 
-    StatTypes stat_types = get_stat_types_for_pool(*(block->pool));
-    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
-      update_stat(stats.allocation[stat_type], -1);
-      update_stat(stats.allocated_bytes[stat_type], -block->size);
-    });
-
-    record_trace(
-        TraceEntry::FREE_REQUESTED,
-        int64_t(block->ptr),
-        block->requested_size,
-        block->stream,
-        block->device,
-        context ? context : block->context_when_allocated);
-
-    if (block->size >= CachingAllocatorConfig::max_split_size())
-      update_stat(stats.oversize_allocations, -1);
-
-    if (!block->stream_uses.empty() && c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) {
-      if (C10_UNLIKELY(!captures_underway.empty())) {
-        // It's forbidden to npuEventQuery an event recorded during NPU graph
-        // capture. We conservatively defer recording end-of-life events until
-        // the next call to process_events() (which won't happen until no
-        // captures are underway)
-        needs_events_deferred_until_no_capture.push_back(block);
-      } else {
-        insert_events(block);
-      }
-    } else {
-      free_block(block, context, allocator_type);
-    }
+        std::memcpy(memUceInfo_.info, temp_info, temp_retsize * sizeof(aclrtMemUceInfo));
+        memUceInfo_.retSize = temp_retsize;
 
-    ASCEND_LOGD("PTA CachingAllocator free: free = %zu, cached = %lu, allocated = %lu",
-        orig_block_size,
-        stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-        stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current);
-#ifndef BUILD_LIBTORCH
-    if (torch_npu::profiler::MstxMgr::GetInstance()->isMsleaksEnable()) {
-      mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
-      mstxMemVirtualRangeDesc_t desc{block->device, orig_block_ptr, stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current};
-      torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &desc);
-      torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, orig_block_ptr);
-    }
-    torch_npu::profiler::reportMemoryDataToNpuProfiler({
-        static_cast<int8_t>(c10::DeviceType::PrivateUse1),
-        block->device,
-        static_cast<uint8_t>(torch_npu::profiler::MemoryDataType::MEMORY_FREE),
-        allocator_type,
-        reinterpret_cast<int64_t>(orig_block_ptr),
-        -orig_block_size,
-        stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-        stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-        stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-        reinterpret_cast<int64_t>(block->stream)}
-    );
-#endif
-  }
-
-  void* getBaseAllocation(Block* block, size_t* outSize) {
-    std::lock_guard<std::recursive_mutex> lock(mutex);
-    while (block->prev) {
-      block = block->prev;
-    }
-    void* basePtr = block->ptr;
-    if (outSize) {
-      size_t size = 0;
-      while (block) {
-        size += block->size;
-        block = block->next;
-      }
-      *outSize = size;
-    }
-    return basePtr;
-  }
-
-    void recordStream(Block* block, c10_npu::NPUStream stream) {
-        std::lock_guard<std::recursive_mutex> lock(mutex);
-        block->stream_uses.insert(stream);
-        if (C10_UNLIKELY(!captures_underway.empty())) {
-            block_to_npugraph_stream_uses[block].insert(stream);
+        c10_npu::set_mem_uce_info(memUceInfo_);
+        if (!any_found) {
+            return false;
         }
+        return true;
     }
 
-  void eraseStream(Block* block, c10_npu::NPUStream stream) {
-    std::shared_ptr<c10::GatheredContext> context =
-        maybeGatherContext(RecordContext::ALL);
-    std::lock_guard<std::recursive_mutex> lock(mutex);
-    block->stream_uses.erase(stream);
-
-    // free block, lazy destory block related events
-    for (auto it = npu_events[stream].begin(); it != npu_events[stream].end();) {
-      if (block != it->second) {
-        it++;
-        continue;
-      }
-      it = npu_events[stream].erase(it);
-      block->event_count--;
-      if (block->event_count == 0) {
-        free_block(block, context);
-        break;
-      }
-    }
-  }
-
-  /** set memory fraction to limit maximum allocated memory **/
-  void setMemoryFraction(double fraction) {
-    size_t device_free;
-    size_t device_total;
-    NPU_CHECK_ERROR(aclrtGetMemInfo(ACL_HBM_MEM, &device_free, &device_total));
-    allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
-    set_fraction = true;
-  }
-
-    /** returns cached blocks to the system allocator **/
-    void emptyCache(int device, bool check_error)
+    void markAllBlockUnsafe()
     {
-        std::shared_ptr<c10::GatheredContext> context = maybeGatherContext(RecordContext::ALL);
-        std::lock_guard<std::recursive_mutex> lock(mutex);
-        c10_npu::NPUWorkspaceAllocator::emptyCache(device, true, check_error);
-        release_cached_blocks(check_error, context);
+        for (auto &active_block : active_blocks) {
+            active_block->is_safe = false;
+        }
+        return;
     }
 
-    void buildServerMemMapForHccl(std::shared_ptr<c10d_npu::HCCLComm> hcclComm)
+    // Must be called outside of `mutex` or deadlocks are possible with Python
+    std::shared_ptr<c10::GatheredContext> maybeGatherContext(RecordContext level)
     {
-        std::unique_lock<std::recursive_mutex> lock(mutex);
-        TORCH_INTERNAL_ASSERT(!hcclComm_, "Build HCCL server group redundancy.", PTA_ERROR(ErrCode::INTERNAL));
-        hcclComm_ = hcclComm;
-        for (auto &expandable_segments: expandable_segments_) {
-            expandable_segments->setHcclComm(hcclComm);
+        if (record_context_ < level) {
+            return nullptr;
         }
+        return context_recorder_.load()();
     }
 
-  void release_and_free_events()
-  {
-      std::unique_lock<std::recursive_mutex> lock(mutex);
-      std::shared_ptr<c10::GatheredContext> context = maybeGatherContext(RecordContext::ALL);
-      for (auto& st : npu_events) {
-          for (auto& e : st.second) {
-              EventPool::Event event = std::move(e.first);
-              Block* block = e.second;
-              block->event_count--;
-              if (block->event_count == 0) {
-                  free_block(block, context);
-              }
-          }
-      }
-      npu_events.clear();
-  }
-
-  /** Retrieves info (total size + largest block) of the memory cache **/
-  void cacheInfo(size_t* total, size_t* largest) {
-    std::lock_guard<std::recursive_mutex> lock(mutex);
-    cache_info_aux(large_blocks, total, largest);
-    cache_info_aux(small_blocks, total, largest);
-    for (const auto& gp : graph_pools) {
-      cache_info_aux(gp.second->large_blocks, total, largest);
-      cache_info_aux(gp.second->small_blocks, total, largest);
-    }
-  }
-
-  /** Returns a copy of the memory allocator stats **/
-  DeviceStats getStats() {
-    std::lock_guard<std::recursive_mutex> lock(mutex);
-    return stats;
-  }
-
-  /** Resets the historical accumulation stats for the device **/
-  void resetAccumulatedStats() {
-    std::lock_guard<std::recursive_mutex> lock(mutex);
-
-    for (size_t statType = 0; statType < static_cast<size_t>(StatType::NUM_TYPES); ++statType) {
-      reset_accumulated_stat(stats.allocation[statType]);
-      reset_accumulated_stat(stats.segment[statType]);
-      reset_accumulated_stat(stats.active[statType]);
-      reset_accumulated_stat(stats.inactive_split[statType]);
-      reset_accumulated_stat(stats.allocated_bytes[statType]);
-      reset_accumulated_stat(stats.reserved_bytes[statType]);
-      reset_accumulated_stat(stats.active_bytes[statType]);
-      reset_accumulated_stat(stats.inactive_split_bytes[statType]);
-      reset_accumulated_stat(stats.requested_bytes[statType]);
-    }
-
-    stats.num_alloc_retries = 0;
-    stats.num_ooms = 0;
-    reset_accumulated_stat(stats.oversize_allocations);
-    reset_accumulated_stat(stats.oversize_segments);
-  }
-
-  /** Resets the historical peak stats for the device **/
-  void resetPeakStats() {
-    std::lock_guard<std::recursive_mutex> lock(mutex);
-
-    for (size_t statType = 0; statType < static_cast<size_t>(StatType::NUM_TYPES); ++statType) {
-      reset_peak_stat(stats.allocation[statType]);
-      reset_peak_stat(stats.segment[statType]);
-      reset_peak_stat(stats.active[statType]);
-      reset_peak_stat(stats.inactive_split[statType]);
-      reset_peak_stat(stats.allocated_bytes[statType]);
-      reset_peak_stat(stats.reserved_bytes[statType]);
-      reset_peak_stat(stats.active_bytes[statType]);
-      reset_peak_stat(stats.inactive_split_bytes[statType]);
-      reset_peak_stat(stats.requested_bytes[statType]);
-    }
-
-    reset_peak_stat(stats.oversize_allocations);
-    reset_peak_stat(stats.oversize_segments);
-  }
-
-    /** Dump a complete snapshot of the memory held by the allocator. Potentially VERY expensive. **/
-    std::vector<SegmentInfo> snapshot()
+    // All public methods (except the above) acquire the allocator mutex.
+    // Thus, do not call a public method from another public method.
+
+    Block *malloc(int device, size_t orig_size, aclrtStream stream, uint8_t allocator_type = 0)
     {
-        std::lock_guard<std::recursive_mutex> lock(mutex);
+        // done outside the lock because we don't know what locks the recorder needs
+        // to have...
+        auto context = maybeGatherContext(RecordContext::STATE);
 
-        std::unordered_map<PrivatePool*, MempoolId_t> pool_to_id;
-        pool_to_id.reserve(graph_pools.size() + graph_pools_freeable.size());
-        for (const auto& pair : graph_pools) {
-            pool_to_id[pair.second.get()] = pair.first;
+        std::unique_lock<std::recursive_mutex> lock(mutex);
+
+        if (device == -1) {
+            NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
         }
-        for (const auto& pair : graph_pools_freeable) {
-            pool_to_id[pair.second] = pair.first;
+
+        if (C10_LIKELY(captures_underway.empty())) {
+            // Processes end-of-life events for outstanding allocations used on
+            // multiple streams (checks if their NPU-side uses are complete and
+            // recycles their memory if so)
+            //
+            // Q. Why skip process_events if a capture might be underway?
+            // A. process_events involves npuEventQueries, illegal during NPU graph
+            //    capture.
+            //    Dumb simple solution: defer reclaiming these allocations until after
+            //    capture. Cross-stream memory use is uncommon, so the deferral's
+            //    effect on memory use during capture should be small.
+            process_events(context);
+        }
+        auto size = round_size(orig_size);
+        auto &pool = get_pool(size, stream);
+
+        // 开环境变量 大池子放1G内存块
+        const size_t alloc_size = IsMallocPage1GMem(pool.is_small) ?
+            kExtraLargeBuffer * ((size + kExtraLargeBuffer - 1) / kExtraLargeBuffer) :
+            get_allocation_size(size);
+        AllocParams params(device, size, stream, &pool, alloc_size, stats);
+        params.stat_types = get_stat_types_for_pool(pool);
+
+        // First, try to get a block from the existing pool.
+        bool block_found =
+            // Search pool
+            get_free_block(params) ||
+            // Trigger callbacks and retry search
+            (trigger_free_memory_callbacks(params) && get_free_block(params));
+        // Can't reuse an existing block; try to get a new one.
+        if (!block_found) {
+            // Do garbage collection if the flag is set.
+            if (C10_UNLIKELY(set_fraction && CachingAllocatorConfig::garbage_collection_threshold() > 0.0)) {
+                garbage_collect_cached_blocks(context);
+            }
+            // Attempt allocate
+            block_found = alloc_block(params, false, context, lock) ||
+                // Free enough available cached blocks to satisfy alloc and retry
+                // alloc.
+                (release_available_cached_blocks(params, context) && alloc_block(params, false, context, lock));
         }
 
-        size_t total_active = 0;
-        std::vector<SegmentInfo> result;
-        const auto all_blocks = get_all_blocks();
+        if (!block_found && C10_LIKELY(captures_underway.empty())) {
+            ASCEND_LOGE(
+                "Get a block from the existing pool failed. Try to free cached blocks and reallocate. This error log "
+                "can be ignored.");
+            // Free all non-split cached blocks and retry alloc.
+            c10_npu::NPUWorkspaceAllocator::emptyCache(device, true, true);
+            block_found = (release_cached_blocks(true, context) && alloc_block(params, true, context, lock));
+        }
 
-        for (const Block* const head_block : all_blocks) {
-            // For expandable segments, we report one segment for each continguous
-            // mapped range of memory
-            if (head_block->prev && head_block->prev->mapped) {
-                continue;
-            }
-            result.emplace_back();
-            SegmentInfo& segment_info = result.back();
-            segment_info.device = head_block->device;
-            segment_info.address = reinterpret_cast<int64_t>(head_block->ptr);
-            segment_info.stream = head_block->stream;
-            segment_info.is_large = (!head_block->pool->is_small);
-            segment_info.is_expandable = head_block->expandable_segment_;
-            segment_info.context_when_allocated =
-                head_block->context_when_segment_allocated;
-            auto mempool_id = pool_to_id.find(head_block->pool->owner_PrivatePool);
-            if (mempool_id != pool_to_id.end()) {
-                segment_info.owner_private_pool_id = mempool_id->second;
-            }
-            const Block* block = head_block;
-            while (block != nullptr && block->mapped) {
-                segment_info.blocks.emplace_back();
-                BlockInfo& block_info = segment_info.blocks.back();
+        if (!block_found) {
+            if (params.err == ACL_ERROR_RT_MEMORY_ALLOCATION) {
+                size_t device_free;
+                size_t device_total;
+                NPU_CHECK_ERROR(aclrtGetMemInfo(ACL_HBM_MEM, &device_free, &device_total));
 
-                block_info.size = block->size;
-                block_info.requested_size = block->requested_size;
-                block_info.allocated = block->allocated;
-                block_info.active = block->allocated || (block->event_count > 0);
+                std::string allowed_info;
+                if (set_fraction) {
+                    allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
+                }
+                stats.num_ooms += 1;
 
-                segment_info.total_size += block_info.size;
-                if (block_info.allocated) {
-                    segment_info.allocated_size += block_info.size;
+                record_trace(TraceEntry::OOM, device_free, params.size(), params.stream(), params.device(),
+                    std::move(context));
+                auto observers_local = oom_observers_;
+
+                // Make sure we do not have the device lock before calling our
+                // observers which might need hold the GIL
+                // It is safe to release at this point because will no longer
+                // be reading any allocator state.
+
+                lock.unlock();
+
+                for (const auto &obs : observers_local) {
+                    obs(device, alloc_size, set_fraction ? allowed_memory_maximum : device_total, device_free);
                 }
-                if (block_info.active) {
+                // "total capacity": total global memory on NPU
+                // "allowed": memory is allowed to use, which set by fraction.
+                // "already allocated": memory allocated by the program using the
+                //                      caching allocator
+                // "free": free memory as reported by the NPU API
+                // "cached": memory held by the allocator but not used by the program
+                //
+                // The "allocated" amount  does not include memory allocated outside
+                // of the caching allocator, such as memory allocated by other programs
+                // or memory held by the driver.
+                //
+                // The sum of "allocated" + "free" + "cached" may be less than the
+                // total capacity due to memory held by the driver and usage by other
+                // programs.
+                //
+                // Note that at this point free_cached_blocks has already returned all
+                // possible "cached" memory to the driver. The only remaining "cached"
+                // memory is split from a larger block that is partially in-use.
+                AT_ERROR("NPU out of memory. Tried to allocate ", format_size(alloc_size), " (NPU ", device, "; ",
+                    format_size(device_total), " total capacity; ",
+                    format_size(stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
+                    " already allocated; ",
+                    format_size(stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
+                    " current active; ", format_size(device_free), " free; ", allowed_info,
+                    format_size(stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current),
+                    " reserved in total by PyTorch)",
+                    " If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.");
+            } else {
+                NPU_CHECK_ERROR(params.err);
+            }
+        }
+
+        int64_t ori_block_ptr = int64_t(params.block->ptr);
+        size_t align_round = CachingAllocatorConfig::base_addr_aligned_size();
+        if (params.size() >= kRoundLarge && CachingAllocatorConfig::expandable_segments() && align_round != 0 &&
+            ori_block_ptr % align_round != 0) {
+            char *align_ptr = reinterpret_cast<char *>((ori_block_ptr + align_round) - (ori_block_ptr % align_round));
+            size_t offset_size = align_ptr - (char *)params.block->ptr;
+            if (offset_size + params.size() <= params.block->size) {
+                auto size = params.block->size;
+                Block *remaining = params.block;
+
+                Block *block = new Block(params.device(), params.stream(), size - offset_size, params.pool, align_ptr);
+                block->expandable_segment_ = remaining->expandable_segment_;
+                block->next = remaining->next;
+                if (block->next) {
+                    block->next->prev = block;
+                }
+                block->prev = remaining;
+
+                remaining->next = block;
+                remaining->size = offset_size;
+                params.pool->blocks.insert(remaining);
+
+                params.block = block;
+            }
+        }
+
+        bool split_remainder = should_split(params.block, params.size());
+        return alloc_found_block(std::move(params), orig_size, std::move(context), split_remainder, allocator_type);
+    }
+
+    Block *alloc_found_block(AllocParams params, size_t orig_size, std::shared_ptr<c10::GatheredContext> context,
+        bool split_remainder, uint8_t allocator_type)
+    {
+        auto size = params.size();
+        auto device = params.device();
+        auto pool = params.pool;
+        auto stream = params.stream();
+
+        TORCH_INTERNAL_ASSERT(params.err == ACL_ERROR_NONE && params.block != nullptr && params.block->ptr != nullptr,
+            PTA_ERROR(ErrCode::PTR));
+        Block *block = params.block;
+        Block *remaining = nullptr;
+
+        const bool already_split = block->is_split();
+        if (split_remainder) {
+            remaining = block;
+
+            block = new Block(device, stream, size, pool, block->ptr);
+            block->expandable_segment_ = remaining->expandable_segment_;
+            block->prev = remaining->prev;
+            if (block->prev) {
+                block->prev->next = block;
+            }
+            block->next = remaining;
+
+            remaining->prev = block;
+            remaining->ptr = static_cast<char *>(remaining->ptr) + size;
+            remaining->size -= size;
+            pool->blocks.insert(remaining);
+
+            if (already_split && !block->expandable_segment_) {
+                // An already-split inactive block is being shrunk by size bytes.
+                update_stat_array(stats.inactive_split_bytes, -static_cast<std::int64_t>(block->size),
+                    params.stat_types);
+            } else if (!block->expandable_segment_) {
+                // A new split inactive block is being created from a previously unsplit
+                // block, size remaining->size bytes.
+                for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
+                    update_stat(stats.inactive_split_bytes[stat_type], static_cast<std::int64_t>(remaining->size));
+                    update_stat(stats.inactive_split[stat_type], 1);
+                });
+            }
+        } else if (already_split && !block->expandable_segment_) {
+            // An already-split block is becoming active
+            for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
+                update_stat(stats.inactive_split_bytes[stat_type], -static_cast<std::int64_t>(block->size));
+                update_stat(stats.inactive_split[stat_type], -1);
+            });
+        }
+
+        block->allocated = true;
+        block->requested_size = orig_size;
+        if (block->is_safe == false) {
+            ASCEND_LOGI("Unsafe memory block is passively refreshed by releasing and mallocing memory again");
+        }
+        block->is_safe = true;
+
+        block->context_when_allocated = std::move(context);
+        record_trace(TraceEntry::ALLOC, int64_t(block->ptr), orig_size, block->stream, block->device,
+            block->context_when_allocated);
+
+        active_blocks.insert(block);
+
+        for_each_selected_stat_type(params.stat_types, [&](size_t stat_type) {
+            update_stat(stats.allocation[stat_type], 1);
+            update_stat(stats.allocated_bytes[stat_type], static_cast<std::int64_t>(block->size));
+            update_stat(stats.active[stat_type], 1);
+            update_stat(stats.active_bytes[stat_type], static_cast<std::int64_t>(block->size));
+            update_stat(stats.requested_bytes[stat_type], static_cast<std::int64_t>(block->requested_size));
+        });
+
+        if (block->size >= CachingAllocatorConfig::max_split_size()) {
+            update_stat(stats.oversize_allocations, 1);
+        }
+
+        ASCEND_LOGD("PTA CachingAllocator malloc: malloc = %zu, cached = %lu, allocated = %lu", block->size,
+            stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current);
+
+#ifndef BUILD_LIBTORCH
+        if (torch_npu::profiler::MstxMgr::GetInstance()->isMsleaksEnable()) {
+            mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(
+                torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+            mstxMemVirtualRangeDesc_t heapDesc{ block->device, block->ptr,
+                stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current };
+            torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &heapDesc);
+            mstxMemVirtualRangeDesc_t regionDesc{ block->device, block->ptr, block->size };
+            torch_npu::profiler::MstxMgr::GetInstance()->memRegionsRegister(msleaksDomain, &regionDesc);
+        }
+        torch_npu::profiler::reportMemoryDataToNpuProfiler({ static_cast<int8_t>(c10::DeviceType::PrivateUse1),
+            block->device, static_cast<uint8_t>(torch_npu::profiler::MemoryDataType::MEMORY_MALLOC), allocator_type,
+            reinterpret_cast<int64_t>(block->ptr), block->size,
+            stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            reinterpret_cast<int64_t>(block->stream) });
+#endif
+
+        return block;
+    }
+
+
+    void free(Block *block, uint8_t allocator_type = 0)
+    {
+        std::shared_ptr<c10::GatheredContext> context = maybeGatherContext(RecordContext::ALL);
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+
+        block->allocated = false;
+
+        // following logic might modifying underlaying Block, causing the size
+        // changed. We store ahead for reporting
+        auto orig_block_ptr = block->ptr;
+        auto orig_block_size = block->size;
+
+        StatTypes stat_types = get_stat_types_for_pool(*(block->pool));
+        for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
+            update_stat(stats.allocation[stat_type], -1);
+            update_stat(stats.allocated_bytes[stat_type], -block->size);
+        });
+
+        record_trace(TraceEntry::FREE_REQUESTED, int64_t(block->ptr), block->requested_size, block->stream,
+            block->device, context ? context : block->context_when_allocated);
+
+        if (block->size >= CachingAllocatorConfig::max_split_size()) {
+            update_stat(stats.oversize_allocations, -1);
+        }
+
+        if (!block->stream_uses.empty() && c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) {
+            if (C10_UNLIKELY(!captures_underway.empty())) {
+                // It's forbidden to npuEventQuery an event recorded during NPU graph
+                // capture. We conservatively defer recording end-of-life events until
+                // the next call to process_events() (which won't happen until no
+                // captures are underway)
+                needs_events_deferred_until_no_capture.push_back(block);
+            } else {
+                insert_events(block);
+            }
+        } else {
+            free_block(block, context, allocator_type);
+        }
+
+        ASCEND_LOGD("PTA CachingAllocator free: free = %zu, cached = %lu, allocated = %lu", orig_block_size,
+            stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current);
+#ifndef BUILD_LIBTORCH
+        if (torch_npu::profiler::MstxMgr::GetInstance()->isMsleaksEnable()) {
+            mstxDomainHandle_t msleaksDomain = torch_npu::profiler::MstxMgr::GetInstance()->createLeaksDomain(
+                torch_npu::profiler::DOMAIN_MSLEAKS.c_str());
+            mstxMemVirtualRangeDesc_t desc{ block->device, orig_block_ptr,
+                stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current };
+            torch_npu::profiler::MstxMgr::GetInstance()->memHeapRegister(msleaksDomain, &desc);
+            torch_npu::profiler::MstxMgr::GetInstance()->memRegionsUnregister(msleaksDomain, orig_block_ptr);
+        }
+        torch_npu::profiler::reportMemoryDataToNpuProfiler({ static_cast<int8_t>(c10::DeviceType::PrivateUse1),
+            block->device, static_cast<uint8_t>(torch_npu::profiler::MemoryDataType::MEMORY_FREE), allocator_type,
+            reinterpret_cast<int64_t>(orig_block_ptr), -orig_block_size,
+            stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            reinterpret_cast<int64_t>(block->stream) });
+#endif
+    }
+
+    void *getBaseAllocation(Block *block, size_t *outSize)
+    {
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+        while (block->prev) {
+            block = block->prev;
+        }
+        void *basePtr = block->ptr;
+        if (outSize) {
+            size_t size = 0;
+            while (block) {
+                size += block->size;
+                block = block->next;
+            }
+            *outSize = size;
+        }
+        return basePtr;
+    }
+
+    void recordStream(Block *block, c10_npu::NPUStream stream)
+    {
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+        block->stream_uses.insert(stream);
+        if (C10_UNLIKELY(!captures_underway.empty())) {
+            block_to_npugraph_stream_uses[block].insert(stream);
+        }
+    }
+
+    void eraseStream(Block *block, c10_npu::NPUStream stream)
+    {
+        std::shared_ptr<c10::GatheredContext> context = maybeGatherContext(RecordContext::ALL);
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+        block->stream_uses.erase(stream);
+
+        // free block, lazy destory block related events
+        for (auto it = npu_events[stream].begin(); it != npu_events[stream].end();) {
+            if (block != it->second) {
+                it++;
+                continue;
+            }
+            it = npu_events[stream].erase(it);
+            block->event_count--;
+            if (block->event_count == 0) {
+                free_block(block, context);
+                break;
+            }
+        }
+    }
+
+    /* * set memory fraction to limit maximum allocated memory * */
+    void setMemoryFraction(double fraction)
+    {
+        size_t device_free;
+        size_t device_total;
+        NPU_CHECK_ERROR(aclrtGetMemInfo(ACL_HBM_MEM, &device_free, &device_total));
+        allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
+        set_fraction = true;
+    }
+
+    /* * returns cached blocks to the system allocator * */
+    void emptyCache(int device, bool check_error)
+    {
+        std::shared_ptr<c10::GatheredContext> context = maybeGatherContext(RecordContext::ALL);
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+        c10_npu::NPUWorkspaceAllocator::emptyCache(device, true, check_error);
+        release_cached_blocks(check_error, context);
+    }
+
+    void buildServerMemMapForHccl(std::shared_ptr<c10d_npu::HCCLComm> hcclComm)
+    {
+        std::unique_lock<std::recursive_mutex> lock(mutex);
+        TORCH_INTERNAL_ASSERT(!hcclComm_, "Build HCCL server group redundancy.", PTA_ERROR(ErrCode::INTERNAL));
+        hcclComm_ = hcclComm;
+        for (auto &expandable_segments : expandable_segments_) {
+            expandable_segments->setHcclComm(hcclComm);
+        }
+    }
+
+    void release_and_free_events()
+    {
+        std::unique_lock<std::recursive_mutex> lock(mutex);
+        std::shared_ptr<c10::GatheredContext> context = maybeGatherContext(RecordContext::ALL);
+        for (auto &st : npu_events) {
+            for (auto &e : st.second) {
+                EventPool::Event event = std::move(e.first);
+                Block *block = e.second;
+                block->event_count--;
+                if (block->event_count == 0) {
+                    free_block(block, context);
+                }
+            }
+        }
+        npu_events.clear();
+    }
+
+    /* * Retrieves info (total size + largest block) of the memory cache * */
+    void cacheInfo(size_t *total, size_t *largest)
+    {
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+        cache_info_aux(large_blocks, total, largest);
+        cache_info_aux(small_blocks, total, largest);
+        for (const auto &gp : graph_pools) {
+            cache_info_aux(gp.second->large_blocks, total, largest);
+            cache_info_aux(gp.second->small_blocks, total, largest);
+        }
+    }
+
+    /* * Returns a copy of the memory allocator stats * */
+    DeviceStats getStats()
+    {
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+        return stats;
+    }
+
+    /* * Resets the historical accumulation stats for the device * */
+    void resetAccumulatedStats()
+    {
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+
+        for (size_t statType = 0; statType < static_cast<size_t>(StatType::NUM_TYPES); ++statType) {
+            reset_accumulated_stat(stats.allocation[statType]);
+            reset_accumulated_stat(stats.segment[statType]);
+            reset_accumulated_stat(stats.active[statType]);
+            reset_accumulated_stat(stats.inactive_split[statType]);
+            reset_accumulated_stat(stats.allocated_bytes[statType]);
+            reset_accumulated_stat(stats.reserved_bytes[statType]);
+            reset_accumulated_stat(stats.active_bytes[statType]);
+            reset_accumulated_stat(stats.inactive_split_bytes[statType]);
+            reset_accumulated_stat(stats.requested_bytes[statType]);
+        }
+
+        stats.num_alloc_retries = 0;
+        stats.num_ooms = 0;
+        reset_accumulated_stat(stats.oversize_allocations);
+        reset_accumulated_stat(stats.oversize_segments);
+    }
+
+    /* * Resets the historical peak stats for the device * */
+    void resetPeakStats()
+    {
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+
+        for (size_t statType = 0; statType < static_cast<size_t>(StatType::NUM_TYPES); ++statType) {
+            reset_peak_stat(stats.allocation[statType]);
+            reset_peak_stat(stats.segment[statType]);
+            reset_peak_stat(stats.active[statType]);
+            reset_peak_stat(stats.inactive_split[statType]);
+            reset_peak_stat(stats.allocated_bytes[statType]);
+            reset_peak_stat(stats.reserved_bytes[statType]);
+            reset_peak_stat(stats.active_bytes[statType]);
+            reset_peak_stat(stats.inactive_split_bytes[statType]);
+            reset_peak_stat(stats.requested_bytes[statType]);
+        }
+
+        reset_peak_stat(stats.oversize_allocations);
+        reset_peak_stat(stats.oversize_segments);
+    }
+
+    /* * Dump a complete snapshot of the memory held by the allocator. Potentially VERY expensive. * */
+    std::vector<SegmentInfo> snapshot()
+    {
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+
+        std::unordered_map<PrivatePool *, MempoolId_t> pool_to_id;
+        pool_to_id.reserve(graph_pools.size() + graph_pools_freeable.size());
+        for (const auto &pair : graph_pools) {
+            pool_to_id[pair.second.get()] = pair.first;
+        }
+        for (const auto &pair : graph_pools_freeable) {
+            pool_to_id[pair.second] = pair.first;
+        }
+
+        size_t total_active = 0;
+        std::vector<SegmentInfo> result;
+        const auto all_blocks = get_all_blocks();
+
+        for (const Block * const head_block : all_blocks) {
+            // For expandable segments, we report one segment for each continguous
+            // mapped range of memory
+            if (head_block->prev && head_block->prev->mapped) {
+                continue;
+            }
+            result.emplace_back();
+            SegmentInfo &segment_info = result.back();
+            segment_info.device = head_block->device;
+            segment_info.address = reinterpret_cast<int64_t>(head_block->ptr);
+            segment_info.stream = head_block->stream;
+            segment_info.is_large = (!head_block->pool->is_small);
+            segment_info.is_expandable = head_block->expandable_segment_;
+            segment_info.context_when_allocated = head_block->context_when_segment_allocated;
+            auto mempool_id = pool_to_id.find(head_block->pool->owner_PrivatePool);
+            if (mempool_id != pool_to_id.end()) {
+                segment_info.owner_private_pool_id = mempool_id->second;
+            }
+            const Block *block = head_block;
+            while (block != nullptr && block->mapped) {
+                segment_info.blocks.emplace_back();
+                BlockInfo &block_info = segment_info.blocks.back();
+
+                block_info.size = block->size;
+                block_info.requested_size = block->requested_size;
+                block_info.allocated = block->allocated;
+                block_info.active = block->allocated || (block->event_count > 0);
+
+                segment_info.total_size += block_info.size;
+                if (block_info.allocated) {
+                    segment_info.allocated_size += block_info.size;
+                }
+                if (block_info.active) {
                     segment_info.active_size += block_info.size;
                     segment_info.requested_size += block_info.requested_size;
                 }
@@ -1705,42 +1620,37 @@ class DeviceCachingAllocator {
         }
 
         std::sort(result.begin(), result.end(),
-                  [](const SegmentInfo& a, const SegmentInfo& b) {
-                      return a.address < b.address;
-                  });
+            [](const SegmentInfo &a, const SegmentInfo &b) { return a.address < b.address; });
 
         record_trace(TraceEntry::SNAPSHOT, 0, total_active, nullptr, 0, nullptr);
         return result;
     }
 
-  std::vector<TraceEntry> trace()
-  {
-      std::lock_guard<std::recursive_mutex> lock(mutex);
-      std::vector<TraceEntry> result;
-      result.reserve(alloc_trace->size());
-      result.insert(result.end(), alloc_trace->begin() + alloc_trace_next,
-                    alloc_trace->end());
-      result.insert(result.end(), alloc_trace->begin(),
-                    alloc_trace->begin() + alloc_trace_next);
+    std::vector<TraceEntry> trace()
+    {
+        std::lock_guard<std::recursive_mutex> lock(mutex);
+        std::vector<TraceEntry> result;
+        result.reserve(alloc_trace->size());
+        result.insert(result.end(), alloc_trace->begin() + alloc_trace_next, alloc_trace->end());
+        result.insert(result.end(), alloc_trace->begin(), alloc_trace->begin() + alloc_trace_next);
 
-      return result;
-  }
+        return result;
+    }
 
-  static size_t round_size(size_t size) {
-    size = size + 32;
-    if (size < kMinBlockSize) {
-      return kMinBlockSize;
-    } else {
-      return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
+    static size_t round_size(size_t size)
+    {
+        size = size + 32;
+        if (size < kMinBlockSize) {
+            return kMinBlockSize;
+        } else {
+            return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
+        }
     }
-  }
 
     // See Note [Interaction with NPU graph capture]
 
     // Called by NPUGraph::capture_begin
-    void beginAllocateToPool(
-        MempoolId_t mempool_id,
-        std::function<bool(aclrtStream)> filter)
+    void beginAllocateToPool(MempoolId_t mempool_id, std::function<bool(aclrtStream)> filter)
     {
         std::lock_guard<std::recursive_mutex> lock(mutex);
         auto it = graph_pools.find(mempool_id);
@@ -1755,11 +1665,8 @@ class DeviceCachingAllocator {
             TORCH_INTERNAL_ASSERT(it->second->use_count > 0);
             it->second->use_count++;
         }
-        for (auto it2 = captures_underway.begin(); it2 != captures_underway.end();
-            ++it2) {
-            TORCH_CHECK(
-                it2->first != mempool_id,
-                "beginAllocateToPool: already recording to mempool_id");
+        for (auto it2 = captures_underway.begin(); it2 != captures_underway.end(); ++it2) {
+            TORCH_CHECK(it2->first != mempool_id, "beginAllocateToPool: already recording to mempool_id");
         }
         captures_underway.emplace_back(mempool_id, std::move(filter));
     }
@@ -1774,8 +1681,7 @@ class DeviceCachingAllocator {
                 return;
             }
         }
-        TORCH_CHECK(
-            false, "endAllocatePool: not currently recording to mempool_id");
+        TORCH_CHECK(false, "endAllocatePool: not currently recording to mempool_id");
     }
 
     // Called by NPUGraph::reset
@@ -1799,314 +1705,260 @@ class DeviceCachingAllocator {
             // Allows free_cached_blocks to begin npuFreeing this pool's memory,
             // and makes sure this pool wasn't somehow made freeable already.
             // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
-            bool inserted =
-                graph_pools_freeable.insert({mempool_id, it->second.get()}).second;
+            bool inserted = graph_pools_freeable.insert({ mempool_id, it->second.get() }).second;
             TORCH_INTERNAL_ASSERT(inserted);
         }
     }
 
- private:
-
-  // All private methods do not acquire the allocator mutex.
-
-  std::vector<const Block*> get_all_blocks() const {
-    std::vector<const Block*> blocks;
-    blocks.insert(blocks.end(), small_blocks.blocks.begin(), small_blocks.blocks.end());
-    blocks.insert(blocks.end(), large_blocks.blocks.begin(), large_blocks.blocks.end());
-    for (const auto& gp : graph_pools) {
-      blocks.insert(
-          blocks.end(),
-          gp.second->small_blocks.blocks.begin(),
-          gp.second->small_blocks.blocks.end());
-      blocks.insert(
-          blocks.end(),
-          gp.second->large_blocks.blocks.begin(),
-          gp.second->large_blocks.blocks.end());
-    }
-    blocks.insert(blocks.end(), active_blocks.begin(), active_blocks.end());
-    return blocks;
-  }
-
-  // returns the smallest possible address in any segment
-  // where there is enough free address space to fit size
-  // may be composed of free and unmapped segments
-  Block* find_expandable_block(
-      int device,
-      aclrtStream stream,
-      BlockPool* pool,
-      size_t size) {
-    Block key(device, stream, 0);
-
-    auto allocatable = [](Block* b) {
-      return b && !b->allocated && b->event_count == 0 &&
-          b->stream_uses.empty();
-    };
-    auto has_available_address_space = [&](Block* b) {
-      size_t bytes = 0;
-      while (bytes < size && allocatable(b)) {
-        bytes += b->size;
-        b = b->next;
-      }
-      return bytes >= size;
-    };
-    for (auto it = pool->unmapped.lower_bound(&key);
-         it != pool->unmapped.end() && (*it)->stream == stream;
-         ++it) {
-      Block* c = *it;
-      // we found the lowest address of an unmapped segment
-      // but there might be a free segment we can also use
-      // right before it
-      if (allocatable(c->prev)) {
-        c = c->prev;
-      }
-      if (has_available_address_space(c)) {
-        return c;
-      }
-    }
-    auto segment_size = pool->is_small ? kSmallBuffer : (
-      c10_npu::option::OptionsManager::IsHcclZeroCopyEnable() ? kLargeBufferForHccl : kLargeBuffer
-    );
-    // 此处申请虚拟内存，segment_size是页大小，实际虚拟内存巨大
-    if (IsMallocPage1GMem(pool->is_small)) {
-        segment_size = kExtraLargeBuffer;
-    }
-    auto segment = new ExpandableSegment(device, stream, segment_size);
-    if (hcclComm_) {
-        segment->setHcclComm(hcclComm_);
-    }
-    expandable_segments_.emplace_back(segment);
-
-    ExpandableSegment* es = expandable_segments_.back();
-    Block* candidate = new Block(device, stream, es->size(), pool, es->ptr());
-    candidate->mapped = false;
-    candidate->expandable_segment_ = es;
-    pool->unmapped.insert(candidate);
-    return candidate;
-  }
-
-  bool map_block(
-      Block* to_map,
-      size_t size,
-      const std::shared_ptr<c10::GatheredContext>& ctx)
-  {
-    TORCH_INTERNAL_ASSERT(!to_map->mapped && size <= to_map->size, PTA_ERROR(ErrCode::VALUE));
-    TORCH_INTERNAL_ASSERT(
-        !to_map->context_when_allocated); // unmapped blocks should not keep
-                                          // history
-    auto mapped_range =
-        to_map->expandable_segment_->map(SegmentRange{to_map->ptr, size});
-    // failed to map the memory
-    if (mapped_range.size == 0) {
-      return false;
-    }
-    TORCH_INTERNAL_ASSERT(
-        mapped_range.ptr == to_map->ptr && mapped_range.size >= size, PTA_ERROR(ErrCode::INTERNAL));
-
-    BlockPool& pool = *to_map->pool;
-    pool.unmapped.erase(to_map);
-    to_map->mapped = true;
-
-    if (mapped_range.size < to_map->size) {
-      // to_map -> remaining -> to_map->next(?)
-      Block* remaining = new Block(
-          to_map->device,
-          to_map->stream,
-          to_map->size - mapped_range.size,
-          &pool,
-          static_cast<char*>(to_map->ptr) + mapped_range.size);
-      remaining->mapped = false;
-      remaining->expandable_segment_ = to_map->expandable_segment_;
-      remaining->splice(to_map, to_map->next);
-      pool.unmapped.insert(remaining);
-      to_map->size = mapped_range.size;
-    }
-
-    try_merge_blocks(to_map, to_map->prev, pool);
-    try_merge_blocks(to_map, to_map->next, pool);
-
-    pool.blocks.insert(to_map);
-
-    // update statistics
-    total_allocated_memory += mapped_range.size;
-    StatTypes stat_types = get_stat_types_for_pool(*to_map->pool);
-    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
-      update_stat(stats.reserved_bytes[stat_type], mapped_range.size);
-    });
-    record_trace(
-        TraceEntry::SEGMENT_MAP,
-        int64_t(mapped_range.ptr),
-        mapped_range.size,
-        to_map->stream,
-        to_map->device,
-        ctx);
-    if (!to_map->prev && !to_map->context_when_segment_allocated) {
-      to_map->context_when_segment_allocated = ctx;
-    }
-
-    return true;
-  }
-
-  Block* try_allocate_expandable_block(
-      int device,
-      aclrtStream stream,
-      BlockPool* pool,
-      size_t size,
-      const std::shared_ptr<c10::GatheredContext>& ctx)
-  {
-    Block* candidate = find_expandable_block(device, stream, pool, size);
-    // Candidate is now a list free/unmapped blocks with at least size room:
-    // unmapped -> null
-    // unmapped -> free -> *
-    // free -> unmapped -> *
-
-    if (!candidate->mapped &&
-        !map_block(candidate, std::min(candidate->size, size), ctx)) {
-      return nullptr;
-    }
-    TORCH_INTERNAL_ASSERT(candidate->mapped, PTA_ERROR(ErrCode::INTERNAL));
-
-    while (candidate->size < size) {
-      // invariant: free -> unmapped -> *
-      // map_block will map some of unmapped and merge with free
-      auto remaining = size - candidate->size;
-      auto new_candidate = candidate->next;
-      if (!map_block(new_candidate, std::min(remaining, candidate->next->size), ctx)) {
-        return nullptr;
-      }
-      candidate = new_candidate;
-    }
-    pool->blocks.erase(candidate);
-    return candidate;
-  }
-
-
-  /** moves a block into a pool of cached free blocks **/
-  void free_block(
-      Block* block,
-      const std::shared_ptr<c10::GatheredContext>& context,
-      uint8_t allocator_type = 0)
-  {
-    AT_ASSERT(!block->allocated && block->event_count == 0, PTA_ERROR(ErrCode::VALUE));
-
-    record_trace(
-        TraceEntry::FREE_COMPLETED,
-        int64_t(block->ptr),
-        block->requested_size,
-        block->stream,
-        block->device,
-        context ? context : block->context_when_allocated);
-
-    block->context_when_allocated = nullptr;
-    size_t original_block_size = block->size;
-    auto orig_block_ptr = block->ptr;
-    size_t requested_size = block->requested_size;
-
-    auto& pool = *block->pool;
-    int64_t net_change_inactive_split_blocks = 0;
-    int64_t net_change_inactive_split_size = 0;
-
-    const std::array<Block*, 2> merge_candidates = {block->prev, block->next};
-    for (Block* merge_candidate : merge_candidates) {
-      const int64_t subsumed_size = static_cast<int64_t>(try_merge_blocks(block, merge_candidate, pool));
-      if (subsumed_size > 0) {
-        net_change_inactive_split_blocks -= 1;
-        net_change_inactive_split_size -= subsumed_size;
-      }
-    }
-
-    active_blocks.erase(block);
-    pool.blocks.insert(block);
-
-    if (block->is_split()) {
-      net_change_inactive_split_blocks += 1;
-      net_change_inactive_split_size += static_cast<int64_t>(block->size);
-    }
-
-    StatTypes stat_types = get_stat_types_for_pool(pool);
-    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
-      // inactive_split tries to capture the idea that blocks
-      // cannot be freed when requested, but fully free pages
-      // of expandable blocks can always be freed.
-      // The logic to track this as statistic is pretty involved,
-      // so we simply just exclude expandable segements from
-      // inactive_split
-      if (!block->expandable_segment_) {
-        update_stat(
-            stats.inactive_split[stat_type], net_change_inactive_split_blocks);
-        update_stat(
-            stats.inactive_split_bytes[stat_type],
-            net_change_inactive_split_size);
-      }
-      update_stat(stats.active[stat_type], -1);
-      update_stat(stats.active_bytes[stat_type], -original_block_size);
-      update_stat(
-          stats.requested_bytes[stat_type],
-          -static_cast<std::int64_t>(requested_size));
-    });
-#ifndef BUILD_LIBTORCH
-    torch_npu::profiler::reportMemoryDataToNpuProfiler({
-        static_cast<int8_t>(c10::DeviceType::PrivateUse1),
-        block->device,
-        static_cast<uint8_t>(torch_npu::profiler::MemoryDataType::MEMORY_BLOCK_FREE),
-        allocator_type,
-        reinterpret_cast<int64_t>(orig_block_ptr),
-        -original_block_size,
-        stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-        stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-        stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-        reinterpret_cast<int64_t>(block->stream)}
-    );
-#endif
-  }
+private:
+    // All private methods do not acquire the allocator mutex.
 
-  /** combine previously split blocks. returns the size of the subsumed block, or 0 on failure. **/
-  size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) {
-    if (!src || src->allocated || src->event_count > 0 ||
-        !src->stream_uses.empty() || dst->mapped != src->mapped) {
-      return 0;
+    std::vector<const Block *> get_all_blocks() const
+    {
+        std::vector<const Block *> blocks;
+        blocks.insert(blocks.end(), small_blocks.blocks.begin(), small_blocks.blocks.end());
+        blocks.insert(blocks.end(), large_blocks.blocks.begin(), large_blocks.blocks.end());
+        for (const auto &gp : graph_pools) {
+            blocks.insert(blocks.end(), gp.second->small_blocks.blocks.begin(), gp.second->small_blocks.blocks.end());
+            blocks.insert(blocks.end(), gp.second->large_blocks.blocks.begin(), gp.second->large_blocks.blocks.end());
+        }
+        blocks.insert(blocks.end(), active_blocks.begin(), active_blocks.end());
+        return blocks;
     }
 
-    AT_ASSERT(dst->is_split() && src->is_split(), PTA_ERROR(ErrCode::VALUE));
+    // returns the smallest possible address in any segment
+    // where there is enough free address space to fit size
+    // may be composed of free and unmapped segments
+    Block *find_expandable_block(int device, aclrtStream stream, BlockPool *pool, size_t size)
+    {
+        Block key(device, stream, 0);
+
+        auto allocatable = [](Block *b) { return b && !b->allocated && b->event_count == 0 && b->stream_uses.empty(); };
+        auto has_available_address_space = [&](Block *b) {
+            size_t bytes = 0;
+            while (bytes < size && allocatable(b)) {
+                bytes += b->size;
+                b = b->next;
+            }
+            return bytes >= size;
+        };
+        for (auto it = pool->unmapped.lower_bound(&key); it != pool->unmapped.end() && (*it)->stream == stream; ++it) {
+            Block *c = *it;
+            // we found the lowest address of an unmapped segment
+            // but there might be a free segment we can also use
+            // right before it
+            if (allocatable(c->prev)) {
+                c = c->prev;
+            }
+            if (has_available_address_space(c)) {
+                return c;
+            }
+        }
+        auto segment_size = pool->is_small ?
+            kSmallBuffer :
+            (c10_npu::option::OptionsManager::IsHcclZeroCopyEnable() ? kLargeBufferForHccl : kLargeBuffer);
+        // 此处申请虚拟内存，segment_size是页大小，实际虚拟内存巨大
+        if (IsMallocPage1GMem(pool->is_small)) {
+            segment_size = kExtraLargeBuffer;
+        }
+        auto segment = new ExpandableSegment(device, stream, segment_size);
+        if (hcclComm_) {
+            segment->setHcclComm(hcclComm_);
+        }
+        expandable_segments_.emplace_back(segment);
 
-    if (dst->prev == src) {
-      dst->ptr = src->ptr;
-      dst->prev = src->prev;
-      if (dst->prev) {
-        dst->prev->next = dst;
-      }
-    } else {
-      dst->next = src->next;
-      if (dst->next) {
-        dst->next->prev = dst;
-      }
+        ExpandableSegment *es = expandable_segments_.back();
+        Block *candidate = new Block(device, stream, es->size(), pool, es->ptr());
+        candidate->mapped = false;
+        candidate->expandable_segment_ = es;
+        pool->unmapped.insert(candidate);
+        return candidate;
+    }
+
+    bool map_block(Block *to_map, size_t size, const std::shared_ptr<c10::GatheredContext> &ctx)
+    {
+        TORCH_INTERNAL_ASSERT(!to_map->mapped && size <= to_map->size, PTA_ERROR(ErrCode::VALUE));
+        TORCH_INTERNAL_ASSERT(!to_map->context_when_allocated); // unmapped blocks should not keep
+                                                                // history
+        auto mapped_range = to_map->expandable_segment_->map(SegmentRange{ to_map->ptr, size });
+        // failed to map the memory
+        if (mapped_range.size == 0) {
+            return false;
+        }
+        TORCH_INTERNAL_ASSERT(mapped_range.ptr == to_map->ptr && mapped_range.size >= size,
+            PTA_ERROR(ErrCode::INTERNAL));
+
+        BlockPool &pool = *to_map->pool;
+        pool.unmapped.erase(to_map);
+        to_map->mapped = true;
+
+        if (mapped_range.size < to_map->size) {
+            // to_map -> remaining -> to_map->next(?)
+            Block *remaining = new Block(to_map->device, to_map->stream, to_map->size - mapped_range.size, &pool,
+                static_cast<char *>(to_map->ptr) + mapped_range.size);
+            remaining->mapped = false;
+            remaining->expandable_segment_ = to_map->expandable_segment_;
+            remaining->splice(to_map, to_map->next);
+            pool.unmapped.insert(remaining);
+            to_map->size = mapped_range.size;
+        }
+
+        try_merge_blocks(to_map, to_map->prev, pool);
+        try_merge_blocks(to_map, to_map->next, pool);
+
+        pool.blocks.insert(to_map);
+
+        // update statistics
+        total_allocated_memory += mapped_range.size;
+        StatTypes stat_types = get_stat_types_for_pool(*to_map->pool);
+        for_each_selected_stat_type(stat_types,
+            [&](size_t stat_type) { update_stat(stats.reserved_bytes[stat_type], mapped_range.size); });
+        record_trace(TraceEntry::SEGMENT_MAP, int64_t(mapped_range.ptr), mapped_range.size, to_map->stream,
+            to_map->device, ctx);
+        if (!to_map->prev && !to_map->context_when_segment_allocated) {
+            to_map->context_when_segment_allocated = ctx;
+        }
+
+        return true;
     }
 
-    const size_t subsumed_size = src->size;
-    dst->size += subsumed_size;
-    auto erased =
-        src->mapped ? pool.blocks.erase(src) : pool.unmapped.erase(src);
-    delete src;
-    src = nullptr;
+    Block *try_allocate_expandable_block(int device, aclrtStream stream, BlockPool *pool, size_t size,
+        const std::shared_ptr<c10::GatheredContext> &ctx)
+    {
+        Block *candidate = find_expandable_block(device, stream, pool, size);
+        // Candidate is now a list free/unmapped blocks with at least size room:
+        // unmapped -> null
+        // unmapped -> free -> *
+        // free -> unmapped -> *
+
+        if (!candidate->mapped && !map_block(candidate, std::min(candidate->size, size), ctx)) {
+            return nullptr;
+        }
+        TORCH_INTERNAL_ASSERT(candidate->mapped, PTA_ERROR(ErrCode::INTERNAL));
+
+        while (candidate->size < size) {
+            // invariant: free -> unmapped -> *
+            // map_block will map some of unmapped and merge with free
+            auto remaining = size - candidate->size;
+            auto new_candidate = candidate->next;
+            if (!map_block(new_candidate, std::min(remaining, candidate->next->size), ctx)) {
+                return nullptr;
+            }
+            candidate = new_candidate;
+        }
+        pool->blocks.erase(candidate);
+        return candidate;
+    }
 
-    return subsumed_size;
-  }
 
-    BlockPool& get_pool(size_t size, aclrtStream stream)
+    /* * moves a block into a pool of cached free blocks * */
+    void free_block(Block *block, const std::shared_ptr<c10::GatheredContext> &context, uint8_t allocator_type = 0)
+    {
+        AT_ASSERT(!block->allocated && block->event_count == 0, PTA_ERROR(ErrCode::VALUE));
+
+        record_trace(TraceEntry::FREE_COMPLETED, int64_t(block->ptr), block->requested_size, block->stream,
+            block->device, context ? context : block->context_when_allocated);
+
+        block->context_when_allocated = nullptr;
+        size_t original_block_size = block->size;
+        auto orig_block_ptr = block->ptr;
+        size_t requested_size = block->requested_size;
+
+        auto &pool = *block->pool;
+        int64_t net_change_inactive_split_blocks = 0;
+        int64_t net_change_inactive_split_size = 0;
+
+        const std::array<Block *, 2> merge_candidates = { block->prev, block->next };
+        for (Block *merge_candidate : merge_candidates) {
+            const int64_t subsumed_size = static_cast<int64_t>(try_merge_blocks(block, merge_candidate, pool));
+            if (subsumed_size > 0) {
+                net_change_inactive_split_blocks -= 1;
+                net_change_inactive_split_size -= subsumed_size;
+            }
+        }
+
+        active_blocks.erase(block);
+        pool.blocks.insert(block);
+
+        if (block->is_split()) {
+            net_change_inactive_split_blocks += 1;
+            net_change_inactive_split_size += static_cast<int64_t>(block->size);
+        }
+
+        StatTypes stat_types = get_stat_types_for_pool(pool);
+        for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
+            // inactive_split tries to capture the idea that blocks
+            // cannot be freed when requested, but fully free pages
+            // of expandable blocks can always be freed.
+            // The logic to track this as statistic is pretty involved,
+            // so we simply just exclude expandable segements from
+            // inactive_split
+            if (!block->expandable_segment_) {
+                update_stat(stats.inactive_split[stat_type], net_change_inactive_split_blocks);
+                update_stat(stats.inactive_split_bytes[stat_type], net_change_inactive_split_size);
+            }
+            update_stat(stats.active[stat_type], -1);
+            update_stat(stats.active_bytes[stat_type], -original_block_size);
+            update_stat(stats.requested_bytes[stat_type], -static_cast<std::int64_t>(requested_size));
+        });
+#ifndef BUILD_LIBTORCH
+        torch_npu::profiler::reportMemoryDataToNpuProfiler({ static_cast<int8_t>(c10::DeviceType::PrivateUse1),
+            block->device, static_cast<uint8_t>(torch_npu::profiler::MemoryDataType::MEMORY_BLOCK_FREE), allocator_type,
+            reinterpret_cast<int64_t>(orig_block_ptr), -original_block_size,
+            stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+            reinterpret_cast<int64_t>(block->stream) });
+#endif
+    }
+
+    /* * combine previously split blocks. returns the size of the subsumed block, or 0 on failure. * */
+    size_t try_merge_blocks(Block *dst, Block *src, BlockPool &pool)
+    {
+        if (!src || src->allocated || src->event_count > 0 || !src->stream_uses.empty() || dst->mapped != src->mapped) {
+            return 0;
+        }
+
+        AT_ASSERT(dst->is_split() && src->is_split(), PTA_ERROR(ErrCode::VALUE));
+
+        if (dst->prev == src) {
+            dst->ptr = src->ptr;
+            dst->prev = src->prev;
+            if (dst->prev) {
+                dst->prev->next = dst;
+            }
+        } else {
+            dst->next = src->next;
+            if (dst->next) {
+                dst->next->prev = dst;
+            }
+        }
+
+        const size_t subsumed_size = src->size;
+        dst->size += subsumed_size;
+        auto erased = src->mapped ? pool.blocks.erase(src) : pool.unmapped.erase(src);
+        delete src;
+        src = nullptr;
+
+        return subsumed_size;
+    }
+
+    BlockPool &get_pool(size_t size, aclrtStream stream)
     {
         // captures_underway is a conservative guess that the current stream may be
         // capturing. It's only non-empty if some thread has begun and not yet ended
         // a capture, so it's usually 0, and we can short-circuit
         // npuStreamCaptureStatus (which does a TLS lookup).
         if (C10_UNLIKELY(!captures_underway.empty())) {
-            for (auto& entry : captures_underway) {
+            for (auto &entry : captures_underway) {
                 if (entry.second(stream)) {
                     auto it1 = graph_pools.find(entry.first);
                     TORCH_INTERNAL_ASSERT(it1 != graph_pools.end());
-                if (size <= kSmallSize) {
-                    return it1->second->small_blocks;
-                } else {
-                    return it1->second->large_blocks;
-                }
+                    if (size <= kSmallSize) {
+                        return it1->second->small_blocks;
+                    } else {
+                        return it1->second->large_blocks;
+                    }
                 }
             }
         }
@@ -2117,178 +1969,171 @@ class DeviceCachingAllocator {
         }
     }
 
-  StatTypes get_stat_types_for_pool(const BlockPool& pool) {
-    StatTypes stat_types = {false};
-    stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
-    stat_types[static_cast<size_t>(
-        pool.is_small ? StatType::SMALL_POOL : StatType::LARGE_POOL)] = true;
-    return stat_types;
-  }
+    StatTypes get_stat_types_for_pool(const BlockPool &pool)
+    {
+        StatTypes stat_types = { false };
+        stat_types[static_cast<size_t>(StatType::AGGREGATE)] = true;
+        stat_types[static_cast<size_t>(pool.is_small ? StatType::SMALL_POOL : StatType::LARGE_POOL)] = true;
+        return stat_types;
+    }
 
-  bool should_split(const Block* block, size_t size) {
-    size_t remaining = block->size - size;
-    if (block->pool->is_small ||
-        CachingAllocatorConfig::expandable_segments()) {
-      return remaining >= kMinBlockSize;
-    } else {
-      return (size < CachingAllocatorConfig::max_split_size()) && (remaining > kSmallSize);
+    bool should_split(const Block *block, size_t size)
+    {
+        size_t remaining = block->size - size;
+        if (block->pool->is_small || CachingAllocatorConfig::expandable_segments()) {
+            return remaining >= kMinBlockSize;
+        } else {
+            return (size < CachingAllocatorConfig::max_split_size()) && (remaining > kSmallSize);
+        }
     }
-  }
 
-  static size_t get_allocation_size(size_t size) {
-    if (size <= kSmallSize) {
-      return kSmallBuffer;
-    } else if (size < kMinLargeAlloc) {
-      return kLargeBuffer;
-    } else {
-      return kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
-    }
-  }
-
-  bool get_free_block(AllocParams& p) {
-    BlockPool& pool = *p.pool;
-
-    if (C10_UNLIKELY(set_fraction &&
-            CachingAllocatorConfig::garbage_collection_threshold() > 0.0)) {
-      // Track block reuse interval only when garbage collection is enabled.
-      for (auto& b : pool.blocks) {
-        ++b->gc_count;
-      }
-    }
-    auto it = pool.blocks.lower_bound(&p.search_key);
-    if (it == pool.blocks.end() || (*it)->stream != p.stream()) {
-      return false;
-    }
-
-    if ((*it)->expandable_segment_) {
-      if (CachingAllocatorConfig::expandable_segments()) {
-        // if we are allocated to the part of the block that is expandable
-        // for the purposes of "best fit" we consider its size to be the size it
-        // can expand to, not the size it currently is. This means that we
-        // sometimes have to search for blocks with bigger 'size' before
-        // choosing this segment.
-        auto expandable_size = [](Block* b) {
-          return b->size + (b->next && !b->next->mapped ? b->next->size : 0);
-        };
-        auto next = it;
-        next++;
-        while ((*it)->expandable_segment_ && next != pool.blocks.end() &&
-               (*next)->stream == p.stream() &&
-               expandable_size(*next) < expandable_size(*it)) {
-          it = next++;
-        }
-      } else {
-        // Rarely expandable segments has been turned off after we have
-        // already allocated some blocks as expandable. For instance,
-        // since we cannot share expandable memory via IPC, someone might
-        // temporarily disable it. In this case we need to honor this request
-        // by only finding non-expandable blocks
-        do {
-          it++;
-        } while (it != pool.blocks.end() && (*it)->expandable_segment_ &&
-                 (*it)->stream == p.stream());
+    static size_t get_allocation_size(size_t size)
+    {
+        if (size <= kSmallSize) {
+            return kSmallBuffer;
+        } else if (size < kMinLargeAlloc) {
+            return kLargeBuffer;
+        } else {
+            return kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
+        }
+    }
+
+    bool get_free_block(AllocParams &p)
+    {
+        BlockPool &pool = *p.pool;
+
+        if (C10_UNLIKELY(set_fraction && CachingAllocatorConfig::garbage_collection_threshold() > 0.0)) {
+            // Track block reuse interval only when garbage collection is enabled.
+            for (auto &b : pool.blocks) {
+                ++b->gc_count;
+            }
+        }
+        auto it = pool.blocks.lower_bound(&p.search_key);
         if (it == pool.blocks.end() || (*it)->stream != p.stream()) {
-          return false;
-        }
-      }
-    }
-
-    // Do not return an oversized block for a large request
-    if ((p.size() < CachingAllocatorConfig::max_split_size()) &&
-        ((*it)->size >= CachingAllocatorConfig::max_split_size())) {
-          return false;
-        }
-    // Allow oversized block size to be rounded up but within a limit
-    if ((p.size() >= CachingAllocatorConfig::max_split_size()) && ((*it)->size >= p.size() + kLargeBuffer)) {
-      return false;
-    }
-    p.block = *it;
-    (*it)->gc_count = 0; // Denote this block has been used
-    pool.blocks.erase(it);
-    return true;
-  }
-
-  bool trigger_free_memory_callbacks(AllocParams& p) {
-    bool freed_memory = false;
-    for (const auto& name : FreeNPUMemoryCallbacksRegistry()->Keys()) {
-      freed_memory |=
-        FreeNPUMemoryCallbacksRegistry()->Create(name)->Execute();
-    }
-    return freed_memory;
-  }
-
-  void garbage_collect_cached_blocks(const std::shared_ptr<c10::GatheredContext>& ctx)
-  {
-    // Free unused cached blocks to reclaim NPU memory.
-    // Unlike release_cached_blocks(), this does not enforce synchronization and
-    // therefore should be of less overheads.
-
-    size_t gc_threshold = static_cast<size_t>(
-        CachingAllocatorConfig::garbage_collection_threshold() *
-        allowed_memory_maximum);
-    // No need to trigger GC yet
-    if (total_allocated_memory <= gc_threshold) {
-      return;
-    }
-    const auto target_size = total_allocated_memory - gc_threshold;
-    size_t gc_reclaimed = 0;
-
-    // Calculate the total age of the free-able blocks. We'll use it later to
-    // get "avg age" threshold.
-    double total_age = 0.0;
-    int freeable_block_count = 0;
-    for (auto& b : large_blocks.blocks) {
-      if (!b->is_split()) {
-        total_age += b->gc_count;
-        ++freeable_block_count;
-      }
-    }
-    // No free-able blocks?
-    if (freeable_block_count == 0) {
-      return;
-    }
-
-    c10_npu::npuSynchronizeDevice(true);
-
-    // Repeat GC until we reach reclaim > target size.
-    bool block_freed = true;
-    while (gc_reclaimed < target_size && block_freed == true &&
-           freeable_block_count > 0) {
-      // Free blocks exceeding this age threshold first.
-      double age_threshold = total_age / freeable_block_count;
-      // Stop iteration if we can no longer free a block.
-      block_freed = false;
-
-      // Free blocks of > avg age. Don't stop upon reaching the target_size,
-      // we don't want this GC to be triggered frequently.
-      auto it = large_blocks.blocks.begin();
-      while (it != large_blocks.blocks.end()) {
-        Block* block = *it;
-        ++it;
-        if (!block->is_split() && block->gc_count >= age_threshold) {
-          block_freed = true;
-          gc_reclaimed += block->size;
-          total_age -= block->gc_count; // Decrement the age
-          freeable_block_count--; // One less block that can be freed
-          release_block(block, ctx);
-
-          ASCEND_LOGD("PTA CachingAllocator gc: free = %zu, cached = %lu, allocated = %lu",
-              block->size,
-              stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
-              stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current);
-        }
-      }
-    }
-  }
-
-    bool alloc_block(
-        AllocParams& p,
-        bool isRetry,
-        const std::shared_ptr<c10::GatheredContext>& ctx,
-        std::unique_lock<std::recursive_mutex>& lock)
+            return false;
+        }
+
+        if ((*it)->expandable_segment_) {
+            if (CachingAllocatorConfig::expandable_segments()) {
+                // if we are allocated to the part of the block that is expandable
+                // for the purposes of "best fit" we consider its size to be the size it
+                // can expand to, not the size it currently is. This means that we
+                // sometimes have to search for blocks with bigger 'size' before
+                // choosing this segment.
+                auto expandable_size = [](Block *b) {
+                    return b->size + (b->next && !b->next->mapped ? b->next->size : 0);
+                };
+                auto next = it;
+                next++;
+                while ((*it)->expandable_segment_ && next != pool.blocks.end() && (*next)->stream == p.stream() &&
+                    expandable_size(*next) < expandable_size(*it)) {
+                    it = next++;
+                }
+            } else {
+                // Rarely expandable segments has been turned off after we have
+                // already allocated some blocks as expandable. For instance,
+                // since we cannot share expandable memory via IPC, someone might
+                // temporarily disable it. In this case we need to honor this request
+                // by only finding non-expandable blocks
+                do {
+                    it++;
+                } while (it != pool.blocks.end() && (*it)->expandable_segment_ && (*it)->stream == p.stream());
+                if (it == pool.blocks.end() || (*it)->stream != p.stream()) {
+                    return false;
+                }
+            }
+        }
+
+        // Do not return an oversized block for a large request
+        if ((p.size() < CachingAllocatorConfig::max_split_size()) &&
+            ((*it)->size >= CachingAllocatorConfig::max_split_size())) {
+            return false;
+        }
+        // Allow oversized block size to be rounded up but within a limit
+        if ((p.size() >= CachingAllocatorConfig::max_split_size()) && ((*it)->size >= p.size() + kLargeBuffer)) {
+            return false;
+        }
+        p.block = *it;
+        (*it)->gc_count = 0; // Denote this block has been used
+        pool.blocks.erase(it);
+        return true;
+    }
+
+    bool trigger_free_memory_callbacks(AllocParams &p)
+    {
+        bool freed_memory = false;
+        for (const auto &name : FreeNPUMemoryCallbacksRegistry()->Keys()) {
+            freed_memory |= FreeNPUMemoryCallbacksRegistry()->Create(name)->Execute();
+        }
+        return freed_memory;
+    }
+
+    void garbage_collect_cached_blocks(const std::shared_ptr<c10::GatheredContext> &ctx)
+    {
+        // Free unused cached blocks to reclaim NPU memory.
+        // Unlike release_cached_blocks(), this does not enforce synchronization and
+        // therefore should be of less overheads.
+
+        size_t gc_threshold =
+            static_cast<size_t>(CachingAllocatorConfig::garbage_collection_threshold() * allowed_memory_maximum);
+        // No need to trigger GC yet
+        if (total_allocated_memory <= gc_threshold) {
+            return;
+        }
+        const auto target_size = total_allocated_memory - gc_threshold;
+        size_t gc_reclaimed = 0;
+
+        // Calculate the total age of the free-able blocks. We'll use it later to
+        // get "avg age" threshold.
+        double total_age = 0.0;
+        int freeable_block_count = 0;
+        for (auto &b : large_blocks.blocks) {
+            if (!b->is_split()) {
+                total_age += b->gc_count;
+                ++freeable_block_count;
+            }
+        }
+        // No free-able blocks?
+        if (freeable_block_count == 0) {
+            return;
+        }
+
+        c10_npu::npuSynchronizeDevice(true);
+
+        // Repeat GC until we reach reclaim > target size.
+        bool block_freed = true;
+        while (gc_reclaimed < target_size && block_freed == true && freeable_block_count > 0) {
+            // Free blocks exceeding this age threshold first.
+            double age_threshold = total_age / freeable_block_count;
+            // Stop iteration if we can no longer free a block.
+            block_freed = false;
+
+            // Free blocks of > avg age. Don't stop upon reaching the target_size,
+            // we don't want this GC to be triggered frequently.
+            auto it = large_blocks.blocks.begin();
+            while (it != large_blocks.blocks.end()) {
+                Block *block = *it;
+                ++it;
+                if (!block->is_split() && block->gc_count >= age_threshold) {
+                    block_freed = true;
+                    gc_reclaimed += block->size;
+                    total_age -= block->gc_count; // Decrement the age
+                    freeable_block_count--;       // One less block that can be freed
+                    release_block(block, ctx);
+
+                    ASCEND_LOGD("PTA CachingAllocator gc: free = %zu, cached = %lu, allocated = %lu", block->size,
+                        stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current,
+                        stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current);
+                }
+            }
+        }
+    }
+
+    bool alloc_block(AllocParams &p, bool isRetry, const std::shared_ptr<c10::GatheredContext> &ctx,
+        std::unique_lock<std::recursive_mutex> &lock)
     {
         size_t size = p.alloc_size;
-        void* ptr = nullptr;
+        void *ptr = nullptr;
 
         if (isRetry) {
             stats.num_alloc_retries += 1;
@@ -2319,324 +2164,288 @@ class DeviceCachingAllocator {
                 if (IsMallocPage1GMem(p.pool->is_small)) {
                     policy = aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE1G_ONLY;
                 }
-                p.err = c10_npu::acl::AclrtMallocAlign32(&ptr, size, policy);
+                p.err = c10_npu::acl::AclrtMallocAlign32(&ptr, size, policy);
+            }
+            if (p.err != ACL_ERROR_NONE) {
+                return false;
+            }
+        }
+
+        ASCEND_LOGD("NPUCachingAllocator malloc by AclrtMallocAlign32: size=%zu", size);
+
+        if (p.pool->owner_PrivatePool) {
+            // The block is for a NPU graph's PrivatePool.
+            p.pool->owner_PrivatePool->npuMalloc_count++;
+        }
+
+        total_allocated_memory += size;
+        p.block = new Block(p.device(), p.stream(), size, p.pool, (char *)ptr);
+        for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
+            update_stat(stats.segment[stat_type], 1);
+            update_stat(stats.reserved_bytes[stat_type], size);
+        });
+        if (size >= CachingAllocatorConfig::max_split_size()) {
+            update_stat(stats.oversize_segments, 1);
+        }
+        ASCEND_LOGD("pta_memory acl_malloc: malloc = %zu, ret = %d", size, p.err);
+
+        // p.block came from new, not npuMalloc. It should not be nullptr here.
+        TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr);
+
+        record_trace(TraceEntry::SEGMENT_ALLOC, int64_t(p.block->ptr), p.block->size, p.stream(), p.device(), ctx);
+        p.block->context_when_segment_allocated = ctx;
+        return true;
+    }
+
+    /* * Free one or more oversize blocks to the system allocator.  But only enough to satisfy the target size * */
+    bool release_available_cached_blocks(const AllocParams &p, const std::shared_ptr<c10::GatheredContext> &ctx)
+    {
+        if (CachingAllocatorConfig::max_split_size() == std::numeric_limits<size_t>::max()) {
+            return false;
+        }
+        BlockPool &pool = *p.pool;
+        Block key = p.search_key;
+        key.size =
+            (key.size < CachingAllocatorConfig::max_split_size()) ? CachingAllocatorConfig::max_split_size() : key.size;
+        auto it = pool.blocks.lower_bound(&key);
+
+        c10_npu::npuSynchronizeDevice(true);
+
+        if (it == pool.blocks.end() || (*it)->stream != p.stream()) {
+            // No single block is large enough; free multiple oversize blocks, starting with the largest
+            if (it == pool.blocks.begin()) {
+                return false;
+            }
+            size_t totalReleased = 0;
+            // Back up one item.  Now on the largest block for the correct stream
+            --it;
+            while ((totalReleased < key.size) && ((*it)->size >= CachingAllocatorConfig::max_split_size()) &&
+                ((*it)->stream == p.stream())) {
+                auto cur = it;
+                totalReleased += (*it)->size;
+                if (it != pool.blocks.begin()) {
+                    --it;
+                    release_block(*cur, ctx);
+                } else {
+                    release_block(*cur, ctx);
+                    break;
+                }
             }
-            if (p.err != ACL_ERROR_NONE) {
+            if (totalReleased < key.size) {
                 return false;
             }
+        } else {
+            release_block(*it, ctx);
         }
+        return true;
+    }
 
-        ASCEND_LOGD("NPUCachingAllocator malloc by AclrtMallocAlign32: size=%zu", size);
+    bool release_cached_blocks(bool check_error, const std::shared_ptr<c10::GatheredContext> &context)
+    {
+        // Make sure event deque from taskqueue, then synchronize Event
+        c10_npu::npuSynchronizeDevice(check_error);
+
+        // First ensure that all blocks that can't currently be allocated due to
+        // outstanding events are returned to the pool.
+        synchronize_and_free_events(check_error, context);
+
+        // Free all non-split cached blocks
+        release_blocks(large_blocks, context);
+        release_blocks(small_blocks, context);
+
+        for (auto it = graph_pools_freeable.begin(); it != graph_pools_freeable.end();) {
+            // See notifyCaptureDestroy for the strategy here.
+            TORCH_INTERNAL_ASSERT(it->second->use_count == 0);
+            release_blocks(it->second->small_blocks, context);
+            release_blocks(it->second->large_blocks, context);
+            if (it->second->npuMalloc_count == 0) {
+                auto erase_count = graph_pools.erase(it->first);
+                TORCH_INTERNAL_ASSERT(erase_count == 1);
+                it = graph_pools_freeable.erase(it);
+            } else {
+                ++it;
+            }
+        }
 
-        if (p.pool->owner_PrivatePool) {
-            // The block is for a NPU graph's PrivatePool.
-            p.pool->owner_PrivatePool->npuMalloc_count++;
+        return true;
+    }
+
+    void release_expandable_segment(Block *block)
+    {
+        TORCH_INTERNAL_ASSERT(block->size == block->expandable_segment_->size(), "block disagrees with segment",
+            PTA_ERROR(ErrCode::INTERNAL));
+        TORCH_INTERNAL_ASSERT(!block->mapped, PTA_ERROR(ErrCode::INTERNAL));
+        auto it = std::find(expandable_segments_.begin(), expandable_segments_.end(), block->expandable_segment_);
+        TORCH_INTERNAL_ASSERT(it != expandable_segments_.end(), PTA_ERROR(ErrCode::INTERNAL));
+        expandable_segments_.erase(it);
+        block->pool->unmapped.erase(block);
+        delete block->expandable_segment_;
+        block->expandable_segment_ = nullptr;
+        delete block;
+        block = nullptr;
+    }
+
+    void release_block(Block *block, const std::shared_ptr<c10::GatheredContext> &context)
+    {
+        TORCH_INTERNAL_ASSERT(!block->expandable_segment_, PTA_ERROR(ErrCode::VALUE));
+        ASCEND_LOGD("NPUCachingAllocator free by aclrtFree: size=%zu", block->size);
+
+        record_trace(TraceEntry::SEGMENT_FREE, int64_t(block->ptr), block->size, block->stream, block->device,
+            context ? context : block->context_when_segment_allocated);
+
+        aclrtFree((void *)block->ptr);
+        total_allocated_memory -= block->size;
+
+        auto *pool = block->pool;
+        if (pool->owner_PrivatePool) {
+            // The npuFreed block belonged to a NPU graph's PrivatePool.
+            TORCH_INTERNAL_ASSERT(pool->owner_PrivatePool->npuMalloc_count > 0);
+            pool->owner_PrivatePool->npuMalloc_count--;
         }
 
-        total_allocated_memory += size;
-        p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr);
-        for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
-            update_stat(stats.segment[stat_type], 1);
-            update_stat(stats.reserved_bytes[stat_type], size);
+        StatTypes stat_types = get_stat_types_for_pool(*pool);
+        for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
+            update_stat(stats.segment[stat_type], -1);
+            update_stat(stats.reserved_bytes[stat_type], -block->size);
         });
-        if (size >= CachingAllocatorConfig::max_split_size()) {
-            update_stat(stats.oversize_segments, 1);
+
+        if (block->size >= CachingAllocatorConfig::max_split_size()) {
+            update_stat(stats.oversize_segments, -1);
         }
-        ASCEND_LOGD("pta_memory acl_malloc: malloc = %zu, ret = %d", size, p.err);
+        ASCEND_LOGD("pta_memory acl_free: free_size = %zu", block->size);
 
-        // p.block came from new, not npuMalloc. It should not be nullptr here.
-        TORCH_INTERNAL_ASSERT(p.block != nullptr && p.block->ptr != nullptr);
+        pool->blocks.erase(block);
+        delete block;
+        block = nullptr;
+    }
 
-        record_trace(
-            TraceEntry::SEGMENT_ALLOC,
-            int64_t(p.block->ptr),
-            p.block->size,
-            p.stream(),
-            p.device(),
-            ctx);
-        p.block->context_when_segment_allocated = ctx;
-        return true;
+    void unmap_block(Block *block, const std::shared_ptr<c10::GatheredContext> &context)
+    {
+        auto unmapped = block->expandable_segment_->unmap(SegmentRange{ block->ptr, block->size });
+        if (unmapped.size == 0) {
+            return;
+        }
+        block->pool->blocks.erase(block);
+
+        ptrdiff_t before_size = static_cast<char *>(unmapped.ptr) - static_cast<char *>(block->ptr);
+        if (before_size > 0) {
+            // prev? -> before_free -> block
+            Block *before_free = new Block(block->device, block->stream, before_size, block->pool, block->ptr);
+            before_free->expandable_segment_ = block->expandable_segment_;
+            before_free->splice(block->prev, block);
+            block->pool->blocks.insert(before_free);
+        }
+
+        auto after_size = block->size - (before_size + unmapped.size);
+        if (after_size > 0) {
+            // block -> after_free -> next?
+            Block *after_free = new Block(block->device, block->stream, after_size, block->pool,
+                static_cast<char *>(unmapped.ptr) + unmapped.size);
+            after_free->expandable_segment_ = block->expandable_segment_;
+            after_free->splice(block, block->next);
+            block->pool->blocks.insert(after_free);
+        }
+
+        block->ptr = unmapped.ptr;
+        block->size = unmapped.size;
+        block->mapped = false;
+
+        try_merge_blocks(block, block->prev, *block->pool);
+        try_merge_blocks(block, block->next, *block->pool);
+        block->pool->unmapped.insert(block);
+
+        // update statistics
+        total_allocated_memory -= unmapped.size;
+        StatTypes stat_types = get_stat_types_for_pool(*block->pool);
+        for_each_selected_stat_type(stat_types,
+            [&](size_t stat_type) { update_stat(stats.reserved_bytes[stat_type], -unmapped.size); });
+
+        if (block->pool->owner_PrivatePool) {
+            // The npuFreed block belonged to a NPU graph's PrivatePool.
+            TORCH_INTERNAL_ASSERT(block->pool->owner_PrivatePool->npuMalloc_count > 0);
+            block->pool->owner_PrivatePool->npuMalloc_count--;
+        }
+
+        record_trace(TraceEntry::SEGMENT_UNMAP, int64_t(unmapped.ptr), unmapped.size, block->stream, block->device,
+            context ? context : block->context_when_segment_allocated);
     }
 
-  /** Free one or more oversize blocks to the system allocator.  But only enough to satisfy the target size **/
-  bool release_available_cached_blocks(const AllocParams& p,
-    const std::shared_ptr<c10::GatheredContext>& ctx)
-  {
-    if (CachingAllocatorConfig::max_split_size() == std::numeric_limits<size_t>::max()) {
-      return false;
-    }
-    BlockPool &pool = *p.pool;
-    Block key = p.search_key;
-    key.size =
-        (key.size < CachingAllocatorConfig::max_split_size()) ? CachingAllocatorConfig::max_split_size() : key.size;
-    auto it = pool.blocks.lower_bound(&key);
-
-    c10_npu::npuSynchronizeDevice(true);
-
-    if (it == pool.blocks.end() || (*it)->stream != p.stream()) {
-      // No single block is large enough; free multiple oversize blocks, starting with the largest
-      if (it == pool.blocks.begin()) {
-        return false;
-      }
-      size_t totalReleased = 0;
-      // Back up one item.  Now on the largest block for the correct stream
-      --it;
-      while ((totalReleased < key.size) && ((*it)->size >= CachingAllocatorConfig::max_split_size()) &&
-            ((*it)->stream == p.stream())) {
-        auto cur = it;
-        totalReleased += (*it)->size;
-        if (it != pool.blocks.begin()) {
-          --it;
-          release_block(*cur, ctx);
-        } else {
-          release_block(*cur, ctx);
-          break;
+    void release_blocks(BlockPool &pool, const std::shared_ptr<c10::GatheredContext> &context)
+    {
+        std::vector<Block *> to_unmap;
+        // Frees all non-split blocks
+        auto it = pool.blocks.begin();
+        while (it != pool.blocks.end()) {
+            Block *block = *it;
+            ++it;
+            if (block->expandable_segment_) {
+                // unmapping will mutate the free pool
+                // so just gather what needs to be freed
+                // to avoid invalidating the iterator
+                to_unmap.push_back(block);
+            } else if (!block->prev && !block->next) {
+                release_block(block, context);
+            }
         }
-      }
-      if (totalReleased < key.size) {
-        return false;
-      }
-    } else {
-      release_block(*it, ctx);
-    }
-    return true;
-  }
-
-  bool release_cached_blocks(bool check_error, const std::shared_ptr<c10::GatheredContext>& context)
-  {
-      // Make sure event deque from taskqueue, then synchronize Event
-      c10_npu::npuSynchronizeDevice(check_error);
-
-      // First ensure that all blocks that can't currently be allocated due to
-      // outstanding events are returned to the pool.
-      synchronize_and_free_events(check_error, context);
-
-      // Free all non-split cached blocks
-      release_blocks(large_blocks, context);
-      release_blocks(small_blocks, context);
-
-      for (auto it = graph_pools_freeable.begin();
-          it != graph_pools_freeable.end();) {
-        // See notifyCaptureDestroy for the strategy here.
-        TORCH_INTERNAL_ASSERT(it->second->use_count == 0);
-        release_blocks(it->second->small_blocks, context);
-        release_blocks(it->second->large_blocks, context);
-        if (it->second->npuMalloc_count == 0) {
-          auto erase_count = graph_pools.erase(it->first);
-          TORCH_INTERNAL_ASSERT(erase_count == 1);
-          it = graph_pools_freeable.erase(it);
-        } else {
-          ++it;
-        }
-      }
-
-      return true;
-  }
-
-  void release_expandable_segment(Block* block) {
-    TORCH_INTERNAL_ASSERT(
-        block->size == block->expandable_segment_->size(),
-        "block disagrees with segment", PTA_ERROR(ErrCode::INTERNAL));
-    TORCH_INTERNAL_ASSERT(!block->mapped, PTA_ERROR(ErrCode::INTERNAL));
-    auto it = std::find(
-        expandable_segments_.begin(),
-        expandable_segments_.end(),
-        block->expandable_segment_);
-    TORCH_INTERNAL_ASSERT(it != expandable_segments_.end(), PTA_ERROR(ErrCode::INTERNAL));
-    expandable_segments_.erase(it);
-    block->pool->unmapped.erase(block);
-    delete block->expandable_segment_;
-    block->expandable_segment_ = nullptr;
-    delete block;
-    block = nullptr;
-  }
-
-  void release_block(
-      Block* block,
-      const std::shared_ptr<c10::GatheredContext>& context)
-  {
-    TORCH_INTERNAL_ASSERT(!block->expandable_segment_, PTA_ERROR(ErrCode::VALUE));
-    ASCEND_LOGD("NPUCachingAllocator free by aclrtFree: size=%zu", block->size);
-
-    record_trace(
-        TraceEntry::SEGMENT_FREE,
-        int64_t(block->ptr),
-        block->size,
-        block->stream,
-        block->device,
-        context ? context : block->context_when_segment_allocated);
-
-    aclrtFree((void*)block->ptr);
-    total_allocated_memory -= block->size;
-
-    auto* pool = block->pool;
-    if (pool->owner_PrivatePool) {
-      // The npuFreed block belonged to a NPU graph's PrivatePool.
-      TORCH_INTERNAL_ASSERT(pool->owner_PrivatePool->npuMalloc_count > 0);
-      pool->owner_PrivatePool->npuMalloc_count--;
-    }
-
-    StatTypes stat_types = get_stat_types_for_pool(*pool);
-    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
-      update_stat(stats.segment[stat_type], -1);
-      update_stat(stats.reserved_bytes[stat_type], -block->size);
-    });
-
-    if (block->size >= CachingAllocatorConfig::max_split_size())
-      update_stat(stats.oversize_segments, -1);
-    ASCEND_LOGD("pta_memory acl_free: free_size = %zu", block->size);
-
-    pool->blocks.erase(block);
-    delete block;
-    block = nullptr;
-    }
-
-  void unmap_block(
-      Block* block,
-      const std::shared_ptr<c10::GatheredContext>& context)
-  {
-    auto unmapped = block->expandable_segment_->unmap(
-        SegmentRange{block->ptr, block->size});
-    if (unmapped.size == 0) {
-      return;
-    }
-    block->pool->blocks.erase(block);
-
-    ptrdiff_t before_size =
-        static_cast<char*>(unmapped.ptr) - static_cast<char*>(block->ptr);
-    if (before_size > 0) {
-      // prev? -> before_free -> block
-      Block* before_free = new Block(
-          block->device, block->stream, before_size, block->pool, block->ptr);
-      before_free->expandable_segment_ = block->expandable_segment_;
-      before_free->splice(block->prev, block);
-      block->pool->blocks.insert(before_free);
-    }
-
-    auto after_size = block->size - (before_size + unmapped.size);
-    if (after_size > 0) {
-      // block -> after_free -> next?
-      Block* after_free = new Block(
-          block->device,
-          block->stream,
-          after_size,
-          block->pool,
-          static_cast<char*>(unmapped.ptr) + unmapped.size);
-      after_free->expandable_segment_ = block->expandable_segment_;
-      after_free->splice(block, block->next);
-      block->pool->blocks.insert(after_free);
-    }
-
-    block->ptr = unmapped.ptr;
-    block->size = unmapped.size;
-    block->mapped = false;
-
-    try_merge_blocks(block, block->prev, *block->pool);
-    try_merge_blocks(block, block->next, *block->pool);
-    block->pool->unmapped.insert(block);
-
-    // update statistics
-    total_allocated_memory -= unmapped.size;
-    StatTypes stat_types = get_stat_types_for_pool(*block->pool);
-    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
-      update_stat(stats.reserved_bytes[stat_type], -unmapped.size);
-    });
-
-    if (block->pool->owner_PrivatePool) {
-      // The npuFreed block belonged to a NPU graph's PrivatePool.
-      TORCH_INTERNAL_ASSERT(
-          block->pool->owner_PrivatePool->npuMalloc_count > 0);
-      block->pool->owner_PrivatePool->npuMalloc_count--;
-    }
-
-    record_trace(
-        TraceEntry::SEGMENT_UNMAP,
-        int64_t(unmapped.ptr),
-        unmapped.size,
-        block->stream,
-        block->device,
-        context ? context : block->context_when_segment_allocated);
-  }
-
-  void release_blocks(
-      BlockPool& pool,
-      const std::shared_ptr<c10::GatheredContext>& context)
-  {
-    std::vector<Block*> to_unmap;
-    // Frees all non-split blocks
-    auto it = pool.blocks.begin();
-    while (it != pool.blocks.end()) {
-      Block *block = *it;
-      ++it;
-      if (block->expandable_segment_) {
-        // unmapping will mutate the free pool
-        // so just gather what needs to be freed
-        // to avoid invalidating the iterator
-        to_unmap.push_back(block);
-      } else if (!block->prev && !block->next) {
-        release_block(block, context);
-      }
-    }
-    for (Block* block : to_unmap) {
-      unmap_block(block, context);
-      if (!block->prev && !block->next) {
-        release_expandable_segment(block);
-      }
-    }
-  }
-
-  EventPool::Event create_event_internal(int idx) {
-    // Leak the event pool to avoid shutdown issues.
-    static auto* event_pool = new EventPool();
-    return event_pool->get(idx);
-  }
-
-  void synchronize_and_free_events(bool check_error, const std::shared_ptr<c10::GatheredContext>& context)
-  {
-    // This function syncs, so capture should not be underway. Might as well
-    // make sure capture-deferred end of life events get processed too.
-    TORCH_INTERNAL_ASSERT(captures_underway.empty());
-    insert_events_deferred_until_no_capture(context);
-
-    // Synchronize on outstanding events and then free associated blocks.
-    for (auto& st : npu_events) {
-      for (auto& e : st.second) {
-        EventPool::Event event = std::move(e.first);
-        Block* block = e.second;
-        if (check_error) {
-          NPU_CHECK_ERROR(aclrtSynchronizeEvent(*event));
-        } else {
-          NPU_CHECK_WARN(aclrtSynchronizeEvent(*event));
+        for (Block *block : to_unmap) {
+            unmap_block(block, context);
+            if (!block->prev && !block->next) {
+                release_expandable_segment(block);
+            }
         }
+    }
+
+    EventPool::Event create_event_internal(int idx)
+    {
+        // Leak the event pool to avoid shutdown issues.
+        static auto *event_pool = new EventPool();
+        return event_pool->get(idx);
+    }
+
+    void synchronize_and_free_events(bool check_error, const std::shared_ptr<c10::GatheredContext> &context)
+    {
+        // This function syncs, so capture should not be underway. Might as well
+        // make sure capture-deferred end of life events get processed too.
+        TORCH_INTERNAL_ASSERT(captures_underway.empty());
+        insert_events_deferred_until_no_capture(context);
+
+        // Synchronize on outstanding events and then free associated blocks.
+        for (auto &st : npu_events) {
+            for (auto &e : st.second) {
+                EventPool::Event event = std::move(e.first);
+                Block *block = e.second;
+                if (check_error) {
+                    NPU_CHECK_ERROR(aclrtSynchronizeEvent(*event));
+                } else {
+                    NPU_CHECK_WARN(aclrtSynchronizeEvent(*event));
+                }
 #ifndef BUILD_LIBTORCH
-        const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
-        if (C10_UNLIKELY(trigger)) {
-            trigger->traceNpuEventSynchronization(reinterpret_cast<uintptr_t>(event.get()));
-        }
+                const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace();
+                if (C10_UNLIKELY(trigger)) {
+                    trigger->traceNpuEventSynchronization(reinterpret_cast<uintptr_t>(event.get()));
+                }
 #endif
-        ASCEND_LOGI("Event: aclrtSynchronizeEvent is successfully executed, event=%p", event.get());
+                ASCEND_LOGI("Event: aclrtSynchronizeEvent is successfully executed, event=%p", event.get());
 
-        block->event_count--;
-        if (block->event_count == 0) {
-          free_block(block, context);
+                block->event_count--;
+                if (block->event_count == 0) {
+                    free_block(block, context);
+                }
+            }
         }
-      }
-    }
 
-    npu_events.clear();
-  }
+        npu_events.clear();
+    }
 
-    void remove_npugraph_stream_uses(Block* block)
+    void remove_npugraph_stream_uses(Block *block)
     {
         // remove stream uses added during npugraph capture
         // (i.e., block->stream_uses - block->npugraph_stream_uses)
-        if (C10_UNLIKELY(
-            block_to_npugraph_stream_uses.find(block) != block_to_npugraph_stream_uses.end())) {
+        if (C10_UNLIKELY(block_to_npugraph_stream_uses.find(block) != block_to_npugraph_stream_uses.end())) {
             stream_set streams(std::move(block->stream_uses));
             AT_ASSERT(block->stream_uses.empty());
-            for (auto& stream : streams) {
-                if (block_to_npugraph_stream_uses[block].find(stream) ==
-                    block_to_npugraph_stream_uses[block].end()) {
+            for (auto &stream : streams) {
+                if (block_to_npugraph_stream_uses[block].find(stream) == block_to_npugraph_stream_uses[block].end()) {
                     block->stream_uses.insert(stream);
                 }
             }
@@ -2644,7 +2453,7 @@ class DeviceCachingAllocator {
         }
     }
 
-    void insert_events(Block* block)
+    void insert_events(Block *block)
     {
         aclrtContext compiler_ctx = aclrtContext();
         aclError ret_ctx = aclrtGetCurrentContext(&compiler_ctx);
@@ -2652,7 +2461,7 @@ class DeviceCachingAllocator {
 
         stream_set streams(std::move(block->stream_uses));
         AT_ASSERT(block->stream_uses.empty(), PTA_ERROR(ErrCode::VALUE));
-        for (auto& stream : streams) {
+        for (auto &stream : streams) {
             NPU_CHECK_ERROR(c10_npu::SetDevice(stream.device_index()));
 
             EventPool::Event event = create_event_internal(stream.device_index());
@@ -2667,11 +2476,10 @@ class DeviceCachingAllocator {
         }
     }
 
-    void insert_events_deferred_until_no_capture(
-        const std::shared_ptr<c10::GatheredContext>& context)
+    void insert_events_deferred_until_no_capture(const std::shared_ptr<c10::GatheredContext> &context)
     {
         if (C10_UNLIKELY(!needs_events_deferred_until_no_capture.empty())) {
-            for (auto* block : needs_events_deferred_until_no_capture) {
+            for (auto *block : needs_events_deferred_until_no_capture) {
                 TORCH_INTERNAL_ASSERT(!block->stream_uses.empty());
                 // only streams recorded before npugraph will be used to insert events
                 // since we know all streams recorded during npugraph must have
@@ -2687,85 +2495,77 @@ class DeviceCachingAllocator {
         }
     }
 
-  void process_events(const std::shared_ptr<c10::GatheredContext>& context)
-  {
-    insert_events_deferred_until_no_capture(context);
-
-    // Process outstanding npuEvents. Events that are completed are removed
-    // from the queue, and the 'event_count' for the corresponding allocation
-    // is decremented. Stops at the first event which has not been completed.
-    // Since events on different devices or streams may occur out of order,
-    // the processing of some events may be delayed.
-    for (auto it = npu_events.begin(); it != npu_events.end();) {
-      while (!it->second.empty()) {
-        auto& e = it->second.front();
-        EventPool::Event event = std::move(e.first);
-        Block* block = e.second;
-
-        if (!event->query()) {
-          e.first = std::move(event);
-          break;
-        }
-
-        block->event_count--;
-        if (block->event_count == 0) {
-          free_block(block, context);
-        }
-        it->second.pop_front();
-      }
-
-      if (it->second.empty()) {
-        it = npu_events.erase(it);
-      } else {
-        it++;
-      }
-    }
-  }
-
-  // Accumulates sizes of all memory blocks for given device in given pool
-  void cache_info_aux(BlockPool& blocks, size_t* total, size_t* largest) {
-    for (auto it = blocks.blocks.begin(); it != blocks.blocks.end(); ++it) {
-      size_t blocksize = (*it)->size;
-      *total += blocksize;
-      if (blocksize > *largest) {
-        *largest = blocksize;
-      }
-    }
-  }
-
-  void record_trace(
-      TraceEntry::Action action,
-      int64_t addr,
-      size_t size,
-      aclrtStream stream,
-      int device,
-      std::shared_ptr<c10::GatheredContext> context)
-  {
-    if (!record_history) {return;}
-
-    auto te = TraceEntry(
-        action,
-        device,
-        addr,
-        size,
-        stream,
-        record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr);
-
-    if (record_history) {
-      if (alloc_trace->size() < alloc_trace_max_entries_) {
-        alloc_trace->emplace_back(te);
-      } else {
-        (*alloc_trace)[alloc_trace_next++] = te;
-        if (alloc_trace_next == alloc_trace_max_entries_) {
-          alloc_trace_next = 0;
-        }
-      }
-    }
-  }
+    void process_events(const std::shared_ptr<c10::GatheredContext> &context)
+    {
+        insert_events_deferred_until_no_capture(context);
+
+        // Process outstanding npuEvents. Events that are completed are removed
+        // from the queue, and the 'event_count' for the corresponding allocation
+        // is decremented. Stops at the first event which has not been completed.
+        // Since events on different devices or streams may occur out of order,
+        // the processing of some events may be delayed.
+        for (auto it = npu_events.begin(); it != npu_events.end();) {
+            while (!it->second.empty()) {
+                auto &e = it->second.front();
+                EventPool::Event event = std::move(e.first);
+                Block *block = e.second;
+
+                if (!event->query()) {
+                    e.first = std::move(event);
+                    break;
+                }
+
+                block->event_count--;
+                if (block->event_count == 0) {
+                    free_block(block, context);
+                }
+                it->second.pop_front();
+            }
+
+            if (it->second.empty()) {
+                it = npu_events.erase(it);
+            } else {
+                it++;
+            }
+        }
+    }
+
+    // Accumulates sizes of all memory blocks for given device in given pool
+    void cache_info_aux(BlockPool &blocks, size_t *total, size_t *largest)
+    {
+        for (auto it = blocks.blocks.begin(); it != blocks.blocks.end(); ++it) {
+            size_t blocksize = (*it)->size;
+            *total += blocksize;
+            if (blocksize > *largest) {
+                *largest = blocksize;
+            }
+        }
+    }
+
+    void record_trace(TraceEntry::Action action, int64_t addr, size_t size, aclrtStream stream, int device,
+        std::shared_ptr<c10::GatheredContext> context)
+    {
+        if (!record_history) {
+            return;
+        }
+
+        auto te = TraceEntry(action, device, addr, size, stream,
+            record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr);
 
+        if (record_history) {
+            if (alloc_trace->size() < alloc_trace_max_entries_) {
+                alloc_trace->emplace_back(te);
+            } else {
+                (*alloc_trace)[alloc_trace_next++] = te;
+                if (alloc_trace_next == alloc_trace_max_entries_) {
+                    alloc_trace_next = 0;
+                }
+            }
+        }
+    }
 };
 
-static void uncached_delete(void* ptr)
+static void uncached_delete(void *ptr)
 {
     if (c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) {
         c10_npu::npuSynchronizeDevice(false);
@@ -2774,266 +2574,257 @@ static void uncached_delete(void* ptr)
     NPU_CHECK_ERROR(aclrtFree(ptr));
 }
 
-void local_raw_delete(void* ptr);
+void local_raw_delete(void *ptr);
 
 class NpuCachingAllocator : public NPUAllocator {
- private:
-
-  std::mutex mutex;
-
-  // allocated blocks by device pointer
-  ska::flat_hash_map<void*, Block*> allocated_blocks;
+private:
+    std::mutex mutex;
 
-  void add_allocated_block(Block* block) {
-    std::lock_guard<std::mutex> lock(mutex);
-    allocated_blocks[block->ptr] = block;
-  }
+    // allocated blocks by device pointer
+    ska::flat_hash_map<void *, Block *> allocated_blocks;
 
- public:
+    void add_allocated_block(Block *block)
+    {
+        std::lock_guard<std::mutex> lock(mutex);
+        allocated_blocks[block->ptr] = block;
+    }
 
-  std::vector<std::unique_ptr<DeviceCachingAllocator>> device_allocator;
+public:
+    std::vector<std::unique_ptr<DeviceCachingAllocator>> device_allocator;
 
-  Block* get_allocated_block(void* ptr, bool remove = false) {
-    std::lock_guard<std::mutex> lock(mutex);
-    auto it = allocated_blocks.find(ptr);
-    if (it == allocated_blocks.end()) {
-      return nullptr;
-    }
-    Block* block = it->second;
-    if (remove) {
-      allocated_blocks.erase(it);
+    Block *get_allocated_block(void *ptr, bool remove = false)
+    {
+        std::lock_guard<std::mutex> lock(mutex);
+        auto it = allocated_blocks.find(ptr);
+        if (it == allocated_blocks.end()) {
+            return nullptr;
+        }
+        Block *block = it->second;
+        if (remove) {
+            allocated_blocks.erase(it);
+        }
+        return block;
     }
-    return block;
-  }
 
-  void init(int device_count) override
+    void init(int device_count) override
     {
-    int size = static_cast<int>(device_allocator.size());
-    if (size < device_count) {
-      device_allocator.resize(device_count);
-      for (const auto i : c10::irange(size, device_count)) {
-        device_allocator[i] = std::make_unique<DeviceCachingAllocator>();
-      }
+        int size = static_cast<int>(device_allocator.size());
+        if (size < device_count) {
+            device_allocator.resize(device_count);
+            for (const auto i : c10::irange(size, device_count)) {
+                device_allocator[i] = std::make_unique<DeviceCachingAllocator>();
+            }
+        }
     }
-  }
 
-  bool initialized() override
+    bool initialized() override
     {
         return !device_allocator.empty();
     }
-  /** allocates a block which is safe to use from the provided stream */
-  void malloc(void** devPtr, int device, size_t size, aclrtStream stream) {
-      TORCH_INTERNAL_ASSERT(0 <= device && static_cast<size_t>(device) < device_allocator.size(),
-                            "Allocator not initialized for device ", device, ": did you call init?",
-                            PTA_ERROR(ErrCode::PARAM));
-      Block* block = device_allocator[device]->malloc(device, size, stream);
+    /* * allocates a block which is safe to use from the provided stream */
+    void malloc(void **devPtr, int device, size_t size, aclrtStream stream)
+    {
+        TORCH_INTERNAL_ASSERT(0 <= device && static_cast<size_t>(device) < device_allocator.size(),
+            "Allocator not initialized for device ", device, ": did you call init?", PTA_ERROR(ErrCode::PARAM));
+        Block *block = device_allocator[device]->malloc(device, size, stream);
 
-    add_allocated_block(block);
-    *devPtr = static_cast<void*>(block->ptr);
+        add_allocated_block(block);
+        *devPtr = static_cast<void *>(block->ptr);
 #ifndef BUILD_LIBTORCH
-    const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
-    if (C10_UNLIKELY(trigger)) {
-        trigger->traceNpuMemoryAllocation(
-            reinterpret_cast<uintptr_t>(*devPtr));
+        const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace();
+        if (C10_UNLIKELY(trigger)) {
+            trigger->traceNpuMemoryAllocation(reinterpret_cast<uintptr_t>(*devPtr));
+        }
+#endif
     }
+
+    void free(void *ptr)
+    {
+        if (!ptr) {
+            return;
+        }
+        Block *block = get_allocated_block(ptr, true);
+        if (!block) {
+            AT_ERROR("invalid device pointer: ", ptr);
+        }
+#ifndef BUILD_LIBTORCH
+        const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace();
+        if (C10_UNLIKELY(trigger)) {
+            trigger->traceNpuMemoryDeallocation(reinterpret_cast<uintptr_t>(block->ptr));
+        }
 #endif
-  }
+        auto orig_block_ptr = block->ptr;
+        auto orig_block_size = block->size;
+        device_allocator[block->device]->free(block);
+    }
+
+    void setMemoryFraction(double fraction, int device) override
+    {
+        TORCH_INTERNAL_ASSERT(0 <= device && device < device_allocator.size(), "Allocator not initialized for device ",
+            device, ": did you call init?", PTA_ERROR(ErrCode::PARAM));
+        TORCH_INTERNAL_ASSERT(0 <= fraction && fraction <= 1, "invalid fraction:", fraction,
+            ". Please set within (0, 1).", PTA_ERROR(ErrCode::PARAM));
+
+        c10_npu::SetDevice(device);
 
-  void free(void* ptr) {
-    if (!ptr) {
-      return;
+        device_allocator[device]->setMemoryFraction(fraction);
     }
-    Block* block = get_allocated_block(ptr, true);
-    if (!block) {
-      AT_ERROR("invalid device pointer: ", ptr);
+
+    void recordHistory(bool enabled, CreateContextFn context_recorder, size_t alloc_trace_max_entries,
+        RecordContext when) override
+    {
+        for (auto &allocator : device_allocator) {
+            allocator->recordHistory(enabled, context_recorder, alloc_trace_max_entries, when);
+        }
     }
-#ifndef BUILD_LIBTORCH
-    const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
-    if (C10_UNLIKELY(trigger)) {
-        trigger->traceNpuMemoryDeallocation(
-            reinterpret_cast<uintptr_t>(block->ptr));
+
+    bool isHistoryEnabled() override
+    {
+        int device = 0;
+        NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
+        return device_allocator[device]->isHistoryEnabled();
+    }
+
+    void attachOutOfMemoryObserver(OutOfMemoryObserver observer) override
+    {
+        for (auto &allocator : device_allocator) {
+            allocator->attachOutOfMemoryObserver(observer);
+        }
+    }
+
+    bool checkUceInMemPool(int device) override
+    {
+        return device_allocator[device]->checkUceInMemPool();
+    }
+
+    bool checkBlockIsSafe(const c10::DataPtr &ptr) override
+    {
+        if (!ptr.get()) {
+            return true;
+        }
+        if (ptr.get_deleter() != &local_raw_delete) {
+            return true;
+        }
+        Block *block = get_allocated_block(ptr.get());
+        TORCH_INTERNAL_ASSERT(block != nullptr, "No allocated block can be found", PTA_ERROR(ErrCode::NOT_FOUND));
+        return block->is_safe;
+    }
+
+    void markAllBlockUnsafe(int device) override
+    {
+        return device_allocator[device]->markAllBlockUnsafe();
+    }
+
+    void updateBlockToSafe(const c10::DataPtr &ptr) override
+    {
+        if (!ptr.get()) {
+            return;
+        }
+        if (ptr.get_deleter() != &local_raw_delete) {
+            return;
+        }
+        Block *block = get_allocated_block(ptr.get());
+        TORCH_INTERNAL_ASSERT(block != nullptr, "No allocated block can be found", PTA_ERROR(ErrCode::NOT_FOUND));
+        if (block->is_safe == false) {
+            ASCEND_LOGI("Triggers to refresh the data of the unsafe memory block and remove the unsafe flag");
+        }
+        block->is_safe = true;
+    }
+
+    void cleanEvent() override
+    {
+        int count = static_cast<int>(device_allocator.size());
+        for (int i = 0; i < count; i++) {
+            device_allocator[i]->release_and_free_events();
+        }
+    }
+
+    void emptyCache(bool check_error) override
+    {
+        int count = static_cast<int>(device_allocator.size());
+        for (int i = 0; i < count; i++) {
+            device_allocator[i]->emptyCache(i, check_error);
+        }
+    }
+
+    void *getBaseAllocation(void *ptr, size_t *outSize) override
+    {
+        Block *block = get_allocated_block(ptr);
+        if (!block) {
+            AT_ERROR("invalid device pointer: ", ptr);
+        }
+        return device_allocator[block->device]->getBaseAllocation(block, outSize);
+    }
+
+    void recordStream(const c10::DataPtr &ptr, c10_npu::NPUStream stream) override
+    {
+        // Empty tensor's storage().data() might be a null ptr. As there is no
+        // blocks associated with those tensors, it is fine to do nothing here.
+        if (!ptr.get()) {
+            return;
+        }
+
+        // If a tensor is not allocated by this instance, simply skip
+        // This usually happens when NPU tensors are shared across processes,
+        // we have implemented reference counting based sharing mechanism to
+        // guarantee tensors won't be accidentally freed by one process while
+        // they are still being used in another
+        if (ptr.get_deleter() != &local_raw_delete) {
+            return;
+        }
+
+        Block *block = get_allocated_block(ptr.get());
+        // block must not be null reaching here
+        TORCH_INTERNAL_ASSERT(block != nullptr, "No allocated block can be found", PTA_ERROR(ErrCode::NOT_FOUND));
+        device_allocator[block->device]->recordStream(block, stream);
+    }
+
+    void eraseStream(const c10::DataPtr &ptr, c10_npu::NPUStream stream)
+    {
+        if (!ptr.get()) {
+            return;
+        }
+
+        // If a tensor is not allocated by this instance, simply skip
+        // This usually happens when NPU tensors are shared across processes,
+        // we have implemented reference counting based sharing mechanism to
+        // guarantee tensors won't be accidentally freed by one process while
+        // they are still being used in another
+        if (ptr.get_deleter() != &local_raw_delete) {
+            TORCH_NPU_WARN_ONCE("Tensor not is not allocated by NPUCachingAllocator, skip eraseStream.");
+            return;
+        }
+
+        Block *block = get_allocated_block(ptr.get());
+        if (!block) {
+            AT_ERROR("invalid device pointer: ", ptr.get());
+        }
+
+        if (block->stream != c10_npu::getCurrentNPUStream(block->device).stream(false)) {
+            // If the Stream applying for tensor block different from
+            // the stream of submiting event wait task in HCCL synchronize()
+            // method, the recordSteam can not be erased.
+            // New tensor creation may use the block before HCCL op is complete.
+            return;
+        }
+
+        device_allocator[block->device]->eraseStream(block, stream);
+    }
+
+    SnapshotInfo snapshot() override
+    {
+        SnapshotInfo result;
+        int count = static_cast<int>(device_allocator.size());
+        for (int i = 0; i < count; i++) {
+            result.device_traces.emplace_back(device_allocator[i]->trace());
+            auto snap = device_allocator[i]->snapshot();
+            result.segments.insert(result.segments.end(), snap.begin(), snap.end());
+        }
+        return result;
     }
-#endif
-    auto orig_block_ptr = block->ptr;
-    auto orig_block_size = block->size;
-    device_allocator[block->device]->free(block);
-  }
-
-  void setMemoryFraction(double fraction, int device) override
-  {
-    TORCH_INTERNAL_ASSERT(
-        0 <= device && device < device_allocator.size(),
-        "Allocator not initialized for device ",
-        device,
-        ": did you call init?", PTA_ERROR(ErrCode::PARAM));
-    TORCH_INTERNAL_ASSERT(
-        0 <= fraction  && fraction <= 1,
-        "invalid fraction:",
-        fraction,
-        ". Please set within (0, 1).", PTA_ERROR(ErrCode::PARAM));
-
-    c10_npu::SetDevice(device);
-
-    device_allocator[device]->setMemoryFraction(fraction);
-  }
-
-  void recordHistory(bool enabled, CreateContextFn context_recorder,
-                     size_t alloc_trace_max_entries,
-                     RecordContext when) override
-  {
-      for (auto& allocator : device_allocator) {
-          allocator->recordHistory(enabled, context_recorder,
-                                   alloc_trace_max_entries, when);
-      }
-  }
-
-  bool isHistoryEnabled() override
-  {
-      int device = 0;
-      NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
-      return device_allocator[device]->isHistoryEnabled();
-  }
-
-  void attachOutOfMemoryObserver(OutOfMemoryObserver observer) override
-  {
-      for (auto& allocator : device_allocator) {
-          allocator->attachOutOfMemoryObserver(observer);
-      }
-  }
-
-  bool checkUceInMemPool(int device) override
-  {
-      return device_allocator[device]-> checkUceInMemPool();
-  }
-
-  bool checkBlockIsSafe(const c10::DataPtr& ptr) override
-  {
-      if (!ptr.get()) {
-          return true;
-      }
-      if (ptr.get_deleter() != &local_raw_delete) {
-          return true;
-      }
-      Block* block = get_allocated_block(ptr.get());
-      TORCH_INTERNAL_ASSERT(block != nullptr, "No allocated block can be found", PTA_ERROR(ErrCode::NOT_FOUND));
-      return block->is_safe;
-  }
-
-  void markAllBlockUnsafe(int device) override
-  {
-      return device_allocator[device]-> markAllBlockUnsafe();
-  }
-
-  void updateBlockToSafe(const c10::DataPtr &ptr) override
-  {
-      if (!ptr.get()) {
-          return;
-      }
-      if (ptr.get_deleter() != &local_raw_delete) {
-          return;
-      }
-      Block* block = get_allocated_block(ptr.get());
-      TORCH_INTERNAL_ASSERT(block != nullptr, "No allocated block can be found", PTA_ERROR(ErrCode::NOT_FOUND));
-      if (block->is_safe == false) {
-          ASCEND_LOGI("Triggers to refresh the data of the unsafe memory block and remove the unsafe flag");
-      }
-      block->is_safe = true;
-  }
-
-  void cleanEvent() override
-  {
-      int count = static_cast<int>(device_allocator.size());
-      for (int i = 0; i < count; i++)
-          device_allocator[i]->release_and_free_events();
-  }
-
-  void emptyCache(bool check_error) override
-  {
-    int count = static_cast<int>(device_allocator.size());
-    for (int i = 0; i < count; i++)
-      device_allocator[i]->emptyCache(i, check_error);
-  }
-
-  void* getBaseAllocation(void* ptr, size_t* outSize) override
-  {
-    Block* block = get_allocated_block(ptr);
-    if (!block) {
-      AT_ERROR("invalid device pointer: ", ptr);
-    }
-    return device_allocator[block->device]->getBaseAllocation(block, outSize);
-  }
-
-  void recordStream(const c10::DataPtr& ptr, c10_npu::NPUStream stream) override
-  {
-    // Empty tensor's storage().data() might be a null ptr. As there is no
-    // blocks associated with those tensors, it is fine to do nothing here.
-    if (!ptr.get()) {
-      return;
-    }
-
-    // If a tensor is not allocated by this instance, simply skip
-    // This usually happens when NPU tensors are shared across processes,
-    // we have implemented reference counting based sharing mechanism to
-    // guarantee tensors won't be accidentally freed by one process while
-    // they are still being used in another
-    if (ptr.get_deleter() != &local_raw_delete) {
-      return;
-    }
-
-    Block* block = get_allocated_block(ptr.get());
-    // block must not be null reaching here
-    TORCH_INTERNAL_ASSERT(block != nullptr, "No allocated block can be found", PTA_ERROR(ErrCode::NOT_FOUND));
-    device_allocator[block->device]->recordStream(block, stream);
-  }
-
-  void eraseStream(const c10::DataPtr& ptr, c10_npu::NPUStream stream)
-  {
-      if (!ptr.get()) {
-          return;
-      }
-
-      // If a tensor is not allocated by this instance, simply skip
-      // This usually happens when NPU tensors are shared across processes,
-      // we have implemented reference counting based sharing mechanism to
-      // guarantee tensors won't be accidentally freed by one process while
-      // they are still being used in another
-      if (ptr.get_deleter() != &local_raw_delete) {
-          TORCH_NPU_WARN_ONCE("Tensor not is not allocated by NPUCachingAllocator, skip eraseStream.");
-          return;
-      }
-
-      Block* block = get_allocated_block(ptr.get());
-      if (!block) {
-          AT_ERROR("invalid device pointer: ", ptr.get());
-      }
-
-      if (block->stream != c10_npu::getCurrentNPUStream(block->device).stream(false)) {
-          // If the Stream applying for tensor block different from
-          // the stream of submiting event wait task in HCCL synchronize()
-          // method, the recordSteam can not be erased.
-          // New tensor creation may use the block before HCCL op is complete.
-          return;
-      }
-
-      device_allocator[block->device]->eraseStream(block, stream);
-  }
-
-  SnapshotInfo snapshot() override
-  {
-    SnapshotInfo result;
-    int count = static_cast<int>(device_allocator.size());
-    for (int i = 0; i < count; i++) {
-      result.device_traces.emplace_back(device_allocator[i]->trace());
-      auto snap = device_allocator[i]->snapshot();
-      result.segments.insert(result.segments.end(), snap.begin(), snap.end());
-    }
-    return result;
-  }
 
     // CUDAGraph interactions
-    void beginAllocateToPool(
-        c10::DeviceIndex device,
-        MempoolId_t mempool_id,
+    void beginAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id,
         std::function<bool(aclrtStream)> filter) override
     {
         assertValidDevice(device);
@@ -3052,116 +2843,117 @@ class NpuCachingAllocator : public NPUAllocator {
         device_allocator[device]->releasePool(std::move(mempool_id));
     }
 
-  c10::DataPtr allocate(size_t size) override
-  {
-      constexpr size_t one_exa_bytes = 1152921504606846976ULL;
-      if (size >= one_exa_bytes) {
-          AT_ERROR("NPU out of memory. Tried to allocate more than 1EB memory.");
-      }
-      int device = 0;
-      NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
-      void* devPtr = nullptr;
-      void (*deleteFunc)(void*) = &local_raw_delete;
-
-      if (size != 0) {
-          if (c10_npu::option::OptionsManager::CheckForceUncached()) {
-              deleteFunc = &uncached_delete;
-              size_t alloc_size = size + 32;
-              NPU_CHECK_ERROR(c10_npu::acl::AclrtMallocAlign32(&devPtr, alloc_size,
-                                                               aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST));
-              ASCEND_LOGD("Without NPUCachingAllocator, malloc by "
-                          "AclrtMallocAlign32: size=%zu",
-                          alloc_size);
-          } else {
-              this->malloc(&devPtr, device, size, c10_npu::getCurrentNPUStreamNoWait(device));
-          }
-      }
-      return {devPtr, devPtr, deleteFunc, c10::Device(c10::DeviceType::PrivateUse1, device)};
-  }
-
-  c10::DeleterFnPtr raw_deleter() const override
-  {
-      if (c10_npu::option::OptionsManager::CheckForceUncached()) {
-          return &uncached_delete;
-      } else {
-          return &local_raw_delete;
-      }
-  }
-
-  void cacheInfo(int dev_id, size_t* cachedAndFree, size_t* largestBlock) override
-  {
-      device_allocator[dev_id]->cacheInfo(cachedAndFree, largestBlock);
-  }
-
-  void assertValidDevice(int device)
-  {
-      const auto device_num = device_allocator.size();
-      TORCH_CHECK(0 <= device && device < static_cast<int64_t>(device_num), "Invalid device argument ", device,
-                  ": did you call init?", PTA_ERROR(ErrCode::PARAM));
-  }
-
-  DeviceStats getDeviceStats(int device) override
-  {
-      assertValidDevice(device);
-      return device_allocator[device]->getStats();
-  }
-
-  void resetAccumulatedStats(int device) override
-  {
-      assertValidDevice(device);
-      device_allocator[device]->resetAccumulatedStats();
-  }
-
-  void resetPeakStats(int device) override
-  {
-      assertValidDevice(device);
-      device_allocator[device]->resetPeakStats();
-  }
-
-  void* raw_alloc(size_t nbytes) override
-  {
-    if (nbytes == 0) {
-      return nullptr;
-    }
-    int device = 0;
-    NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
-    void* r = nullptr;
-    malloc(&r, device, nbytes, c10_npu::getCurrentNPUStreamNoWait(device));
-    return r;
-  }
-
-  void* raw_alloc_with_stream(size_t nbytes, aclrtStream stream) override
-  {
-    if (nbytes == 0) {
-      return nullptr;
-    }
-    int device;
-    NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
-    void* r = nullptr;
-    malloc(&r, device, nbytes, stream);
-    return r;
-  }
-
-  void raw_delete(void* ptr) override
-  {
-    this->free(ptr);
-  }
-
-  void FreeDeviceCachedMemory(int device) override
-  {
-    device_allocator[device]->emptyCache(device, true);
-  }
-
-  std::string name() override
-  {
-    return "native";
-  }
-
-  // Note [COW/lazy_clone is not supported yet]
-  void copy_data(void* dest, const void* src, std::size_t count) const final {
-    default_copy_data(dest, src, count);
-  }
-  
+    c10::DataPtr allocate(size_t size) override
+    {
+        constexpr size_t one_exa_bytes = 1152921504606846976ULL;
+        if (size >= one_exa_bytes) {
+            AT_ERROR("NPU out of memory. Tried to allocate more than 1EB memory.");
+        }
+        int device = 0;
+        NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
+        void *devPtr = nullptr;
+        void (*deleteFunc)(void *) = &local_raw_delete;
+
+        if (size != 0) {
+            if (c10_npu::option::OptionsManager::CheckForceUncached()) {
+                deleteFunc = &uncached_delete;
+                size_t alloc_size = size + 32;
+                NPU_CHECK_ERROR(c10_npu::acl::AclrtMallocAlign32(&devPtr, alloc_size,
+                    aclrtMemMallocPolicy::ACL_MEM_MALLOC_HUGE_FIRST));
+                ASCEND_LOGD("Without NPUCachingAllocator, malloc by "
+                    "AclrtMallocAlign32: size=%zu",
+                    alloc_size);
+            } else {
+                this->malloc(&devPtr, device, size, c10_npu::getCurrentNPUStreamNoWait(device));
+            }
+        }
+        return { devPtr, devPtr, deleteFunc, c10::Device(c10::DeviceType::PrivateUse1, device) };
+    }
+
+    c10::DeleterFnPtr raw_deleter() const override
+    {
+        if (c10_npu::option::OptionsManager::CheckForceUncached()) {
+            return &uncached_delete;
+        } else {
+            return &local_raw_delete;
+        }
+    }
+
+    void cacheInfo(int dev_id, size_t *cachedAndFree, size_t *largestBlock) override
+    {
+        device_allocator[dev_id]->cacheInfo(cachedAndFree, largestBlock);
+    }
+
+    void assertValidDevice(int device)
+    {
+        const auto device_num = device_allocator.size();
+        TORCH_CHECK(0 <= device && device < static_cast<int64_t>(device_num), "Invalid device argument ", device,
+            ": did you call init?", PTA_ERROR(ErrCode::PARAM));
+    }
+
+    DeviceStats getDeviceStats(int device) override
+    {
+        assertValidDevice(device);
+        return device_allocator[device]->getStats();
+    }
+
+    void resetAccumulatedStats(int device) override
+    {
+        assertValidDevice(device);
+        device_allocator[device]->resetAccumulatedStats();
+    }
+
+    void resetPeakStats(int device) override
+    {
+        assertValidDevice(device);
+        device_allocator[device]->resetPeakStats();
+    }
+
+    void *raw_alloc(size_t nbytes) override
+    {
+        if (nbytes == 0) {
+            return nullptr;
+        }
+        int device = 0;
+        NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
+        void *r = nullptr;
+        malloc(&r, device, nbytes, c10_npu::getCurrentNPUStreamNoWait(device));
+        return r;
+    }
+
+    void *raw_alloc_with_stream(size_t nbytes, aclrtStream stream) override
+    {
+        if (nbytes == 0) {
+            return nullptr;
+        }
+        int device;
+        NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
+        void *r = nullptr;
+        malloc(&r, device, nbytes, stream);
+        return r;
+    }
+
+    void raw_delete(void *ptr) override
+    {
+        this->free(ptr);
+    }
+
+    void FreeDeviceCachedMemory(int device) override
+    {
+        device_allocator[device]->emptyCache(device, true);
+    }
+
+    std::string name() override
+    {
+        return "native";
+    }
+
+    // Note [COW/lazy_clone is not supported yet]
+    void copy_data(void *dest, const void *src, std::size_t count) const final
+    {
+        default_copy_data(dest, src, count);
+    }
+
     void buildServerMemMapForHccl(int device, std::shared_ptr<c10d_npu::HCCLComm> hcclComm)
     {
         device_allocator[device]->buildServerMemMapForHccl(hcclComm);
@@ -3173,69 +2965,72 @@ NpuCachingAllocator caching_allocator;
 REGISTER_ALLOCATOR(c10::DeviceType::PrivateUse1, &caching_allocator);
 
 
-void local_raw_delete(void* ptr)
+void local_raw_delete(void *ptr)
 {
-  caching_allocator.free(ptr);
+    caching_allocator.free(ptr);
 }
 
-void* MallocBlock(size_t size, void *stream, int device) {
-  if (device == -1) {
-    NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
-  }
-  if ((device < 0) || (device > static_cast<int>(caching_allocator.device_allocator.size()))) {
-    return nullptr;
-  }
-  AT_ASSERT(caching_allocator.device_allocator[device], PTA_ERROR(ErrCode::NOT_FOUND));
-  AT_ASSERT(stream, PTA_ERROR(ErrCode::NOT_FOUND));
-  auto block = caching_allocator.device_allocator[device]->malloc(device, size, stream,
-    static_cast<uint8_t>(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_EXTERNAL));
-  AT_ASSERT(block, PTA_ERROR(ErrCode::NOT_FOUND));
-  return reinterpret_cast<void*>(block);
+void *MallocBlock(size_t size, void *stream, int device)
+{
+    if (device == -1) {
+        NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
+    }
+    if ((device < 0) || (device > static_cast<int>(caching_allocator.device_allocator.size()))) {
+        return nullptr;
+    }
+    AT_ASSERT(caching_allocator.device_allocator[device], PTA_ERROR(ErrCode::NOT_FOUND));
+    AT_ASSERT(stream, PTA_ERROR(ErrCode::NOT_FOUND));
+    auto block = caching_allocator.device_allocator[device]->malloc(device, size, stream,
+        static_cast<uint8_t>(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_EXTERNAL));
+    AT_ASSERT(block, PTA_ERROR(ErrCode::NOT_FOUND));
+    return reinterpret_cast<void *>(block);
 }
 
-void FreeBlock(void *handle) {
-  Block* block = reinterpret_cast<Block*>(handle);
-  AT_ASSERT(block, PTA_ERROR(ErrCode::PTR));
-  caching_allocator.assertValidDevice(block->device);
-  AT_ASSERT(caching_allocator.device_allocator[block->device], PTA_ERROR(ErrCode::NOT_FOUND));
-  auto orig_block_ptr = block->ptr;
-  auto orig_block_size = block->size;
-  caching_allocator.device_allocator[block->device]->free(block,
-    static_cast<uint8_t>(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_EXTERNAL));
+void FreeBlock(void *handle)
+{
+    Block *block = reinterpret_cast<Block *>(handle);
+    AT_ASSERT(block, PTA_ERROR(ErrCode::PTR));
+    caching_allocator.assertValidDevice(block->device);
+    AT_ASSERT(caching_allocator.device_allocator[block->device], PTA_ERROR(ErrCode::NOT_FOUND));
+    auto orig_block_ptr = block->ptr;
+    auto orig_block_size = block->size;
+    caching_allocator.device_allocator[block->device]->free(block,
+        static_cast<uint8_t>(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_EXTERNAL));
 }
 
-void* GetBlockPtr(const void *handle) {
-  const Block* block = reinterpret_cast<const Block*>(handle);
-  AT_ASSERT(block, PTA_ERROR(ErrCode::PTR));
-  return block->ptr;
+void *GetBlockPtr(const void *handle)
+{
+    const Block *block = reinterpret_cast<const Block *>(handle);
+    AT_ASSERT(block, PTA_ERROR(ErrCode::PTR));
+    return block->ptr;
 }
 
-size_t GetBlockSize(const void *handle) {
-  const Block* block = reinterpret_cast<const Block*>(handle);
-  AT_ASSERT(block, PTA_ERROR(ErrCode::PTR));
-  return block->size;
+size_t GetBlockSize(const void *handle)
+{
+    const Block *block = reinterpret_cast<const Block *>(handle);
+    AT_ASSERT(block, PTA_ERROR(ErrCode::PTR));
+    return block->size;
 }
 
 struct BackendStaticInitializer {
     BackendStaticInitializer()
     {
-      allocator.store(&caching_allocator);
+        allocator.store(&caching_allocator);
     }
 };
 
-std::atomic<NPUAllocator*> allocator;
+std::atomic<NPUAllocator *> allocator;
 BackendStaticInitializer backend_static_initializer;
 
-std::mutex* getFreeMutex() {
-  static std::mutex npu_free_mutex;
-  return &npu_free_mutex;
+std::mutex *getFreeMutex()
+{
+    static std::mutex npu_free_mutex;
+    return &npu_free_mutex;
 }
-
 } // namespace NPUCachingAllocator
 } // namespace c10_npu
 
 namespace c10_npu {
-
 // uid_ is incremented when a user creates a MemPool,
 // for example: using graph_pool_handle() or c10_npu::MemPool().
 //
@@ -3246,17 +3041,17 @@ namespace c10_npu {
 // passed to a function, either by user or NPUGraphs. For example,
 // default value of MempoolId_t for capture_begin function is {0, 0}.
 // That's why uid_ and uuid_ start at 1.
-std::atomic<CaptureId_t> MemPool::uid_{1};
-std::atomic<CaptureId_t> MemPool::uuid_{1};
+std::atomic<CaptureId_t> MemPool::uid_{ 1 };
+std::atomic<CaptureId_t> MemPool::uuid_{ 1 };
 
 
-MemPool::MemPool(NPUCachingAllocator::NPUAllocator* allocator, bool is_user_created)
+MemPool::MemPool(NPUCachingAllocator::NPUAllocator *allocator, bool is_user_created)
     : allocator_(allocator), is_user_created_(is_user_created)
 {
     if (is_user_created_) {
-        id_ = {0, uid_++};
+        id_ = { 0, uid_++ };
     } else {
-        id_ = {uuid_++, 0};
+        id_ = { uuid_++, 0 };
     }
 }
 
@@ -3265,7 +3060,7 @@ MempoolId_t MemPool::id()
     return id_;
 }
 
-NPUCachingAllocator::NPUAllocator* MemPool::allocator()
+NPUCachingAllocator::NPUAllocator *MemPool::allocator()
 {
     return allocator_;
 }
@@ -3273,10 +3068,9 @@ NPUCachingAllocator::NPUAllocator* MemPool::allocator()
 // Note that active_mempool_ is a global variable here
 // and not inside MemPoolContext class, because in windows we
 // can't use __declspec(dllexport) and __declspec(thread)
-static thread_local MemPool* active_mempool_ = nullptr;
+static thread_local MemPool *active_mempool_ = nullptr;
 
-MemPoolContext::MemPoolContext(MemPool* mempool)
-    : prev_mempool_(active_mempool_)
+MemPoolContext::MemPoolContext(MemPool *mempool) : prev_mempool_(active_mempool_)
 {
     active_mempool_ = mempool;
 }
@@ -3286,9 +3080,8 @@ MemPoolContext::~MemPoolContext()
     active_mempool_ = prev_mempool_;
 }
 
-MemPool* MemPoolContext::getActiveMemPool()
+MemPool *MemPoolContext::getActiveMemPool()
 {
     return active_mempool_;
 }
-
 } // namespace c10_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.h b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
index 12104d59a9..21eaf80ad5 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.h
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.h
@@ -113,7 +113,7 @@ struct BlockInfo {
 struct SegmentInfo {
     int64_t device = 0;
     int64_t  address = 0;
-    aclrtStream stream = 0;
+    aclrtStream stream = nullptr;
     int64_t total_size = 0;
     int64_t requested_size = 0;
     int64_t allocated_size = 0;
diff --git a/torch_npu/csrc/core/npu/NPUEvent.cpp b/torch_npu/csrc/core/npu/NPUEvent.cpp
index 39220d766b..0e6857cf51 100644
--- a/torch_npu/csrc/core/npu/NPUEvent.cpp
+++ b/torch_npu/csrc/core/npu/NPUEvent.cpp
@@ -70,7 +70,9 @@ void NPUEvent::record()
 
 void NPUEvent::recordOnce(const NPUStream& stream)
 {
-    if (!was_recorded_) record(stream);
+    if (!was_recorded_) {
+        record(stream);
+    }
 }
 
 void NPUEvent::record(const NPUStream& stream)
diff --git a/torch_npu/csrc/core/npu/NPUGuard.h b/torch_npu/csrc/core/npu/NPUGuard.h
index 63f9bdf209..995beb9dfc 100644
--- a/torch_npu/csrc/core/npu/NPUGuard.h
+++ b/torch_npu/csrc/core/npu/NPUGuard.h
@@ -9,293 +9,309 @@
 #include <cstddef>
 
 namespace c10_npu {
-
 // This code is kind of boilerplatey.  See Note [Whither the DeviceGuard
 // boilerplate]
 
-/// A variant of DeviceGuard that is specialized for NPU.  It accepts
-/// integer indices (interpreting them as NPU devices) and is a little
-/// more efficient than DeviceGuard (it compiles to straight line
-/// NPUSetDevice/NPUGetDevice calls); however, it can only be used
-/// from code that links against NPU directly.
+// / A variant of DeviceGuard that is specialized for NPU.  It accepts
+// / integer indices (interpreting them as NPU devices) and is a little
+// / more efficient than DeviceGuard (it compiles to straight line
+// / NPUSetDevice/NPUGetDevice calls); however, it can only be used
+// / from code that links against NPU directly.
 struct NPUGuard {
-  /// No default constructor; see Note [Omitted default constructor from RAII]
-  explicit NPUGuard() = delete;
-
-  /// Set the current NPU device to the passed device index.
-  explicit NPUGuard(c10::DeviceIndex device_index) : guard_(device_index) {}
-
-  /// Sets the current NPU device to the passed device.  Errors if the passed
-  /// device is not a NPU device.
-  explicit NPUGuard(c10::Device device) : guard_(device) {}
-
-  // Copy is not allowed
-  NPUGuard(const NPUGuard&) = delete;
-  NPUGuard& operator=(const NPUGuard&) = delete;
-
-  // Move is not allowed (there is no uninitialized state)
-  NPUGuard(NPUGuard&& other) = delete;
-  NPUGuard& operator=(NPUGuard&& other) = delete;
-
-  /// Sets the NPU device to the given device.  Errors if the given device
-  /// is not a NPU device.
-  void set_device(c10::Device device) {
-    guard_.set_device(device);
-  }
-
-  /// Sets the NPU device to the given device.  Errors if the given device
-  /// is not a NPU device.  (This method is provided for uniformity with
-  /// DeviceGuard).
-  void reset_device(c10::Device device) {
-    guard_.reset_device(device);
-  }
-
-  /// Sets the NPU device to the given device index.
-  void set_index(c10::DeviceIndex device_index) {
-    guard_.set_index(device_index);
-  }
-
-  /// Returns the device that was set upon construction of the guard
-  c10::Device original_device() const {
-    return guard_.original_device();
-  }
-
-  /// Returns the last device that was set via `set_device`, if any, otherwise
-  /// the device passed during construction.
-  c10::Device current_device() const {
-    return guard_.current_device();
-  }
+    // / No default constructor; see Note [Omitted default constructor from RAII]
+    explicit NPUGuard() = delete;
+
+    // / Set the current NPU device to the passed device index.
+    explicit NPUGuard(c10::DeviceIndex device_index) : guard_(device_index) {}
+
+    // / Sets the current NPU device to the passed device.  Errors if the passed
+    // / device is not a NPU device.
+    explicit NPUGuard(c10::Device device) : guard_(device) {}
+
+    // Copy is not allowed
+    NPUGuard(const NPUGuard &) = delete;
+    NPUGuard &operator = (const NPUGuard &) = delete;
+
+    // Move is not allowed (there is no uninitialized state)
+    NPUGuard(NPUGuard &&other) = delete;
+    NPUGuard &operator = (NPUGuard &&other) = delete;
+
+    // / Sets the NPU device to the given device.  Errors if the given device
+    // / is not a NPU device.
+    void set_device(c10::Device device)
+    {
+        guard_.set_device(device);
+    }
+
+    // / Sets the NPU device to the given device.  Errors if the given device
+    // / is not a NPU device.  (This method is provided for uniformity with
+    // / DeviceGuard).
+    void reset_device(c10::Device device)
+    {
+        guard_.reset_device(device);
+    }
+
+    // / Sets the NPU device to the given device index.
+    void set_index(c10::DeviceIndex device_index)
+    {
+        guard_.set_index(device_index);
+    }
+
+    // / Returns the device that was set upon construction of the guard
+    c10::Device original_device() const
+    {
+        return guard_.original_device();
+    }
+
+    // / Returns the last device that was set via `set_device`, if any, otherwise
+    // / the device passed during construction.
+    c10::Device current_device() const
+    {
+        return guard_.current_device();
+    }
 
 private:
-  /// The guard for the current device.
-  c10::impl::InlineDeviceGuard<c10_npu::impl::NPUGuardImpl> guard_;
+    // / The guard for the current device.
+    c10::impl::InlineDeviceGuard<c10_npu::impl::NPUGuardImpl> guard_;
 };
 
-/// A variant of OptionalDeviceGuard that is specialized for NPU.  See
-/// NPUGuard for when you can use this.
+// / A variant of OptionalDeviceGuard that is specialized for NPU.  See
+// / NPUGuard for when you can use this.
 struct OptionalNPUGuard {
-  /// Create an uninitialized OptionalNPUGuard.
-  explicit OptionalNPUGuard() : guard_() {}
-
-  /// Set the current NPU device to the passed Device, if it is not nullopt.
-  explicit OptionalNPUGuard(c10::optional<c10::Device> device_opt) : guard_(device_opt) {}
-
-  /// Set the current NPU device to the passed device index, if it is not
-  /// nullopt
-  explicit OptionalNPUGuard(c10::optional<c10::DeviceIndex> device_index_opt)
-      : guard_(device_index_opt) {}
-
-  // Copy is not allowed
-  OptionalNPUGuard(const OptionalNPUGuard&) = delete;
-  OptionalNPUGuard& operator=(const OptionalNPUGuard&) = delete;
-
-  // See Note [Move construction for RAII guards is tricky]
-  OptionalNPUGuard(OptionalNPUGuard&& other) = delete;
-
-  // See Note [Move assignment for RAII guards is tricky]
-  OptionalNPUGuard& operator=(OptionalNPUGuard&& other) = delete;
-
-  /// Sets the NPU device to the given device, initializing the guard if it
-  /// is not already initialized.  Errors if the given device is not a NPU
-  /// device.
-  void set_device(c10::Device device) {
-    guard_.set_device(device);
-  }
-
-  /// Sets the NPU device to the given device, initializing the guard if it is
-  /// not already initialized.  Errors if the given device is not a NPU device.
-  /// (This method is provided for uniformity with OptionalDeviceGuard).
-  void reset_device(c10::Device device) {
-    guard_.reset_device(device);
-  }
-
-  /// Sets the NPU device to the given device index, initializing the guard if
-  /// it is not already initialized.
-  void set_index(c10::DeviceIndex device_index) {
-    guard_.set_index(device_index);
-  }
-
-  /// Returns the device that was set immediately prior to initialization of the
-  /// guard, or nullopt if the guard is uninitialized.
-  c10::optional<c10::Device> original_device() const {
-    return guard_.original_device();
-  }
-
-  /// Returns the most recent device that was set using this device guard,
-  /// either from construction, or via set_device, if the guard is initialized,
-  /// or nullopt if the guard is uninitialized.
-  c10::optional<c10::Device> current_device() const {
-    return guard_.current_device();
-  }
-
-  /// Restore the original NPU device, resetting this guard to uninitialized
-  /// state.
-  void reset() {
-    guard_.reset();
-  }
+    // / Create an uninitialized OptionalNPUGuard.
+    explicit OptionalNPUGuard() : guard_() {}
+
+    // / Set the current NPU device to the passed Device, if it is not nullopt.
+    explicit OptionalNPUGuard(c10::optional<c10::Device> device_opt) : guard_(device_opt) {}
+
+    // / Set the current NPU device to the passed device index, if it is not
+    // / nullopt
+    explicit OptionalNPUGuard(c10::optional<c10::DeviceIndex> device_index_opt) : guard_(device_index_opt) {}
+
+    // Copy is not allowed
+    OptionalNPUGuard(const OptionalNPUGuard &) = delete;
+    OptionalNPUGuard &operator = (const OptionalNPUGuard &) = delete;
+
+    // See Note [Move construction for RAII guards is tricky]
+    OptionalNPUGuard(OptionalNPUGuard &&other) = delete;
+
+    // See Note [Move assignment for RAII guards is tricky]
+    OptionalNPUGuard &operator = (OptionalNPUGuard &&other) = delete;
+
+    // / Sets the NPU device to the given device, initializing the guard if it
+    // / is not already initialized.  Errors if the given device is not a NPU
+    // / device.
+    void set_device(c10::Device device)
+    {
+        guard_.set_device(device);
+    }
+
+    // / Sets the NPU device to the given device, initializing the guard if it is
+    // / not already initialized.  Errors if the given device is not a NPU device.
+    // / (This method is provided for uniformity with OptionalDeviceGuard).
+    void reset_device(c10::Device device)
+    {
+        guard_.reset_device(device);
+    }
+
+    // / Sets the NPU device to the given device index, initializing the guard if
+    // / it is not already initialized.
+    void set_index(c10::DeviceIndex device_index)
+    {
+        guard_.set_index(device_index);
+    }
+
+    // / Returns the device that was set immediately prior to initialization of the
+    // / guard, or nullopt if the guard is uninitialized.
+    c10::optional<c10::Device> original_device() const
+    {
+        return guard_.original_device();
+    }
+
+    // / Returns the most recent device that was set using this device guard,
+    // / either from construction, or via set_device, if the guard is initialized,
+    // / or nullopt if the guard is uninitialized.
+    c10::optional<c10::Device> current_device() const
+    {
+        return guard_.current_device();
+    }
+
+    // / Restore the original NPU device, resetting this guard to uninitialized
+    // / state.
+    void reset()
+    {
+        guard_.reset();
+    }
 
 private:
-  c10::impl::InlineOptionalDeviceGuard<impl::NPUGuardImpl> guard_;
+    c10::impl::InlineOptionalDeviceGuard<impl::NPUGuardImpl> guard_;
 };
 
-/// A variant of StreamGuard that is specialized for NPU.  See NPUGuard
-/// for when you can use this.
+// / A variant of StreamGuard that is specialized for NPU.  See NPUGuard
+// / for when you can use this.
 struct NPUStreamGuard {
-  /// No default constructor, see Note [Omitted default constructor from RAII]
-  explicit NPUStreamGuard() = delete;
-
-  /// Set the current NPU device to the device associated with the passed
-  /// stream, and set the current NPU stream on that device to the passed
-  /// stream. Errors if the Stream is not a NPU stream.
-  explicit NPUStreamGuard(c10::Stream stream) : guard_(stream) {}
-
-  /// Copy is disallowed
-  NPUStreamGuard(const NPUStreamGuard&) = delete;
-  NPUStreamGuard& operator=(const NPUStreamGuard&) = delete;
-
-  /// Move is disallowed, as NPUStreamGuard does not have an uninitialized
-  /// state, which is required for moves on types with nontrivial destructors.
-  NPUStreamGuard(NPUStreamGuard&& other) = delete;
-  NPUStreamGuard& operator=(NPUStreamGuard&& other) = delete;
-
-  /// Resets the currently set stream to the original stream and
-  /// the currently set device to the original device.  Then,
-  /// set the current device to the device associated with the passed stream,
-  /// and set the current stream on that device to the passed stream.
-  /// Errors if the stream passed is not a NPU stream.
-  ///
-  /// NOTE: this implementation may skip some stream/device setting if
-  /// it can prove that it is unnecessary.
-  ///
-  /// WARNING: reset_stream does NOT preserve previously set streams on
-  /// different devices.  If you need to set streams on multiple devices
-  /// on NPU, use NPUMultiStreamGuard instead.
-  void reset_stream(c10::Stream stream) {
-    guard_.reset_stream(stream);
-  }
-
-  /// Returns the NPU stream that was set at the time the guard was constructed.
-  NPUStream original_stream() const {
-    return NPUStream(NPUStream::UNCHECKED, guard_.original_stream());
-  }
-
-  /// Returns the most recent NPU stream that was set using this device guard,
-  /// either from construction, or via set_stream.
-  NPUStream current_stream() const {
-    return NPUStream(NPUStream::UNCHECKED, guard_.current_stream());
-  }
-
-  /// Returns the most recent NPU device that was set using this device guard,
-  /// either from construction, or via set_device/reset_device/set_index.
-  c10::Device current_device() const {
-    return guard_.current_device();
-  }
-
-  /// Returns the NPU device that was set at the most recent reset_stream(),
-  /// or otherwise the device at construction time.
-  c10::Device original_device() const {
-    return guard_.original_device();
-  }
+    // / No default constructor, see Note [Omitted default constructor from RAII]
+    explicit NPUStreamGuard() = delete;
+
+    // / Set the current NPU device to the device associated with the passed
+    // / stream, and set the current NPU stream on that device to the passed
+    // / stream. Errors if the Stream is not a NPU stream.
+    explicit NPUStreamGuard(c10::Stream stream) : guard_(stream) {}
+
+    // / Copy is disallowed
+    NPUStreamGuard(const NPUStreamGuard &) = delete;
+    NPUStreamGuard &operator = (const NPUStreamGuard &) = delete;
+
+    // / Move is disallowed, as NPUStreamGuard does not have an uninitialized
+    // / state, which is required for moves on types with nontrivial destructors.
+    NPUStreamGuard(NPUStreamGuard &&other) = delete;
+    NPUStreamGuard &operator = (NPUStreamGuard &&other) = delete;
+
+    // / Resets the currently set stream to the original stream and
+    // / the currently set device to the original device.  Then,
+    // / set the current device to the device associated with the passed stream,
+    // / and set the current stream on that device to the passed stream.
+    // / Errors if the stream passed is not a NPU stream.
+    // /
+    // / NOTE: this implementation may skip some stream/device setting if
+    // / it can prove that it is unnecessary.
+    // /
+    // / WARNING: reset_stream does NOT preserve previously set streams on
+    // / different devices.  If you need to set streams on multiple devices
+    // / on NPU, use NPUMultiStreamGuard instead.
+    void reset_stream(c10::Stream stream)
+    {
+        guard_.reset_stream(stream);
+    }
+
+    // / Returns the NPU stream that was set at the time the guard was constructed.
+    NPUStream original_stream() const
+    {
+        return NPUStream(NPUStream::UNCHECKED, guard_.original_stream());
+    }
+
+    // / Returns the most recent NPU stream that was set using this device guard,
+    // / either from construction, or via set_stream.
+    NPUStream current_stream() const
+    {
+        return NPUStream(NPUStream::UNCHECKED, guard_.current_stream());
+    }
+
+    // / Returns the most recent NPU device that was set using this device guard,
+    // / either from construction, or via set_device/reset_device/set_index.
+    c10::Device current_device() const
+    {
+        return guard_.current_device();
+    }
+
+    // / Returns the NPU device that was set at the most recent reset_stream(),
+    // / or otherwise the device at construction time.
+    c10::Device original_device() const
+    {
+        return guard_.original_device();
+    }
 
 private:
-  c10::impl::InlineStreamGuard<c10_npu::impl::NPUGuardImpl> guard_;
+    c10::impl::InlineStreamGuard<c10_npu::impl::NPUGuardImpl> guard_;
 };
 
-/// A variant of OptionalStreamGuard that is specialized for NPU.  See NPUGuard
-/// for when you can use this.
+// / A variant of OptionalStreamGuard that is specialized for NPU.  See NPUGuard
+// / for when you can use this.
 struct OptionalNPUStreamGuard {
-  /// Create an uninitialized guard.
-  explicit OptionalNPUStreamGuard() : guard_() {}
-
-  /// Set the current NPU device to the device associated with the passed
-  /// stream, and set the current NPU stream on that device to the passed
-  /// stream. Errors if the Stream is not a NPU stream.
-  explicit OptionalNPUStreamGuard(c10::Stream stream) : guard_(stream) {}
-
-  /// Set the current device to the device associated with the passed stream,
-  /// and set the current stream on that device to the passed stream,
-  /// if the passed stream is not nullopt.
-  explicit OptionalNPUStreamGuard(c10::optional<c10::Stream> stream_opt)
-      : guard_(stream_opt) {}
-
-  /// Copy is disallowed
-  OptionalNPUStreamGuard(const OptionalNPUStreamGuard&) = delete;
-  OptionalNPUStreamGuard& operator=(const OptionalNPUStreamGuard&) = delete;
-
-  // See Note [Move construction for RAII guards is tricky]
-  OptionalNPUStreamGuard(OptionalNPUStreamGuard&& other) = delete;
-
-  // See Note [Move assignment for RAII guards is tricky]
-  OptionalNPUStreamGuard& operator=(OptionalNPUStreamGuard&& other) = delete;
-
-  /// Resets the currently set NPU stream to the original stream and
-  /// the currently set device to the original device.  Then,
-  /// set the current device to the device associated with the passed stream,
-  /// and set the current stream on that device to the passed stream.
-  /// Initializes the guard if it was not previously initialized.
-  void reset_stream(c10::Stream stream) {
-    guard_.reset_stream(stream);
-  }
-
-  /// Returns the NPU stream that was set at the time the guard was most
-  /// recently initialized, or nullopt if the guard is uninitialized.
-  c10::optional<NPUStream> original_stream() const {
-    auto r = guard_.original_stream();
-    if (r.has_value()) {
-      return c10::make_optional(NPUStream(NPUStream::UNCHECKED, r.value()));
-    } else {
-      return c10::nullopt;
+    // / Create an uninitialized guard.
+    explicit OptionalNPUStreamGuard() : guard_() {}
+
+    // / Set the current NPU device to the device associated with the passed
+    // / stream, and set the current NPU stream on that device to the passed
+    // / stream. Errors if the Stream is not a NPU stream.
+    explicit OptionalNPUStreamGuard(c10::Stream stream) : guard_(stream) {}
+
+    // / Set the current device to the device associated with the passed stream,
+    // / and set the current stream on that device to the passed stream,
+    // / if the passed stream is not nullopt.
+    explicit OptionalNPUStreamGuard(c10::optional<c10::Stream> stream_opt) : guard_(stream_opt) {}
+
+    // / Copy is disallowed
+    OptionalNPUStreamGuard(const OptionalNPUStreamGuard &) = delete;
+    OptionalNPUStreamGuard &operator = (const OptionalNPUStreamGuard &) = delete;
+
+    // See Note [Move construction for RAII guards is tricky]
+    OptionalNPUStreamGuard(OptionalNPUStreamGuard &&other) = delete;
+
+    // See Note [Move assignment for RAII guards is tricky]
+    OptionalNPUStreamGuard &operator = (OptionalNPUStreamGuard &&other) = delete;
+
+    // / Resets the currently set NPU stream to the original stream and
+    // / the currently set device to the original device.  Then,
+    // / set the current device to the device associated with the passed stream,
+    // / and set the current stream on that device to the passed stream.
+    // / Initializes the guard if it was not previously initialized.
+    void reset_stream(c10::Stream stream)
+    {
+        guard_.reset_stream(stream);
     }
-  }
-
-  /// Returns the most recent NPU stream that was set using this stream guard,
-  /// either from construction, or via reset_stream, if the guard is
-  /// initialized, or nullopt if the guard is uninitialized.
-  c10::optional<NPUStream> current_stream() const {
-    auto r = guard_.current_stream();
-    if (r.has_value()) {
-      return c10::make_optional(NPUStream(NPUStream::UNCHECKED, r.value()));
-    } else {
-      return c10::nullopt;
+
+    // / Returns the NPU stream that was set at the time the guard was most
+    // / recently initialized, or nullopt if the guard is uninitialized.
+    c10::optional<NPUStream> original_stream() const
+    {
+        auto r = guard_.original_stream();
+        if (r.has_value()) {
+            return c10::make_optional(NPUStream(NPUStream::UNCHECKED, r.value()));
+        } else {
+            return c10::nullopt;
+        }
     }
-  }
 
-  /// Restore the original NPU device and stream, resetting this guard to
-  /// uninitialized state.
-  void reset() {
-    guard_.reset();
-  }
+    // / Returns the most recent NPU stream that was set using this stream guard,
+    // / either from construction, or via reset_stream, if the guard is
+    // / initialized, or nullopt if the guard is uninitialized.
+    c10::optional<NPUStream> current_stream() const
+    {
+        auto r = guard_.current_stream();
+        if (r.has_value()) {
+            return c10::make_optional(NPUStream(NPUStream::UNCHECKED, r.value()));
+        } else {
+            return c10::nullopt;
+        }
+    }
+
+    // / Restore the original NPU device and stream, resetting this guard to
+    // / uninitialized state.
+    void reset()
+    {
+        guard_.reset();
+    }
 
 private:
-  c10::impl::InlineOptionalStreamGuard<c10_npu::impl::NPUGuardImpl> guard_;
+    c10::impl::InlineOptionalStreamGuard<c10_npu::impl::NPUGuardImpl> guard_;
 };
 
-/// A variant of MultiStreamGuard that is specialized for NPU.
+// / A variant of MultiStreamGuard that is specialized for NPU.
 struct NPUMultiStreamGuard {
-  explicit NPUMultiStreamGuard(at::ArrayRef<NPUStream> streams)
-      : guard_(unwrapStreams(streams)) {}
+    explicit NPUMultiStreamGuard(at::ArrayRef<NPUStream> streams) : guard_(unwrapStreams(streams)) {}
 
-  /// Copy is disallowed
-  NPUMultiStreamGuard(const NPUMultiStreamGuard&) = delete;
-  NPUMultiStreamGuard& operator=(const NPUMultiStreamGuard&) = delete;
+    // / Copy is disallowed
+    NPUMultiStreamGuard(const NPUMultiStreamGuard &) = delete;
+    NPUMultiStreamGuard &operator = (const NPUMultiStreamGuard &) = delete;
 
-  // See Note [Move construction for RAII guards is tricky]
-  NPUMultiStreamGuard(NPUMultiStreamGuard&& other) = delete;
+    // See Note [Move construction for RAII guards is tricky]
+    NPUMultiStreamGuard(NPUMultiStreamGuard &&other) = delete;
 
-  // See Note [Move assignment for RAII guards is tricky]
-  NPUMultiStreamGuard& operator=(NPUMultiStreamGuard&& other) = delete;
+    // See Note [Move assignment for RAII guards is tricky]
+    NPUMultiStreamGuard &operator = (NPUMultiStreamGuard &&other) = delete;
 
 private:
-  c10::impl::InlineMultiStreamGuard<c10_npu::impl::NPUGuardImpl> guard_;
-
-  static std::vector<c10::Stream> unwrapStreams(at::ArrayRef<NPUStream> NPUStreams) {
-    std::vector<c10::Stream> streams;
-    streams.reserve(NPUStreams.size());
-    for (const NPUStream& NPUStream : NPUStreams) {
-      streams.push_back(NPUStream);
+    c10::impl::InlineMultiStreamGuard<c10_npu::impl::NPUGuardImpl> guard_;
+
+    static std::vector<c10::Stream> unwrapStreams(at::ArrayRef<NPUStream> NPUStreams)
+    {
+        std::vector<c10::Stream> streams;
+        streams.reserve(NPUStreams.size());
+        for (const NPUStream &NPUStream : NPUStreams) {
+            streams.push_back(NPUStream);
+        }
+        return streams;
     }
-    return streams;
-  }
 };
-
 } // namespace c10_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp
index 83e33e3b84..8e0b28e03f 100644
--- a/torch_npu/csrc/core/npu/NPUQueue.cpp
+++ b/torch_npu/csrc/core/npu/NPUQueue.cpp
@@ -20,91 +20,105 @@
 #include <third_party/acl/inc/acl/acl_rt.h>
 
 namespace c10_npu {
-
-struct timeval delay = {0, 1};
+struct timeval delay = { 0, 1 };
 
 namespace {
-
 class CallBackManager {
 public:
     CallBackManager() {}
     ~CallBackManager() {}
-    void SetExec(const ACL_EXEC_FUNC& func) {
+    void SetExec(const ACL_EXEC_FUNC &func)
+    {
         this->execFunc = func;
     }
 
-    void SetCopy(const ACL_COPY_FUNC& func) {
+    void SetCopy(const ACL_COPY_FUNC &func)
+    {
         this->copyFunc = func;
     }
 
-    void SetRelease(const ACL_RELEASE_FUNC& func) {
+    void SetRelease(const ACL_RELEASE_FUNC &func)
+    {
         this->releaseFunc = func;
     }
 
-    void SetCopyReleaseParam(const ACL_COPY_RELEASE_PARM_FUNC& func) {
+    void SetCopyReleaseParam(const ACL_COPY_RELEASE_PARM_FUNC &func)
+    {
         this->copyReleaseParamFunc = func;
     }
 
-    void SetReleaseParam(const ACL_RELEASE_PARAM_FUNC& func) {
+    void SetReleaseParam(const ACL_RELEASE_PARAM_FUNC &func)
+    {
         this->releaseParamFunc = func;
     }
 
-    void SetNew(const ACL_NEW_FUNC& func) {
+    void SetNew(const ACL_NEW_FUNC &func)
+    {
         this->newFunc = func;
     }
 
-    void SetDelete(const ACL_DELETE_FUNC& func) {
+    void SetDelete(const ACL_DELETE_FUNC &func)
+    {
         this->deleteFunc = func;
     }
 
-    void *getCurrentParams(void* head, int offset)
+    void *getCurrentParams(void *head, int offset)
     {
-        return (uint8_t*)head + sizePerParams * offset;
+        return (uint8_t *)head + sizePerParams * offset;
     }
 
-    int Call(void* head, int offset) {
+    int Call(void *head, int offset)
+    {
         TORCH_CHECK(this->execFunc, "Failed to find execution function.", PTA_ERROR(ErrCode::NOT_FOUND));
-        auto dstPtr = (uint8_t*)head + sizePerParams * offset;
+        auto dstPtr = (uint8_t *)head + sizePerParams * offset;
         return this->execFunc(dstPtr);
     }
 
-    void Copy(void* dstHead, int offset, void* src) {
+    void Copy(void *dstHead, int offset, void *src)
+    {
         TORCH_CHECK(this->copyFunc, "Failed to find copy function.", PTA_ERROR(ErrCode::NOT_FOUND));
-        auto dstPtr = (uint8_t*)dstHead + sizePerParams * offset;
+        auto dstPtr = (uint8_t *)dstHead + sizePerParams * offset;
         return this->copyFunc(dstPtr, src);
     }
 
-    void Release(void* head, int offset, ReleaseQueue& releaseQueue) {
+    void Release(void *head, int offset, ReleaseQueue &releaseQueue)
+    {
         TORCH_CHECK(this->releaseFunc, "Failed to find release function.", PTA_ERROR(ErrCode::NOT_FOUND));
-        auto ptr = (uint8_t*)head +  sizePerParams * offset;
+        auto ptr = (uint8_t *)head + sizePerParams * offset;
         return this->releaseFunc(ptr, releaseQueue);
     }
 
-    void CopyRealseParam(void* dstHead, int offset, void* src) {
-        TORCH_CHECK(this->copyReleaseParamFunc, "Failed to find copy release params function.", PTA_ERROR(ErrCode::NOT_FOUND));
-        auto dstPtr = (uint8_t*)dstHead + sizePerParams * offset;
+    void CopyRealseParam(void *dstHead, int offset, void *src)
+    {
+        TORCH_CHECK(this->copyReleaseParamFunc, "Failed to find copy release params function.",
+            PTA_ERROR(ErrCode::NOT_FOUND));
+        auto dstPtr = (uint8_t *)dstHead + sizePerParams * offset;
         return this->copyReleaseParamFunc(dstPtr, src);
     }
 
-    void ReleaseParam(void* head, int offset) {
+    void ReleaseParam(void *head, int offset)
+    {
         TORCH_CHECK(this->releaseParamFunc, "Failed to find release params function.", PTA_ERROR(ErrCode::NOT_FOUND));
-        auto ptr = (uint8_t*)head +  sizePerParams * offset;
+        auto ptr = (uint8_t *)head + sizePerParams * offset;
         return this->releaseParamFunc(ptr);
     }
 
-    void* Init(int capacity) {
+    void *Init(int capacity)
+    {
         TORCH_CHECK(this->newFunc, "Failed to find new function.", PTA_ERROR(ErrCode::NOT_FOUND));
-        void* ptr = this->newFunc(capacity, sizePerParams); // not check as CUDA
+        void *ptr = this->newFunc(capacity, sizePerParams); // not check as CUDA
         return ptr;
     }
 
-    void DeInit(void* ptr) {
+    void DeInit(void *ptr)
+    {
         if (ptr != nullptr) {
             TORCH_CHECK(this->deleteFunc, "Failed to find delete function.", PTA_ERROR(ErrCode::NOT_FOUND));
             this->deleteFunc(ptr);
             ptr = nullptr;
         }
     }
+
 private:
     int sizePerParams = 0;
     ACL_EXEC_FUNC execFunc = nullptr;
@@ -116,22 +130,24 @@ private:
     ACL_RELEASE_PARAM_FUNC releaseParamFunc = nullptr;
 }; // class CallBackManager
 
-CallBackManager& manager() {
+CallBackManager &manager()
+{
     static CallBackManager instance;
     return instance;
 }
 
-CallBackManager& releaseManager() {
+CallBackManager &releaseManager()
+{
     static CallBackManager releaseinstance;
     return releaseinstance;
 }
 } // namespace
 
 namespace register_queue_cb {
-NPUCallBackRegisterBuilder::NPUCallBackRegisterBuilder(const ACL_EXEC_FUNC& execFunc,
-    const ACL_COPY_FUNC& copyFunc, const ACL_RELEASE_FUNC& releaseFunc,
-    const ACL_NEW_FUNC& newFunc, const ACL_DELETE_FUNC& deleteFunc,
-    const ACL_COPY_RELEASE_PARM_FUNC& copyReleaseParamF, const ACL_RELEASE_PARAM_FUNC& releaseParamF) {
+NPUCallBackRegisterBuilder::NPUCallBackRegisterBuilder(const ACL_EXEC_FUNC &execFunc, const ACL_COPY_FUNC &copyFunc,
+    const ACL_RELEASE_FUNC &releaseFunc, const ACL_NEW_FUNC &newFunc, const ACL_DELETE_FUNC &deleteFunc,
+    const ACL_COPY_RELEASE_PARM_FUNC &copyReleaseParamF, const ACL_RELEASE_PARAM_FUNC &releaseParamF)
+{
     manager().SetExec(execFunc);
     manager().SetCopy(copyFunc);
     manager().SetRelease(releaseFunc);
@@ -153,7 +169,7 @@ static constexpr size_t kQueueCapacity = 4096;
 static std::string repo_error;
 static std::string acl_error;
 
-std::string get_func_error_msg(void* error_paras)
+std::string get_func_error_msg(void *error_paras)
 {
     auto queueParam = static_cast<c10_npu::queue::QueueParas *>(error_paras);
     auto type = queueParam->paramType;
@@ -169,9 +185,8 @@ std::string get_func_error_msg(void* error_paras)
         result << "the current working operator name is " << op_name;
     } else if (type == c10_npu::queue::ASYNC_MEMCPY) {
         auto cur_paras = static_cast<c10_npu::queue::CopyParas *>(queueParam->paramVal);
-        result << "the current copy params are srclen=" << cur_paras->srcLen
-               << ", dstlen=" << cur_paras->dstLen
-               << ", kind=" << cur_paras->kind;
+        result << "the current copy params are srclen=" << cur_paras->srcLen << ", dstlen=" << cur_paras->dstLen <<
+            ", kind=" << cur_paras->kind;
     } else {
         auto cur_paras = static_cast<c10_npu::queue::EventParas *>(queueParam->paramVal);
         result << "the current working event is " << cur_paras->event;
@@ -179,7 +194,8 @@ std::string get_func_error_msg(void* error_paras)
     return result.str();
 }
 
-RepoStatus Repository::GetStatus() const {
+RepoStatus Repository::GetStatus() const
+{
     if (initialized == false) {
         ASCEND_LOGE("Task queue is not initialized, shouldn't call GetStatus(). !!");
     }
@@ -187,7 +203,8 @@ RepoStatus Repository::GetStatus() const {
     return repo_status.load();
 }
 
-void Repository::SetStatus(RepoStatus desired) {
+void Repository::SetStatus(RepoStatus desired)
+{
     if (initialized == false) {
         ASCEND_LOGE("Task queue is not initialized, shouldn't call SetStatus(). !!");
         return;
@@ -196,7 +213,8 @@ void Repository::SetStatus(RepoStatus desired) {
     repo_status = desired;
 }
 
-void Repository::ChangeStatus(RepoStatus expected, RepoStatus desired) {
+void Repository::ChangeStatus(RepoStatus expected, RepoStatus desired)
+{
     if (initialized == false) {
         ASCEND_LOGE("Task queue is not initialized, shouldn't call ChangeStatus(). !!");
         return;
@@ -287,7 +305,7 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
                 c10_npu::option::oom_observer();
             }
         }
-        
+
 #ifndef BUILD_LIBTORCH
         if (gilState) {
             PyEval_RestoreThread(gilState);
@@ -296,14 +314,15 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
 
         if (check_error) {
             throw std::runtime_error("The Inner error is reported as above. "
-                                    "The process exits for this inner error, and " + repo_error + ".\n" +
-                                    "Since the operator is called asynchronously, the stacktrace may be inaccurate. "
-                                    "If you want to get the accurate stacktrace, "
-                                    "pleace set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" +
-                                    "Note: ASCEND_LAUNCH_BLOCKING=1 will force ops to run in synchronous mode, "
-                                    "resulting in performance degradation. "
-                                    "Please unset ASCEND_LAUNCH_BLOCKING in time after debugging." +
-                                    PTA_ERROR(ErrCode::ACL) + ".\n" + acl_error);
+                "The process exits for this inner error, and " +
+                repo_error + ".\n" +
+                "Since the operator is called asynchronously, the stacktrace may be inaccurate. "
+                "If you want to get the accurate stacktrace, "
+                "pleace set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" +
+                "Note: ASCEND_LAUNCH_BLOCKING=1 will force ops to run in synchronous mode, "
+                "resulting in performance degradation. "
+                "Please unset ASCEND_LAUNCH_BLOCKING in time after debugging." +
+                PTA_ERROR(ErrCode::ACL) + ".\n" + acl_error);
         } else {
             ASCEND_LOGE("Inner error happend, detail: %s", repo_error);
         }
@@ -319,7 +338,8 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
     return NPU_STATUS_SUCCESS;
 }
 
-bool Repository::WriteQueue(void* cur_paras) {
+bool Repository::WriteQueue(void *cur_paras)
+{
     std::lock_guard<std::mutex> lock(mu_enqueue);
 
     if (GetStatus() == RepoStatus::STOP_EXIT) {
@@ -379,7 +399,7 @@ bool Repository::ReadQueue()
         }
         repo_error = get_func_error_msg(manager().getCurrentParams(datas, read_idx.idx));
         ASCEND_LOGE("---Thread---%llu: device = %d, write_idx = %u, read_idx = %u, status = %d, ret = %d",
-                    std::this_thread::get_id(), device_idx, write_idx.idx, read_idx.idx, GetStatus(), ret);
+            std::this_thread::get_id(), device_idx, write_idx.idx, read_idx.idx, GetStatus(), ret);
         while (!IsEmptyQueue()) { // ignore other tasks
             manager().Release(datas, read_idx.idx, releaseQueue);
             read_idx.idx = (read_idx.idx + 1) & (kQueueCapacity - 1);
@@ -410,7 +430,8 @@ bool Repository::ReadQueue()
     return true;
 }
 
-void Repository::Enqueue(void* cur_paras) {
+void Repository::Enqueue(void *cur_paras)
+{
     if (initialized == false) {
         ASCEND_LOGE("Task queue is not initialized, shouldn't call Enqueue(). !!");
         return;
@@ -463,14 +484,15 @@ void Repository::Enqueue(void* cur_paras) {
         }
 
         throw std::runtime_error("The Inner error is reported as above. "
-                                "The process exits for this inner error, and " + repo_error + ".\n" +
-                                "Since the operator is called asynchronously, the stacktrace may be inaccurate. "
-                                "If you want to get the accurate stacktrace, "
-                                "pleace set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" +
-                                "Note: ASCEND_LAUNCH_BLOCKING=1 will force ops to run in synchronous mode, "
-                                "resulting in performance degradation. "
-                                "Please unset ASCEND_LAUNCH_BLOCKING in time after debugging." +
-                                PTA_ERROR(ErrCode::ACL) + ".\n" + acl_error);
+            "The process exits for this inner error, and " +
+            repo_error + ".\n" +
+            "Since the operator is called asynchronously, the stacktrace may be inaccurate. "
+            "If you want to get the accurate stacktrace, "
+            "pleace set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" +
+            "Note: ASCEND_LAUNCH_BLOCKING=1 will force ops to run in synchronous mode, "
+            "resulting in performance degradation. "
+            "Please unset ASCEND_LAUNCH_BLOCKING in time after debugging." +
+            PTA_ERROR(ErrCode::ACL) + ".\n" + acl_error);
     }
 
     if (GetStatus() != RUN && GetStatus() != INIT) {
@@ -487,7 +509,7 @@ void Repository::Enqueue(void* cur_paras) {
         } else if (type == c10_npu::queue::ASYNC_MEMCPY) {
             auto cur_paras = static_cast<c10_npu::queue::CopyParas *>(queueParam->paramVal);
             ASCEND_LOGW("Task queue thread is exit, can't call Enqueue() for copy, srclen=%zu, dstlen is %zu, kind=%d",
-                        cur_paras->srcLen, cur_paras->dstLen, cur_paras->kind);
+                cur_paras->srcLen, cur_paras->dstLen, cur_paras->kind);
         } else {
             auto cur_paras = static_cast<c10_npu::queue::EventParas *>(queueParam->paramVal);
             ASCEND_LOGW("Task queue thread is exit, can't call Enqueue() for event, event is=%p", cur_paras->event);
@@ -543,7 +565,8 @@ void Repository::Enqueue(void* cur_paras) {
     SetWriteWorking(false);
 }
 
-void Repository::Dequeue() {
+void Repository::Dequeue()
+{
     if (initialized == false) {
         ASCEND_LOGE("Task queue is not initialized, shouldn't call Dequeue(). !!");
         return;
@@ -594,8 +617,7 @@ void Repository::Dequeue() {
             continue;
         }
         __sync_synchronize();
-        notify_empty = need_empty &&
-            IsEmptyQueue(); // need_empty && (ret == false || IsEmptyQueue());
+        notify_empty = need_empty && IsEmptyQueue(); // need_empty && (ret == false || IsEmptyQueue());
         while (notify_empty) {
             s = eventfd_write(efd_empty, u);
             if (s != 0) {
@@ -623,7 +645,8 @@ void Repository::Dequeue() {
     SetReadWorking(false);
 }
 
-void Repository::ReleaseResource() {
+void Repository::ReleaseResource()
+{
     manager().DeInit(datas);
     if (efd_read > 0) {
         close(efd_read);
@@ -652,12 +675,13 @@ void Repository::SetQueueErrMsg(const char *errmsg)
     error_msg = errmsg;
 }
 
-const char* Repository::GetQueueErrMsg()
+const char *Repository::GetQueueErrMsg()
 {
     return error_msg;
 }
 
-Repository::~Repository() {
+Repository::~Repository()
+{
     if (initialized) {
         if (consumer.joinable()) {
             SetStatus(NEED_EXIT);
@@ -669,15 +693,18 @@ Repository::~Repository() {
     }
 }
 
-bool Repository::IsFullQueue() const {
+bool Repository::IsFullQueue() const
+{
     return ((write_idx.idx + 1) & (kQueueCapacity - 1)) == read_idx.idx;
 }
 
-bool Repository::CheckInit() const {
+bool Repository::CheckInit() const
+{
     return initialized;
 }
 
-void StartConsume(Repository* repo, c10::DeviceIndex device_id) {
+void StartConsume(Repository *repo, c10::DeviceIndex device_id)
+{
     SetThreadName(ThreadType::aclThread);
     SetThreadAffinity(device_id);
 
@@ -693,7 +720,8 @@ void StartConsume(Repository* repo, c10::DeviceIndex device_id) {
     return;
 }
 
-void Repository::InitRepo(c10::DeviceIndex device_id) {
+void Repository::InitRepo(c10::DeviceIndex device_id)
+{
     if (datas == nullptr) {
         datas = manager().Init(kQueueCapacity);
         ASCEND_LOGI("TaskQueue is enable");
@@ -724,7 +752,7 @@ std::string Repository::GetPara()
 }
 
 static constexpr size_t kReleaseQueueCapacity = 8192;
-bool ReleaseQueue::WriteToReleaseQueue(void* cur_paras)
+bool ReleaseQueue::WriteToReleaseQueue(void *cur_paras)
 {
     if (IsFullQueue()) {
         return false;
@@ -737,7 +765,8 @@ bool ReleaseQueue::WriteToReleaseQueue(void* cur_paras)
     return true;
 }
 
-void ReleaseQueue::PushToReleaseQueue(void* cur_paras) {
+void ReleaseQueue::PushToReleaseQueue(void *cur_paras)
+{
     if (initialized == false) {
         ASCEND_LOGE("Release queue is not initialized, shouldn't call PushToReleaseQueue(). !!");
         return;
@@ -752,7 +781,8 @@ void ReleaseQueue::PushToReleaseQueue(void* cur_paras) {
     }
 }
 
-bool ReleaseQueue::ReadFromReleaseQueue() {
+bool ReleaseQueue::ReadFromReleaseQueue()
+{
     if (IsEmptyQueue()) {
         return false;
     }
@@ -766,7 +796,8 @@ bool ReleaseQueue::ReadFromReleaseQueue() {
     return true;
 }
 
-void ReleaseQueue::PopFromReleaseQueue() {
+void ReleaseQueue::PopFromReleaseQueue()
+{
     if (initialized == false) {
         ASCEND_LOGE("Release queue is not initialized, shouldn't call PopFromReleaseQueue(). !!");
         return;
@@ -786,7 +817,8 @@ void ReleaseQueue::PopFromReleaseQueue() {
     }
 }
 
-void StartRelease(ReleaseQueue* releaseQue) {
+void StartRelease(ReleaseQueue *releaseQue)
+{
     SetThreadName(ThreadType::releaseThread);
     SetThreadAffinity(releaseQue->GetDeviceID());
 
@@ -809,7 +841,8 @@ void ReleaseQueue::InitReleaseQueue(c10::DeviceIndex device_id)
     device_idx = device_id;
 }
 
-ReleaseQueue::~ReleaseQueue() {
+ReleaseQueue::~ReleaseQueue()
+{
     if (initialized) {
         if (releaser.joinable()) {
             SetStatus(NEED_EXIT);
@@ -819,11 +852,13 @@ ReleaseQueue::~ReleaseQueue() {
     releaseManager().DeInit(datas);
 }
 
-bool ReleaseQueue::IsFullQueue() const {
+bool ReleaseQueue::IsFullQueue() const
+{
     return ((write_idx.idx + 1) % kReleaseQueueCapacity) == read_idx.idx;
 }
 
-RepoStatus ReleaseQueue::GetStatus() const {
+RepoStatus ReleaseQueue::GetStatus() const
+{
     if (initialized == false) {
         ASCEND_LOGE("Release queue is not initialized, shouldn't call GetStatus(). !!");
     }
@@ -837,7 +872,8 @@ c10::DeviceIndex ReleaseQueue::GetDeviceID() const
 }
 
 
-void ReleaseQueue::SetStatus(RepoStatus desired) {
+void ReleaseQueue::SetStatus(RepoStatus desired)
+{
     if (initialized == false) {
         ASCEND_LOGE("Release queue is not initialized, shouldn't call SetStatus(). !!");
         return;
@@ -846,7 +882,8 @@ void ReleaseQueue::SetStatus(RepoStatus desired) {
     repo_status = desired;
 }
 
-void ReleaseQueue::ChangeStatus(RepoStatus expected, RepoStatus desired) {
+void ReleaseQueue::ChangeStatus(RepoStatus expected, RepoStatus desired)
+{
     if (initialized == false) {
         ASCEND_LOGE("Release queue is not initialized, shouldn't call ChangeStatus(). !!");
         return;
diff --git a/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.cpp b/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.cpp
index e4f3051141..b60117c61b 100644
--- a/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AsyncTaskQueueInterface.cpp
@@ -11,19 +11,20 @@
 #endif
 namespace c10_npu {
 namespace queue {
-std::atomic<uint64_t> QueueParas::g_correlation_id{0};
+std::atomic<uint64_t> QueueParas::g_correlation_id{ 0 };
 std::map<int64_t, std::string> CopyParas::COPY_PARAS_MAP{
-    {ACL_MEMCPY_HOST_TO_HOST, "acl_memcpy_host_to_host"},
-    {ACL_MEMCPY_HOST_TO_DEVICE, "acl_memcpy_host_to_device"},
-    {ACL_MEMCPY_DEVICE_TO_HOST, "acl_memcpy_device_to_host"},
-    {ACL_MEMCPY_DEVICE_TO_DEVICE, "acl_memcpy_device_to_device"},
+    { ACL_MEMCPY_HOST_TO_HOST, "acl_memcpy_host_to_host" },
+    { ACL_MEMCPY_HOST_TO_DEVICE, "acl_memcpy_host_to_device" },
+    { ACL_MEMCPY_DEVICE_TO_HOST, "acl_memcpy_device_to_host" },
+    { ACL_MEMCPY_DEVICE_TO_DEVICE, "acl_memcpy_device_to_device" },
 };
 std::map<int64_t, std::string> EventParas::EVENT_PARAS_MAP{
-    {RECORD_EVENT, "record_event"},
-    {WAIT_EVENT, "wait_event"},
-    {LAZY_DESTROY_EVENT, "destroy_event"},
+    { RECORD_EVENT, "record_event" },
+    { WAIT_EVENT, "wait_event" },
+    { LAZY_DESTROY_EVENT, "destroy_event" },
 };
-void CopyParas::Copy(CopyParas& other) {
+void CopyParas::Copy(CopyParas &other)
+{
     this->dst = other.dst;
     this->dstLen = other.dstLen;
     this->src = other.src;
@@ -31,19 +32,15 @@ void CopyParas::Copy(CopyParas& other) {
     this->kind = other.kind;
 }
 
-void EventParas::Copy(EventParas& other) {
+void EventParas::Copy(EventParas &other)
+{
     this->event = other.event;
     this->eventAllocatorType = other.eventAllocatorType;
 }
 
 class AsyncCopyTask {
 public:
-    AsyncCopyTask(
-        void* dst,
-        size_t dstLen,
-        void* src,
-        size_t srcLen,
-        aclrtMemcpyKind kind);
+    AsyncCopyTask(void *dst, size_t dstLen, void *src, size_t srcLen, aclrtMemcpyKind kind);
     ~AsyncCopyTask() = default;
     void LaunchCopyTask();
 
@@ -53,13 +50,10 @@ private:
 
 class EventTask {
 public:
-    explicit EventTask(
-        aclrtEvent event,
-        EventAllocatorType allocatorType = RESERVED)
+    explicit EventTask(aclrtEvent event, EventAllocatorType allocatorType = RESERVED)
         : eventParam_(event, allocatorType){};
     ~EventTask() = default;
-    void LaunchRecordTask(
-        c10_npu::NPUStream npuStream);
+    void LaunchRecordTask(c10_npu::NPUStream npuStream);
     void LaunchWaitTask(c10_npu::NPUStream npuStream);
     void LaunchLazyDestroyTask(c10::DeviceIndex device_index);
 
@@ -67,12 +61,8 @@ private:
     EventParas eventParam_;
 };
 
-AsyncCopyTask::AsyncCopyTask(
-    void* dst,
-    size_t dstLen,
-    void* src,
-    size_t srcLen,
-    aclrtMemcpyKind kind) {
+AsyncCopyTask::AsyncCopyTask(void *dst, size_t dstLen, void *src, size_t srcLen, aclrtMemcpyKind kind)
+{
     copyParam_.dst = dst;
     copyParam_.dstLen = dstLen;
     copyParam_.src = src;
@@ -80,7 +70,8 @@ AsyncCopyTask::AsyncCopyTask(
     copyParam_.kind = kind;
 }
 
-void AsyncCopyTask::LaunchCopyTask() {
+void AsyncCopyTask::LaunchCopyTask()
+{
     RECORD_FUNCTION(CopyParas::COPY_PARAS_MAP[copyParam_.kind], std::vector<c10::IValue>({}));
     auto cur_stream = c10_npu::getCurrentNPUStream();
     if (!cur_stream.isSyncLaunchStream() && c10_npu::option::OptionsManager::GetTaskQueueEnable()) {
@@ -90,108 +81,106 @@ void AsyncCopyTask::LaunchCopyTask() {
         QueueParas params(ASYNC_MEMCPY, sizeof(CopyParas), &copyParam_);
         c10_npu::enCurrentNPUStream(&params);
 #ifndef BUILD_LIBTORCH
-        at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, CopyParas::COPY_PARAS_MAP[copyParam_.kind], params.correlation_id);
+        at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, CopyParas::COPY_PARAS_MAP[copyParam_.kind],
+            params.correlation_id);
 #endif
     } else {
         c10_npu::NPUStream stream = c10_npu::getCurrentNPUStream();
-        NPU_CHECK_ERROR(aclrtMemcpyAsync(
-            copyParam_.dst,
-            copyParam_.dstLen,
-            copyParam_.src,
-            copyParam_.srcLen,
-            copyParam_.kind,
-            stream));
+        NPU_CHECK_ERROR(aclrtMemcpyAsync(copyParam_.dst, copyParam_.dstLen, copyParam_.src, copyParam_.srcLen,
+            copyParam_.kind, stream));
     }
 }
 
-aclError LaunchAsyncCopyTask(
-    void* dst,
-    size_t dstLen,
-    void* src,
-    size_t srcLen,
-    aclrtMemcpyKind kind) {
+aclError LaunchAsyncCopyTask(void *dst, size_t dstLen, void *src, size_t srcLen, aclrtMemcpyKind kind)
+{
     AsyncCopyTask copyTask(dst, dstLen, src, srcLen, kind);
     copyTask.LaunchCopyTask();
     return ACL_ERROR_NONE;
 }
 
-void EventTask::LaunchRecordTask(c10_npu::NPUStream npuStream) {
+void EventTask::LaunchRecordTask(c10_npu::NPUStream npuStream)
+{
     RECORD_FUNCTION(EventParas::EVENT_PARAS_MAP[RECORD_EVENT], std::vector<c10::IValue>({}));
     if (!npuStream.isSyncLaunchStream() && c10_npu::option::OptionsManager::GetTaskQueueEnable()) {
 #ifndef BUILD_LIBTORCH
         at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(0, EventParas::EVENT_PARAS_MAP[RECORD_EVENT]);
 #endif
-    uint64_t prof_correlation_id = 0;
-    {
-        c10_npu::NPUStreamGuard guard(npuStream);
-        QueueParas params(RECORD_EVENT, sizeof(EventParas), &eventParam_);
-        c10_npu::NPUEventManager::GetInstance().IncreaseUnrecordedCount(eventParam_.event);
-        c10_npu::enCurrentNPUStream(&params);
-        prof_correlation_id = params.correlation_id;
-    }
-    ASCEND_LOGI("Event: LaunchRecordTask is successfully executed, event=%p", eventParam_.event);
+        uint64_t prof_correlation_id = 0;
+        {
+            c10_npu::NPUStreamGuard guard(npuStream);
+            QueueParas params(RECORD_EVENT, sizeof(EventParas), &eventParam_);
+            c10_npu::NPUEventManager::GetInstance().IncreaseUnrecordedCount(eventParam_.event);
+            c10_npu::enCurrentNPUStream(&params);
+            prof_correlation_id = params.correlation_id;
+        }
+        ASCEND_LOGI("Event: LaunchRecordTask is successfully executed, event=%p", eventParam_.event);
 #ifndef BUILD_LIBTORCH
-    at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[RECORD_EVENT], prof_correlation_id);
+        at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[RECORD_EVENT],
+            prof_correlation_id);
 #endif
     } else {
         NPU_CHECK_ERROR(aclrtRecordEvent(eventParam_.event, npuStream));
-        ASCEND_LOGI("Event: aclrtRecordEvent is successfully executed, stream=%p, event=%p", npuStream.stream(false), eventParam_.event);
+        ASCEND_LOGI("Event: aclrtRecordEvent is successfully executed, stream=%p, event=%p", npuStream.stream(false),
+            eventParam_.event);
     }
 }
 
-aclError LaunchRecordEventTask(aclrtEvent event, c10_npu::NPUStream npuStream) {
+aclError LaunchRecordEventTask(aclrtEvent event, c10_npu::NPUStream npuStream)
+{
     EventTask recordTask(event);
     recordTask.LaunchRecordTask(npuStream);
 #ifndef BUILD_LIBTORCH
-    const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
+    const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace();
     if (C10_UNLIKELY(trigger)) {
-        trigger->traceNpuEventRecord(
-            reinterpret_cast<uintptr_t>(event),
-            reinterpret_cast<uintptr_t>(npuStream.stream(false))
-        );
+        trigger->traceNpuEventRecord(reinterpret_cast<uintptr_t>(event),
+            reinterpret_cast<uintptr_t>(npuStream.stream(false)));
     }
 #endif
     return ACL_ERROR_NONE;
 }
 
-void EventTask::LaunchWaitTask(c10_npu::NPUStream npuStream) {
+void EventTask::LaunchWaitTask(c10_npu::NPUStream npuStream)
+{
     RECORD_FUNCTION(EventParas::EVENT_PARAS_MAP[WAIT_EVENT], std::vector<c10::IValue>({}));
     if (!npuStream.isSyncLaunchStream() && c10_npu::option::OptionsManager::GetTaskQueueEnable()) {
 #ifndef BUILD_LIBTORCH
         at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(0, EventParas::EVENT_PARAS_MAP[WAIT_EVENT]);
 #endif
-    uint64_t prof_correlation_id = 0;
-    {
-        c10_npu::NPUStreamGuard guard(npuStream);
-        QueueParas params(WAIT_EVENT, sizeof(EventParas), &eventParam_);
-        c10_npu::enCurrentNPUStream(&params);
-        prof_correlation_id = params.correlation_id;
-    }
-    ASCEND_LOGI("Event: LaunchWaitTask is successfully executed, event=%p", eventParam_.event);
+        uint64_t prof_correlation_id = 0;
+        {
+            c10_npu::NPUStreamGuard guard(npuStream);
+            QueueParas params(WAIT_EVENT, sizeof(EventParas), &eventParam_);
+            c10_npu::enCurrentNPUStream(&params);
+            prof_correlation_id = params.correlation_id;
+        }
+        ASCEND_LOGI("Event: LaunchWaitTask is successfully executed, event=%p", eventParam_.event);
 #ifndef BUILD_LIBTORCH
-    at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[WAIT_EVENT], prof_correlation_id);
+        at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[WAIT_EVENT],
+            prof_correlation_id);
 #endif
     } else {
         NPU_CHECK_ERROR(aclrtStreamWaitEvent(npuStream, eventParam_.event));
-        ASCEND_LOGI("Event: aclrtStreamWaitEvent is successfully executed, stream=%p, event=%p", npuStream.stream(false), eventParam_.event);
+        ASCEND_LOGI("Event: aclrtStreamWaitEvent is successfully executed, stream=%p, event=%p",
+            npuStream.stream(false), eventParam_.event);
     }
 }
 
-aclError LaunchWaitEventTask(aclrtEvent event, c10_npu::NPUStream npuStream) {
+aclError LaunchWaitEventTask(aclrtEvent event, c10_npu::NPUStream npuStream)
+{
     EventTask waitTask(event);
     waitTask.LaunchWaitTask(npuStream);
 #ifndef BUILD_LIBTORCH
-    const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
+    const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace();
     if (C10_UNLIKELY(trigger)) {
-        trigger->traceNpuEventWait(
-            reinterpret_cast<uintptr_t>(event),
+        trigger->traceNpuEventWait(reinterpret_cast<uintptr_t>(event),
             reinterpret_cast<uintptr_t>(npuStream.stream(false)));
     }
 #endif
     return ACL_ERROR_NONE;
 }
 
-void EventTask::LaunchLazyDestroyTask(c10::DeviceIndex device_index) {
+void EventTask::LaunchLazyDestroyTask(c10::DeviceIndex device_index)
+{
     RECORD_FUNCTION(EventParas::EVENT_PARAS_MAP[LAZY_DESTROY_EVENT], std::vector<c10::IValue>({}));
     auto cur_stream = c10_npu::getCurrentNPUStream();
     if (!cur_stream.isSyncLaunchStream() && c10_npu::option::OptionsManager::GetTaskQueueEnable()) {
@@ -202,17 +191,19 @@ void EventTask::LaunchLazyDestroyTask(c10::DeviceIndex device_index) {
         c10_npu::enCurrentNPUStream(&params, device_index);
         ASCEND_LOGI("Event: LaunchLazyDestroyTask is successfully executed, event=%p", eventParam_.event);
 #ifndef BUILD_LIBTORCH
-        at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[LAZY_DESTROY_EVENT], params.correlation_id);
+        at_npu::native::NpuUtils::ProfReportMarkDataToNpuProfiler(1, EventParas::EVENT_PARAS_MAP[LAZY_DESTROY_EVENT],
+            params.correlation_id);
 #endif
     } else {
         NPU_CHECK_ERROR(c10_npu::NPUEventManager::GetInstance().LazyDestroy(eventParam_.event), "aclrtDestroyEvent");
     }
 }
 
-aclError LaunchLazyDestroyEventTask(aclrtEvent event, c10::DeviceIndex device_index) {
+aclError LaunchLazyDestroyEventTask(aclrtEvent event, c10::DeviceIndex device_index)
+{
     EventTask lazyDestroyTask(event);
 #ifndef BUILD_LIBTORCH
-    const c10_npu::impl::PyCallbackTrigger* trigger = c10_npu::impl::NPUTrace::getTrace();
+    const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace();
     if (C10_UNLIKELY(trigger)) {
         trigger->traceNpuEventDeletion(reinterpret_cast<uintptr_t>(event));
     }
diff --git a/torch_npu/csrc/core/npu/register/FunctionLoader.cpp b/torch_npu/csrc/core/npu/register/FunctionLoader.cpp
index 17930667c8..28270dd4e9 100644
--- a/torch_npu/csrc/core/npu/register/FunctionLoader.cpp
+++ b/torch_npu/csrc/core/npu/register/FunctionLoader.cpp
@@ -5,22 +5,25 @@
 
 namespace c10_npu {
 namespace option {
-
-FunctionLoader::FunctionLoader(const std::string& name) {
+FunctionLoader::FunctionLoader(const std::string &name)
+{
     this->fileName = name + ".so";
 }
 
-FunctionLoader::~FunctionLoader() {
+FunctionLoader::~FunctionLoader()
+{
     if (this->handle != nullptr) {
         dlclose(this->handle);
     }
 }
 
-void FunctionLoader::Set(const std::string& name) {
+void FunctionLoader::Set(const std::string &name)
+{
     this->registry[name] = nullptr;
 }
 
-void* FunctionLoader::Get(const std::string& name) {
+void *FunctionLoader::Get(const std::string &name)
+{
     if (this->handle == nullptr) {
         auto handle = dlopen(this->fileName.c_str(), RTLD_LAZY | RTLD_GLOBAL);
         if (handle == nullptr) {
@@ -49,40 +52,44 @@ void* FunctionLoader::Get(const std::string& name) {
 }
 
 namespace register_function {
-    FunctionRegister* FunctionRegister::GetInstance() {
-        static FunctionRegister instance;
-        return &instance;
-    }
-    void FunctionRegister::Register(const std::string& name, ::std::unique_ptr<FunctionLoader>& ptr) {
-        std::lock_guard<std::mutex> lock(mu_);
-        registry.emplace(name, std::move(ptr));
-    }
+FunctionRegister *FunctionRegister::GetInstance()
+{
+    static FunctionRegister instance;
+    return &instance;
+}
+void FunctionRegister::Register(const std::string &name, ::std::unique_ptr<FunctionLoader> &ptr)
+{
+    std::lock_guard<std::mutex> lock(mu_);
+    registry.emplace(name, std::move(ptr));
+}
 
-    void FunctionRegister::Register(const std::string& name, const std::string& funcName) {
-        auto itr = registry.find(name);
-        if (itr == registry.end()) {
-            AT_ERROR(name, " library should register first.");
-            return;
-        }
-        itr->second->Set(funcName);
+void FunctionRegister::Register(const std::string &name, const std::string &funcName)
+{
+    auto itr = registry.find(name);
+    if (itr == registry.end()) {
+        AT_ERROR(name, " library should register first.");
+        return;
     }
+    itr->second->Set(funcName);
+}
 
-    void* FunctionRegister::Get(const std::string& soName, const std::string& funcName) {
-        auto itr = registry.find(soName);
-        if (itr != registry.end()) {
-            return itr->second->Get(funcName);
-        }
-        return nullptr;
+void *FunctionRegister::Get(const std::string &soName, const std::string &funcName)
+{
+    auto itr = registry.find(soName);
+    if (itr != registry.end()) {
+        return itr->second->Get(funcName);
     }
+    return nullptr;
+}
 
-    FunctionRegisterBuilder::FunctionRegisterBuilder(const std::string& name, ::std::unique_ptr<FunctionLoader>& ptr) {
-        FunctionRegister::GetInstance()->Register(name, ptr);
-    }
-    FunctionRegisterBuilder::FunctionRegisterBuilder(const std::string& soName, const std::string& funcName) {
-        FunctionRegister::GetInstance()->Register(soName, funcName);
-    }
+FunctionRegisterBuilder::FunctionRegisterBuilder(const std::string &name, ::std::unique_ptr<FunctionLoader> &ptr)
+{
+    FunctionRegister::GetInstance()->Register(name, ptr);
+}
+FunctionRegisterBuilder::FunctionRegisterBuilder(const std::string &soName, const std::string &funcName)
+{
+    FunctionRegister::GetInstance()->Register(soName, funcName);
+}
 } // namespace register_function
-
-
 } // namespace option
 } // namespace c10_npu
diff --git a/torch_npu/csrc/core/npu/register/OptionRegister.cpp b/torch_npu/csrc/core/npu/register/OptionRegister.cpp
index e37543bf83..8f7f17a011 100644
--- a/torch_npu/csrc/core/npu/register/OptionRegister.cpp
+++ b/torch_npu/csrc/core/npu/register/OptionRegister.cpp
@@ -7,12 +7,13 @@
 
 namespace c10_npu {
 namespace option {
-
-OptionInterface::OptionInterface(OptionCallBack callback) {
+OptionInterface::OptionInterface(OptionCallBack callback)
+{
     this->callback = callback;
 }
 
-void OptionInterface::Set(const std::string& in) {
+void OptionInterface::Set(const std::string &in)
+{
     this->val = in;
     if (this->callback != nullptr) {
         if (c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) {
@@ -25,24 +26,27 @@ void OptionInterface::Set(const std::string& in) {
     }
 }
 
-std::string OptionInterface::Get() {
+std::string OptionInterface::Get()
+{
     return val;
 }
 
 
 namespace register_options {
-OptionRegister* OptionRegister::GetInstance() {
+OptionRegister *OptionRegister::GetInstance()
+{
     static OptionRegister instance;
     return &instance;
 }
 
-void OptionRegister::Register(const std::string& name,
-    ::std::unique_ptr<OptionInterface>& ptr) {
+void OptionRegister::Register(const std::string &name, ::std::unique_ptr<OptionInterface> &ptr)
+{
     std::lock_guard<std::mutex> lock(mu_);
     registry.emplace(name, std::move(ptr));
 }
 
-void OptionRegister::Set(const std::string& name, const std::string& val) {
+void OptionRegister::Set(const std::string &name, const std::string &val)
+{
     auto itr = registry.find(name);
     if (itr != registry.end()) {
         itr->second->Set(val);
@@ -51,7 +55,8 @@ void OptionRegister::Set(const std::string& name, const std::string& val) {
     }
 }
 
-c10::optional<std::string> OptionRegister::Get(const std::string& name) {
+c10::optional<std::string> OptionRegister::Get(const std::string &name)
+{
     auto itr = registry.find(name);
     if (itr != registry.end()) {
         return itr->second->Get();
@@ -59,17 +64,16 @@ c10::optional<std::string> OptionRegister::Get(const std::string& name) {
     return c10::nullopt; // default value
 }
 
-OptionInterfaceBuilder::OptionInterfaceBuilder(
-    const std::string& name,
-    ::std::unique_ptr<OptionInterface>& ptr,
-    const std::string& type) {
+OptionInterfaceBuilder::OptionInterfaceBuilder(const std::string &name, ::std::unique_ptr<OptionInterface> &ptr,
+    const std::string &type)
+{
     OptionRegister::GetInstance()->Register(name, ptr);
 
     // init the value if env variable.
     if (type == "env") {
         std::string env_name = name;
         std::transform(env_name.begin(), env_name.end(), env_name.begin(), ::toupper);
-        char* env_val = std::getenv(env_name.c_str());
+        char *env_val = std::getenv(env_name.c_str());
         if (env_val != nullptr) {
             std::string val(env_val);
             OptionRegister::GetInstance()->Set(name, val);
@@ -78,19 +82,21 @@ OptionInterfaceBuilder::OptionInterfaceBuilder(
 }
 } // namespace register_options
 
-void SetOption(const std::string& key, const std::string& val) {
+void SetOption(const std::string &key, const std::string &val)
+{
     register_options::OptionRegister::GetInstance()->Set(key, val);
 }
 
-void SetOption(const std::map<std::string, std::string>& options) {
+void SetOption(const std::map<std::string, std::string> &options)
+{
     for (auto item : options) {
         SetOption(item.first, item.second);
     }
 }
 
-c10::optional<std::string> GetOption(const std::string& key) {
+c10::optional<std::string> GetOption(const std::string &key)
+{
     return register_options::OptionRegister::GetInstance()->Get(key);
 }
-
 } // namespace option
 } // namespace c10_npu
-- 
Gitee


From 2fd18b24645472a07e1e26eb95c871ed47c2b761 Mon Sep 17 00:00:00 2001
From: yuhaiyan <yuhaiyan8@huawei.com>
Date: Wed, 26 Mar 2025 02:02:01 +0000
Subject: [PATCH 236/358] !19492 cleancode Merge pull request !19492 from
 yuhaiyan/v2.6.0-dev1

---
 torch_npu/csrc/framework/OpParamMaker.cpp     |   2 +-
 .../{AoeUtils.cpp => AoeDumpGraphManager.cpp} |  31 +-
 .../aoe/{AoeUtils.h => AoeDumpGraphManager.h} |   2 +-
 .../framework/autograd/FunctionsManual.cpp    | 106 +++---
 .../csrc/framework/autograd/FunctionsManual.h |  14 +-
 .../framework/autograd/VariableTypeManual.cpp | 139 ++++----
 .../framework/contiguous/ContiguousOpt.cpp    | 310 +++++++++---------
 .../csrc/framework/contiguous/ContiguousOpt.h |  46 +--
 .../framework/contiguous/ContiguousUtils.cpp  |  80 ++---
 .../csrc/framework/contiguous/ReshapeOpt.cpp  |  87 ++---
 .../csrc/framework/contiguous/ReshapeOpt.h    |  12 +-
 .../contiguous/contiguous_register.h          |  68 ++--
 .../csrc/framework/interface/AclInterface.cpp | 223 +++++++------
 .../interface/AclOpCompileInterface.cpp       | 190 +++++------
 .../interface/AclOpCompileInterface.h         |   3 +-
 .../csrc/framework/interface/EnvVariables.cpp |  52 +--
 .../framework/interface/HcclInterface.cpp     |   3 +-
 .../csrc/framework/interface/HcclInterface.h  |   7 +-
 .../csrc/framework/interface/LibAscendHal.cpp |   9 +-
 .../interface/MsProfilerInterface.cpp         |   2 +-
 .../framework/interface/MstxInterface.cpp     |   2 +-
 21 files changed, 734 insertions(+), 654 deletions(-)
 rename torch_npu/csrc/framework/aoe/{AoeUtils.cpp => AoeDumpGraphManager.cpp} (64%)
 rename torch_npu/csrc/framework/aoe/{AoeUtils.h => AoeDumpGraphManager.h} (98%)

diff --git a/torch_npu/csrc/framework/OpParamMaker.cpp b/torch_npu/csrc/framework/OpParamMaker.cpp
index 90c4fc4fed..038d97c44c 100644
--- a/torch_npu/csrc/framework/OpParamMaker.cpp
+++ b/torch_npu/csrc/framework/OpParamMaker.cpp
@@ -8,7 +8,7 @@
 #include "torch_npu/csrc/distributed/HCCLUtils.hpp"
 #include "torch_npu/csrc/framework/OpCmdHelper.h"
 #include "torch_npu/csrc/framework/OpParamMaker.h"
-#include "torch_npu/csrc/framework/aoe/AoeUtils.h"
+#include "torch_npu/csrc/framework/aoe/AoeDumpGraphManager.h"
 #include "torch_npu/csrc/framework/interface/HcclInterface.h"
 #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
 #include "torch_npu/csrc/framework/utils/NpuUtils.h"
diff --git a/torch_npu/csrc/framework/aoe/AoeUtils.cpp b/torch_npu/csrc/framework/aoe/AoeDumpGraphManager.cpp
similarity index 64%
rename from torch_npu/csrc/framework/aoe/AoeUtils.cpp
rename to torch_npu/csrc/framework/aoe/AoeDumpGraphManager.cpp
index f5f1411166..676edada7b 100644
--- a/torch_npu/csrc/framework/aoe/AoeUtils.cpp
+++ b/torch_npu/csrc/framework/aoe/AoeDumpGraphManager.cpp
@@ -1,45 +1,52 @@
 #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h"
-#include "torch_npu/csrc/framework/aoe/AoeUtils.h"
+#include "torch_npu/csrc/framework/aoe/AoeDumpGraphManager.h"
 
 namespace at_npu {
 namespace native {
 namespace aoe {
 
-void AoeDumpGraphManager::SetDumpGraphPath(const std::string& dump_path) {
+void AoeDumpGraphManager::SetDumpGraphPath(const std::string& dump_path)
+{
     autotune_graphdumppath = dump_path;
 }
 
-std::string AoeDumpGraphManager::GetDumpGraphPath() const {
+std::string AoeDumpGraphManager::GetDumpGraphPath() const
+{
     return autotune_graphdumppath;
 }
 
-aclGraphDumpOption* AoeDumpGraphManager::CreateGraphDumpOption() {
+aclGraphDumpOption* AoeDumpGraphManager::CreateGraphDumpOption()
+{
     AclGraphDumpOption = AclCreateGraphDumpOpt();
     return AclGraphDumpOption;
 }
 
-void AoeDumpGraphManager::DestropyGraphDumpOption() {
+void AoeDumpGraphManager::DestropyGraphDumpOption()
+{
     AclDestroyGraphDumpOpt(AclGraphDumpOption);
-    AclGraphDumpOption = NULL;
+    AclGraphDumpOption = nullptr;
 }
 
-void AoeDumpGraphManager::EnableAoe() {
+void AoeDumpGraphManager::EnableAoe()
+{
     aoe_enable = true;
 }
 
-bool AoeDumpGraphManager::IsAoeEnabled() const {
+bool AoeDumpGraphManager::IsAoeEnabled() const
+{
     return aoe_enable;
 }
 
-bool AoeDumpGraphManager::IsInWhitelist(const std::string &opName) const {
-    if (white_list_.find(opName) != white_list_.end())
-    {
+bool AoeDumpGraphManager::IsInWhitelist(const std::string &opName) const
+{
+    if (white_list_.find(opName) != white_list_.end()) {
         return true;
     }
     return false;
 }
 
-AoeDumpGraphManager& aoe_manager() {
+AoeDumpGraphManager& aoe_manager()
+{
     static AoeDumpGraphManager instance;
     return instance;
 }
diff --git a/torch_npu/csrc/framework/aoe/AoeUtils.h b/torch_npu/csrc/framework/aoe/AoeDumpGraphManager.h
similarity index 98%
rename from torch_npu/csrc/framework/aoe/AoeUtils.h
rename to torch_npu/csrc/framework/aoe/AoeDumpGraphManager.h
index 54bac615b3..cae67e2d59 100644
--- a/torch_npu/csrc/framework/aoe/AoeUtils.h
+++ b/torch_npu/csrc/framework/aoe/AoeDumpGraphManager.h
@@ -24,7 +24,7 @@ public:
   bool aoe_enable = false;
   // to save graph for autotune, default path is ./
   std::string autotune_graphdumppath = "./";
-  aclGraphDumpOption* AclGraphDumpOption = NULL;
+  aclGraphDumpOption* AclGraphDumpOption = nullptr;
   std::unordered_set<std::string> white_list_ = {
       "Abs",
       "AccumulateNV2",
diff --git a/torch_npu/csrc/framework/autograd/FunctionsManual.cpp b/torch_npu/csrc/framework/autograd/FunctionsManual.cpp
index 7bba1b6d88..b832378548 100644
--- a/torch_npu/csrc/framework/autograd/FunctionsManual.cpp
+++ b/torch_npu/csrc/framework/autograd/FunctionsManual.cpp
@@ -19,41 +19,50 @@ using at::IntArrayRef;
 using at::TensorList;
 using at::areAnyTensorSubclassLike;
 
-Tensor apply_loss_reduction(const Tensor& unreduced, int64_t reduction) {
-  if (reduction == at::Reduction::Mean) {
-    return unreduced.mean();
-  } else if (reduction == at::Reduction::Sum) {
-    return unreduced.sum();
-  }
-  return unreduced;
+Tensor apply_loss_reduction(const Tensor& unreduced, int64_t reduction)
+{
+    if (reduction == at::Reduction::Mean) {
+        return unreduced.mean();
+    } else if (reduction == at::Reduction::Sum) {
+        return unreduced.sum();
+    }
+        return unreduced;
 }
 
-bool any_variable_defined(const variable_list& variables) {
-  for (const auto& variable : variables) {
-    if (variable.defined()) {
-      return true;
+bool any_variable_defined(const variable_list& variables)
+{
+    for (const auto& variable : variables) {
+        if (variable.defined()) {
+            return true;
+        }
     }
-  }
-  return false;
+    return false;
 }
 
-bool isDefined(const c10::optional<Tensor>& t) {
-  return t.has_value() && t->defined();
+bool isDefined(const c10::optional<Tensor>& t)
+{
+    return t.has_value() && t->defined();
 }
 
-Tensor toNonOptTensor(const c10::optional<Tensor>& t) {
-  return t.has_value() ? *t : Tensor();
+Tensor toNonOptTensor(const c10::optional<Tensor>& t)
+{
+    return t.has_value() ? *t : Tensor();
 }
 
-Tensor toNonOptFwGrad(const c10::optional<Tensor>& t) {
-  return (t.has_value() && t->defined()) ? t->_fw_grad(/* level */ 0) : Tensor();
+Tensor toNonOptFwGrad(const c10::optional<Tensor>& t)
+{
+    // 0: level 0
+    return (t.has_value() && t->defined()) ? t->_fw_grad(0) : Tensor();
 }
 
-Tensor toNonOptPrimal(const c10::optional<Tensor>& t) {
-  return (t.has_value() && t->defined()) ? t->_fw_primal(/* level */ 0) : Tensor();
+Tensor toNonOptPrimal(const c10::optional<Tensor>& t)
+{
+    // 0: level 0
+    return (t.has_value() && t->defined()) ? t->_fw_primal(0) : Tensor();
 }
 
-void copy_range(variable_list& out, IndexRange range, const Tensor& t) {
+void copy_range(variable_list& out, IndexRange range, const Tensor& t)
+{
     AT_ASSERT(range.second <= out.size(), OPS_ERROR(ErrCode::PARAM));
     AT_ASSERTM(range.second - range.first == 1,
                "inconsistent range for Tensor output",
@@ -61,7 +70,8 @@ void copy_range(variable_list& out, IndexRange range, const Tensor& t) {
     out[range.first] = t;
 }
 
-void copy_range(variable_list& out, IndexRange range, at::ArrayRef<Tensor> t) {
+void copy_range(variable_list& out, IndexRange range, at::ArrayRef<Tensor> t)
+{
     AT_ASSERT(range.second <= out.size(), OPS_ERROR(ErrCode::PARAM));
     AT_ASSERTM(range.second - range.first == t.size(),
                "inconsistent range for TensorList output",
@@ -70,35 +80,39 @@ void copy_range(variable_list& out, IndexRange range, at::ArrayRef<Tensor> t) {
 }
 
 template <typename T>
-T not_implemented_base(const char* name, const char* reason) {
-  std::string msg = c10::str("the derivative for '", name, "' is not implemented.");
-  if (strlen(reason) > 0) {
-    msg = c10::str(msg, " ", reason);
-  };
-  TORCH_CHECK_NOT_IMPLEMENTED(false, msg);
+T not_implemented_base(const char* name, const char* reason)
+{
+    std::string msg = c10::str("the derivative for '", name, "' is not implemented.");
+    if (strlen(reason) > 0) {
+        msg = c10::str(msg, " ", reason);
+    };
+    TORCH_CHECK_NOT_IMPLEMENTED(false, msg);
 }
 
-Tensor not_implemented(const char* name, const char* reason) {
-  return not_implemented_base<Tensor>(name, reason);
+Tensor not_implemented(const char* name, const char* reason)
+{
+    return not_implemented_base<Tensor>(name, reason);
 }
 
-std::vector<Tensor> not_implemented_list(const char* name, const char* reason) {
-  return not_implemented_base<std::vector<Tensor>>(name, reason);
+std::vector<Tensor> not_implemented_list(const char* name, const char* reason)
+{
+    return not_implemented_base<std::vector<Tensor>>(name, reason);
 }
 
-Tensor maybe_multiply(const Tensor& t, const Scalar& s) {
-  bool is_one = false;
-  if (s.isFloatingPoint()) {
-    is_one = s.toSymFloat() == 1;
-  } else if (s.isIntegral(true)) {
-    is_one = s.toSymInt() == 1;
-  }
-
-  if (is_one) {
-    return t;
-  } else {
-    return t * s;
-  }
+Tensor maybe_multiply(const Tensor& t, const Scalar& s)
+{
+    bool is_one = false;
+    if (s.isFloatingPoint()) {
+        is_one = s.toSymFloat() == 1;
+    } else if (s.isIntegral(true)) {
+        is_one = s.toSymInt() == 1;
+    }
+
+    if (is_one) {
+        return t;
+    } else {
+        return t * s;
+    }
 }
 
 } // namespace details
diff --git a/torch_npu/csrc/framework/autograd/FunctionsManual.h b/torch_npu/csrc/framework/autograd/FunctionsManual.h
index 522edbdb32..495a97eb2b 100644
--- a/torch_npu/csrc/framework/autograd/FunctionsManual.h
+++ b/torch_npu/csrc/framework/autograd/FunctionsManual.h
@@ -20,13 +20,13 @@ namespace details {
 // A simple way to imperatively compute index ranges for slots
 // that have been flattened
 struct IndexRangeGenerator {
-  IndexRange range(size_t range_size) {
-    i += range_size;
-    return {i - range_size, i};
-  }
-  size_t size() { return i; }
-  private:
-    size_t i = 0;
+    IndexRange range(size_t range_size) {
+        i += range_size;
+        return {i - range_size, i};
+    }
+    size_t size() const { return i; }
+    private:
+        size_t i = 0;
 };
 
 Tensor toNonOptFwGrad(const c10::optional<Tensor>& t);
diff --git a/torch_npu/csrc/framework/autograd/VariableTypeManual.cpp b/torch_npu/csrc/framework/autograd/VariableTypeManual.cpp
index 43acdcb2f2..7ef8237d0f 100644
--- a/torch_npu/csrc/framework/autograd/VariableTypeManual.cpp
+++ b/torch_npu/csrc/framework/autograd/VariableTypeManual.cpp
@@ -21,93 +21,104 @@ namespace at_npu {
 namespace autograd {
 namespace VariableType {
 
-std::vector<at::DeprecatedTypeProperties*> allTypesForBackends(at::ArrayRef<at::Backend> backends) {
-  std::vector<DeprecatedTypeProperties*> res;
-  res.reserve(backends.size());
-  for (auto p : backends) {
-    for (const auto s : c10::irange(static_cast<int64_t>(ScalarType::NumOptions))) {
-      auto& type = getDeprecatedTypeProperties(static_cast<Backend>(p), static_cast<ScalarType>(s));
-      res.emplace_back(&type);
+std::vector<at::DeprecatedTypeProperties*> allTypesForBackends(at::ArrayRef<at::Backend> backends)
+{
+    std::vector<DeprecatedTypeProperties*> res;
+    res.reserve(backends.size());
+    for (auto p : backends) {
+        for (const auto s : c10::irange(static_cast<int64_t>(ScalarType::NumOptions))) {
+            auto& type = getDeprecatedTypeProperties(static_cast<Backend>(p), static_cast<ScalarType>(s));
+            res.emplace_back(&type);
+        }
     }
-  }
-  return res;
+    return res;
 }
 
-C10_EXPORT std::vector<at::DeprecatedTypeProperties*> allCPUTypes() {
-  return allTypesForBackends({ Backend::CPU, Backend::SparseCPU });
+C10_EXPORT std::vector<at::DeprecatedTypeProperties*> allCPUTypes()
+{
+    return allTypesForBackends({ Backend::CPU, Backend::SparseCPU });
 }
 
 namespace {
-const Variable& checked_cast_variable(const Tensor& t, const char* name, int pos) {
-  if (!t.defined()) {
-    AT_ERROR("Expected a proper Tensor but got None (or an undefined Tensor in C++) ",
-             "for argument #", pos, " '", name, "'");
-  }
-  return t;
+const Variable& checked_cast_variable(const Tensor& t, const char* name, int pos)
+{
+    if (!t.defined()) {
+        AT_ERROR("Expected a proper Tensor but got None (or an undefined Tensor in C++) ",
+                 "for argument #", pos, " '", name, "'");
+    }
+    return t;
 }
 
-Variable& checked_cast_variable(Tensor& t, const char* name, int pos) {
-  if (!t.defined()) {
-    AT_ERROR("Expected a proper Tensor but got None (or an undefined Tensor in C++) ",
-             "for argument #", pos, " '", name, "'");
-  }
-  return t;
+Variable& checked_cast_variable(Tensor& t, const char* name, int pos)
+{
+    if (!t.defined()) {
+        AT_ERROR("Expected a proper Tensor but got None (or an undefined Tensor in C++) ",
+                 "for argument #", pos, " '", name, "'");
+    }
+    return t;
 }
 } // namespace
 
-const Tensor& unpack(const Tensor& t, const char* name, int pos) {
-  return checked_cast_variable(t, name, pos);
+const Tensor& unpack(const Tensor& t, const char* name, int pos)
+{
+    return checked_cast_variable(t, name, pos);
 }
 
-Tensor& unpack(Tensor& t, const char* name, int pos) {
-  return checked_cast_variable(t, name, pos);
+Tensor& unpack(Tensor& t, const char* name, int pos)
+{
+    return checked_cast_variable(t, name, pos);
 }
 
-Tensor unpack_opt(const Tensor& t, const char* name, int pos) {
-  if (!t.defined()) {
-    return Tensor();
-  }
-  return unpack(t, name, pos);
+Tensor unpack_opt(const Tensor& t, const char* name, int pos)
+{
+    if (!t.defined()) {
+        return Tensor();
+    }
+    return unpack(t, name, pos);
 }
 
-std::vector<at::Tensor> unpack(at::TensorList tl, const char* name, int pos) {
-  std::vector<at::Tensor> ret(tl.size());
-  for (const auto i : c10::irange(tl.size())) {
-    const auto &t = tl[i];
-    if (!t.defined()) {
-      continue;
+std::vector<at::Tensor> unpack(at::TensorList tl, const char* name, int pos)
+{
+    std::vector<at::Tensor> ret(tl.size());
+    for (const auto i : c10::irange(tl.size())) {
+        const auto &t = tl[i];
+        if (!t.defined()) {
+            continue;
+        }
+        ret[i] = static_cast<const Variable&>(t);
     }
-    ret[i] = static_cast<const Variable&>(t);
-  }
-  return ret;
+    (void) name;
+    (void) pos;
+    return ret;
 }
 
 namespace {
 
 // Taken from codegened version
-Tensor _fw_primal(c10::DispatchKeySet ks, const Tensor& self, int64_t level) {
-  auto& self_ = unpack(self, "self", 0);
-  std::shared_ptr<Identity> grad_fn;
-  if (compute_requires_grad(self)) {
-    grad_fn = std::make_shared<Identity>();
-    grad_fn->set_next_edges(collect_next_edges(self));
-  }
-
-  auto result = ([&]() {
-    at::AutoDispatchBelowAutograd guard;
-    return at::redispatch::_fw_primal(ks& c10::after_autograd_keyset, self_, level);
-  })();
-
-  if (grad_fn) {
-      set_history(flatten_tensor_args(result), grad_fn);
-  }
-  if (isFwGradDefined(self)) {
-    // Modified from original codegen
-    // We explicitly want to ignore the forward grad at the given level
-    TORCH_CHECK(level == 0, "Invalid level given to _fw_primal", OPS_ERROR(ErrCode::VALUE));
-    // End modified from original codegen
-  }
-  return result;
+Tensor _fw_primal(c10::DispatchKeySet ks, const Tensor& self, int64_t level)
+{
+    auto& self_ = unpack(self, "self", 0);
+    std::shared_ptr<Identity> grad_fn;
+    if (compute_requires_grad(self)) {
+        grad_fn = std::make_shared<Identity>();
+        grad_fn->set_next_edges(collect_next_edges(self));
+    }
+
+    auto result = ([&]() {
+        at::AutoDispatchBelowAutograd guard;
+        return at::redispatch::_fw_primal(ks& c10::after_autograd_keyset, self_, level);
+    })();
+
+    if (grad_fn) {
+        set_history(flatten_tensor_args(result), grad_fn);
+    }
+    if (isFwGradDefined(self)) {
+        // Modified from original codegen
+        // We explicitly want to ignore the forward grad at the given level
+        TORCH_CHECK(level == 0, "Invalid level given to _fw_primal", OPS_ERROR(ErrCode::VALUE));
+        // End modified from original codegen
+    }
+    return result;
 }
 } // namespace
 
diff --git a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp
index cc134627cf..f0be2dcb98 100644
--- a/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp
+++ b/torch_npu/csrc/framework/contiguous/ContiguousOpt.cpp
@@ -11,73 +11,78 @@ ska::flat_hash_map<size_t, CachedContiguousOpt> TransContiguous::cached_contiguo
 
 
 ContiguousTensorDesc TransContiguous::GetTensorDescInfo(
-    const at::Tensor &src, const OptimizationCases &opt_cases) {
-  auto src_base_info = torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc();
-  c10::SmallVector<int64_t, MAX_DIM> src_size_inferred;
-  c10::SmallVector<int64_t, MAX_DIM> src_stride_inferred;
-  c10::SmallVector<int64_t, MAX_DIM> src_storage_size_inferred =
-      src_base_info.storage_sizes_;
-  if (src.dim() == 0) {
-    src_size_inferred = {1};
-    src_stride_inferred = {1};
-    if (src_storage_size_inferred.size() == 0) {
-      src_storage_size_inferred = {1};
+    const at::Tensor &src, const OptimizationCases &opt_cases)
+{
+    auto src_base_info = torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc();
+    c10::SmallVector<int64_t, MAX_DIM> src_size_inferred;
+    c10::SmallVector<int64_t, MAX_DIM> src_stride_inferred;
+    c10::SmallVector<int64_t, MAX_DIM> src_storage_size_inferred = src_base_info.storage_sizes_;
+    if (src.dim() == 0) {
+        src_size_inferred = {1};
+        src_stride_inferred = {1};
+        if (src_storage_size_inferred.size() == 0) {
+            src_storage_size_inferred = {1};
+        }
+    } else {
+        src_size_inferred = CalcuOpUtil::ConvertIntArrayRefToSmallVector(src.sizes());
+        src_stride_inferred = CalcuOpUtil::ConvertIntArrayRefToSmallVector(src.strides());
+    }
+    ContiguousTensorDesc src_desc = {
+        src.is_contiguous(),       src_size_inferred,
+        src_stride_inferred,       src.storage_offset(),
+        src_base_info.base_sizes_, src_base_info.base_strides_,
+        src_storage_size_inferred, src_base_info.base_offset_,
+        src_base_info.npu_format_, opt_cases};
+    if (src_desc.opt_cases_.empty()) {
+        src_desc.find_match_optimization_cases();
     }
-  } else {
-    src_size_inferred = CalcuOpUtil::ConvertIntArrayRefToSmallVector(src.sizes());
-    src_stride_inferred = CalcuOpUtil::ConvertIntArrayRefToSmallVector(src.strides());
-  }
-  ContiguousTensorDesc src_desc = {
-      src.is_contiguous(),       src_size_inferred,
-      src_stride_inferred,       src.storage_offset(),
-      src_base_info.base_sizes_, src_base_info.base_strides_,
-      src_storage_size_inferred, src_base_info.base_offset_,
-      src_base_info.npu_format_, opt_cases};
-  if (src_desc.opt_cases_.empty()) {
-    src_desc.find_match_optimization_cases();
-  }
-  return src_desc;
+    return src_desc;
 }
 
-bool TransContiguous::CheckClone(const at::Tensor &src, at::Tensor &self) {
-  // self tensor may not be temporary constructed empty tensor from src, so:
-  // 1. contiguous storage is needed:storage_offset and numels eq
-  // 2. full memory copy: size match between src and self
-  if (StorageDescHelper::OffsetAreMatch(&self) && self.is_contiguous() &&
-      src.sizes().equals(self.sizes()) &&
-      self.sizes().equals(torch_npu::NPUBridge::GetNpuStorageImpl(self)->get_npu_desc().base_sizes_)) {
-    return true;
-  }
-  return false;
+bool TransContiguous::CheckClone(const at::Tensor &src, at::Tensor &self)
+{
+    // self tensor may not be temporary constructed empty tensor from src, so:
+    // 1. contiguous storage is needed:storage_offset and numels eq
+    // 2. full memory copy: size match between src and self
+    if (StorageDescHelper::OffsetAreMatch(&self) && self.is_contiguous() &&
+        src.sizes().equals(self.sizes()) &&
+        self.sizes().equals(torch_npu::NPUBridge::GetNpuStorageImpl(self)->get_npu_desc().base_sizes_)) {
+        return true;
+    }
+    return false;
 }
 
-bool TransContiguous::can_optimize_(ContiguousTensorDesc &tensor_desc) {
-  for (auto opt_case : tensor_desc.opt_cases_) {
-    bool res = register_opt::CopyOptRegister::GetInstance()->CanOptimize(
-        opt_case, tensor_desc);
-    if (res) {
-      // refresh patterns to only keep optimized pattern
-      tensor_desc.opt_cases_.clear();
-      tensor_desc.opt_cases_.emplace_back(opt_case);
-      return true;
+bool TransContiguous::can_optimize_(ContiguousTensorDesc &tensor_desc)
+{
+    for (auto opt_case : tensor_desc.opt_cases_) {
+        bool res = register_opt::CopyOptRegister::GetInstance()->CanOptimize(
+            opt_case, tensor_desc);
+        if (res) {
+            // refresh patterns to only keep optimized pattern
+            tensor_desc.opt_cases_.clear();
+            tensor_desc.opt_cases_.emplace_back(opt_case);
+            return true;
+        }
     }
-  }
-  return false;
+    return false;
 }
 
-bool TransContiguous::CanOptimize(ContiguousTensorDesc &tensor_desc) {
-  return can_optimize_(tensor_desc);
+bool TransContiguous::CanOptimize(ContiguousTensorDesc &tensor_desc)
+{
+    return can_optimize_(tensor_desc);
 }
 
 bool TransContiguous::CanOptimize(const at::Tensor &tensor,
-                                  const OptimizationCases &opt_cases) {
-  ContiguousTensorDesc tensor_desc = GetTensorDescInfo(tensor, opt_cases);
-  return can_optimize_(tensor_desc);
+                                  const OptimizationCases &opt_cases)
+{
+    ContiguousTensorDesc tensor_desc = GetTensorDescInfo(tensor, opt_cases);
+    return can_optimize_(tensor_desc);
 }
 
 bool TransContiguous::contiguous_optimize_with_anyformat_(
-    at::Tensor &self, const at::Tensor &src, ContiguousTensorDesc &src_desc) {
-  if (!CheckClone(src, self)) {
+    at::Tensor &self, const at::Tensor &src, ContiguousTensorDesc &src_desc)
+{
+    if (!CheckClone(src, self)) {
         return false;
     }
     for (auto &opt_case : src_desc.opt_cases_) {
@@ -87,97 +92,99 @@ bool TransContiguous::contiguous_optimize_with_anyformat_(
             return true;
         }
     }
-  return false;
+    return false;
 }
 
-    size_t GetHash_(const c10::SmallVector<int64_t, MAX_DIM>& small_vector_size)
-    {
-        size_t seed = 0;
-        for (size_t i = 0; i < small_vector_size.size(); i++) {
-            seed ^= static_cast<size_t>(small_vector_size[i]) + (seed << 6) + (seed >> 2);
-        }
-        return seed;
+size_t GetHash_(const c10::SmallVector<int64_t, MAX_DIM>& small_vector_size)
+{
+    size_t seed = 0;
+    for (size_t i = 0; i < small_vector_size.size(); i++) {
+        seed ^= static_cast<size_t>(small_vector_size[i]) + (seed << 6) + (seed >> 2);
     }
+    return seed;
+}
 
-    size_t GetHash_(const ContiguousTensorDesc &src_desc)
-    {
-        size_t hash_src_desc = (GetHash_(src_desc.sizes_)<<52) +
-                               (GetHash_(src_desc.base_sizes_)<<40) +
-                               (GetHash_(src_desc.strides_)<<28) +
-                               (GetHash_(src_desc.base_strides_)<<16) +
-                               (static_cast<size_t>(src_desc.offset_) << 4) +
-                               src_desc.npu_format_;
-        return hash_src_desc;
+size_t GetHash_(const ContiguousTensorDesc &src_desc)
+{
+    size_t hash_src_desc = (GetHash_(src_desc.sizes_)<<52) +
+                           (GetHash_(src_desc.base_sizes_)<<40) +
+                           (GetHash_(src_desc.strides_)<<28) +
+                           (GetHash_(src_desc.base_strides_)<<16) +
+                           (static_cast<size_t>(src_desc.offset_) << 4) +
+                           src_desc.npu_format_;
+    return hash_src_desc;
+}
+
+bool equalDesc(const ContiguousTensorDesc &src_desc, const ContiguousTensorDesc &desc_desc)
+{
+    if (src_desc.sizes_ == desc_desc.sizes_ &&
+        src_desc.base_sizes_ == desc_desc.base_sizes_ &&
+        src_desc.strides_ == desc_desc.strides_ &&
+        src_desc.base_strides_ == desc_desc.base_strides_ &&
+        src_desc.offset_ == desc_desc.offset_ &&
+        src_desc.npu_format_ == desc_desc.npu_format_) {
+        return true;
     }
+    return false;
+}
 
-    bool equalDesc(const ContiguousTensorDesc &src_desc, const ContiguousTensorDesc &desc_desc)
-    {
-        if (src_desc.sizes_ == desc_desc.sizes_ &&
-            src_desc.base_sizes_ == desc_desc.base_sizes_ &&
-            src_desc.strides_ == desc_desc.strides_ &&
-            src_desc.base_strides_ == desc_desc.base_strides_ &&
-            src_desc.offset_ == desc_desc.offset_ &&
-            src_desc.npu_format_ == desc_desc.npu_format_) {
-            return true;
-        }
+bool TransContiguous::cached_contiguous_optimize_with_anyformat_(
+    at::Tensor &self, const at::Tensor &src, ContiguousTensorDesc &src_desc)
+{
+    // No cached, try caching
+    if (!CheckClone(src, self)) {
         return false;
     }
-
-    bool TransContiguous::cached_contiguous_optimize_with_anyformat_(
-        at::Tensor &self, const at::Tensor &src, ContiguousTensorDesc &src_desc)
-    {
-        // No cached, try caching
-        if (!CheckClone(src, self)) {
-            return false;
-        }
-        src_desc.hash_src_desc = GetHash_(src_desc);
-        auto it = TransContiguous::cached_contiguous_opt.find(src_desc.hash_src_desc);
-        if (it != TransContiguous::cached_contiguous_opt.end()) {
-            // Cached
-            if (equalDesc(src_desc, it->second.contiguous_tensor_desc)) {
-                src_desc.cached_contiguous = true;
-                auto &opt_case = it->second.cached_opt_case;
-                return register_opt::CopyOptRegister::GetInstance()->CachedRun(opt_case, self,
-                                                                               src, src_desc);
-            }
-            return contiguous_optimize_with_anyformat_(self, src, src_desc);
+    src_desc.hash_src_desc = GetHash_(src_desc);
+    auto it = TransContiguous::cached_contiguous_opt.find(src_desc.hash_src_desc);
+    if (it != TransContiguous::cached_contiguous_opt.end()) {
+        // Cached
+        if (equalDesc(src_desc, it->second.contiguous_tensor_desc)) {
+            src_desc.cached_contiguous = true;
+            auto &opt_case = it->second.cached_opt_case;
+            return register_opt::CopyOptRegister::GetInstance()->CachedRun(opt_case, self,
+                                                                           src, src_desc);
         }
+        return contiguous_optimize_with_anyformat_(self, src, src_desc);
+    }
 
-        for (auto &opt_case : src_desc.opt_cases_) {
-            bool res = false;
-            if (TransContiguous::cached_contiguous_opt.size() >= CachedMaxSize) {
-                res = register_opt::CopyOptRegister::GetInstance()->Run(opt_case, self, src, src_desc);
-            } else {
-                src_desc.cached_contiguous = false;
-                res =  register_opt::CopyOptRegister::GetInstance()->CachedRun(opt_case, self, src, src_desc);
-            }
-            if (res) {
-                return true;
-            }
+    for (auto &opt_case : src_desc.opt_cases_) {
+        bool res = false;
+        if (TransContiguous::cached_contiguous_opt.size() >= CachedMaxSize) {
+            res = register_opt::CopyOptRegister::GetInstance()->Run(opt_case, self, src, src_desc);
+        } else {
+            src_desc.cached_contiguous = false;
+            res =  register_opt::CopyOptRegister::GetInstance()->CachedRun(opt_case, self, src, src_desc);
+        }
+        if (res) {
+            return true;
         }
-        return false;
     }
+    return false;
+}
 
 bool TransContiguous::ContiguousOptimizeWithAnyFormat(
     at::Tensor &self, const at::Tensor &src,
-    const OptimizationCases &opt_cases) {
-  ContiguousTensorDesc src_desc = GetTensorDescInfo(src, opt_cases);
-  return contiguous_optimize_with_anyformat_(self, src, src_desc);
+    const OptimizationCases &opt_cases)
+{
+    ContiguousTensorDesc src_desc = GetTensorDescInfo(src, opt_cases);
+    return contiguous_optimize_with_anyformat_(self, src, src_desc);
 }
 
 c10::optional<at::Tensor> TransContiguous::ContiguousOptimizeWithAnyFormat(
-    const at::Tensor &src, const OptimizationCases &opt_cases) {
-  TORCH_CHECK(src.device().type() == c10::DeviceType::PrivateUse1,
-      "Expected all tensors to be on the same device. "
-      "Expected NPU tensor, please check whether the input tensor device is correct.",
-      OPS_ERROR(ErrCode::TYPE));
-  auto self = OpPreparation::ApplyTensorWithFormat(
-      src.sizes(), src.options(), torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc().npu_format_);
-  ContiguousTensorDesc src_desc = GetTensorDescInfo(src, opt_cases);
-  if (cached_contiguous_optimize_with_anyformat_(self, src, src_desc)) {
-    return self;
-  }
-  return c10::nullopt;
+    const at::Tensor &src, const OptimizationCases &opt_cases)
+{
+    TORCH_CHECK(src.device().type() == c10::DeviceType::PrivateUse1,
+        "Expected all tensors to be on the same device. "
+        "Expected NPU tensor, please check whether the input tensor device is correct.",
+        OPS_ERROR(ErrCode::TYPE));
+    auto self = OpPreparation::ApplyTensorWithFormat(
+        src.sizes(), src.options(), torch_npu::NPUBridge::GetNpuStorageImpl(src)->get_npu_desc().npu_format_);
+    ContiguousTensorDesc src_desc = GetTensorDescInfo(src, opt_cases);
+    if (cached_contiguous_optimize_with_anyformat_(self, src, src_desc)) {
+        return self;
+    }
+    return c10::nullopt;
 }
 
 bool TransContiguous::ContiguousOptimizeWithBaseFormat(
@@ -196,33 +203,32 @@ bool TransContiguous::ContiguousOptimizeWithBaseFormat(
     return cached_contiguous_optimize_with_anyformat_(self, src, src_desc);
 }
 
-
-    at::Tensor TransContiguous::view_tensor(const at::Tensor& self,
-                                            int64_t offset,
-                                            const c10::IntArrayRef& sizes,
-                                            const c10::IntArrayRef& strides)
-    {
-        at::Tensor self_;
-        if (self.is_quantized()) {
-            self_ = at::detail::make_tensor<at::QTensorImpl>(
-                    c10::TensorImpl::VIEW,
-                    c10::Storage(self.storage()),
-                    self.key_set(),
-                    self.dtype(),
-                    get_qtensorimpl(self)->quantizer());
-        } else {
-            self_ = at::detail::make_tensor<at::TensorImpl>(
-                    c10::TensorImpl::VIEW,
-                    c10::Storage(self.storage()),
-                    self.key_set(),
-                    self.dtype());
-        }
-        auto* self_tmp_ = self_.unsafeGetTensorImpl();
-        self_tmp_->set_storage_offset(offset);
-        self_tmp_->set_sizes_and_strides(sizes, strides);
-        at::namedinference::propagate_names(self_, self);
-        return self_;
+at::Tensor TransContiguous::view_tensor(const at::Tensor& self,
+                                        int64_t offset,
+                                        const c10::IntArrayRef& sizes,
+                                        const c10::IntArrayRef& strides)
+{
+    at::Tensor self_;
+    if (self.is_quantized()) {
+        self_ = at::detail::make_tensor<at::QTensorImpl>(
+                c10::TensorImpl::VIEW,
+                c10::Storage(self.storage()),
+                self.key_set(),
+                self.dtype(),
+                get_qtensorimpl(self)->quantizer());
+    } else {
+        self_ = at::detail::make_tensor<at::TensorImpl>(
+                c10::TensorImpl::VIEW,
+                c10::Storage(self.storage()),
+                self.key_set(),
+                self.dtype());
     }
+    auto* self_tmp_ = self_.unsafeGetTensorImpl();
+    self_tmp_->set_storage_offset(offset);
+    self_tmp_->set_sizes_and_strides(sizes, strides);
+    at::namedinference::propagate_names(self_, self);
+    return self_;
+}
 
 } // namespace native
 } // namespace at_npu
\ No newline at end of file
diff --git a/torch_npu/csrc/framework/contiguous/ContiguousOpt.h b/torch_npu/csrc/framework/contiguous/ContiguousOpt.h
index d7eef99903..8e99100041 100644
--- a/torch_npu/csrc/framework/contiguous/ContiguousOpt.h
+++ b/torch_npu/csrc/framework/contiguous/ContiguousOpt.h
@@ -28,37 +28,37 @@ namespace native {
 
 class TransContiguous {
 public:
-  TransContiguous() {}
-  virtual ~TransContiguous() {}
-  static bool CheckClone(const at::Tensor &src, at::Tensor &self);
-  static ContiguousTensorDesc
-  GetTensorDescInfo(const at::Tensor &src,
+    TransContiguous() {}
+    virtual ~TransContiguous() {}
+    static bool CheckClone(const at::Tensor &src, at::Tensor &self);
+    static ContiguousTensorDesc
+    GetTensorDescInfo(const at::Tensor &src,
                     const OptimizationCases &opt_cases = optCasesDefault);
-  static bool can_optimize_(ContiguousTensorDesc &tensor_desc);
-  static bool CanOptimize(ContiguousTensorDesc &tensor_desc);
-  static bool CanOptimize(const at::Tensor &tensor,
+    static bool can_optimize_(ContiguousTensorDesc &tensor_desc);
+    static bool CanOptimize(ContiguousTensorDesc &tensor_desc);
+    static bool CanOptimize(const at::Tensor &tensor,
                           const OptimizationCases &opt_cases);
-  static bool
-  contiguous_optimize_with_anyformat_(at::Tensor &self, const at::Tensor &src,
+    static bool
+    contiguous_optimize_with_anyformat_(at::Tensor &self, const at::Tensor &src,
                                       ContiguousTensorDesc &src_desc);
-  static bool ContiguousOptimizeWithAnyFormat(
-      at::Tensor &self, const at::Tensor &src,
-      const OptimizationCases &opt_cases = optCasesAnyFormat);
-  static c10::optional<at::Tensor> ContiguousOptimizeWithAnyFormat(
-      const at::Tensor &src,
-      const OptimizationCases &opt_cases = optCasesAnyFormat);
-  static bool ContiguousOptimizeWithBaseFormat(
-      at::Tensor &self, const at::Tensor &src,
-      const OptimizationCases &opt_cases = optCasesDefault,
-      bool OpenCombined = true);
+    static bool ContiguousOptimizeWithAnyFormat(
+        at::Tensor &self, const at::Tensor &src,
+        const OptimizationCases &opt_cases = optCasesAnyFormat);
+    static c10::optional<at::Tensor> ContiguousOptimizeWithAnyFormat(
+        const at::Tensor &src,
+        const OptimizationCases &opt_cases = optCasesAnyFormat);
+    static bool ContiguousOptimizeWithBaseFormat(
+        at::Tensor &self, const at::Tensor &src,
+        const OptimizationCases &opt_cases = optCasesDefault,
+        bool OpenCombined = true);
     static bool cached_contiguous_optimize_with_anyformat_(
-            at::Tensor &self, const at::Tensor &src, ContiguousTensorDesc &src_desc);
+        at::Tensor &self, const at::Tensor &src, ContiguousTensorDesc &src_desc);
     static ska::flat_hash_map<size_t, CachedContiguousOpt> cached_contiguous_opt;
     static at::Tensor view_tensor(const at::Tensor& self, int64_t offset, const c10::IntArrayRef& sizes, const c10::IntArrayRef& strides);
 
 private:
-  static OptimizationCases optCasesDefault;
-  static OptimizationCases optCasesAnyFormat;
+    static OptimizationCases optCasesDefault;
+    static OptimizationCases optCasesAnyFormat;
 };
 
 } // namespace native
diff --git a/torch_npu/csrc/framework/contiguous/ContiguousUtils.cpp b/torch_npu/csrc/framework/contiguous/ContiguousUtils.cpp
index 0ad4891669..0360215edf 100644
--- a/torch_npu/csrc/framework/contiguous/ContiguousUtils.cpp
+++ b/torch_npu/csrc/framework/contiguous/ContiguousUtils.cpp
@@ -3,55 +3,59 @@
 namespace at_npu {
 namespace native {
 
-void ContiguousTensorDesc::refresh_contiguous_using_size_and_stride() {
-  if (c10::multiply_integers(sizes_) == 0) {
-    is_contiguous_ = true;
-  }
-  int64_t infer_axis_size = 1;
-  for (int64_t dim = static_cast<int64_t>(sizes_.size()) - 1; dim >= 0; dim--) {
-    if (sizes_[dim] != 1) {
-      if (strides_[dim] == infer_axis_size) {
-        infer_axis_size *= sizes_[dim];
-      } else {
-        is_contiguous_ = false;
-        return;
-      }
+void ContiguousTensorDesc::refresh_contiguous_using_size_and_stride()
+{
+    if (c10::multiply_integers(sizes_) == 0) {
+        is_contiguous_ = true;
     }
-  }
-  is_contiguous_ = true;
+    int64_t infer_axis_size = 1;
+    for (int64_t dim = static_cast<int64_t>(sizes_.size()) - 1; dim >= 0; dim--) {
+        if (sizes_[dim] != 1) {
+            if (strides_[dim] == infer_axis_size) {
+                infer_axis_size *= sizes_[dim];
+            } else {
+                is_contiguous_ = false;
+                return;
+            }
+        }
+    }
+    is_contiguous_ = true;
 }
 
 void ContiguousTensorDesc::reset_optimization_cases(
-    const OptimizationCases &opt_cases) {
-  opt_cases_ = opt_cases;
+    const OptimizationCases &opt_cases)
+{
+    opt_cases_ = opt_cases;
 }
 
-void ContiguousTensorDesc::add_optimization_case(const std::string &opt_case) {
-  opt_cases_.emplace_back(opt_case);
+void ContiguousTensorDesc::add_optimization_case(const std::string &opt_case)
+{
+    opt_cases_.emplace_back(opt_case);
 }
 
-void ContiguousTensorDesc::find_match_optimization_cases() {
-  for (const auto i : c10::irange(sizes_.size())) {
-    if (strides_[i] == 0) {
-      opt_cases_.emplace_back("broadcast");
-      return;
+void ContiguousTensorDesc::find_match_optimization_cases()
+{
+    for (const auto i : c10::irange(sizes_.size())) {
+        if (strides_[i] == 0) {
+          opt_cases_.emplace_back("broadcast");
+          return;
+        }
     }
-  }
 
-  for (const auto i : c10::irange(strides_.size() - 1)) {
-    if (strides_[i] < strides_[i + 1]) {
-      opt_cases_.emplace_back("permute");
-      return;
+    for (const auto i : c10::irange(strides_.size() - 1)) {
+        if (strides_[i] < strides_[i + 1]) {
+          opt_cases_.emplace_back("permute");
+          return;
+        }
+    }
+
+    // Considering combined-cases, we cannot split slice cases any further.
+    if (c10::multiply_integers(sizes_) < c10::multiply_integers(base_sizes_)) {
+        opt_cases_.emplace_back("slice");
+        opt_cases_.emplace_back("select");
+        opt_cases_.emplace_back("indexing");
+        return;
     }
-  }
-
-  // Considering combined-cases, we cannot split slice cases any further.
-  if (c10::multiply_integers(sizes_) < c10::multiply_integers(base_sizes_)) {
-    opt_cases_.emplace_back("slice");
-    opt_cases_.emplace_back("select");
-    opt_cases_.emplace_back("indexing");
-    return;
-  }
 }
 
 } // namespace native
diff --git a/torch_npu/csrc/framework/contiguous/ReshapeOpt.cpp b/torch_npu/csrc/framework/contiguous/ReshapeOpt.cpp
index 48a3d88ea3..3536977542 100644
--- a/torch_npu/csrc/framework/contiguous/ReshapeOpt.cpp
+++ b/torch_npu/csrc/framework/contiguous/ReshapeOpt.cpp
@@ -4,26 +4,28 @@
 namespace at_npu {
 namespace native {
 
-bool can_use_memecpy_for_NZ_format(const ContiguousTensorDesc &tensor_desc) {
-  int64_t tensor_shape_size = static_cast<int64_t>(tensor_desc.sizes_.size());
-  int64_t base_shape_size = static_cast<int64_t>(tensor_desc.base_sizes_.size());
-  // No padding&&offset!=0 at the same time. e.g. x(3, 15, 16)[1:]
-  if (((tensor_desc.sizes_[tensor_shape_size - 1] % 16 != 0) ||
-       (tensor_desc.sizes_[tensor_shape_size - 2] % 16 != 0)) &&
-      tensor_desc.offset_ != 0) {
-    return false;
-  }
-  // Make sure that sizes of last 2 dims don't change
-  if (tensor_desc.sizes_[tensor_shape_size - 1] !=
-          tensor_desc.base_sizes_[base_shape_size - 1] ||
-      tensor_desc.sizes_[tensor_shape_size - 2] !=
-          tensor_desc.base_sizes_[base_shape_size - 2]) {
-    return false;
-  }
-  return true;
+bool can_use_memecpy_for_NZ_format(const ContiguousTensorDesc &tensor_desc)
+{
+    int64_t tensor_shape_size = static_cast<int64_t>(tensor_desc.sizes_.size());
+    int64_t base_shape_size = static_cast<int64_t>(tensor_desc.base_sizes_.size());
+    // No padding&&offset!=0 at the same time. e.g. x(3, 15, 16)[1:]
+    if (((tensor_desc.sizes_[tensor_shape_size - 1] % 16 != 0) ||
+        (tensor_desc.sizes_[tensor_shape_size - 2] % 16 != 0)) &&
+        tensor_desc.offset_ != 0) {
+        return false;
+    }
+    // Make sure that sizes of last 2 dims don't change
+    if (tensor_desc.sizes_[tensor_shape_size - 1] !=
+        tensor_desc.base_sizes_[base_shape_size - 1] ||
+        tensor_desc.sizes_[tensor_shape_size - 2] !=
+        tensor_desc.base_sizes_[base_shape_size - 2]) {
+        return false;
+    }
+    return true;
 }
 
-bool can_use_memcpy_for_other_format(const ContiguousTensorDesc &tensor_desc) {
+bool can_use_memcpy_for_other_format(const ContiguousTensorDesc &tensor_desc)
+{
     // torch.flatten(x) case should be removed
     if (tensor_desc.sizes_.size() < 2) {
         return false;
@@ -49,35 +51,38 @@ bool can_use_memcpy_for_other_format(const ContiguousTensorDesc &tensor_desc) {
 }
 
 bool check_reshape_match(const ContiguousTensorDesc &self_desc,
-                         const ContiguousTensorDesc &src_desc) {
-  // For all format, both src and self are taken into consideration
-  if (check_reshape_match(src_desc) && check_reshape_match(self_desc)) {
-    // tensor numels eqs for self and src tensor. i.e. make sure that storage
-    // keep same.
-    if (!(self_desc.sizes_ == src_desc.sizes_)) {
-      return false;
+                         const ContiguousTensorDesc &src_desc)
+{
+    // For all format, both src and self are taken into consideration
+    if (check_reshape_match(src_desc) && check_reshape_match(self_desc)) {
+        // tensor numels eqs for self and src tensor. i.e. make sure that storage
+        // keep same.
+        if (!(self_desc.sizes_ == src_desc.sizes_)) {
+            return false;
+        }
+
+        return true;
     }
+    return false;
+}
 
+bool check_reshape_match(const ContiguousTensorDesc &tensor_desc)
+{
+    // (case 1) Reshape tensor should be contiguous
+    if (!tensor_desc.is_contiguous_) {
+        return false;
+    }
+    // (case2) for other format, sizes at key dims should remain unchanged
+    if (!FormatHelper::IsBaseFormatType(tensor_desc.npu_format_)) {
+        return can_use_memcpy_for_other_format(tensor_desc);
+    }
     return true;
-  }
-  return false;
 }
 
-bool check_reshape_match(const ContiguousTensorDesc &tensor_desc) {
-  // (case 1) Reshape tensor should be contiguous
-  if (!tensor_desc.is_contiguous_) {
-    return false;
-  }
-  // (case2) for other format, sizes at key dims should remain unchanged
-  if (!FormatHelper::IsBaseFormatType(tensor_desc.npu_format_)) {
+bool CanUseMemcpyForOtherFormat(const at::Tensor &tensor)
+{
+    ContiguousTensorDesc tensor_desc = TransContiguous::GetTensorDescInfo(tensor);
     return can_use_memcpy_for_other_format(tensor_desc);
-  }
-  return true;
-}
-
-bool CanUseMemcpyForOtherFormat(const at::Tensor &tensor) {
-  ContiguousTensorDesc tensor_desc = TransContiguous::GetTensorDescInfo(tensor);
-  return can_use_memcpy_for_other_format(tensor_desc);
 }
 
 } // namespace native
diff --git a/torch_npu/csrc/framework/contiguous/ReshapeOpt.h b/torch_npu/csrc/framework/contiguous/ReshapeOpt.h
index 7a721806ef..018f3853f8 100644
--- a/torch_npu/csrc/framework/contiguous/ReshapeOpt.h
+++ b/torch_npu/csrc/framework/contiguous/ReshapeOpt.h
@@ -8,15 +8,15 @@
 namespace at_npu {
 namespace native {
 
-bool can_use_memecpy_for_NZ_format(const ContiguousTensorDesc &);
-bool can_use_memcpy_for_other_format(const ContiguousTensorDesc &);
+bool can_use_memecpy_for_NZ_format(const ContiguousTensorDesc &tensor_desc);
+bool can_use_memcpy_for_other_format(const ContiguousTensorDesc &tensor_desc);
 bool check_reshape_match_flex(const ContiguousTensorDesc &,
                               const ContiguousTensorDesc &);
-bool check_reshape_match(const ContiguousTensorDesc &,
-                         const ContiguousTensorDesc &);
+bool check_reshape_match(const ContiguousTensorDesc &self_desc,
+                         const ContiguousTensorDesc &src_desc);
 bool check_reshape_match_flex(const ContiguousTensorDesc &);
-bool check_reshape_match(const ContiguousTensorDesc &);
-bool CanUseMemcpyForOtherFormat(const at::Tensor &);
+bool check_reshape_match(const ContiguousTensorDesc &tensor_desc);
+bool CanUseMemcpyForOtherFormat(const at::Tensor &tensor);
 
 } // namespace native
 } // namespace at_npu
diff --git a/torch_npu/csrc/framework/contiguous/contiguous_register.h b/torch_npu/csrc/framework/contiguous/contiguous_register.h
index 2fe9c4636d..adfc7d84a9 100644
--- a/torch_npu/csrc/framework/contiguous/contiguous_register.h
+++ b/torch_npu/csrc/framework/contiguous/contiguous_register.h
@@ -18,13 +18,13 @@ namespace native {
 
 class ContiguousOpt {
 public:
-  ContiguousOpt() {}
-  virtual ~ContiguousOpt() = default;
-  virtual bool Optimizer(at::Tensor &self, const at::Tensor &src,
+    ContiguousOpt() {}
+    virtual ~ContiguousOpt() = default;
+    virtual bool Optimizer(at::Tensor &self, const at::Tensor &src,
                          const ContiguousTensorDesc &src_desc) = 0;
-  virtual bool CanOptimizer(const ContiguousTensorDesc &src_desc) {
-    return false;
-  }
+    virtual bool CanOptimizer(const ContiguousTensorDesc &src_desc) {
+        return false;
+    }
     virtual bool CachedOptimizer(at::Tensor &self, const at::Tensor &src,
                                  const ContiguousTensorDesc &src_desc)
     {
@@ -35,32 +35,32 @@ public:
 namespace register_opt {
 class CopyOptRegister {
 public:
-  ~CopyOptRegister() = default;
-  static CopyOptRegister *GetInstance() {
-    static CopyOptRegister instance;
-    return &instance;
-  }
-  void Register(std::string &name, ::std::unique_ptr<ContiguousOpt> &ptr) {
-    std::lock_guard<std::mutex> lock(mu_);
-    registry.emplace(name, std::move(ptr));
-  }
+    ~CopyOptRegister() = default;
+    static CopyOptRegister *GetInstance() {
+        static CopyOptRegister instance;
+        return &instance;
+    }
+    void Register(std::string &name, ::std::unique_ptr<ContiguousOpt> &ptr) {
+        std::lock_guard<std::mutex> lock(mu_);
+        registry.emplace(name, std::move(ptr));
+    }
 
-  bool CanOptimize(std::string &name, const ContiguousTensorDesc &src_desc) {
-    auto itr = registry.find(name);
-    if (itr != registry.end()) {
-      return itr->second->CanOptimizer(src_desc);
+    bool CanOptimize(std::string &name, const ContiguousTensorDesc &src_desc) {
+        auto itr = registry.find(name);
+        if (itr != registry.end()) {
+            return itr->second->CanOptimizer(src_desc);
+        }
+        return false;
     }
-    return false;
-  }
 
-  bool Run(const std::string &name, at::Tensor &self, const at::Tensor &src,
+    bool Run(const std::string &name, at::Tensor &self, const at::Tensor &src,
            const ContiguousTensorDesc &src_desc) {
-    auto itr = registry.find(name);
-    if (itr != registry.end()) {
-      return itr->second->Optimizer(self, src, src_desc);
+        auto itr = registry.find(name);
+        if (itr != registry.end()) {
+          return itr->second->Optimizer(self, src, src_desc);
+        }
+        return false;
     }
-    return false;
-  }
 
     bool CachedRun(const std::string &name, at::Tensor &self, const at::Tensor &src,
                    const ContiguousTensorDesc &src_desc)
@@ -73,17 +73,17 @@ public:
     }
 
 private:
-  CopyOptRegister() {}
-  mutable std::mutex mu_;
-  mutable std::map<std::string, ::std::unique_ptr<ContiguousOpt>> registry;
+    CopyOptRegister() {}
+    mutable std::mutex mu_;
+    mutable std::map<std::string, ::std::unique_ptr<ContiguousOpt>> registry;
 }; // class CopyOptRegister
 
 class CopyOptBuilder {
 public:
-  CopyOptBuilder(std::string name, ::std::unique_ptr<ContiguousOpt> &ptr) {
-    CopyOptRegister::GetInstance()->Register(name, ptr);
-  }
-  ~CopyOptBuilder() = default;
+    CopyOptBuilder(std::string name, ::std::unique_ptr<ContiguousOpt> &ptr) {
+        CopyOptRegister::GetInstance()->Register(name, ptr);
+    }
+    ~CopyOptBuilder() = default;
 }; // class CopyOptBuilder
 } // namespace register_opt
 
diff --git a/torch_npu/csrc/framework/interface/AclInterface.cpp b/torch_npu/csrc/framework/interface/AclInterface.cpp
index d2fcc24d7d..fd6310c866 100644
--- a/torch_npu/csrc/framework/interface/AclInterface.cpp
+++ b/torch_npu/csrc/framework/interface/AclInterface.cpp
@@ -26,111 +26,120 @@ LOAD_FUNCTION(aclprofFinalize)
 LOAD_FUNCTION(aclprofCreateConfig)
 LOAD_FUNCTION(aclprofDestroyConfig)
 
-aclprofStepInfoPtr init_stepinfo() {
-  typedef aclprofStepInfoPtr(*npdInitFunc)();
-  static npdInitFunc func = nullptr;
-  if (func == nullptr) {
-      func = (npdInitFunc)GET_FUNC(aclprofCreateStepInfo);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclprofCreateStepInfo", PROF_ERROR(ErrCode::NOT_FOUND));
-  auto ret = func();
-  return ret;
+aclprofStepInfoPtr init_stepinfo()
+{
+    typedef aclprofStepInfoPtr(*npdInitFunc)();
+    static npdInitFunc func = nullptr;
+    if (func == nullptr) {
+        func = (npdInitFunc)GET_FUNC(aclprofCreateStepInfo);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclprofCreateStepInfo", PROF_ERROR(ErrCode::NOT_FOUND));
+    auto ret = func();
+    return ret;
 }
 
-NpdStatus destroy_stepinfo(aclprofStepInfoPtr stepInfo) {
-  typedef NpdStatus(*npdDestroyFunc)(aclprofStepInfoPtr);
-  static npdDestroyFunc func = nullptr;
-  if (func == nullptr) {
-      func = (npdDestroyFunc)GET_FUNC(aclprofDestroyStepInfo);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclprofDestroyStepInfo", PROF_ERROR(ErrCode::NOT_FOUND));
-  auto ret = func(stepInfo);
-  return ret;
+NpdStatus destroy_stepinfo(aclprofStepInfoPtr stepInfo)
+{
+    typedef NpdStatus(*npdDestroyFunc)(aclprofStepInfoPtr);
+    static npdDestroyFunc func = nullptr;
+    if (func == nullptr) {
+        func = (npdDestroyFunc)GET_FUNC(aclprofDestroyStepInfo);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclprofDestroyStepInfo", PROF_ERROR(ErrCode::NOT_FOUND));
+    auto ret = func(stepInfo);
+    return ret;
 }
 
-NpdStatus start_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream) {
-  typedef NpdStatus(*npdStartProfiling)(aclprofStepInfoPtr, aclprofStepTag, aclrtStream);
-  static npdStartProfiling func = nullptr;
-  if (func == nullptr) {
-      func = (npdStartProfiling)GET_FUNC(aclprofGetStepTimestamp);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclprofGetStepTimestamp", PROF_ERROR(ErrCode::NOT_FOUND));
-  auto ret = func(stepInfo, stepTag, stream);
-  return ret;
+NpdStatus start_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream)
+{
+    typedef NpdStatus(*npdStartProfiling)(aclprofStepInfoPtr, aclprofStepTag, aclrtStream);
+    static npdStartProfiling func = nullptr;
+    if (func == nullptr) {
+        func = (npdStartProfiling)GET_FUNC(aclprofGetStepTimestamp);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclprofGetStepTimestamp", PROF_ERROR(ErrCode::NOT_FOUND));
+    auto ret = func(stepInfo, stepTag, stream);
+    return ret;
 }
 
-NpdStatus stop_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream) {
-  typedef NpdStatus(*npdStopProfiling)(aclprofStepInfoPtr, aclprofStepTag, aclrtStream);
-  static npdStopProfiling func = nullptr;
-  if (func == nullptr) {
-      func = (npdStopProfiling)GET_FUNC(aclprofGetStepTimestamp);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclprofGetStepTimestamp", PROF_ERROR(ErrCode::NOT_FOUND));
-  auto ret = func(stepInfo, stepTag, stream);
-  return ret;
+NpdStatus stop_deliver_op(aclprofStepInfoPtr stepInfo, aclprofStepTag stepTag, aclrtStream stream)
+{
+    typedef NpdStatus(*npdStopProfiling)(aclprofStepInfoPtr, aclprofStepTag, aclrtStream);
+    static npdStopProfiling func = nullptr;
+    if (func == nullptr) {
+        func = (npdStopProfiling)GET_FUNC(aclprofGetStepTimestamp);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclprofGetStepTimestamp", PROF_ERROR(ErrCode::NOT_FOUND));
+    auto ret = func(stepInfo, stepTag, stream);
+    return ret;
 }
 
 const char *AclGetErrMsg()
 {
-  typedef const char *(*aclGetErrMsg)();
-  static aclGetErrMsg func = nullptr;
-  if (func == nullptr) {
-    func = (aclGetErrMsg)GET_FUNC(aclGetRecentErrMsg);
-  }
-  if (func != nullptr) {
-    return func();
-  }
-  return "";
+    typedef const char *(*aclGetErrMsg)();
+    static aclGetErrMsg func = nullptr;
+    if (func == nullptr) {
+        func = (aclGetErrMsg)GET_FUNC(aclGetRecentErrMsg);
+    }
+    if (func != nullptr) {
+        return func();
+    }
+    return "";
 }
 
-aclError AclrtCreateEventWithFlag(aclrtEvent *event, uint32_t flag) {
-  typedef aclError(*AclrtCreateEventWithFlagFunc)(aclrtEvent*, uint32_t);
-  static AclrtCreateEventWithFlagFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclrtCreateEventWithFlagFunc)GET_FUNC(aclrtCreateEventWithFlag);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclrtCreateEventWithFlag", PROF_ERROR(ErrCode::NOT_FOUND));
-  return func(event, flag);
+aclError AclrtCreateEventWithFlag(aclrtEvent *event, uint32_t flag)
+{
+    typedef aclError(*AclrtCreateEventWithFlagFunc)(aclrtEvent*, uint32_t);
+    static AclrtCreateEventWithFlagFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclrtCreateEventWithFlagFunc)GET_FUNC(aclrtCreateEventWithFlag);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclrtCreateEventWithFlag", PROF_ERROR(ErrCode::NOT_FOUND));
+    return func(event, flag);
 }
 
-aclError AclProfilingInit(const char *profilerResultPath, size_t length) {
-  typedef aclError (*AclProfInitFunc) (const char *, size_t);
-  static AclProfInitFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclProfInitFunc)GET_FUNC(aclprofInit);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclprofInit", PROF_ERROR(ErrCode::NOT_FOUND));
-  return func(profilerResultPath, length);
+aclError AclProfilingInit(const char *profilerResultPath, size_t length)
+{
+    typedef aclError (*AclProfInitFunc) (const char *, size_t);
+    static AclProfInitFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclProfInitFunc)GET_FUNC(aclprofInit);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclprofInit", PROF_ERROR(ErrCode::NOT_FOUND));
+    return func(profilerResultPath, length);
 }
 
-aclError AclProfilingStart(const aclprofConfig *profilerConfig) {
-  typedef aclError (*AclProfStartFunc) (const aclprofConfig *);
-  static AclProfStartFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclProfStartFunc)GET_FUNC(aclprofStart);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclprofStart", PROF_ERROR(ErrCode::NOT_FOUND));
-  return func(profilerConfig);
+aclError AclProfilingStart(const aclprofConfig *profilerConfig)
+{
+    typedef aclError (*AclProfStartFunc) (const aclprofConfig *);
+    static AclProfStartFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclProfStartFunc)GET_FUNC(aclprofStart);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclprofStart", PROF_ERROR(ErrCode::NOT_FOUND));
+    return func(profilerConfig);
 }
 
-aclError AclProfilingStop(const aclprofConfig *profilerConfig) {
-  typedef aclError (*AclProfStopFunc) (const aclprofConfig*);
-  static AclProfStopFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclProfStopFunc)GET_FUNC(aclprofStop);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclprofStop", PROF_ERROR(ErrCode::NOT_FOUND));
-  return func(profilerConfig);
+aclError AclProfilingStop(const aclprofConfig *profilerConfig)
+{
+    typedef aclError (*AclProfStopFunc) (const aclprofConfig*);
+    static AclProfStopFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclProfStopFunc)GET_FUNC(aclprofStop);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclprofStop", PROF_ERROR(ErrCode::NOT_FOUND));
+    return func(profilerConfig);
 }
 
-aclError AclProfilingFinalize() {
-  typedef aclError (*AclProfFinalizeFunc) ();
-  static AclProfFinalizeFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclProfFinalizeFunc)GET_FUNC(aclprofFinalize);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclprofFinalize", PROF_ERROR(ErrCode::NOT_FOUND));
-  return func();
+aclError AclProfilingFinalize()
+{
+    typedef aclError (*AclProfFinalizeFunc) ();
+    static AclProfFinalizeFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclProfFinalizeFunc)GET_FUNC(aclprofFinalize);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclprofFinalize", PROF_ERROR(ErrCode::NOT_FOUND));
+    return func();
 }
 
 aclprofConfig *AclProfilingCreateConfig(
@@ -138,25 +147,27 @@ aclprofConfig *AclProfilingCreateConfig(
     uint32_t deviceNums,
     aclprofAicoreMetrics aicoreMetrics,
     aclprofAicoreEvents *aicoreEvents,
-    uint64_t dataTypeConfig) {
-  typedef aclprofConfig *(*AclProfCreateConfigFunc) \
+    uint64_t dataTypeConfig)
+{
+    typedef aclprofConfig *(*AclProfCreateConfigFunc) \
     (uint32_t *, uint32_t, aclprofAicoreMetrics, const aclprofAicoreEvents *, uint64_t);
-  static AclProfCreateConfigFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclProfCreateConfigFunc)GET_FUNC(aclprofCreateConfig);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclprofCreateConfig", PROF_ERROR(ErrCode::NOT_FOUND));
-  return func(deviceIdList, deviceNums, aicoreMetrics, aicoreEvents, dataTypeConfig);
+    static AclProfCreateConfigFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclProfCreateConfigFunc)GET_FUNC(aclprofCreateConfig);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclprofCreateConfig", PROF_ERROR(ErrCode::NOT_FOUND));
+    return func(deviceIdList, deviceNums, aicoreMetrics, aicoreEvents, dataTypeConfig);
 }
 
-aclError AclProfilingDestroyConfig(const aclprofConfig *profilerConfig) {
-  typedef aclError (*AclProfDestroyConfigFunc) (const aclprofConfig *);
-  static AclProfDestroyConfigFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclProfDestroyConfigFunc)GET_FUNC(aclprofDestroyConfig);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclprofDestroyConfig", PROF_ERROR(ErrCode::NOT_FOUND));
-  return func(profilerConfig);
+aclError AclProfilingDestroyConfig(const aclprofConfig *profilerConfig)
+{
+    typedef aclError (*AclProfDestroyConfigFunc) (const aclprofConfig *);
+    static AclProfDestroyConfigFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclProfDestroyConfigFunc)GET_FUNC(aclprofDestroyConfig);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclprofDestroyConfig", PROF_ERROR(ErrCode::NOT_FOUND));
+    return func(profilerConfig);
 }
 
 #undef LOAD_ASCEND_DUMP_FUNCTION
@@ -171,8 +182,9 @@ REGISTER_LIBRARY(libascend_dump)
 LOAD_ASCEND_DUMP_FUNCTION(aclopStartDumpArgs)
 LOAD_ASCEND_DUMP_FUNCTION(aclopStopDumpArgs)
 
-aclError AclopStartDumpArgs(uint32_t dumpType, const char *path) {
-  typedef aclError(*AclopStartDumpArgsFunc)(uint32_t, const char *);
+aclError AclopStartDumpArgs(uint32_t dumpType, const char *path)
+{
+    typedef aclError(*AclopStartDumpArgsFunc)(uint32_t, const char *);
     static AclopStartDumpArgsFunc func = nullptr;
     if (func == nullptr) {
         func = (AclopStartDumpArgsFunc)GET_ASCEND_DUMP_FUNC(aclopStartDumpArgs);
@@ -184,8 +196,9 @@ aclError AclopStartDumpArgs(uint32_t dumpType, const char *path) {
     return func(dumpType, path);
 }
 
-aclError AclopStopDumpArgs(uint32_t dumpType) {
-  typedef aclError(*AclopStopDumpArgsFunc)(uint32_t);
+aclError AclopStopDumpArgs(uint32_t dumpType)
+{
+    typedef aclError(*AclopStopDumpArgsFunc)(uint32_t);
     static AclopStopDumpArgsFunc func = nullptr;
     if (func == nullptr) {
         func = (AclopStopDumpArgsFunc)GET_ASCEND_DUMP_FUNC(aclopStopDumpArgs);
diff --git a/torch_npu/csrc/framework/interface/AclOpCompileInterface.cpp b/torch_npu/csrc/framework/interface/AclOpCompileInterface.cpp
index 663e0ec4a8..246f91870d 100644
--- a/torch_npu/csrc/framework/interface/AclOpCompileInterface.cpp
+++ b/torch_npu/csrc/framework/interface/AclOpCompileInterface.cpp
@@ -3,8 +3,8 @@
 #include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/core/npu/register/FunctionLoader.h"
 #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h"
-#include "third_party/acl/inc/acl/acl_base.h"
 #include "torch_npu/csrc/core/npu/register/OptionsManager.h"
+#include "third_party/acl/inc/acl/acl_base.h"
 
 namespace at_npu
 {
@@ -28,7 +28,8 @@ namespace at_npu
     LOAD_FUNCTION(aclrtCtxSetSysParamOpt)
     LOAD_FUNCTION(aclrtSetSysParamOpt)
 
-aclError AclSetCompileopt(aclCompileOpt opt, const char *value) {
+aclError AclSetCompileopt(aclCompileOpt opt, const char *value)
+{
     bool ge_init_disable = c10_npu::option::OptionsManager::CheckGeInitDisable();
     if (ge_init_disable) {
         return ACL_ERROR_NONE;
@@ -43,120 +44,125 @@ aclError AclSetCompileopt(aclCompileOpt opt, const char *value) {
     return ret;
 }
 
-c10::optional<size_t> AclGetCompileoptSize(aclCompileOpt opt) {
-  typedef aclError (*aclGetCompileoptSizeFunc)(aclCompileOpt opt);
-  static aclGetCompileoptSizeFunc func = nullptr;
-  if (func == nullptr) {
-    func = (aclGetCompileoptSizeFunc)GET_FUNC(aclGetCompileoptSize);
-  }
-  if (func == nullptr) {
-    return c10::nullopt;
-  } else {
-    return func(opt);
-  }
+c10::optional<size_t> AclGetCompileoptSize(aclCompileOpt opt)
+{
+    typedef aclError (*aclGetCompileoptSizeFunc)(aclCompileOpt opt);
+    static aclGetCompileoptSizeFunc func = nullptr;
+    if (func == nullptr) {
+        func = (aclGetCompileoptSizeFunc)GET_FUNC(aclGetCompileoptSize);
+    }
+    if (func == nullptr) {
+        return c10::nullopt;
+    } else {
+        return func(opt);
+    }
 }
 
-aclError AclGetCompileopt(aclCompileOpt opt, char *value, size_t length) {
-  typedef aclError (*aclGetCompileoptFunc)(aclCompileOpt opt, char *value, size_t length);
-  static aclGetCompileoptFunc func = nullptr;
-  if (func == nullptr) {
-    func = (aclGetCompileoptFunc)GET_FUNC(aclGetCompileopt);
-  }
-  if (func == nullptr) {
-    return ACL_ERROR_GE_FAILURE;
-  } else {
-    return func(opt, value, length);
-  }
+aclError AclGetCompileopt(aclCompileOpt opt, char *value, size_t length)
+{
+    typedef aclError (*aclGetCompileoptFunc)(aclCompileOpt opt, char *value, size_t length);
+    static aclGetCompileoptFunc func = nullptr;
+    if (func == nullptr) {
+        func = (aclGetCompileoptFunc)GET_FUNC(aclGetCompileopt);
+    }
+    if (func == nullptr) {
+        return ACL_ERROR_GE_FAILURE;
+    } else {
+        return func(opt, value, length);
+    }
 }
 
 aclError AclGenGraphAndDumpForOp(const char *opType,
     int numInputs, const aclTensorDesc *const inputDesc[], const aclDataBuffer *const inputs[],
     int numOutputs, const aclTensorDesc *const outputDesc[], aclDataBuffer *const outputs[],
     const aclopAttr *attr, aclopEngineType engineType, const char *graphDumpPath,
-    aclGraphDumpOption* graphdumpOpt) {
-  typedef aclError(*AclGenGraphAndDumpForOpFunc)(const char *, int,
-      const aclTensorDesc *const [], const aclDataBuffer *const [],
-      int, const aclTensorDesc *const [], aclDataBuffer *const [],
-      const aclopAttr *, aclopEngineType, const char *, aclGraphDumpOption*);
-  static AclGenGraphAndDumpForOpFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclGenGraphAndDumpForOpFunc)GET_FUNC(aclGenGraphAndDumpForOp);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclGenGraphAndDumpForOp", OPS_ERROR(ErrCode::NOT_FOUND));
-  auto ret = func(opType, numInputs, inputDesc, inputs, numOutputs,
-      outputDesc, outputs, attr, engineType, graphDumpPath, graphdumpOpt);
-  return ret;
+    aclGraphDumpOption* graphdumpOpt)
+{
+    typedef aclError(*AclGenGraphAndDumpForOpFunc)(const char *, int,
+        const aclTensorDesc *const [], const aclDataBuffer *const [],
+        int, const aclTensorDesc *const [], aclDataBuffer *const [],
+        const aclopAttr *, aclopEngineType, const char *, aclGraphDumpOption*);
+    static AclGenGraphAndDumpForOpFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclGenGraphAndDumpForOpFunc)GET_FUNC(aclGenGraphAndDumpForOp);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclGenGraphAndDumpForOp", OPS_ERROR(ErrCode::NOT_FOUND));
+    auto ret = func(opType, numInputs, inputDesc, inputs, numOutputs,
+        outputDesc, outputs, attr, engineType, graphDumpPath, graphdumpOpt);
+    return ret;
 }
 
-aclGraphDumpOption* AclCreateGraphDumpOpt() {
-  typedef aclGraphDumpOption*(*AclCreateGraphDumpOptFunc)();
-  static AclCreateGraphDumpOptFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclCreateGraphDumpOptFunc)GET_FUNC(aclCreateGraphDumpOpt);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclCreateGraphDumpOpt", OPS_ERROR(ErrCode::NOT_FOUND));
-  return func();
+aclGraphDumpOption* AclCreateGraphDumpOpt()
+{
+    typedef aclGraphDumpOption*(*AclCreateGraphDumpOptFunc)();
+    static AclCreateGraphDumpOptFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclCreateGraphDumpOptFunc)GET_FUNC(aclCreateGraphDumpOpt);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclCreateGraphDumpOpt", OPS_ERROR(ErrCode::NOT_FOUND));
+    return func();
 }
 
-aclError AclDestroyGraphDumpOpt(aclGraphDumpOption* aclGraphDumpOpt) {
-  typedef aclError(*AclDestroyGraphDumpOptFunc)(aclGraphDumpOption*);
-  static AclDestroyGraphDumpOptFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclDestroyGraphDumpOptFunc)GET_FUNC(aclDestroyGraphDumpOpt);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclDestroyGraphDumpOpt", OPS_ERROR(ErrCode::NOT_FOUND));
-  return func(aclGraphDumpOpt);
+aclError AclDestroyGraphDumpOpt(aclGraphDumpOption* aclGraphDumpOpt)
+{
+    typedef aclError(*AclDestroyGraphDumpOptFunc)(aclGraphDumpOption*);
+    static AclDestroyGraphDumpOptFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclDestroyGraphDumpOptFunc)GET_FUNC(aclDestroyGraphDumpOpt);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclDestroyGraphDumpOpt", OPS_ERROR(ErrCode::NOT_FOUND));
+    return func(aclGraphDumpOpt);
 }
 
 aclError AclopCompileAndExecuteV2(const char *opType,
     int numInputs, aclTensorDesc *inputDesc[], aclDataBuffer *inputs[],
     int numOutputs, aclTensorDesc *outputDesc[], aclDataBuffer *outputs[],
     aclopAttr *attr, aclopEngineType engineType, aclopCompileType compileFlag,
-    const char *opPath, aclrtStream stream) {
-  typedef aclError(*AclopCompileAndExecuteV2Func)(const char *,
-      int, aclTensorDesc * [], aclDataBuffer * [],
-      int, aclTensorDesc * [], aclDataBuffer * [],
-      aclopAttr *, aclopEngineType, aclopCompileType,
-      const char *, aclrtStream);
-  static AclopCompileAndExecuteV2Func func = nullptr;
-  if (func == nullptr) {
-    func = (AclopCompileAndExecuteV2Func)GET_FUNC(aclopCompileAndExecuteV2);
-  }
-  TORCH_CHECK(func, "Failed to find function ", "aclopCompileAndExecuteV2", OPS_ERROR(ErrCode::NOT_FOUND));
-  auto ret = func(opType, numInputs, inputDesc, inputs, numOutputs,
-      outputDesc, outputs, attr, engineType, compileFlag, opPath, stream);
-  return ret;
+    const char *opPath, aclrtStream stream)
+{
+    typedef aclError(*AclopCompileAndExecuteV2Func)(const char *,
+        int, aclTensorDesc * [], aclDataBuffer * [],
+        int, aclTensorDesc * [], aclDataBuffer * [],
+        aclopAttr *, aclopEngineType, aclopCompileType,
+        const char *, aclrtStream);
+    static AclopCompileAndExecuteV2Func func = nullptr;
+    if (func == nullptr) {
+        func = (AclopCompileAndExecuteV2Func)GET_FUNC(aclopCompileAndExecuteV2);
+    }
+    TORCH_CHECK(func, "Failed to find function ", "aclopCompileAndExecuteV2", OPS_ERROR(ErrCode::NOT_FOUND));
+    auto ret = func(opType, numInputs, inputDesc, inputs, numOutputs,
+        outputDesc, outputs, attr, engineType, compileFlag, opPath, stream);
+    return ret;
 }
 
-aclError AclrtCtxSetSysParamOpt(aclSysParamOpt opt, int64_t value) {
-  typedef aclError (*AclrtCtxSetSysParamOptFunc)(aclSysParamOpt opt, int64_t value);
-  static AclrtCtxSetSysParamOptFunc func = nullptr;
-  if (func == nullptr) {
-    func = (AclrtCtxSetSysParamOptFunc)GET_FUNC(aclrtCtxSetSysParamOpt);
-  }
-  if (func == nullptr) {
-    TORCH_WARN("Failed to find this aclrtCtxSetSysParamOpt function!");
-    return ACL_ERROR_NONE;
-  }
-  auto ret = func(opt, value);
-  return ret;
+aclError AclrtCtxSetSysParamOpt(aclSysParamOpt opt, int64_t value)
+{
+    typedef aclError (*AclrtCtxSetSysParamOptFunc)(aclSysParamOpt opt, int64_t value);
+    static AclrtCtxSetSysParamOptFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclrtCtxSetSysParamOptFunc)GET_FUNC(aclrtCtxSetSysParamOpt);
+    }
+    if (func == nullptr) {
+        TORCH_WARN("Failed to find this aclrtCtxSetSysParamOpt function!");
+        return ACL_ERROR_NONE;
+    }
+    auto ret = func(opt, value);
+    return ret;
 }
 
 aclError AclrtSetSysParamOpt(aclSysParamOpt opt, int64_t value)
 {
-  typedef aclError (*AclrtSetSysParamOptFunc)(aclSysParamOpt opt, int64_t value);
-  static AclrtSetSysParamOptFunc func = nullptr;
-  if (func == nullptr)
-  {
-    func = (AclrtSetSysParamOptFunc)GET_FUNC(aclrtSetSysParamOpt);
-  }
-  if (func == nullptr)
-  {
-    TORCH_WARN("Failed to find this aclrtSetSysParamOpt function!");
-    return ACL_ERROR_NONE;
-  }
-  auto ret = func(opt, value);
-  return ret;
+    typedef aclError (*AclrtSetSysParamOptFunc)(aclSysParamOpt opt, int64_t value);
+    static AclrtSetSysParamOptFunc func = nullptr;
+    if (func == nullptr) {
+        func = (AclrtSetSysParamOptFunc)GET_FUNC(aclrtSetSysParamOpt);
+    }
+    if (func == nullptr) {
+        TORCH_WARN("Failed to find this aclrtSetSysParamOpt function!");
+        return ACL_ERROR_NONE;
+    }
+    auto ret = func(opt, value);
+    return ret;
 }
 
   } // namespace native
diff --git a/torch_npu/csrc/framework/interface/AclOpCompileInterface.h b/torch_npu/csrc/framework/interface/AclOpCompileInterface.h
index dd5cd079e6..de614fb4bb 100644
--- a/torch_npu/csrc/framework/interface/AclOpCompileInterface.h
+++ b/torch_npu/csrc/framework/interface/AclOpCompileInterface.h
@@ -10,7 +10,8 @@ namespace native {
  * @ingroup AscendCL
  * @brief an interface set compile flag
  *
- * @param flag [IN]     flag: ACL_OPCOMPILE_DEFAULT represent static compile while ACL_OPCOMPILE_FUZZ represent dynamic compile
+ * @param flag [IN]     flag: ACL_OPCOMPILE_DEFAULT represent static compile
+   while ACL_OPCOMPILE_FUZZ represent dynamic compile
  *
  * @retval ACL_ERROR_NONE The function is successfully executed.
  * @retval OtherValues Failure
diff --git a/torch_npu/csrc/framework/interface/EnvVariables.cpp b/torch_npu/csrc/framework/interface/EnvVariables.cpp
index 347601aea8..c0892c91fe 100644
--- a/torch_npu/csrc/framework/interface/EnvVariables.cpp
+++ b/torch_npu/csrc/framework/interface/EnvVariables.cpp
@@ -5,7 +5,7 @@
 #include "torch_npu/csrc/framework/utils/ForceJitCompileList.h"
 #include "torch_npu/csrc/framework/utils/ForceAclnnList.h"
 #include "torch_npu/csrc/framework/interface/AclOpCompileInterface.h"
-#include "torch_npu/csrc/framework/aoe/AoeUtils.h"
+#include "torch_npu/csrc/framework/aoe/AoeDumpGraphManager.h"
 #include "torch_npu/csrc/core/npu/npu_log.h"
 #include "torch_npu/csrc/core/npu/NpuVariables.h"
 #include "torch_npu/csrc/core/npu/register/OptionRegister.h"
@@ -13,11 +13,12 @@ namespace at_npu {
 namespace native {
 namespace env {
 
-void ValidPathCheck(const std::string& file_path) {
-  char abs_path[PATH_MAX] = {'\0'};
-  if (realpath(file_path.c_str(), abs_path) == nullptr) {
-    TORCH_CHECK(0, "configPath path Fails, path ", (char*)file_path.c_str(), PTA_ERROR(ErrCode::PTR));
-  }
+void ValidPathCheck(const std::string& file_path)
+{
+    char abs_path[PATH_MAX] = {'\0'};
+    if (realpath(file_path.c_str(), abs_path) == nullptr) {
+        TORCH_CHECK(0, "configPath path Fails, path ", (char*)file_path.c_str(), PTA_ERROR(ErrCode::PTR));
+    }
 }
 
 REGISTER_OPTION_HOOK(autotune, [](const std::string& val) {
@@ -66,7 +67,8 @@ REGISTER_OPTION_HOOK(jitCompileInit, [](const std::string &val) {
     SET_OPTION_WITH_CACHE(isJitDisable, ("disable" == val) ? true : false);
 })
 
-bool CheckJitDisable() {
+bool CheckJitDisable()
+{
     return GET_OPTION_WITH_CACHE(isJitDisable);
 }
 
@@ -93,24 +95,25 @@ REGISTER_OPTION_HOOK(ACL_PRECISION_MODE, [](const std::string &val) {
   NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_PRECISION_MODE, val.c_str()));
 })
 
-bool IsAllowFP32ToFP16() {
-  // For Ascend910B1 and subsequent device, the default precision mode is must_keep_origin_dtype,
-  // and the default value for others is allow_fp32_to_fp16.
-  bool is_allow_fp32_to_fp16 = c10_npu::GetSocVersion() < c10_npu::SocVersion::Ascend910B1;
-
-  static const std::string precision_mode = "ACL_PRECISION_MODE";
-  auto precision_mode_val = c10_npu::option::GetOption(precision_mode);
-  if (precision_mode_val.has_value()) {
-    if (precision_mode_val.value() == "must_keep_origin_dtype") {
-      is_allow_fp32_to_fp16 = false;
-    } else if (precision_mode_val.value() == "allow_fp32_to_fp16") {
-      is_allow_fp32_to_fp16 = true;
-    } else {
-      ASCEND_LOGW("Unsupported precision mode value, using default value according to soc version.");
+bool IsAllowFP32ToFP16()
+{
+    // For Ascend910B1 and subsequent device, the default precision mode is must_keep_origin_dtype,
+    // and the default value for others is allow_fp32_to_fp16.
+    bool is_allow_fp32_to_fp16 = c10_npu::GetSocVersion() < c10_npu::SocVersion::Ascend910B1;
+
+    static const std::string precision_mode = "ACL_PRECISION_MODE";
+    auto precision_mode_val = c10_npu::option::GetOption(precision_mode);
+    if (precision_mode_val.has_value()) {
+        if (precision_mode_val.value() == "must_keep_origin_dtype") {
+            is_allow_fp32_to_fp16 = false;
+        } else if (precision_mode_val.value() == "allow_fp32_to_fp16") {
+            is_allow_fp32_to_fp16 = true;
+        } else {
+            ASCEND_LOGW("Unsupported precision mode value, using default value according to soc version.");
+        }
     }
-  }
 
-  return is_allow_fp32_to_fp16;
+    return is_allow_fp32_to_fp16;
 }
 
 REGISTER_OPTION_HOOK(ACL_OP_SELECT_IMPL_MODE, [](const std::string &val) {
@@ -135,7 +138,8 @@ REGISTER_OPTION_HOOK(OP_HOOK_ENABLE, [](const std::string &val) {
     SET_OPTION_WITH_CACHE(isOpHookEnable, "enable" == val);
 })
 
-bool CheckOpHookEnable() {
+bool CheckOpHookEnable()
+{
     return GET_OPTION_WITH_CACHE(isOpHookEnable);
 }
 
diff --git a/torch_npu/csrc/framework/interface/HcclInterface.cpp b/torch_npu/csrc/framework/interface/HcclInterface.cpp
index 19525ea62a..3b6b6eabcc 100644
--- a/torch_npu/csrc/framework/interface/HcclInterface.cpp
+++ b/torch_npu/csrc/framework/interface/HcclInterface.cpp
@@ -24,7 +24,8 @@ extern HcclResult HcclSetConfig(HcclConfig config, HcclConfigValue configValue)
         func = (HcclSetConfigFunc)GET_FUNC(HcclSetConfig);
     }
     if (func == nullptr) {
-        TORCH_NPU_WARN("Failed to find this HcclSetConfig function, get real hccl config, need to upgrade hccl version!");
+        TORCH_NPU_WARN(
+            "Failed to find this HcclSetConfig function, get real hccl config, need to upgrade hccl version!");
         return HcclResult::HCCL_SUCCESS;
     }
     return func(config, configValue);
diff --git a/torch_npu/csrc/framework/interface/HcclInterface.h b/torch_npu/csrc/framework/interface/HcclInterface.h
index b2cb3fd35d..1fb157a3c4 100644
--- a/torch_npu/csrc/framework/interface/HcclInterface.h
+++ b/torch_npu/csrc/framework/interface/HcclInterface.h
@@ -1,3 +1,6 @@
+#ifndef __PLUGIN_NATIVE_NPU_INTERFACE_HCCLINTERFACE__
+#define __PLUGIN_NATIVE_NPU_INTERFACE_HCCLINTERFACE__
+
 #include "third_party/hccl/inc/hccl/hccl.h"
 
 namespace at_npu {
@@ -16,4 +19,6 @@ extern HcclResult HcclSetConfig(HcclConfig config, HcclConfigValue configValue);
 
 } // namespace hccl
 } // namespace native
-} // namespace at_npu
\ No newline at end of file
+} // namespace at_npu
+
+#endif
\ No newline at end of file
diff --git a/torch_npu/csrc/framework/interface/LibAscendHal.cpp b/torch_npu/csrc/framework/interface/LibAscendHal.cpp
index 536821cd48..dba95bd4e8 100644
--- a/torch_npu/csrc/framework/interface/LibAscendHal.cpp
+++ b/torch_npu/csrc/framework/interface/LibAscendHal.cpp
@@ -22,7 +22,8 @@ constexpr uint32_t ERR_FREQ = 0;
 constexpr uint32_t ERR_VER = 0;
 constexpr uint32_t FREQ_CONFIG = 24;
 
-int64_t getFreq() {
+int64_t getFreq()
+{
     using getReqFun = int32_t (*)(uint32_t, int32_t, int32_t, int64_t*);
     static getReqFun getFreqInfo = nullptr;
     if (getFreqInfo == nullptr) {
@@ -39,7 +40,8 @@ int64_t getFreq() {
     return ERR_FREQ;
 }
 
-int64_t getVer() {
+int64_t getVer()
+{
     using getReqFun = int32_t (*)(int32_t*);
     static getReqFun getVerInfo = nullptr;
     if (getVerInfo == nullptr) {
@@ -57,7 +59,8 @@ int64_t getVer() {
     return ver;
 }
 
-bool isSyscntEnable() {
+bool isSyscntEnable()
+{
     constexpr int32_t supportVersion = 0x071905;
     return getVer() >= supportVersion && getFreq() != ERR_FREQ;
 }
diff --git a/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp b/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp
index a36fbb8d50..4dacd70fb1 100644
--- a/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp
+++ b/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp
@@ -1,5 +1,5 @@
-#include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/framework/interface/MsProfilerInterface.h"
+#include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/core/npu/register/FunctionLoader.h"
 #include "third_party/acl/inc/acl/acl_prof.h"
 
diff --git a/torch_npu/csrc/framework/interface/MstxInterface.cpp b/torch_npu/csrc/framework/interface/MstxInterface.cpp
index 4024a63e27..b46dec9cc4 100644
--- a/torch_npu/csrc/framework/interface/MstxInterface.cpp
+++ b/torch_npu/csrc/framework/interface/MstxInterface.cpp
@@ -1,5 +1,5 @@
-#include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/framework/interface/MstxInterface.h"
+#include "torch_npu/csrc/core/npu/NPUException.h"
 #include "torch_npu/csrc/core/npu/register/FunctionLoader.h"
 #include "torch_npu/csrc/core/npu/npu_log.h"
 #include "torch_npu/csrc/toolkit/profiler/common/utils.h"
-- 
Gitee


From b39647f851bedaa03d6cf0a8aa988b0d9d76c92c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=8F=B6=E5=AD=90=E5=87=A1?= <vscience@qq.com>
Date: Wed, 26 Mar 2025 02:06:57 +0000
Subject: [PATCH 237/358] =?UTF-8?q?!19460=20cleanCode=20Fix=20Merge=20pull?=
 =?UTF-8?q?=20request=20!19460=20from=20=E5=8F=B6=E5=AD=90=E5=87=A1/v2.6.0?=
 =?UTF-8?q?=5Fcleancode=5F0322?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/distributed/HCCLUtils.cpp      |  2 +-
 torch_npu/csrc/distributed/LCCLUtils.cpp      |  2 +-
 .../csrc/distributed/ParallelTcpServer.cpp    |  4 +--
 .../csrc/distributed/ProcessGroupHCCL.cpp     | 26 +++++++++----------
 .../csrc/distributed/ProcessGroupHCCL.hpp     |  4 +--
 torch_npu/distributed/distributed_c10d.py     |  3 ++-
 6 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/torch_npu/csrc/distributed/HCCLUtils.cpp b/torch_npu/csrc/distributed/HCCLUtils.cpp
index ad0d86d320..62b008e376 100644
--- a/torch_npu/csrc/distributed/HCCLUtils.cpp
+++ b/torch_npu/csrc/distributed/HCCLUtils.cpp
@@ -103,7 +103,7 @@ HcclDataType getHcclDataType(at::ScalarType type)
 std::string getHcclDataTypeSerialString(HcclDataType type)
 {
     const auto& iter = kHcclDataTypeToStringMap.find(type);
-    if (iter != kHcclDataTypeToStringMap.end()) {
+    if (iter != kHcclDataTypeToStringMap.cend()) {
         return iter->second;
     } else {
         TORCH_NPU_WARN_ONCE("Can not serialize undefined hccl data type.");
diff --git a/torch_npu/csrc/distributed/LCCLUtils.cpp b/torch_npu/csrc/distributed/LCCLUtils.cpp
index b4e62d9f29..913bb52d04 100644
--- a/torch_npu/csrc/distributed/LCCLUtils.cpp
+++ b/torch_npu/csrc/distributed/LCCLUtils.cpp
@@ -63,7 +63,7 @@ at_npu::lccl::LcclDataType getLcclDataType(at::ScalarType type)
 std::string getLcclDataTypeSerialString(at_npu::lccl::LcclDataType type)
 {
     const auto& iter = kLcclDataTypeToStringMap.find(type);
-    if (iter != kLcclDataTypeToStringMap.end()) {
+    if (iter != kLcclDataTypeToStringMap.cend()) {
         return iter->second;
     } else {
         TORCH_NPU_WARN_ONCE("Cannot serialize undefined LCCL data type.");
diff --git a/torch_npu/csrc/distributed/ParallelTcpServer.cpp b/torch_npu/csrc/distributed/ParallelTcpServer.cpp
index e7047c0ce0..a9121dc582 100644
--- a/torch_npu/csrc/distributed/ParallelTcpServer.cpp
+++ b/torch_npu/csrc/distributed/ParallelTcpServer.cpp
@@ -206,7 +206,7 @@ void ParallelTcpServer::WakeupWaitingClients(const std::string &key) noexcept
         return;
     }
 
-    for (auto it : pos->second) {
+    for (auto it : std::as_const(pos->second)) {
         if (--socketWaitKeyNum_[it] <= 0) {
             stopWaitingSockets.emplace_back(it);
             socketWaitKeyNum_.erase(it);
@@ -332,7 +332,7 @@ int ParallelTcpServer::SetNonBlocking(int fd) noexcept
         return -1;
     }
 
-    auto ret = fcntl(fd, F_SETFL, old | O_NONBLOCK);
+    auto ret = fcntl(fd, F_SETFL, static_cast<int>(old) | O_NONBLOCK);
     if (ret != 0) {
         LOG(ERROR) << "set fd flags failed " << errno << " : " << strerror(errno);
         return -1;
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index d07b3ea3b0..6dd9b13e11 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -51,8 +51,6 @@ static int32_t defaultExecTimeout = 1800;
 constexpr const char* P2P_DEVICE_KEY = "_p2p";
 
 using hcclUs = std::chrono::steady_clock::time_point;
-#define DURATION_US(x) (std::chrono::duration_cast<std::chrono::microseconds>(x))
-#define TIME_NOW() ({ std::chrono::steady_clock::now(); })
 
 constexpr int32_t MAX_GROUP_NAME_LEN = 128;
 
@@ -1760,7 +1758,7 @@ int64_t ProcessGroupHCCL::getStreamId(bool p2p, int peer)
         TORCH_CHECK(peer >= 0, "In p2p scenarios, the passed 'dst rank id' is error.", DIST_ERROR(ErrCode::PARAM));
         key = getKeySendRecv(rank_, peer);
     }
-    if ((!hcclStreams_.count(key)) || hcclStreams_[key].empty()) {
+    if ((hcclStreams_.count(key) == 0) || hcclStreams_[key].empty()) {
         return -1;
     }
     return hcclStreams_[key][0].id();
@@ -2286,13 +2284,13 @@ HcclCommConfig ProcessGroupHCCL::createHcclCommConfigWithOptions()
 
     if (options_->hccl_config.find("group_name") != options_->hccl_config.end()) {
         if (std::holds_alternative<std::string>(options_->hccl_config["group_name"])) {
-            auto groupName = std::get<std::string>(options_->hccl_config["group_name"]);
-            uint32_t udiLength = groupName.length();
-            if (groupName.length() >= UDI_MAX_LENGTH) {
+            auto hcclGroupName = std::get<std::string>(options_->hccl_config["group_name"]);
+            uint32_t udiLength = hcclGroupName.length();
+            if (hcclGroupName.length() >= UDI_MAX_LENGTH) {
                 udiLength = UDI_MAX_LENGTH - 1;
                 TORCH_NPU_WARN("The length of group_name has exceeded the limit UDI_MAX_LENGTH which will be truncated to UDI_MAX_LENGTH - 1.");
             }
-            strncpy(config.hcclUdi, groupName.c_str(), udiLength);
+            strncpy(config.hcclUdi, hcclGroupName.c_str(), udiLength);
             config.hcclUdi[udiLength] = '\0';
         } else {
             TORCH_CHECK(false, "Value type of group_name should be string.", DIST_ERROR(ErrCode::TYPE));
@@ -2454,7 +2452,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::collective(
             // to avoid to much task pushed to the stream, leading to stream overflow
             // insert sync point fluxLimit(key, i)
             c10_npu::NPUStream& hcclStream = hcclStreams[i];
-            hcclUs startut = TIME_NOW();
+            hcclUs startut = std::chrono::steady_clock::now();
             HCCL_CHECK_ERROR(fn(inputs[i], outputs[i], hcclComms[i]->getHcclComm(), hcclStream, work->is_dispatched), opTypeToString(opType).c_str());
             if (c10_npu::option::OptionsManager::GetMultiStreamMemoryReuse() == c10_npu::option::ERASE_RECORD_STREAM) {
                 work->recorded_outputs_.push_back(
@@ -2616,7 +2614,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::collectiveCoalesced(
             // to avoid to much task pushed to the stream, leading to stream overflow
             // insert sync point fluxLimit(key, i)
             c10_npu::NPUStream& hcclStream = hcclStreams[0];
-            hcclUs startut = TIME_NOW();
+            hcclUs startut = std::chrono::steady_clock::now();
             HCCL_CHECK_ERROR(fn(inputs[i], outputs[i], hcclComms[0]->getHcclComm(), hcclStream, work->is_dispatched), opTypeToString(opType).c_str());
             if (c10_npu::option::OptionsManager::GetMultiStreamMemoryReuse() == c10_npu::option::ERASE_RECORD_STREAM) {
                 work->recorded_outputs_.push_back(
@@ -2794,7 +2792,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::pointToPoint(
             // to avoid to much task pushed to the stream, leading to stream overflow
             // insert sync point fluxLimit(key, i)
             c10_npu::NPUStream& hcclStream = hcclStreams_[key][i];
-            hcclUs startut = TIME_NOW();
+            hcclUs startut = std::chrono::steady_clock::now();
             HCCL_CHECK_ERROR(fn(tensors[i], hcclComms[i]->getHcclComm(), hcclStream, work->is_dispatched, p2pTargetRank), opTypeToString(opType).c_str());
         }
     }
@@ -3277,7 +3275,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_scatter_base_uneven(
     check_split_sizes(inputSplitSizes, inputTensor, size_);
 
     int inputSize = static_cast<int>(inputSplitSizes.size());
-    int inputRowSize = static_cast<int>(inputTensor.size(0) ? inputTensor.numel() / inputTensor.size(0) : 1);
+    int inputRowSize = static_cast<int>(inputTensor.size(0) != 0 ? inputTensor.numel() / inputTensor.size(0) : 1);
     std::vector<uint64_t> inputCounts;
     std::vector<uint64_t> inputSpl;
     inputSpl.push_back(0);
@@ -3365,7 +3363,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_allgather_base_uneven(
     check_split_sizes(outputSplitSizes, outputTensor, size_);
 
     int outputSize = static_cast<int>(outputSplitSizes.size());
-    int outputRowSize = static_cast<int>(outputTensor.size(0) ? outputTensor.numel() / outputTensor.size(0) : 1);
+    int outputRowSize = static_cast<int>(outputTensor.size(0) != 0 ? outputTensor.numel() / outputTensor.size(0) : 1);
     std::vector<uint64_t> outputCounts;
     std::vector<uint64_t> outputSpl;
     outputSpl.push_back(0);
@@ -4423,8 +4421,8 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::alltoall_base(
 
         int inputSize = static_cast<int>(inputSplitSizes.size());
         int outSize = static_cast<int>(outputSplitSizes.size());
-        int inputRowSize = static_cast<int>(inputTensor.size(0) ? inputTensor.numel() / inputTensor.size(0) : 1);
-        int outputRowSize = static_cast<int>(outputTensor.size(0) ? outputTensor.numel() / outputTensor.size(0) : 1);
+        int inputRowSize = static_cast<int>(inputTensor.size(0) != 0 ? inputTensor.numel() / inputTensor.size(0) : 1);
+        int outputRowSize = static_cast<int>(outputTensor.size(0) != 0 ? outputTensor.numel() / outputTensor.size(0) : 1);
         std::vector<uint64_t> inputCounts;
         std::vector<uint64_t> inputSpl;
         std::vector<uint64_t> outputCounts;
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
index 7785d14d1c..c2c47c2160 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
@@ -252,10 +252,10 @@ public:
 
         // return intrusive_ptr of the object
         static c10::intrusive_ptr<Options> create(
-            bool is_high_priority_stream = false,
+            bool _is_high_priority_stream = false,
             std::chrono::milliseconds timeout = kNoTimeout)
         {
-            return c10::make_intrusive<Options>(is_high_priority_stream);
+            return c10::make_intrusive<Options>(_is_high_priority_stream);
         }
 
         std::unordered_map<std::string, std::variant<uint32_t, std::string>> hccl_config;
diff --git a/torch_npu/distributed/distributed_c10d.py b/torch_npu/distributed/distributed_c10d.py
index ef66ea772b..7f95f0648c 100644
--- a/torch_npu/distributed/distributed_c10d.py
+++ b/torch_npu/distributed/distributed_c10d.py
@@ -157,7 +157,8 @@ def _gather_object(obj, object_gather_list=None, dst=0, group=None):
         group_size, dtype=torch.long, device=current_device
     )
     object_size_list = [
-        object_sizes_tensor[i].unsqueeze(dim=0) for i in range(group_size)
+        object_sizes_tensor[i].unsqueeze(dim=0)
+        for i in range(group_size)
     ]
     # Allgather tensor sizes. An all-gather is needed here despite this being a
     # gather, since each rank needs to broadcast a tensor of the same (maximal)
-- 
Gitee


From e7d101743e42894a3dff5f535a8af202d456bca1 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Wed, 26 Mar 2025 03:26:21 +0000
Subject: [PATCH 238/358] !19549 Update op_plugin commit id Merge pull request
 !19549 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 2a0dc12795..ba18531b88 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 2a0dc1279515725e1d49a3e14df88e8748d4f17b
+Subproject commit ba18531b882ecc932fc290de2116a1b3d2e5a962
-- 
Gitee


From b9399759e4140e8728c439cd754d86e729e9d97d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= <jiangyiwen5@huawei.com>
Date: Wed, 26 Mar 2025 06:52:42 +0000
Subject: [PATCH 239/358] =?UTF-8?q?!19560=20Change=20ACL=5FOP=5FINIT=5FMOD?=
 =?UTF-8?q?E=20check=20to=20warning=20level=20Merge=20pull=20request=20!19?=
 =?UTF-8?q?560=20from=20=E5=A7=9C=E6=80=A1=E6=96=87/v2.6.0=5Flz?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/core/npu/register/OptionsManager.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
index 9dd23d0d17..0f7d84c295 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
@@ -485,7 +485,7 @@ uint32_t OptionsManager::GetAclOpInitMode()
         int64_t acl_op_init_mode = (buf_val != nullptr) ? strtol(buf_val, nullptr, 10) : 0;
         std::unordered_map<int32_t, std::string> aclOpInitMode = getAclOpInitMode();
         if (aclOpInitMode.find(acl_op_init_mode) == aclOpInitMode.end()) {
-            TORCH_CHECK(false, "ACL_OP_INIT_MODE should be 0, 1 or 2", PTA_ERROR(ErrCode::VALUE));
+            TORCH_NPU_WARN_ONCE("Get env ACL_OP_INIT_MODE not in [0, 1, 2], so reset it to the default value 0.");
         }
         return static_cast<uint32_t>(acl_op_init_mode);
     }();
-- 
Gitee


From 78e0afca9233f87172c223c58fe8a34d525a6b3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=9C=E9=87=91=E8=88=AA?= <15990042527@163.com>
Date: Wed, 26 Mar 2025 09:54:59 +0000
Subject: [PATCH 240/358] =?UTF-8?q?!19576=20fix=20emptycache=20coredump=20?=
 =?UTF-8?q?Merge=20pull=20request=20!19576=20from=20=E6=9D=9C=E9=87=91?=
 =?UTF-8?q?=E8=88=AA/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/InitNpuBindings.cpp              |  6 +++---
 torch_npu/csrc/core/npu/NPUCachingAllocator.cpp | 12 ++++++++----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/torch_npu/csrc/InitNpuBindings.cpp b/torch_npu/csrc/InitNpuBindings.cpp
index 899887e10b..41ed3072a9 100644
--- a/torch_npu/csrc/InitNpuBindings.cpp
+++ b/torch_npu/csrc/InitNpuBindings.cpp
@@ -67,9 +67,9 @@ PyObject* THPModule_npu_shutdown(PyObject* self, PyObject* arg)
     at_npu::native::CachingHostAllocator_emptyCache();
     try {
         ASCEND_LOGI("NPU shutdown NPUCachingAllocator emptyCache.");
-        c10_npu::NPUCachingAllocator::emptyCache(check_error);
-    } catch (std::exception& e) {
-        ASCEND_LOGE("NPUCachingAllocator::emptyCache failed err=:%s", e.what());
+        c10_npu::NPUCachingAllocator::emptyCache(false);
+    } catch (...) {
+        ASCEND_LOGE("NPUCachingAllocator::emptyCache failed");
     }
 
     ASCEND_LOGI("NPU shutdown NpuSysCtrl Finalize.");
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index 7afdec6037..ebc64af1ac 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -2414,10 +2414,15 @@ private:
             for (auto &e : st.second) {
                 EventPool::Event event = std::move(e.first);
                 Block *block = e.second;
-                if (check_error) {
-                    NPU_CHECK_ERROR(aclrtSynchronizeEvent(*event));
+                auto err = aclrtSynchronizeEvent(*event);
+                if (err != ACL_ERROR_NONE) {
+                    if (check_error) {
+                        NPU_CHECK_ERROR(err);
+                    } else {
+                        ASCEND_LOGE("Event: aclrtSynchronizeEvent failed, event = %p", event.get());
+                    }
                 } else {
-                    NPU_CHECK_WARN(aclrtSynchronizeEvent(*event));
+                    ASCEND_LOGI("Event: aclrtSynchronizeEvent is successfully executed, event=%p", event.get());
                 }
 #ifndef BUILD_LIBTORCH
                 const c10_npu::impl::PyCallbackTrigger *trigger = c10_npu::impl::NPUTrace::getTrace();
@@ -2425,7 +2430,6 @@ private:
                     trigger->traceNpuEventSynchronization(reinterpret_cast<uintptr_t>(event.get()));
                 }
 #endif
-                ASCEND_LOGI("Event: aclrtSynchronizeEvent is successfully executed, event=%p", event.get());
 
                 block->event_count--;
                 if (block->event_count == 0) {
-- 
Gitee


From 308289edf860dc94331fd7b32d60748b7228ffe2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=91=A8=E5=85=88=E7=90=AA?=
 <11608950+zhou-xianqi@user.noreply.gitee.com>
Date: Wed, 26 Mar 2025 11:32:00 +0000
Subject: [PATCH 241/358] =?UTF-8?q?!19567=20optimize=5Flogger=20Merge=20pu?=
 =?UTF-8?q?ll=20request=20!19567=20from=20=E5=91=A8=E5=85=88=E7=90=AA/v2.6?=
 =?UTF-8?q?.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../analysis/prof_parse/_cann_file_parser.py  | 16 +++++-----
 .../analysis/prof_parse/_fwk_file_parser.py   | 30 ++++++++++++-------
 2 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/torch_npu/profiler/analysis/prof_parse/_cann_file_parser.py b/torch_npu/profiler/analysis/prof_parse/_cann_file_parser.py
index b70b3e049d..a053bbdebd 100644
--- a/torch_npu/profiler/analysis/prof_parse/_cann_file_parser.py
+++ b/torch_npu/profiler/analysis/prof_parse/_cann_file_parser.py
@@ -52,9 +52,9 @@ class CANNFileParser:
                                         r"^memory_record_\d{1,20}.*\.csv",
                                         r"^memory_record_slice_\d{1,20}.*\.csv"],
         CANNDataEnum.GE_OPERATOR_MEMORY: [r"^ge_operator_memory_\d{1,20}.*\.csv",
-                                        r"^ge_operator_memory_slice_\d{1,20}.*\.csv",
-                                        r"^operator_memory_\d{1,20}.*\.csv",
-                                        r"^operator_memory_slice_\d{1,20}.*\.csv"],
+                                          r"^ge_operator_memory_slice_\d{1,20}.*\.csv",
+                                          r"^operator_memory_\d{1,20}.*\.csv",
+                                          r"^operator_memory_slice_\d{1,20}.*\.csv"],
         CANNDataEnum.L2_CACHE: [r"^l2_cache_\d{1,20}.*\.csv", r"^l2_cache_slice_\d{1,20}.*\.csv"],
         CANNDataEnum.AI_CPU: [r"^aicpu_\d{1,20}.*\.csv", r"^aicpu_slice_\d{1,20}.*\.csv"],
         CANNDataEnum.COMMUNICATION: [r"^communication\.json"],
@@ -125,6 +125,7 @@ class CANNFileParser:
             logger.error("There is no kernel events in msprof timeline.")
 
         acl_to_npu_dict = {}
+        warning_kernel_num = 0
         for flow in flow_dict.values():
             start_event = flow.get("start")
             end_event = flow.get("end")
@@ -135,11 +136,11 @@ class CANNFileParser:
                 unique_id = f"{pid}-{tid}-{ts}"
                 kernel_event = event_dict.get(unique_id)
                 if not kernel_event:
-                    logger.warning("The kernel event of unique_id(pid: %d, tid: %d, ts: %d) is not exist in msprof timeline.", 
-                                    pid, tid, ts)
+                    warning_kernel_num += 1
                     continue
                 acl_to_npu_dict.setdefault(convert_us2ns(start_event.get("ts", 0)), []).append(EventBean(kernel_event))
-                
+        if warning_kernel_num:
+            logger.warning(f"{warning_kernel_num} kernels do not exist in the msprof timeline.")
         return acl_to_npu_dict
 
     def get_timeline_all_data(self) -> list:
@@ -200,7 +201,8 @@ class CANNFileParser:
         PathManager.remove_path_safety(output_path)
 
     def _file_dispatch(self):
-        all_file_list = ProfilerPathManager.get_output_all_file_list_by_type(self._cann_path, self.MINDSTUDIO_PROFILER_OUTPUT)
+        all_file_list = ProfilerPathManager.get_output_all_file_list_by_type(self._cann_path,
+                                                                             self.MINDSTUDIO_PROFILER_OUTPUT)
         all_file_list += ProfilerPathManager.get_analyze_all_file(self._cann_path, self.ANALYZE)
         all_file_list += ProfilerPathManager.get_database_all_file(self._cann_path)
         for file_path in all_file_list:
diff --git a/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py b/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py
index f8010197a8..b15883fd04 100644
--- a/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py
+++ b/torch_npu/profiler/analysis/prof_parse/_fwk_file_parser.py
@@ -48,6 +48,7 @@ class FwkFileParser:
             return enqueue_data_list
         op_mark_data.sort(key=lambda x: x.time_ns)
         tid_op_dict = defaultdict(lambda: defaultdict(list))
+        match_failed_num = 0
         for op_mark in op_mark_data:
             if not op_mark.is_enqueue:
                 continue
@@ -56,14 +57,15 @@ class FwkFileParser:
                 continue
             start_op_list = tid_op_dict.get(op_mark.tid, {}).get(op_mark.origin_name, [])
             if not start_op_list:
-                self.logger.warning("Enquque data match failed, the tid: %d, origin_name: %s is not exist.", 
-                                op_mark.tid, op_mark.origin_name)
+                match_failed_num += 1
                 continue
             start_op = start_op_list.pop()
             op_mark.ts = start_op.time_ns
             op_mark.dur = op_mark.time_ns - start_op.time_ns
             enqueue_data_list.append(op_mark)
             start_op_list.clear()
+        if match_failed_num:
+            self.logger.warning(f"{match_failed_num} enqueue data match failed.")
         return enqueue_data_list
 
     def get_dequeue_data(self) -> list:
@@ -74,6 +76,7 @@ class FwkFileParser:
             return dequeue_data_list
         op_mark_data.sort(key=lambda x: x.time_ns)
         tid_op_dict = defaultdict(lambda: defaultdict(list))
+        match_failed_num = 0
         for op_mark in op_mark_data:
             if not op_mark.is_dequeue:
                 continue
@@ -82,14 +85,15 @@ class FwkFileParser:
                 continue
             start_op_list = tid_op_dict.get(op_mark.tid, {}).get(op_mark.origin_name, [])
             if not start_op_list:
-                self.logger.warning("Dequque data match failed, the tid: %d, origin_name: %s is not exist.", 
-                                op_mark.tid, op_mark.origin_name)
+                match_failed_num += 1
                 continue
             start_op = start_op_list.pop()
             op_mark.ts = start_op.time_ns
             op_mark.dur = op_mark.time_ns - start_op.time_ns
             dequeue_data_list.append(op_mark)
             start_op_list.clear()
+        if match_failed_num:
+            self.logger.warning(f"{match_failed_num} enqueue data match failed.")
         return dequeue_data_list
 
     def get_task_queue_data(self) -> any:
@@ -100,6 +104,7 @@ class FwkFileParser:
         op_mark_data.sort(key=lambda x: x.time_ns)
         enqueue_tid_op_dict = defaultdict(lambda: defaultdict(list))
         dequeue_tid_op_dict = defaultdict(lambda: defaultdict(list))
+        enqueue_match_failed_num, dequeue_match_failed_num = 0, 0
         for op_mark in op_mark_data:
             if op_mark.is_enqueue_start:
                 enqueue_tid_op_dict[op_mark.tid][op_mark.origin_name].append(op_mark)
@@ -110,8 +115,7 @@ class FwkFileParser:
             if op_mark.is_enqueue_end:
                 start_op_list = enqueue_tid_op_dict.get(op_mark.tid, {}).get(op_mark.origin_name, [])
                 if not start_op_list:
-                    self.logger.warning("Enquque data match failed, the tid: %d, origin_name: %s is not exist.", 
-                                    op_mark.tid, op_mark.origin_name)
+                    enqueue_match_failed_num += 1
                     continue
                 start_op = start_op_list.pop()
                 op_mark.ts = start_op.time_ns
@@ -122,14 +126,17 @@ class FwkFileParser:
             if op_mark.is_dequeue_end:
                 start_op_list = dequeue_tid_op_dict.get(op_mark.tid, {}).get(op_mark.origin_name, [])
                 if not start_op_list:
-                    self.logger.warning("Dequque data match failed, the tid: %d, origin_name: %s is not exist.", 
-                                    op_mark.tid, op_mark.origin_name)
+                    dequeue_match_failed_num += 1
                     continue
                 start_op = start_op_list.pop()
                 op_mark.ts = start_op.time_ns
                 op_mark.dur = op_mark.time_ns - start_op.time_ns
                 dequeue_data_list.append(op_mark)
                 start_op_list.clear()
+        if enqueue_match_failed_num:
+            self.logger.warning(f"{enqueue_match_failed_num} enqueue data match failed.")
+        if dequeue_match_failed_num:
+            self.logger.warning(f"{dequeue_match_failed_num} dequeue data match failed.")
         return enqueue_data_list, dequeue_data_list
 
     def get_torch_op_tree_node(self, only_fwk: bool = False) -> list:
@@ -230,7 +237,7 @@ class FwkFileParser:
                 bwd_op_id = node['end']['idx']
                 torch_op_apis[fwb_op_id][3].append(start_connection_id)
                 torch_op_apis[bwd_op_id][3].append(start_connection_id)
-                
+
                 start_connection_id += 1
 
     def get_fwk_api(self) -> dict:
@@ -247,7 +254,8 @@ class FwkFileParser:
         for torch_op in torch_op_data:
             api = [torch_op.ts, torch_op.end_ns, contact_2num(pid, torch_op.tid), [], torch_op.name,
                    torch_op.args.get(Constant.SEQUENCE_NUMBER, -1), torch_op.args.get(Constant.FORWARD_THREAD_ID),
-                   torch_op.args.get(Constant.INPUT_DTYPES), torch_op.args.get(Constant.INPUT_SHAPES), torch_op.call_stack]
+                   torch_op.args.get(Constant.INPUT_DTYPES), torch_op.args.get(Constant.INPUT_SHAPES),
+                   torch_op.call_stack]
             if torch_op.name == "mstx_mark_op":
                 mstx_mark_apis.append(api)
             else:
@@ -269,7 +277,7 @@ class FwkFileParser:
             task_dequeues.append(
                 [dequeue_data.ts, dequeue_data.ts + dequeue_data.dur, contact_2num(pid, dequeue_data.tid),
                  dequeue_data.corr_id, dequeue_data.name])
-        
+
         start_connection_id = max(connection_ids) + 1 if connection_ids else 0
         self.update_fwd_bwd_connection_id(fwd_bwd_dict, torch_op_apis, start_connection_id)
 
-- 
Gitee


From fcbcf0f23b3fb1f2015d9095f289473bb7610f16 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 27 Mar 2025 02:40:00 +0000
Subject: [PATCH 242/358] !19621 Update op_plugin commit id Merge pull request
 !19621 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index ba18531b88..e52d7ff6f3 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit ba18531b882ecc932fc290de2116a1b3d2e5a962
+Subproject commit e52d7ff6f3f840d1ae118a24b63cda9b668ecf34
-- 
Gitee


From d5afaf7097a9b5357ecf0604d7c9ad002afbd415 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=8F=B6=E5=AD=90=E5=87=A1?= <vscience@qq.com>
Date: Thu, 27 Mar 2025 06:37:06 +0000
Subject: [PATCH 243/358] =?UTF-8?q?!19092=20cleancodeFix=20Merge=20pull=20?=
 =?UTF-8?q?request=20!19092=20from=20=E5=8F=B6=E5=AD=90=E5=87=A1/v2.6.0=5F?=
 =?UTF-8?q?cleancode=5F0313?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../csrc/core/npu/register/FunctionLoader.cpp |  4 +-
 .../csrc/core/npu/register/FunctionLoader.h   |  4 +-
 .../csrc/core/npu/register/OptionsManager.cpp |  2 +-
 .../csrc/core/npu/register/OptionsManager.h   |  2 +-
 torch_npu/csrc/distributed/HCCLUtils.cpp      |  2 +-
 torch_npu/csrc/distributed/HCCLUtils.hpp      |  6 +--
 torch_npu/csrc/distributed/HcclCompile.h      | 24 +++++------
 torch_npu/csrc/distributed/Init.cpp           |  6 +--
 .../csrc/distributed/ParallelStoreProxy.cpp   |  2 +-
 .../csrc/distributed/ParallelTcpServer.cpp    |  6 +--
 .../csrc/distributed/ParallelTcpStore.cpp     |  2 +-
 .../csrc/distributed/ProcessGroupHCCL.cpp     | 42 +++++++++----------
 .../csrc/distributed/ProcessGroupHCCL.hpp     |  4 +-
 .../csrc/distributed/ProcessGroupLCCL.cpp     |  4 +-
 .../csrc/distributed/ProcessGroupLCCL.hpp     |  4 +-
 torch_npu/csrc/distributed/StoreClient.cpp    |  4 +-
 torch_npu/csrc/distributed/StoreClient.hpp    |  2 +-
 torch_npu/csrc/distributed/reducer.cpp        | 16 +++----
 torch_npu/csrc/distributed/reducer.hpp        | 17 ++++----
 .../csrc/distributed/rpc/tensorpipe_agent.cpp |  2 +-
 torch_npu/csrc/logging/LogContext.cpp         |  2 +-
 torch_npu/csrc/logging/LogContext.h           |  2 +-
 22 files changed, 81 insertions(+), 78 deletions(-)

diff --git a/torch_npu/csrc/core/npu/register/FunctionLoader.cpp b/torch_npu/csrc/core/npu/register/FunctionLoader.cpp
index 28270dd4e9..6ef031ce3e 100644
--- a/torch_npu/csrc/core/npu/register/FunctionLoader.cpp
+++ b/torch_npu/csrc/core/npu/register/FunctionLoader.cpp
@@ -82,11 +82,11 @@ void *FunctionRegister::Get(const std::string &soName, const std::string &funcNa
     return nullptr;
 }
 
-FunctionRegisterBuilder::FunctionRegisterBuilder(const std::string &name, ::std::unique_ptr<FunctionLoader> &ptr)
+FunctionRegisterBuilder::FunctionRegisterBuilder(const std::string &name, ::std::unique_ptr<FunctionLoader> &ptr) noexcept
 {
     FunctionRegister::GetInstance()->Register(name, ptr);
 }
-FunctionRegisterBuilder::FunctionRegisterBuilder(const std::string &soName, const std::string &funcName)
+FunctionRegisterBuilder::FunctionRegisterBuilder(const std::string &soName, const std::string &funcName) noexcept
 {
     FunctionRegister::GetInstance()->Register(soName, funcName);
 }
diff --git a/torch_npu/csrc/core/npu/register/FunctionLoader.h b/torch_npu/csrc/core/npu/register/FunctionLoader.h
index 489243b1b1..722a78a48a 100644
--- a/torch_npu/csrc/core/npu/register/FunctionLoader.h
+++ b/torch_npu/csrc/core/npu/register/FunctionLoader.h
@@ -72,11 +72,11 @@ public:
     /**
         ctr
         */
-    FunctionRegisterBuilder(const std::string& name, ::std::unique_ptr<FunctionLoader>& ptr);
+    FunctionRegisterBuilder(const std::string& name, ::std::unique_ptr<FunctionLoader>& ptr) noexcept;
     /**
         ctr
         */
-    FunctionRegisterBuilder(const std::string& soName, const std::string& funcName);
+    FunctionRegisterBuilder(const std::string& soName, const std::string& funcName) noexcept;
 }; // class FunctionRegisterBuilder
 
 } // namespace register_function
diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
index 0f7d84c295..4e78c1e396 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
@@ -256,7 +256,7 @@ bool OptionsManager::CheckStatusSaveEnable()
     return CheckStatusSaveEnable;
 }
 
-std::string OptionsManager::GetStatusSavePath()
+std::string OptionsManager::GetStatusSavePath() noexcept
 {
     char* status_save_val = std::getenv("TORCH_HCCL_STATUS_SAVE_PATH");
     std::string status_save_path = (status_save_val != nullptr) ? std::string(status_save_val) : "/tmp";
diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h
index d5ff1562eb..6bffa5aa6c 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.h
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.h
@@ -112,7 +112,7 @@ public:
     static int64_t GetRankId();
     static char *GetNslbPath();
     static bool CheckStatusSaveEnable();
-    static std::string GetStatusSavePath();
+    static std::string GetStatusSavePath() noexcept;
     static uint32_t GetStatusSaveInterval();
     static uint32_t GetNslbCntVal();
     static bool CheckGeInitDisable();
diff --git a/torch_npu/csrc/distributed/HCCLUtils.cpp b/torch_npu/csrc/distributed/HCCLUtils.cpp
index 62b008e376..a284c47770 100644
--- a/torch_npu/csrc/distributed/HCCLUtils.cpp
+++ b/torch_npu/csrc/distributed/HCCLUtils.cpp
@@ -1,7 +1,7 @@
 #include <filesystem>
 
-#include "torch_npu/csrc/distributed/HCCLUtils.hpp"
 #include "torch_npu/csrc/core/npu/interface/HcclInterface.h"
+#include "torch_npu/csrc/distributed/HCCLUtils.hpp"
 
 
 namespace c10d_npu {
diff --git a/torch_npu/csrc/distributed/HCCLUtils.hpp b/torch_npu/csrc/distributed/HCCLUtils.hpp
index e39c59643f..40d350d3d3 100644
--- a/torch_npu/csrc/distributed/HCCLUtils.hpp
+++ b/torch_npu/csrc/distributed/HCCLUtils.hpp
@@ -1,11 +1,11 @@
 #pragma once
+#include <map>
+#include <memory>
+#include <string>
 
 #include "torch_npu/csrc/core/npu/npu_log.h"
 #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h"
 #include "torch_npu/csrc/core/npu/NPUException.h"
-#include <map>
-#include <memory>
-#include <string>
 
 #include <ATen/ATen.h>
 #include <c10/util/Optional.h>
diff --git a/torch_npu/csrc/distributed/HcclCompile.h b/torch_npu/csrc/distributed/HcclCompile.h
index 8204dd9555..e6358a7b1e 100644
--- a/torch_npu/csrc/distributed/HcclCompile.h
+++ b/torch_npu/csrc/distributed/HcclCompile.h
@@ -32,7 +32,7 @@ extern HcclResult hcclAlltoAllV(const void *sendBuf, const void *sendCounts, con
     HcclDataType sendType, const void *recvBuf, const void *recvCounts, const void *rdispls,
     HcclDataType recvType, HcclComm comm, aclrtStream stream)
 {
-    typedef HcclResult(*HcclAlltoAllVFunc)(
+    using HcclAlltoAllVFunc = HcclResult(*)(
         const void *, const void *, const void *, HcclDataType,
         const void *, const void *, const void *, HcclDataType,
         HcclComm, aclrtStream);
@@ -50,7 +50,7 @@ extern HcclResult hcclAllGatherV(const void *sendBuf, uint64_t sendCount,
     const void *recvBuf, const void *recvCounts, const void *rdispls,
     HcclDataType dataType, HcclComm comm, aclrtStream stream)
 {
-    typedef HcclResult(*HcclAllGatherVFunc)(
+    using HcclAllGatherVFunc = HcclResult(*)(
         const void *, uint64_t,
         const void *, const void *, const void *,
         HcclDataType, HcclComm, aclrtStream);
@@ -67,7 +67,7 @@ extern HcclResult hcclReduceScatterV(const void *sendBuf, const void *sendCounts
     const void *recvBuf, uint64_t recvCount,
     HcclDataType dataType, HcclReduceOp op, HcclComm comm, aclrtStream stream)
 {
-    typedef HcclResult(*HcclReduceScatterVFunc)(
+    using HcclReduceScatterVFunc = HcclResult(*)(
         const void *, const void *, const void *,
         const void *, uint64_t,
         HcclDataType, HcclReduceOp, HcclComm, aclrtStream);
@@ -83,7 +83,7 @@ extern HcclResult hcclReduceScatterV(const void *sendBuf, const void *sendCounts
 extern HcclResult hcclReduce(void *sendBuf, void *recvBuf, uint64_t count, HcclDataType sendType,
     HcclReduceOp op, uint32_t root, HcclComm comm, aclrtStream stream)
 {
-    typedef HcclResult(*HcclReduceVFunc)(
+    using HcclReduceVFunc = HcclResult(*)(
         void *, void *, uint64_t, HcclDataType, HcclReduceOp, uint32_t, HcclComm, aclrtStream);
     static HcclReduceVFunc func = nullptr;
     if (func == nullptr) {
@@ -96,7 +96,7 @@ extern HcclResult hcclReduce(void *sendBuf, void *recvBuf, uint64_t count, HcclD
 
 HcclResult hcclGetCommAsyncError(HcclComm comm, HcclResult* asyncError)
 {
-    typedef HcclResult(*HcclGetCommAsyncErrorVFunc)(HcclComm, HcclResult*);
+    using HcclGetCommAsyncErrorVFunc = HcclResult(*)(HcclComm, HcclResult*);
     static HcclGetCommAsyncErrorVFunc func = nullptr;
     if (func == nullptr) {
         func = (HcclGetCommAsyncErrorVFunc)GET_FUNC(HcclGetCommAsyncError);
@@ -109,7 +109,7 @@ HcclResult hcclGetCommAsyncError(HcclComm comm, HcclResult* asyncError)
 HcclResult hcclScatter(void *sendBuf, void *recvBuf, uint64_t count, HcclDataType dataType, uint32_t root,
     HcclComm comm, aclrtStream stream)
 {
-    typedef HcclResult(*HcclScatterVFunc)(void *, void *, uint64_t, HcclDataType, uint32_t, HcclComm, aclrtStream);
+    using HcclScatterVFunc = HcclResult(*)(void *, void *, uint64_t, HcclDataType, uint32_t, HcclComm, aclrtStream);
     static HcclScatterVFunc func = nullptr;
     if (func == nullptr) {
         func = (HcclScatterVFunc)GET_FUNC(HcclScatter);
@@ -121,7 +121,7 @@ HcclResult hcclScatter(void *sendBuf, void *recvBuf, uint64_t count, HcclDataTyp
 
 HcclResult hcclBatchIsendIrecv(void* sendRecvInfo, uint32_t itemNum, HcclComm comm, aclrtStream stream)
 {
-    typedef HcclResult(*HcclBatchIsendIrecvVFunc)(
+    using HcclBatchIsendIrecvVFunc = HcclResult(*)(
         void *, uint32_t, HcclComm, aclrtStream);
     static HcclBatchIsendIrecvVFunc func = nullptr;
     if (func == nullptr) {
@@ -136,7 +136,7 @@ HcclResult hcclAlltoAll(const void *sendBuf, uint64_t sendCount, HcclDataType se
     const void *recvBuf, uint64_t recvCount, HcclDataType recvType,
     HcclComm comm, aclrtStream stream)
 {
-    typedef HcclResult(*HcclAlltoAllFunc)(
+    using HcclAlltoAllFunc = HcclResult(*)(
         const void *, uint64_t, HcclDataType,
         const void *, uint64_t, HcclDataType,
         HcclComm, aclrtStream);
@@ -195,7 +195,7 @@ bool hcclReduceScatterVExist()
 
 HcclResult hcclCommInitRootInfoConfig(uint32_t nRanks, const HcclRootInfo *rootInfo, uint32_t rank, HcclCommConfig* config, HcclComm *comm)
 {
-    typedef HcclResult(*HcclCommInitRootInfoConfigFunc)(
+    using HcclCommInitRootInfoConfigFunc = HcclResult(*)(
         uint32_t, const HcclRootInfo *, uint32_t, HcclCommConfig*, HcclComm *);
     static HcclCommInitRootInfoConfigFunc func = nullptr;
     if (func == nullptr) {
@@ -208,7 +208,7 @@ HcclResult hcclCommInitRootInfoConfig(uint32_t nRanks, const HcclRootInfo *rootI
 
 bool isHcclFeatureSupported(HcclCommConfigCapability configParameter)
 {
-    typedef uint32_t(*HcclGetCommConfigCapabilityFunc)();
+    using HcclGetCommConfigCapabilityFunc = uint32_t(*)();
     static HcclGetCommConfigCapabilityFunc func = (HcclGetCommConfigCapabilityFunc) GET_FUNC(
             HcclGetCommConfigCapability);
     if (func == nullptr) {
@@ -228,7 +228,7 @@ bool hcclCommInitClusterInfoConfigExist()
 
 HcclResult hcclCommInitClusterInfoConfig(const char *clusterInfo, uint32_t rank, HcclCommConfig *config, HcclComm *comm)
 {
-    typedef HcclResult(*HcclCommInitClusterInfoConfigFunc)(const char *, uint32_t, HcclCommConfig *, HcclComm *);
+    using HcclCommInitClusterInfoConfigFunc = HcclResult(*)(const char *, uint32_t, HcclCommConfig *, HcclComm *);
     static HcclCommInitClusterInfoConfigFunc func = nullptr;
     if (func == nullptr) {
         func = (HcclCommInitClusterInfoConfigFunc)GET_FUNC(HcclCommInitClusterInfoConfig)
@@ -250,7 +250,7 @@ bool hcclCreateSubCommConfigExist()
 HcclResult hcclCreateSubCommConfig(HcclComm *comm, uint32_t rankNum, uint32_t *rankIds, uint64_t subCommId, uint32_t subCommRankId,
     HcclCommConfig* config, HcclComm *subComm)
 {
-    typedef HcclResult(*HcclCreateSubCommConfigFunc)(HcclComm *, uint32_t, uint32_t *, uint64_t, uint32_t, HcclCommConfig *, HcclComm *);
+    using HcclCreateSubCommConfigFunc = HcclResult(*)(HcclComm *, uint32_t, uint32_t *, uint64_t, uint32_t, HcclCommConfig *, HcclComm *);
     static HcclCreateSubCommConfigFunc func = nullptr;
     if (func == nullptr) {
         func = (HcclCreateSubCommConfigFunc)GET_FUNC(HcclCreateSubCommConfig)
diff --git a/torch_npu/csrc/distributed/Init.cpp b/torch_npu/csrc/distributed/Init.cpp
index 2b671852d0..531a1471db 100644
--- a/torch_npu/csrc/distributed/Init.cpp
+++ b/torch_npu/csrc/distributed/Init.cpp
@@ -21,11 +21,11 @@
 #include "torch_npu/csrc/distributed/ProcessGroupHCCL.hpp"
 #include "torch_npu/csrc/distributed/ProcessGroupLCCL.hpp"
 #include "torch_npu/csrc/distributed/reducer.hpp"
-#include "torch_npu/csrc/distributed/Init.h"
 #include "torch_npu/csrc/distributed/ParallelTcpStore.hpp"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "torch_npu/csrc/aten/CustomFunctions.h"
 #include "torch_npu/csrc/core/NPUBridge.h"
+#include "torch_npu/csrc/distributed/Init.h"
 
 
 namespace {
@@ -49,7 +49,7 @@ public:
         : impl_(c10::intrusive_ptr<T>::unsafe_steal_from_new(impl)) {}
     ~IntrusivePtrNoGilDestructor() {
         if (impl_) {
-            if (PyGILState_Check()) {
+            if (PyGILState_Check() != 0) {
                 pybind11::gil_scoped_release release;
                 impl_.reset();
             } else {
@@ -95,7 +95,7 @@ using intrusive_ptr_no_gil_destructor_class_ =
 
 class BroadcastWork {
 public:
-    inline std::vector<at::Tensor> cast_tensors(at::TensorList tensors)
+    inline std::vector<at::Tensor> cast_tensors(at::TensorList tensors) const
     {
         static auto cast_back_to_ori_format = [](const at::Tensor &t) {
             return at_npu::native::custom_ops::npu_format_cast(t, torch_npu::NPUBridge::GetNpuStorageImpl(t)->npu_desc_.origin_format_);
diff --git a/torch_npu/csrc/distributed/ParallelStoreProxy.cpp b/torch_npu/csrc/distributed/ParallelStoreProxy.cpp
index f61b82cbe1..c7355c2c06 100644
--- a/torch_npu/csrc/distributed/ParallelStoreProxy.cpp
+++ b/torch_npu/csrc/distributed/ParallelStoreProxy.cpp
@@ -1,11 +1,11 @@
 #include <unistd.h>
-#include "ParallelStoreProxy.hpp"
 #include "ParallelTcpStore.hpp"
 #include "StoreClient.hpp"
 #include "c10/util/Exception.h"
 #include <stdexcept>
 #include "StoreMessagePacker.hpp"
 #include "ParallelTcpServer.hpp"
+#include "ParallelStoreProxy.hpp"
 
 namespace c10d {
 namespace torch_npu {
diff --git a/torch_npu/csrc/distributed/ParallelTcpServer.cpp b/torch_npu/csrc/distributed/ParallelTcpServer.cpp
index a9121dc582..acbcb21609 100644
--- a/torch_npu/csrc/distributed/ParallelTcpServer.cpp
+++ b/torch_npu/csrc/distributed/ParallelTcpServer.cpp
@@ -413,7 +413,7 @@ void ParallelTcpServer::ProcessListenEvent() noexcept
 void ParallelTcpServer::ProcessClientEvent(int epFd, int fd, uint32_t event,
     std::unordered_map<int, ClientIoContext> &ctx) noexcept
 {
-    if (event & (EPOLLRDHUP | EPOLLHUP)) {
+    if ((event & (EPOLLRDHUP | EPOLLHUP)) != 0) {
         epoll_ctl(epFd, EPOLL_CTL_DEL, fd, nullptr);
         close(fd);
         fd = -1;
@@ -427,7 +427,7 @@ void ParallelTcpServer::ProcessClientEvent(int epFd, int fd, uint32_t event,
     }
 
     auto setEvents = pos->second.currentEvents_;
-    if (event & EPOLLIN) {
+    if ((event & EPOLLIN) != 0) {
         pos->second.ReceiveData();
         while (pos->second.HasNextReq()) {
             auto response = process_(fd, pos->second.NextRequest());
@@ -443,7 +443,7 @@ void ParallelTcpServer::ProcessClientEvent(int epFd, int fd, uint32_t event,
         }
     }
 
-    if (event & EPOLLOUT) {
+    if ((event & EPOLLOUT) != 0) {
         pos->second.FlushSendBuf();
         setEvents = EPOLLIN | EPOLLRDHUP | EPOLLHUP;
     }
diff --git a/torch_npu/csrc/distributed/ParallelTcpStore.cpp b/torch_npu/csrc/distributed/ParallelTcpStore.cpp
index 59d07278e6..6fef4ebadd 100644
--- a/torch_npu/csrc/distributed/ParallelTcpStore.cpp
+++ b/torch_npu/csrc/distributed/ParallelTcpStore.cpp
@@ -15,10 +15,10 @@
  */
 #include <chrono>
 #include "ParallelTcpServer.hpp"
-#include "ParallelTcpStore.hpp"
 #include "ParallelStoreProxy.hpp"
 #include "StoreClient.hpp"
 #include "torch_npu/csrc/core/npu/npu_log.h"
+#include "ParallelTcpStore.hpp"
 
 namespace c10d {
 namespace torch_npu {
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 6dd9b13e11..8d84ee4963 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -4,16 +4,17 @@
 #include <tuple>
 #include <unordered_set>
 #include <unistd.h>
-#include <linux/limits.h>
 #include <fstream>
 #include <iostream>
 #include <functional>
 #include <cstdlib>
+#include <linux/limits.h>
 
 #include <c10/util/Optional.h>
 #include <c10/util/irange.h>
 #include <c10d/ParamCommsUtils.hpp>
 #include <c10d/TraceUtils.h>
+#include <c10d/Utils.hpp>
 
 #include "op_plugin/OpInterface.h"
 #include "third_party/acl/inc/acl/acl.h"
@@ -34,14 +35,13 @@
 #include "torch_npu/csrc/core/npu/interface/OpInterface.h"
 #include "torch_npu/csrc/distributed/HCCLUtils.hpp"
 #include "torch_npu/csrc/distributed/HcclCompile.h"
-#include "torch_npu/csrc/distributed/ProcessGroupHCCL.hpp"
 #include "torch_npu/csrc/toolkit/profiler/common/utils.h"
 #include "torch_npu/csrc/framework/OpHook.h"
 #include "torch_npu/csrc/framework/FormatHelper.h"
 #include "torch_npu/csrc/framework/utils/OpPreparation.h"
 #include "torch_npu/csrc/profiler/npu_profiler.h"
 #include "torch_npu/csrc/logging/LogContext.h"
-#include <c10d/Utils.hpp>
+#include "torch_npu/csrc/distributed/ProcessGroupHCCL.hpp"
 
 namespace c10d_npu {
 namespace {
@@ -71,7 +71,7 @@ std::map<c10d::ReduceOp, std::string> unsupportedOp = {
 bool nslb_is_end = false;
 bool uce_error_flag = false;
 bool force_stop_error_flag = false;
-char* nslb_path = c10_npu::option::OptionsManager::GetNslbPath();
+const char* nslb_path = c10_npu::option::OptionsManager::GetNslbPath();
 bool status_save_enable = c10_npu::option::OptionsManager::CheckStatusSaveEnable();
 std::string status_save_path = c10_npu::option::OptionsManager::GetStatusSavePath();
 
@@ -435,7 +435,7 @@ ProcessGroupHCCL::WorkHCCL::WorkHCCL(
     if (desyncDebug || (status_save_enable)) {
         hcclStartEvents_ = std::make_shared<std::vector<c10_npu::NPUEvent>>();
         hcclStartEvents_->reserve(devices.size());
-        for (int i = 0; i < devices.size(); i++) {
+        for (size_t i = 0; i < devices.size(); i++) {
             hcclStartEvents_->emplace_back(ACL_EVENT_CAPTURE_STREAM_PROGRESS);
         }
     }
@@ -602,7 +602,7 @@ bool ProcessGroupHCCL::WorkHCCL::checkTimeout(c10::optional<std::chrono::millise
     return true;
 }
 
-std::chrono::milliseconds GetDispatchTimeout()
+std::chrono::milliseconds GetDispatchTimeout() noexcept
 {
     uint32_t dispatchTimeout_ = 600U;
     uint32_t dispatchoffset = 30U;
@@ -610,8 +610,8 @@ std::chrono::milliseconds GetDispatchTimeout()
 
     int32_t hccl_exec_timeout = c10_npu::option::OptionsManager::GetHCCLExecTimeout();
     if (hccl_exec_timeout > 0) {
-        if (hccl_exec_timeout < dispatchTimeout_ + dispatchoffset && hccl_exec_timeout > mindispatchTimeout_ + dispatchoffset) {
-            dispatchTimeout_ = hccl_exec_timeout - dispatchoffset;
+        if (static_cast<uint32_t>(hccl_exec_timeout) < dispatchTimeout_ + dispatchoffset && static_cast<uint32_t>(hccl_exec_timeout) > mindispatchTimeout_ + dispatchoffset) {
+            dispatchTimeout_ = static_cast<uint32_t>(hccl_exec_timeout) - dispatchoffset;
         };
     };
     ASCEND_LOGI("set dispatchTimeout_ %u s.", dispatchTimeout_);
@@ -806,12 +806,12 @@ ProcessGroupHCCL::ProcessGroupHCCL(
             if (hccl_event_timeout < defaultExecTimeout) {
                 TORCH_NPU_WARN_ONCE("The value of HCCL_EVENT_TIMEOUT:", hccl_event_timeout, " is less than the default value of HCCL_EXEC_TIMEOUT:", defaultExecTimeout, ".");
             }
-            kOpWaitTimeout = hccl_event_timeout;
+            kOpWaitTimeout = static_cast<uint32_t>(hccl_event_timeout);
         } else if (hccl_exec_timeout == 0) {
             kOpWaitTimeout = 0;
             TORCH_NPU_WARN_ONCE("The value of HCCL_EVENT_TIMEOUT:", hccl_event_timeout, " is less than the value of HCCL_EXEC_TIMEOUT:", hccl_exec_timeout, ", so set op wait timeout to never timeout.");
         } else {
-            kOpWaitTimeout = hccl_event_timeout;
+            kOpWaitTimeout = static_cast<uint32_t>(hccl_event_timeout);
             if (hccl_event_timeout < hccl_exec_timeout) {
                 TORCH_NPU_WARN_ONCE("The value of HCCL_EVENT_TIMEOUT:", hccl_event_timeout, " is less than the value of HCCL_EXEC_TIMEOUT:", hccl_exec_timeout, ".");
             }
@@ -824,16 +824,16 @@ ProcessGroupHCCL::ProcessGroupHCCL(
         if (hccl_exec_timeout == 0) {
             kOpWaitTimeout = 0;
         }
-        if (hccl_exec_timeout > 0 && hccl_exec_timeout > kOpWaitTimeout) {
-            kOpWaitTimeout = hccl_exec_timeout + kOpWaitTimeoutOffset;
-            if (kOpWaitTimeout <= hccl_exec_timeout) {
+        if (hccl_exec_timeout > 0 && static_cast<uint32_t>(hccl_exec_timeout) > kOpWaitTimeout) {
+            kOpWaitTimeout = static_cast<uint32_t>(hccl_exec_timeout) + kOpWaitTimeoutOffset;
+            if (kOpWaitTimeout <= static_cast<uint32_t>(hccl_exec_timeout)) {
                 kOpWaitTimeout = UINT_MAX;
             }
         }
     }
     ASCEND_LOGI("Set op wait timeout to %u.", kOpWaitTimeout);
     NPU_CHECK_SUPPORTED_OR_ERROR(c10_npu::acl::AclrtSetOpWaitTimeout(kOpWaitTimeout));
-    char* blockingWait = getenv(HCCL_BLOCKING_WAIT);
+    const char* blockingWait = getenv(HCCL_BLOCKING_WAIT);
     try {
         if (blockingWait != nullptr) {
             auto val = std::stoi(blockingWait);
@@ -2134,7 +2134,7 @@ std::string ProcessGroupHCCL::getHcclCommName(int rankid, bool init_comm)
     return std::string(commName);
 }
 
-std::string ProcessGroupHCCL::getHcclCommNameWithoutInit(std::vector<std::shared_ptr<HCCLComm>>& hcclComms)
+std::string ProcessGroupHCCL::getHcclCommNameWithoutInit(std::vector<std::shared_ptr<HCCLComm>>& hcclComms) const
 {
     TORCH_CHECK(hcclComms.size() == 1, "expect hcclComms.size() = 1, but hcclComms.size() = ",
         hcclComms.size(), DIST_ERROR(ErrCode::VALUE));
@@ -2382,7 +2382,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::collective(
             for (auto tensor:inputs) {
                 dataVol += tensor.storage().nbytes();
             }
-            char* global_rank = getenv("RANK");
+            const char* global_rank = getenv("RANK");
             TORCH_CHECK(global_rank != nullptr, "Unable to fetch global rank for NSLB.", DIST_ERROR(ErrCode::NOT_FOUND));
             recordDataVol(opTypeToString(opType), std::to_string(dataVol), strtol(global_rank, nullptr, 10), hcclComms);
         }
@@ -2544,7 +2544,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::collectiveCoalesced(
             for (auto tensor:inputs) {
                 dataVol += tensor.storage().nbytes();
             }
-            char* global_rank = getenv("RANK");
+            const char* global_rank = getenv("RANK");
             TORCH_CHECK(global_rank != nullptr, "Unable to fetch global rank for NSLB.", DIST_ERROR(ErrCode::NOT_FOUND));
             recordDataVol(opTypeToString(opType), std::to_string(dataVol), strtol(global_rank, nullptr, 10), hcclComms);
         }
@@ -2724,7 +2724,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::pointToPoint(
             for (auto tensor : tensors) {
                 dataVol += tensor.storage().nbytes();
             }
-            char* global_rank = getenv("RANK");
+            const char* global_rank = getenv("RANK");
             TORCH_CHECK(global_rank != nullptr, "Unable to fetch global rank for NSLB.",
                         DIST_ERROR(ErrCode::NOT_FOUND));
             recordDataVol(opTypeToString(opType), std::to_string(dataVol), strtol(global_rank, nullptr, 10), hcclComms);
@@ -3229,7 +3229,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::_reduce_oop(
 }
 
 constexpr int64_t ADDRESS_ALIGNMENT_BYTE = 512;
-at::Tensor ProcessGroupHCCL::byte_alignment(at::Tensor& tensors)
+at::Tensor ProcessGroupHCCL::byte_alignment(at::Tensor& tensors) const
 {
     at::Tensor inter_tensors = at::reshape(tensors, {1, tensors.numel()});
     if (tensors.element_size() == 0) {
@@ -4236,7 +4236,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::send(std::vector<at::Tensor>& t
                 torch_npu::profiler::MstxRange range(
                     getMstxHcclMsg("HcclSend", numel, hcclType, comm, streamId, -1, dst_rank), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
-                auto hccl_result = HcclSend(inputDataPtr, numel, hcclType, dst_rank, comm, stream.stream(false));
+                auto hccl_result = HcclSend(inputDataPtr, numel, hcclType, static_cast<uint32_t>(dst_rank), comm, stream.stream(false));
                 *is_dispatched = true;
                 return hccl_result;
             };
@@ -4271,7 +4271,7 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::recv(std::vector<at::Tensor>& t
                 torch_npu::profiler::MstxRange range(
                     getMstxHcclMsg("HcclRecv", numel, hcclType, comm, streamId, src_rank, -1), stream.stream(false),
                     torch_npu::profiler::DOMAIN_COMMUNICATION);
-                auto hccl_result = HcclRecv(outputDataPtr, numel, hcclType, src_rank, comm, stream.stream(false));
+                auto hccl_result = HcclRecv(outputDataPtr, numel, hcclType, static_cast<uint32_t>(src_rank), comm, stream.stream(false));
                 *is_dispatched = true;
                 return hccl_result;
             };
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
index c2c47c2160..43b971821e 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.hpp
@@ -337,7 +337,7 @@ public:
 	    std::vector<at::Tensor>& tensors,
 	    std::vector<uint32_t> remote_rank_list);
 
-    at::Tensor byte_alignment(at::Tensor& tensors);
+    at::Tensor byte_alignment(at::Tensor& tensors) const;
 
     c10::intrusive_ptr<c10d::Work> _reduce_scatter_base_uneven(
         at::Tensor& outputTensor,
@@ -459,7 +459,7 @@ public:
 
     void abortAndClearHcclComm(c10::optional<std::string> abortReason);
 
-    std::string getHcclCommNameWithoutInit(std::vector<std::shared_ptr<HCCLComm>>& hcclComms);
+    std::string getHcclCommNameWithoutInit(std::vector<std::shared_ptr<HCCLComm>>& hcclComms) const;
 
     // Return the global ranks of a PG
     const std::vector<uint32_t>& groupRanks() const;
diff --git a/torch_npu/csrc/distributed/ProcessGroupLCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupLCCL.cpp
index 03c979088a..e2d50c6dbc 100644
--- a/torch_npu/csrc/distributed/ProcessGroupLCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupLCCL.cpp
@@ -104,7 +104,7 @@ std::vector<at::Tensor> ProcessGroupLCCL::WorkLCCL::result()
     return *outputs_;
 }
 
-void ProcessGroupLCCL::WorkLCCL::checkAndThrowException()
+void ProcessGroupLCCL::WorkLCCL::checkAndThrowException() const
 {
     // Set the appropriate exception if found.
     checkAndSetException();
@@ -115,7 +115,7 @@ void ProcessGroupLCCL::WorkLCCL::checkAndThrowException()
     }
 }
 
-void ProcessGroupLCCL::WorkLCCL::checkAndSetException()
+void ProcessGroupLCCL::WorkLCCL::checkAndSetException() const
 {
     if (exception()) {
         // We already have an exception.
diff --git a/torch_npu/csrc/distributed/ProcessGroupLCCL.hpp b/torch_npu/csrc/distributed/ProcessGroupLCCL.hpp
index df4e5cd73f..a26eb8f9f9 100644
--- a/torch_npu/csrc/distributed/ProcessGroupLCCL.hpp
+++ b/torch_npu/csrc/distributed/ProcessGroupLCCL.hpp
@@ -72,10 +72,10 @@ public:
         void synchronizeInternal(std::chrono::milliseconds timeout);
 
         // Checks for LCCL errors and sets an appropriate exception_ptr.
-        void checkAndSetException();
+        void checkAndSetException() const;
 
         // Checks for LCCL errors and throws an appropriate exception.
-        void checkAndThrowException();
+        void checkAndThrowException() const;
 
         // Just checks whether NPU execution has completed, without modifying
         // exception_ptr.
diff --git a/torch_npu/csrc/distributed/StoreClient.cpp b/torch_npu/csrc/distributed/StoreClient.cpp
index 8be1fd5831..c9b874f220 100644
--- a/torch_npu/csrc/distributed/StoreClient.cpp
+++ b/torch_npu/csrc/distributed/StoreClient.cpp
@@ -25,8 +25,8 @@
 #include <thread>
 
 #include "c10/util/Logging.h"
-#include "StoreClient.hpp"
 #include "StoreMessagePacker.hpp"
+#include "StoreClient.hpp"
 
 namespace c10d {
 namespace torch_npu {
@@ -228,7 +228,7 @@ int Client::SetReceiveTimeout(const std::chrono::milliseconds &value) const noex
     return ret;
 }
 
-int Client::GetSocketFd() noexcept
+int Client::GetSocketFd() const noexcept
 {
     return socketFd_;
 }
diff --git a/torch_npu/csrc/distributed/StoreClient.hpp b/torch_npu/csrc/distributed/StoreClient.hpp
index 869c217f0f..51972361fd 100644
--- a/torch_npu/csrc/distributed/StoreClient.hpp
+++ b/torch_npu/csrc/distributed/StoreClient.hpp
@@ -34,7 +34,7 @@ public:
     int LocalClose() noexcept;
     int SyncCall(const StoreMessage &request, StoreMessage &response) noexcept;
     int SetReceiveTimeout(const std::chrono::milliseconds &value) const noexcept;
-    int GetSocketFd() noexcept;
+    int GetSocketFd() const noexcept;
 private:
     const std::string localSocketPath_{};
     const std::string host_{};
diff --git a/torch_npu/csrc/distributed/reducer.cpp b/torch_npu/csrc/distributed/reducer.cpp
index cb342cf333..786ed32a76 100644
--- a/torch_npu/csrc/distributed/reducer.cpp
+++ b/torch_npu/csrc/distributed/reducer.cpp
@@ -30,12 +30,12 @@
 #include <torch/csrc/autograd/utils/lambda_post_hook.h>
 #include <c10d/debug.h>
 
-#include "torch_npu/csrc/distributed/reducer.hpp"
 #include "torch_npu/csrc/distributed/ProcessGroupHCCL.hpp"
 #include "torch_npu/csrc/aten/NPUNativeFunctions.h"
 #include "torch_npu/csrc/framework/utils/OpPreparation.h"
 #include "torch_npu/csrc/core/NPUBridge.h"
 #include "torch_npu/csrc/core/NPUStorageImpl.h"
+#include "torch_npu/csrc/distributed/reducer.hpp"
 
 namespace c10d_npu {
 namespace {
@@ -268,17 +268,17 @@ Reducer::~Reducer() noexcept(false)
     }
 }
 
-bool Reducer::dynamic_graph_find_unused()
+bool Reducer::dynamic_graph_find_unused() const
 {
     return !static_graph_ && find_unused_parameters_;
 }
 
-bool Reducer::static_graph_first_iteration()
+bool Reducer::static_graph_first_iteration() const
 {
     return static_graph_ && num_iterations_ == 1;
 }
 
-bool Reducer::static_graph_after_first_iteration()
+bool Reducer::static_graph_after_first_iteration() const
 {
     return static_graph_ && num_iterations_ > 1;
 }
@@ -1148,7 +1148,8 @@ void Reducer::initialize_bucket_views(
 // (see Note:  "Gradient Layout Contract" in initialize_buckets).
 void Reducer::populate_bucket_views_out(
     Reducer::BucketReplica& replica,
-    at::Tensor& tensor) {
+    at::Tensor& tensor) const
+{
     replica.bucket_views_out.clear();
     for (size_t i = 0; i < replica.variables.size(); i++) {
         const auto& v = replica.variables[i];
@@ -1874,7 +1875,7 @@ void Reducer::set_ddp_runtime_logging_sample_rate(int sample_rate)
     ddp_runtime_logging_sample_rate_ = sample_rate;
 }
 
-int Reducer::get_ddp_runtime_logging_sample_rate()
+int Reducer::get_ddp_runtime_logging_sample_rate() const
 {
     return ddp_runtime_logging_sample_rate_;
 }
@@ -1951,7 +1952,8 @@ struct BucketKey {
     const c10::Device device;
 
     // See torch/csrc/utils/hash.h for dispatch code.
-    static size_t hash(const BucketKey& key) {
+    static size_t hash(const BucketKey& key)
+    {
         return c10::get_hash(key.type, key.device);
     }
 };
diff --git a/torch_npu/csrc/distributed/reducer.hpp b/torch_npu/csrc/distributed/reducer.hpp
index dd0c133c24..e6870c940f 100644
--- a/torch_npu/csrc/distributed/reducer.hpp
+++ b/torch_npu/csrc/distributed/reducer.hpp
@@ -401,7 +401,7 @@ protected:
     // This function is called inside `finalize_backward`, it happens only if
     // DDP communication hook was registered to recreate just bucket_views_out
     // with the result of `future_work`.
-    void populate_bucket_views_out(BucketReplica& replica, at::Tensor& tensor);
+    void populate_bucket_views_out(BucketReplica& replica, at::Tensor& tensor) const;
 
     // If gradient_as_bucket_view_ is false, after allreduce buckets,
     // copy bucket results back to grads.
@@ -460,9 +460,10 @@ protected:
 
         VariableLocator() = default;
 
-        VariableLocator(size_t bucket_index_, size_t intra_bucket_index_) {
-        bucket_index = bucket_index_;
-        intra_bucket_index = intra_bucket_index_;
+        VariableLocator(size_t bucket_index_, size_t intra_bucket_index_)
+        {
+            bucket_index = bucket_index_;
+            intra_bucket_index = intra_bucket_index_;
         }
     };
 
@@ -491,7 +492,7 @@ protected:
     void record_backward_comm_start_time();
     void record_backward_comm_end_time();
 
-    int get_ddp_runtime_logging_sample_rate();
+    int get_ddp_runtime_logging_sample_rate() const;
     int ddp_runtime_logging_sample_rate_ = kDDPRuntimeLoggingSampleRate;
 
     bool is_multi_device_module_ = false;
@@ -561,9 +562,9 @@ private:
     void initialize_local_used_map();
     // get current cuda stream
     const c10::Stream get_current_stream();
-    bool dynamic_graph_find_unused();
-    bool static_graph_first_iteration();
-    bool static_graph_after_first_iteration();
+    bool dynamic_graph_find_unused() const;
+    bool static_graph_first_iteration() const;
+    bool static_graph_after_first_iteration() const;
 
     // comm_hook_ is used to access the DDP communication hook if registered.
     std::unique_ptr<c10d::CommHookInterface> comm_hook_;
diff --git a/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp
index 9f0eebfbca..a288dc6477 100644
--- a/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch_npu/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -157,7 +157,7 @@ const std::string &TensorPipeAgent::guessAddress()
     static const std::string uvAddress = []() {
         tensorpipe_npu::Error error;
         std::string result;
-        char *ifnameEnv = std::getenv(kSocketIfnameEnvVar.c_str());
+        const char *ifnameEnv = std::getenv(kSocketIfnameEnvVar.c_str());
         if (ifnameEnv != nullptr) {
             std::tie(error, result) = tensorpipe_npu::transport::uv::lookupAddrForIface(ifnameEnv);
             if (error) {
diff --git a/torch_npu/csrc/logging/LogContext.cpp b/torch_npu/csrc/logging/LogContext.cpp
index a05a059421..c39a05381e 100644
--- a/torch_npu/csrc/logging/LogContext.cpp
+++ b/torch_npu/csrc/logging/LogContext.cpp
@@ -54,7 +54,7 @@ void LogContext::setLogs(const std::unordered_map<std::string, int>& qnameLevels
     }
 }
 
-std::shared_ptr<Logger> LogContext::getLogger(const std::string& name)
+std::shared_ptr<Logger> LogContext::getLogger(const std::string& name) noexcept
 {
     std::lock_guard<std::mutex> lock(mutex_);
     auto iter = loggers_.find(name);
diff --git a/torch_npu/csrc/logging/LogContext.h b/torch_npu/csrc/logging/LogContext.h
index f0bdd6be57..c45d32104e 100644
--- a/torch_npu/csrc/logging/LogContext.h
+++ b/torch_npu/csrc/logging/LogContext.h
@@ -16,7 +16,7 @@ public:
 
     ~LogContext() = default;
 
-    std::shared_ptr<Logger> getLogger(const std::string& name = "");
+    std::shared_ptr<Logger> getLogger(const std::string& name = "") noexcept;
     static LogContext& GetInstance();
     void setLogs(const std::unordered_map<std::string, int>& qnameLevels);
 
-- 
Gitee


From ac1fdede2d8faefdd413d5bd31238307f3fa9ea6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=8F=B6=E5=AD=90=E5=87=A1?= <vscience@qq.com>
Date: Thu, 27 Mar 2025 06:37:28 +0000
Subject: [PATCH 244/358] =?UTF-8?q?!19579=20Fix=20for=20missing=20paramete?=
 =?UTF-8?q?r=20issue=20Merge=20pull=20request=20!19579=20from=20=E5=8F=B6?=
 =?UTF-8?q?=E5=AD=90=E5=87=A1/v2.6.0=5Fbugfix?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/distributed/rendezvous.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_npu/distributed/rendezvous.py b/torch_npu/distributed/rendezvous.py
index 80b10f6dba..4d0148b5b3 100644
--- a/torch_npu/distributed/rendezvous.py
+++ b/torch_npu/distributed/rendezvous.py
@@ -143,7 +143,7 @@ class _ParallelTCPRendezvous(RendezvousHandler):
                 multi_tenant=True,
             )
         store = PrefixStore(self.run_id, self._store)
-        bootstrap_store_info = RendezvousStoreInfo.build(self.rank, store)
+        bootstrap_store_info = RendezvousStoreInfo(self.master_addr, self.master_port)
         return RendezvousInfo(store, self.rank, self.world_size, bootstrap_store_info)
 
     def is_closed(self):
-- 
Gitee


From 30391ef35b83956e826a549e58cd5ac2cdff5b72 Mon Sep 17 00:00:00 2001
From: dilililiwhy <why.wuhuanyu@huawei.com>
Date: Thu, 27 Mar 2025 11:15:24 +0000
Subject: [PATCH 245/358] !19647 [TEMP] remove 8.1.RC1 Merge pull request
 !19647 from dilililiwhy/temp_remove_81rc1_260

---
 README.md    | 4 ----
 README.zh.md | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/README.md b/README.md
index 67e0ea5ad6..d425c4a548 100644
--- a/README.md
+++ b/README.md
@@ -149,10 +149,6 @@ Refer to [API of Ascend Extension for PyTorch](docs/api/torch_npu_apis.md) for m
 
 | CANN Version          | Supported PyTorch Version | Supported Extension Version | Github Branch     |
 |-----------------------|---------------------------|-----------------------------|-------------------|
-| CANN 8.1.RC1          | 2.5.1                     | 2.5.0                       | v2.5.1-7.0.0      |
-|                       | 2.4.0                     | 2.4.0.post3                 | v2.4.0-7.0.0      |
-|                       | 2.3.1                     | 2.3.1.post5                 | v2.3.1-7.0.0      |
-|                       | 2.1.0                     | 2.1.0.post11                | v2.1.0-7.0.0      |
 | CANN 8.0.0.beta1      | 2.6.0                     | 2.6.0rc1                    | v2.6.0            |
 | CANN 8.0.0            | 2.4.0                     | 2.4.0.post2                 | v2.4.0-6.0.0      | 
 |                       | 2.3.1                     | 2.3.1.post4                 | v2.3.1-6.0.0      |
diff --git a/README.zh.md b/README.zh.md
index beb722d8aa..a96710898b 100644
--- a/README.zh.md
+++ b/README.zh.md
@@ -158,10 +158,6 @@ print(z)
 
 | CANN版本                | 支持的PyTorch版本 | 支持的Extension版本   | Gitee分支           | 
 |-----------------------|--------------|------------------|-------------------|
-| CANN 8.1.RC1          | 2.5.1        | 2.5.0            | v2.5.1-7.0.0      |
-|                       | 2.4.0        | 2.4.0.post3      | v2.4.0-7.0.0      |
-|                       | 2.3.1        | 2.3.1.post5      | v2.3.1-7.0.0      |
-|                       | 2.1.0        | 2.1.0.post11     | v2.1.0-7.0.0      |
 | CANN 8.0.0.beta1      | 2.6.0        | 2.6.0rc1         | v2.6.0            |
 | CANN 8.0.0            | 2.4.0        | 2.4.0.post2      | v2.4.0-6.0.0      |
 |                       | 2.3.1        | 2.3.1.post4      | v2.3.1-6.0.0      |
-- 
Gitee


From d7862c5d232eee8954c57f58a14a8000d6ff245f Mon Sep 17 00:00:00 2001
From: hhz886 <hehongzhe@h-partners.com>
Date: Thu, 27 Mar 2025 12:53:52 +0000
Subject: [PATCH 246/358] !19625 [Profiler]warmup bugfix Merge pull request
 !19625 from hhz886/v2.6.0

---
 .../_dynamic_profiler/_dynamic_profiler_config_context.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
index 627f220393..f92c1ec0eb 100644
--- a/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
+++ b/torch_npu/profiler/_dynamic_profiler/_dynamic_profiler_config_context.py
@@ -312,7 +312,7 @@ class ConfigContext:
         return self._active
 
     def warmup(self) -> int:
-        if not isinstance(self._warmup, int) or self._warmup <= 0:
+        if not isinstance(self._warmup, int) or self._warmup < 0:
             DynamicProfilerUtils.out_log("Invalid parameter warmup, reset it to 0.",
                                          DynamicProfilerUtils.LoggerLevelEnum.WARNING)
             return self.DEFAULT_WARMUP
-- 
Gitee


From df183f8228e94aa81e3d3a07fecf4c9947690a01 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 27 Mar 2025 14:24:49 +0000
Subject: [PATCH 247/358] !19667 Update op_plugin commit id Merge pull request
 !19667 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index e52d7ff6f3..f0e9c2e71c 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit e52d7ff6f3f840d1ae118a24b63cda9b668ecf34
+Subproject commit f0e9c2e71c70e5a7f99fd30058841c8e012647f2
-- 
Gitee


From 11019beee3c642ce53fc83c22db9cfbd19759423 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=A8=81=E4=BA=A8?= <chenweiheng1@hisilicon.com>
Date: Fri, 28 Mar 2025 01:40:05 +0000
Subject: [PATCH 248/358] =?UTF-8?q?!19664=20[bugfix]update=20torch=5Fnpu?=
 =?UTF-8?q?=5Fschema.json=20Merge=20pull=20request=20!19664=20from=20?=
 =?UTF-8?q?=E9=99=88=E5=A8=81=E4=BA=A8/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/torch_npu_schema.json | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json
index b15838b525..403e83257a 100644
--- a/test/torch_npu_schema.json
+++ b/test/torch_npu_schema.json
@@ -2741,12 +2741,18 @@
   "torch_npu.npu_all_gather_base_mm": {
     "signature": "(*args, **kwargs)"
   },
-  "torch_npu.dynamo.torchair.ops.NpuStreamSwitch": {
+  "torch_npu.dynamo.torchair.scope.npu_stream_switch": {
     "signature": "(stream_tag: str, stream_priority: int = 0)"
   },
-  "torch_npu.dynamo.torchair.ops.npu_wait_tensor": {
+  "torch_npu.dynamo.torchair.scope.npu_wait_tensor": {
     "signature": "(self: torch.Tensor, dependency: torch.Tensor)"
   },
+  "torch_npu.dynamo.torchair.scope.super_kernel": {
+    "signature": "(scope: str, options: str = '')"
+  },
+  "torch_npu.dynamo.torchair.scope.limit_core_num": {
+    "signature": "(op_aicore_num: int, op_vectorcore_num: int)"
+  },
   "torch_npu.distributed.run.parse_args": {
     "signature": "(args)"
   },
-- 
Gitee


From 398f3ec9d872afa0e013f6820c0e285d8359922f Mon Sep 17 00:00:00 2001
From: huangyunlong <huangyunlong4@huawei.com>
Date: Fri, 28 Mar 2025 09:06:21 +0000
Subject: [PATCH 249/358] !19443 config streampool Merge pull request !19443
 from huangyunlong/2.6stream

---
 torch_npu/csrc/core/npu/NPUStream.cpp         | 40 ++++++++++++++++---
 .../csrc/core/npu/register/OptionsManager.cpp | 15 +++++++
 .../csrc/core/npu/register/OptionsManager.h   |  1 +
 3 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUStream.cpp b/torch_npu/csrc/core/npu/NPUStream.cpp
index 6504f5bccd..de5856352f 100644
--- a/torch_npu/csrc/core/npu/NPUStream.cpp
+++ b/torch_npu/csrc/core/npu/NPUStream.cpp
@@ -103,19 +103,44 @@ std::ostream& operator<<(std::ostream& stream, StreamIdType s)
     return stream;
 }
 
+int GetStreamsPerPoolBits()
+{
+    const static int StreamsPerPoolBits = []() -> int {
+        if (c10_npu::option::OptionsManager::GetStreamsPerDevice() == 8) {
+            return 3;
+        }
+        return kStreamsPerPoolBits;
+    }();
+    return StreamsPerPoolBits;
+}
+
+int GetStreamsPerPool()
+{
+    const static int StreamsPerPool = []() -> int {
+        if (c10_npu::option::OptionsManager::GetStreamsPerDevice() == 8) {
+            return 8;
+        }
+        return kStreamsPerPool;
+    }();
+    return StreamsPerPool;
+}
+
 static inline StreamIdType streamIdType(c10::StreamId s)
 {
-    return static_cast<StreamIdType>((uint32_t)s >> kStreamsPerPoolBits);
+    static int StreamsPerPoolBits = GetStreamsPerPoolBits();
+    return static_cast<StreamIdType>((uint32_t)s >> StreamsPerPoolBits);
 }
 
 static inline size_t streamIdIndex(c10::StreamId s)
 {
-    return static_cast<size_t>((uint32_t)s & ((1 << kStreamsPerPoolBits) - 1));
+    static int StreamsPerPoolBits = GetStreamsPerPoolBits();
+    return static_cast<size_t>((uint32_t)s & ((1 << StreamsPerPoolBits) - 1));
 }
 
 c10::StreamId makeStreamId(StreamIdType st, size_t si)
 {
-    return static_cast<c10::StreamId>((static_cast<size_t>(st) << kStreamsPerPoolBits) | si);
+    static int StreamsPerPoolBits = GetStreamsPerPoolBits();
+    return static_cast<c10::StreamId>((static_cast<size_t>(st) << StreamsPerPoolBits) | si);
 }
 
 template <typename T, typename A>
@@ -189,7 +214,8 @@ static void initDeviceStreamState(c10::DeviceIndex device_index)
     // Switches to the requested device so streams are properly associated
     // with it.
     NPUGuard device_guard{device_index};
-    for (auto i = decltype(kStreamsPerPool){0}; i < kStreamsPerPool; ++i) {
+    static int StreamsPerPool = GetStreamsPerPool();
+    for (auto i = decltype(StreamsPerPool){0}; i < StreamsPerPool; ++i) {
         auto& npu_streami = npu_streams[device_index][i];
 
         npu_streami.device_index = device_index;
@@ -232,7 +258,8 @@ static inline void check_npu(c10::DeviceIndex device_index)
 static uint32_t get_idx(std::atomic<uint32_t>& counter)
 {
     auto raw_idx = counter++;
-    return raw_idx % kStreamsPerPool;
+    static int StreamsPerPool = GetStreamsPerPool();
+    return raw_idx % StreamsPerPool;
 }
 
 static uint32_t get_sync_launch_stream_idx(std::atomic<uint32_t>& counter)
@@ -586,7 +613,8 @@ void recovery_all_npu_streams(c10::DeviceIndex device_index)
     secondary_streamsi.stream = nullptr;
     NPU_CHECK_SUPPORTED_OR_ERROR(
         acl::AclrtCreateStreamWithConfig(&secondary_streamsi.stream, 0, (ACL_STREAM_FAST_LAUNCH | ACL_STREAM_FAST_SYNC)));
-    for (auto i = decltype(kStreamsPerPool){0}; i < kStreamsPerPool; ++i) {
+    static int StreamsPerPool = GetStreamsPerPool();
+    for (auto i = decltype(StreamsPerPool){0}; i < StreamsPerPool; ++i) {
         auto& npu_streami = npu_streams[device_index][i];
         if (npu_streami.stream == nullptr) {
             continue;
diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.cpp b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
index 4e78c1e396..6edf25a882 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.cpp
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.cpp
@@ -492,6 +492,21 @@ uint32_t OptionsManager::GetAclOpInitMode()
     return acl_op_init_mode;
 }
 
+uint32_t OptionsManager::GetStreamsPerDevice()
+{
+    const static uint32_t streams_per_device = []() -> uint32_t {
+        char* buf_val = std::getenv("STREAMS_PER_DEVICE");
+        // Default 8
+        int64_t streams_per_device = (buf_val != nullptr) ? strtol(buf_val, nullptr, 10) : 8;
+        if (streams_per_device != 8 && streams_per_device != 32) {
+            streams_per_device = 8;
+            TORCH_NPU_WARN_ONCE("STREAMS_PER_DEVICE only support 8 or 32, but get other value, so reset it to the default value 8");
+        }
+        return static_cast<uint32_t>(streams_per_device);
+    }();
+    return streams_per_device;
+}
+
 char* OptionsManager::GetCpuAffinityConf()
 {
     return std::getenv("CPU_AFFINITY_CONF");
diff --git a/torch_npu/csrc/core/npu/register/OptionsManager.h b/torch_npu/csrc/core/npu/register/OptionsManager.h
index 6bffa5aa6c..bffd3bf48d 100644
--- a/torch_npu/csrc/core/npu/register/OptionsManager.h
+++ b/torch_npu/csrc/core/npu/register/OptionsManager.h
@@ -126,6 +126,7 @@ public:
     static uint32_t GetP2PBufferSize();
     static uint32_t GetTaskQueueEnable();
     static uint32_t GetAclOpInitMode();
+    static uint32_t GetStreamsPerDevice();
     static char* GetCpuAffinityConf();
     static bool CheckForceUncached();
     static std::string GetOomSnapshotDumpPath();
-- 
Gitee


From 3d0a6b484fa13b75fafb26ca6fd5a250ece1e60e Mon Sep 17 00:00:00 2001
From: dilililiwhy <why.wuhuanyu@huawei.com>
Date: Fri, 28 Mar 2025 09:17:52 +0000
Subject: [PATCH 250/358] !19710 Revert 'Pull Request !19664 : [bugfix]update
 torch_npu_schema.json' Merge pull request !19710 from
 dilililiwhy/revert-merge-19664-v2.6.0

---
 test/torch_npu_schema.json | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json
index 403e83257a..b15838b525 100644
--- a/test/torch_npu_schema.json
+++ b/test/torch_npu_schema.json
@@ -2741,18 +2741,12 @@
   "torch_npu.npu_all_gather_base_mm": {
     "signature": "(*args, **kwargs)"
   },
-  "torch_npu.dynamo.torchair.scope.npu_stream_switch": {
+  "torch_npu.dynamo.torchair.ops.NpuStreamSwitch": {
     "signature": "(stream_tag: str, stream_priority: int = 0)"
   },
-  "torch_npu.dynamo.torchair.scope.npu_wait_tensor": {
+  "torch_npu.dynamo.torchair.ops.npu_wait_tensor": {
     "signature": "(self: torch.Tensor, dependency: torch.Tensor)"
   },
-  "torch_npu.dynamo.torchair.scope.super_kernel": {
-    "signature": "(scope: str, options: str = '')"
-  },
-  "torch_npu.dynamo.torchair.scope.limit_core_num": {
-    "signature": "(op_aicore_num: int, op_vectorcore_num: int)"
-  },
   "torch_npu.distributed.run.parse_args": {
     "signature": "(args)"
   },
-- 
Gitee


From fbbeee4584f2bf60794e6e5ec7d71fd2427cbeb1 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Fri, 28 Mar 2025 09:54:52 +0000
Subject: [PATCH 251/358] !19708 Update op_plugin commit id Merge pull request
 !19708 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index f0e9c2e71c..714e7b4034 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit f0e9c2e71c70e5a7f99fd30058841c8e012647f2
+Subproject commit 714e7b403420450d1be437b4b69ed37593ad8877
-- 
Gitee


From d475b1944e6a7b745b67b477d3ecab8a5ebc4739 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Sat, 29 Mar 2025 03:09:54 +0000
Subject: [PATCH 252/358] !19748 Update op_plugin commit id Merge pull request
 !19748 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 714e7b4034..90636901df 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 714e7b403420450d1be437b4b69ed37593ad8877
+Subproject commit 90636901dfce1c5fb39457992c2508b636d9ea0d
-- 
Gitee


From e1afc1240a9dba34ce530e0ffadda56e1bfeb320 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= <wangchao430@huawei.com>
Date: Sat, 29 Mar 2025 08:12:44 +0000
Subject: [PATCH 253/358] =?UTF-8?q?!19724=20fix=20paralleltcpstore=20wait?=
 =?UTF-8?q?=20bug=20Merge=20pull=20request=20!19724=20from=20=E7=8E=8B?=
 =?UTF-8?q?=E8=B6=85/v2.6.0=5Ftcpstore?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/distributed/test_hccl_stream_id.py           | 5 ++++-
 torch_npu/csrc/distributed/ParallelStoreProxy.cpp | 5 +++++
 torch_npu/csrc/distributed/ParallelStoreProxy.hpp | 1 +
 torch_npu/csrc/distributed/ParallelTcpStore.cpp   | 6 +++++-
 4 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/test/distributed/test_hccl_stream_id.py b/test/distributed/test_hccl_stream_id.py
index 4d9d7b4ca1..a3039db73e 100644
--- a/test/distributed/test_hccl_stream_id.py
+++ b/test/distributed/test_hccl_stream_id.py
@@ -38,7 +38,10 @@ class HcclStreamIdTest(TestCase):
             dist_group.recv(recv_tensor, src)
             p2p_stream_id = _world.default_pg._get_backend(torch.device('npu'))._get_stream_id(True, src)
 
-        assert0 = ((collective_stream_id & 32) == 32)
+        stream_num = os.environ.get("STREAMS_PER_DEVICE", 8)
+        if stream_num != 32:
+            stream_num = 8
+        assert0 = ((collective_stream_id & stream_num) == stream_num)
         assert1 = (collective_stream_id == p2p_stream_id)
         collective_stream = torch.npu.Stream(stream_id=collective_stream_id, device_type=20)
         p2p_stream = torch.npu.Stream(stream_id=collective_stream_id, device_type=20)
diff --git a/torch_npu/csrc/distributed/ParallelStoreProxy.cpp b/torch_npu/csrc/distributed/ParallelStoreProxy.cpp
index c7355c2c06..1da5fa19cb 100644
--- a/torch_npu/csrc/distributed/ParallelStoreProxy.cpp
+++ b/torch_npu/csrc/distributed/ParallelStoreProxy.cpp
@@ -129,5 +129,10 @@ int Proxy::LoopProcessData() noexcept
 
     return result;
 }
+
+int Proxy::SetReceiveTimeout(const std::chrono::milliseconds &value) const noexcept
+{
+    return tcpClient_->SetReceiveTimeout(value);
+}
 } // torch_npu
 } // c10d
\ No newline at end of file
diff --git a/torch_npu/csrc/distributed/ParallelStoreProxy.hpp b/torch_npu/csrc/distributed/ParallelStoreProxy.hpp
index 0646f6f5d7..be222710fd 100644
--- a/torch_npu/csrc/distributed/ParallelStoreProxy.hpp
+++ b/torch_npu/csrc/distributed/ParallelStoreProxy.hpp
@@ -20,6 +20,7 @@ public:
     StoreMessage HandleLocalServerMessage(const int &fd, const torch_npu::StoreMessage &message) noexcept;
     void WriteData(const int &fd, std::vector<uint8_t> &buf, int64_t &unpackSize) noexcept;
     int LoopProcessData() noexcept;
+    int SetReceiveTimeout(const std::chrono::milliseconds &value) const noexcept;
 
 private:
     const std::string host_{};
diff --git a/torch_npu/csrc/distributed/ParallelTcpStore.cpp b/torch_npu/csrc/distributed/ParallelTcpStore.cpp
index 6fef4ebadd..3f47930fba 100644
--- a/torch_npu/csrc/distributed/ParallelTcpStore.cpp
+++ b/torch_npu/csrc/distributed/ParallelTcpStore.cpp
@@ -474,7 +474,11 @@ void ParallelTcpStore::wait(const std::vector<std::string> &keys, const std::chr
 {
     torch_npu::StoreMessage request{ torch_npu::MessageType::WAIT, 0, keys };
     torch_npu::StoreMessage response;
-    client_->SetReceiveTimeout(timeout);
+    if (proxy_) {
+        proxy_->SetReceiveTimeout(timeout);
+    } else {
+        client_->SetReceiveTimeout(timeout);
+    }
     std::lock_guard<std::mutex> lockGuard{ clientMutex_ };
     DoWait(request, response);
 }
-- 
Gitee


From b35e7baa3020cb91f51f373c9339bbba7772e188 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=A8=81=E4=BA=A8?= <chenweiheng1@hisilicon.com>
Date: Sat, 29 Mar 2025 09:23:25 +0000
Subject: [PATCH 254/358] =?UTF-8?q?!19763=20[bugfix]update=20torch=5Fnpu?=
 =?UTF-8?q?=5Fschema.json=20Merge=20pull=20request=20!19763=20from=20?=
 =?UTF-8?q?=E9=99=88=E5=A8=81=E4=BA=A8/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/torch_npu_schema.json | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json
index b15838b525..403e83257a 100644
--- a/test/torch_npu_schema.json
+++ b/test/torch_npu_schema.json
@@ -2741,12 +2741,18 @@
   "torch_npu.npu_all_gather_base_mm": {
     "signature": "(*args, **kwargs)"
   },
-  "torch_npu.dynamo.torchair.ops.NpuStreamSwitch": {
+  "torch_npu.dynamo.torchair.scope.npu_stream_switch": {
     "signature": "(stream_tag: str, stream_priority: int = 0)"
   },
-  "torch_npu.dynamo.torchair.ops.npu_wait_tensor": {
+  "torch_npu.dynamo.torchair.scope.npu_wait_tensor": {
     "signature": "(self: torch.Tensor, dependency: torch.Tensor)"
   },
+  "torch_npu.dynamo.torchair.scope.super_kernel": {
+    "signature": "(scope: str, options: str = '')"
+  },
+  "torch_npu.dynamo.torchair.scope.limit_core_num": {
+    "signature": "(op_aicore_num: int, op_vectorcore_num: int)"
+  },
   "torch_npu.distributed.run.parse_args": {
     "signature": "(args)"
   },
-- 
Gitee


From 83364688645d051e199013869c988bac8a6217e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E5=BC=BA?= <wangqiang160@hisilicon.com>
Date: Sat, 29 Mar 2025 09:38:29 +0000
Subject: [PATCH 255/358] =?UTF-8?q?!19718=20fix=20for=20cann=20compatibili?=
 =?UTF-8?q?ty=20Merge=20pull=20request=20!19718=20from=20=E7=8E=8B?=
 =?UTF-8?q?=E5=BC=BA/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/distributed/test_hccl_stream_id.py         | 2 ++
 torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 8 +++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/test/distributed/test_hccl_stream_id.py b/test/distributed/test_hccl_stream_id.py
index a3039db73e..371791be1c 100644
--- a/test/distributed/test_hccl_stream_id.py
+++ b/test/distributed/test_hccl_stream_id.py
@@ -1,3 +1,4 @@
+import unittest
 import os
 from unittest.mock import patch
 
@@ -71,6 +72,7 @@ class HcclStreamIdTest(TestCase):
         for p in ps:
             p.join()
 
+    @unittest.skip("skip this case tmp")
     @skipIfUnsupportMultiNPU(2)
     def test_dist_get_hccl_stream_id_same(self):
         # CI currently supports only 2 devices
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 8d84ee4963..c612e2e747 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -2266,9 +2266,11 @@ HcclCommConfig ProcessGroupHCCL::createHcclCommConfigWithOptions()
     HcclCommConfig config;
     getHcclCommConfig(&config);
 
-    // update group name in hccl comm config
-    std::string groupName = getGroupName();
-    torch_npu::toolkit::profiler::Utils::safe_strcpy_s(config.hcclCommName, groupName.c_str(), COMM_NAME_MAX_LENGTH);
+    if (isHcclFeatureSupported(HcclCommConfigCapability::HCCL_COMM_CONFIG_COMM_NAME)) {
+        // Update group name in hccl comm config when this capability is supported.
+        std::string groupName = getGroupName();
+        torch_npu::toolkit::profiler::Utils::safe_strcpy_s(config.hcclCommName, groupName.c_str(), COMM_NAME_MAX_LENGTH);
+    }
 
     if (options_->hccl_config.empty()) {
         return config;
-- 
Gitee


From 42d50cde3f518aa86c85979964b97689520dfe71 Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Sun, 30 Mar 2025 11:38:41 +0000
Subject: [PATCH 256/358] !19797 Update torchair commit id Merge pull request
 !19797 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index f6d3ebd3b5..034b3d7c15 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit f6d3ebd3b50835fe706805c79bdb2fb3d5bd6ba6
+Subproject commit 034b3d7c1501d9a72c28f2904abf1ec427898d08
-- 
Gitee


From aeb499b6cf6d5e285b0f3d4eb584fca8bd1bdcf4 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Sun, 30 Mar 2025 14:39:59 +0000
Subject: [PATCH 257/358] !19768 Update op_plugin commit id Merge pull request
 !19768 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 90636901df..f016466352 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 90636901dfce1c5fb39457992c2508b636d9ea0d
+Subproject commit f016466352c19bee47ade2624397002bfcd6c580
-- 
Gitee


From 7d9ffe2a48eb7da3deff8106327c9178f83247aa Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Sun, 30 Mar 2025 15:39:54 +0000
Subject: [PATCH 258/358] !19822 Update op_plugin commit id Merge pull request
 !19822 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index f016466352..bc27d0f9f2 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit f016466352c19bee47ade2624397002bfcd6c580
+Subproject commit bc27d0f9f2864412df6a6d97dfba6653eb503028
-- 
Gitee


From 38d2b3f596a328ff389ce6d9e0eb7855bdac1cb5 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Sun, 30 Mar 2025 15:39:55 +0000
Subject: [PATCH 259/358] !19822 Update op_plugin commit id Merge pull request
 !19822 from pta-robot/v2.6.0

-- 
Gitee


From d1082e6480dee535f7cdc2459057b89e856dde47 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= <wangchao430@huawei.com>
Date: Mon, 31 Mar 2025 02:30:06 +0000
Subject: [PATCH 260/358] =?UTF-8?q?!19643=20fix=20logging=20print=20clutte?=
 =?UTF-8?q?r=20problem=20Merge=20pull=20request=20!19643=20from=20?=
 =?UTF-8?q?=E7=8E=8B=E8=B6=85/v2.6.0=5Floggingcerr?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/distributed/ProcessGroupHCCL.cpp |  8 ++++++++
 torch_npu/csrc/logging/Logger.cpp               | 12 +++++++-----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index c612e2e747..7da9523049 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -1517,6 +1517,7 @@ void ProcessGroupHCCL::createHCCLComm(
     broadcastMasterID(&hcclID, isSingleP2POp, devicesKey, p2pRank);
 
     c10_npu::OptionalNPUGuard npuGuard;
+    auto startTime = std::chrono::steady_clock::now();
     for (size_t i = 0; i < devices.size(); ++i) {
         int numRanks = getSize();
         int rank = getRank() * static_cast<int>(devices.size()) + static_cast<int>(i);
@@ -1549,6 +1550,10 @@ void ProcessGroupHCCL::createHCCLComm(
         // Creates the HCCL streams
         streamVal.push_back(getNPUStreamByCurrentType(devices[i].index()));
     }
+    auto endTime = std::chrono::steady_clock::now();
+    auto timeElapsed = std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime);
+    logger->info("Create hccl comm by hcclCommInitRootInfoConfig success, group id is %s, commType is %d, use %d ms.",
+        options_->group_id.c_str(), static_cast<int>(commType), timeElapsed.count());
 }
 
 bool ProcessGroupHCCL::createHCCLCommEx(
@@ -1597,6 +1602,7 @@ bool ProcessGroupHCCL::createHCCLCommEx(
         auto endTime = std::chrono::steady_clock::now();
         auto timeElapsed = std::chrono::duration_cast<std::chrono::milliseconds>(endTime - startTime);
         ASCEND_LOGI("Create global hccl comm with ranktable success, take %d milliseconds", timeElapsed.count());
+        logger->info("Create global hccl comm with ranktable success, take %d milliseconds", timeElapsed.count());
         return true;
     }
 
@@ -1652,6 +1658,8 @@ bool ProcessGroupHCCL::createHCCLCommEx(
     auto subTimeElapsed = std::chrono::duration_cast<std::chrono::milliseconds>(subEndTime - subStartTime);
     ASCEND_LOGI("Create sub hccl comm by hcclCreateSubCommConfig success, group id is %s, subCommId is %llu, use %d ms.",
         options_->group_id.c_str(), hcclid, subTimeElapsed.count());
+    logger->info("Create sub hccl comm by hcclCreateSubCommConfig success, group id is %s, subCommId is %llu, use %d ms.",
+        options_->group_id.c_str(), hcclid, subTimeElapsed.count());
     return true;
 }
 
diff --git a/torch_npu/csrc/logging/Logger.cpp b/torch_npu/csrc/logging/Logger.cpp
index 385d11f6af..d21b4218d7 100644
--- a/torch_npu/csrc/logging/Logger.cpp
+++ b/torch_npu/csrc/logging/Logger.cpp
@@ -52,13 +52,15 @@ void Logger::log(LoggingLevel level, const char* format, va_list args)
     long nowMs = ts.tv_nsec / 1000000;
 
     auto rank = c10_npu::option::OptionsManager::GetRankId();
+    std::ostringstream oss;
     if (rank != -1) {
-        std::cerr << "[rank:" << rank << "]:[" << timeBuffer << ":" << std::setfill('0') << std::setw(3) << nowMs << "] " << name_ <<
-            ": [" << LoggingLevelNames[level] << "] " << buffer << std::endl;
-    } else {
-        std::cerr << "[" << timeBuffer << ":" << std::setfill('0') << std::setw(3) << nowMs << "] " << name_ << ": [" <<
-            LoggingLevelNames[level] << "] " << buffer << std::endl;
+        oss << "[rank:" << rank << "]:";
     }
+    oss << "[" << timeBuffer << ":" << std::setfill('0') << std::setw(3) << nowMs << "] " << name_ << ": [" <<
+        LoggingLevelNames[level] << "] " << buffer << std::endl;
+    std::string s = oss.str();
+    std::cerr.write(s.c_str(), s.size());
+    std::cerr.flush();
 }
 
 void Logger::debug(const char* format, ...)
-- 
Gitee


From 1f51005cd5aad3b45a965a5491958a0974eb08b9 Mon Sep 17 00:00:00 2001
From: yuhaiyan <yuhaiyan8@huawei.com>
Date: Mon, 31 Mar 2025 06:53:05 +0000
Subject: [PATCH 261/358] !19600 Cleancode Merge pull request !19600 from
 yuhaiyan/v2.6.0-dev1

---
 torch_npu/csrc/core/npu/NPUAffinityController.cpp |  2 +-
 torch_npu/csrc/core/npu/NPUCachingAllocator.cpp   | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUAffinityController.cpp b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
index ddc190e597..cb7d7a928e 100644
--- a/torch_npu/csrc/core/npu/NPUAffinityController.cpp
+++ b/torch_npu/csrc/core/npu/NPUAffinityController.cpp
@@ -123,7 +123,7 @@ namespace c10_npu {
                 threadToCoreidMap[thread_type] = current_core_range;
             }
         } else {
-            int remaining_type_count = thread_types.size() - 1;
+            size_t remaining_type_count = thread_types.size() - 1;
             int i = 0;
             for (auto thread_type : thread_types) {
                 if (thread_type == ThreadType::unknownThread) {
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index ebc64af1ac..55b0cd5b33 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -410,7 +410,7 @@ struct ExpandableSegment {
             prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
             prop.memAttr = (segment_size_ == kExtraLargeBuffer) ? ACL_HBM_MEM_HUGE1G : ACL_HBM_MEM_HUGE;
             prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
-            prop.location.id = device_;
+            prop.location.id = static_cast<unsigned>(device_);
             prop.reserve = 0;
             auto status = c10_npu::acl::AclrtMallocPhysical(&handle, segment_size_, &prop, 0);
             if (status == ACL_ERROR_RT_MEMORY_ALLOCATION) {
@@ -512,7 +512,7 @@ private:
 
     void forEachAllocatedRange(std::function<void(size_t, size_t)> fn)
     {
-        auto start = 0;
+        size_t start = 0;
         for (auto i : c10::irange(handles_.size())) {
             if (handles_.at(i) && (i == 0 || !handles_.at(i - 1))) {
                 start = i;
@@ -531,7 +531,7 @@ private:
     size_t segmentLeft(char *p)
     {
         auto size = p - ptr();
-        return size / segment_size_;
+        return static_cast<size_t>(size) / segment_size_;
     }
 
     size_t segmentRight(char *p)
@@ -1025,7 +1025,7 @@ public:
         aclrtMemUceInfo temp_info[memUceInfo_.retSize];
         size_t temp_retsize = 0;
 
-        for (int i = 0; i < memUceInfo_.retSize; ++i) {
+        for (size_t i = 0; i < memUceInfo_.retSize; ++i) {
             void *addr = info[i].addr;
             size_t length = info[i].len;
             bool found = false;
@@ -1573,7 +1573,7 @@ public:
             pool_to_id[pair.second] = pair.first;
         }
 
-        size_t total_active = 0;
+        uint64_t total_active = 0;
         std::vector<SegmentInfo> result;
         const auto all_blocks = get_all_blocks();
 
-- 
Gitee


From ab453e1f6f5e0966b73fa465f8fb348241a101a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= <jiangyiwen5@huawei.com>
Date: Mon, 31 Mar 2025 07:05:51 +0000
Subject: [PATCH 262/358] =?UTF-8?q?!19803=20Fix=20torch.save's=20bug=20in?=
 =?UTF-8?q?=20multi-thread=20Merge=20pull=20request=20!19803=20from=20?=
 =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87/v2.6.0=5Fsv?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/utils/serialization.py | 119 +++++++++++++++++++++++++------
 1 file changed, 97 insertions(+), 22 deletions(-)

diff --git a/torch_npu/utils/serialization.py b/torch_npu/utils/serialization.py
index 2a2c9a38c7..eb030806cb 100644
--- a/torch_npu/utils/serialization.py
+++ b/torch_npu/utils/serialization.py
@@ -1,4 +1,6 @@
 import os
+import io
+import sys
 import pickle
 import re
 from typing import Any, Optional
@@ -6,7 +8,8 @@ from typing import Any, Optional
 import torch
 from torch.serialization import _check_dill_version, _open_file_like, _is_zipfile, \
     _open_zipfile_reader, _is_torchscript_zip, _weights_only_unpickler, \
-    _legacy_load, _load, FILE_LIKE, MAP_LOCATION, DEFAULT_PROTOCOL
+    _legacy_load, _load, FILE_LIKE, MAP_LOCATION, DEFAULT_PROTOCOL, \
+    normalize_storage_type, location_tag, _serialization_tls
 from torch.serialization import _default_to_weights_only, UNSAFE_MESSAGE
 
 import torch_npu
@@ -275,28 +278,99 @@ def load(
                 return _legacy_load(opened_file, "cpu", pickle_module, **pickle_load_args)
 
 
-def _get_npu_save_result(
-    obj: object,
-    f: FILE_LIKE,
-    pickle_module: Any = pickle,
-    pickle_protocol: int = DEFAULT_PROTOCOL,
-    _use_new_zipfile_serialization: bool = True,
-    _disable_byteorder_record: bool = False
-) -> None:
-    cpu_nbytes = torch.storage.UntypedStorage.nbytes
+def _npu_save(
+    obj,
+    zip_file,
+    pickle_module,
+    pickle_protocol,
+    _disable_byteorder_record,
+):
+    serialized_storages = {}
+    id_map: Dict[int, str] = {}
+
+    # Since loading storages that view the same data with different dtypes is
+    # not supported, we need to keep track of the dtype associated with each
+    # storage data_ptr and throw an error if the dtype is ever different.
+    storage_dtypes: Dict[int, torch.dtype] = {}
+
+    def persistent_id(obj):
+        if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj):
+            if isinstance(obj, torch.storage.TypedStorage):
+                storage = obj._untyped_storage
+                storage_dtype = obj.dtype
+                storage_type_str = obj._pickle_storage_type()
+                storage_type = getattr(torch, storage_type_str)
+                storage_numel = obj._size()
 
-    def npu_nbytes(self):
-        if self.device.type != 'cpu':
-            storage_tensor = torch_npu._C._tensor_construct_from_storage(self)
-            base_nbytes = storage_tensor.size().numel() * storage_tensor.element_size()
-            return base_nbytes
-        else:
-            return cpu_nbytes(self)
+            else:
+                storage = obj
+                storage_dtype = torch.uint8
+                storage_type = normalize_storage_type(type(obj))
+                if storage.device.type != "cpu":
+                    storage_tensor = torch_npu._C._tensor_construct_from_storage(storage)
+                    storage_numel = storage_tensor.size().numel() * storage_tensor.element_size()
+                else:
+                    storage_numel = storage.nbytes()
+
+            # If storage is allocated, ensure that any other saved storages
+            # pointing to the same data all have the same dtype. If storage is
+            # not allocated, don't perform this check
+            if str(storage.device) != "meta" and storage.data_ptr() != 0:
+                if storage.data_ptr() in storage_dtypes:
+                    if storage_dtype != storage_dtypes[storage.data_ptr()]:
+                        raise RuntimeError(
+                            "Cannot save multiple tensors or storages that "
+                            "view the same data as different types"
+                        )
+                else:
+                    storage_dtypes[storage.data_ptr()] = storage_dtype
 
-    torch.storage.UntypedStorage.nbytes = npu_nbytes
-    result = torch.serialization.save(obj, f, pickle_module, pickle_protocol, True, _disable_byteorder_record)
-    torch.storage.UntypedStorage.nbytes = cpu_nbytes
-    return result
+            storage_key = id_map.setdefault(storage._cdata, str(len(id_map)))
+            if hasattr(obj, "_fake_device") and obj._fake_device is not None:
+                location = str(obj._fake_device)
+            else:
+                location = location_tag(storage)
+            serialized_storages[storage_key] = storage
+
+            return ("storage", storage_type, storage_key, location, storage_numel)
+
+        return None
+
+    # Write the pickle data for `obj`
+    data_buf = io.BytesIO()
+
+    class PyTorchPickler(pickle_module.Pickler):  # type: ignore[name-defined]
+        def persistent_id(self, obj):
+            return persistent_id(obj)
+
+    pickler = PyTorchPickler(data_buf, protocol=pickle_protocol)
+    pickler.dump(obj)
+    data_value = data_buf.getvalue()
+    zip_file.write_record("data.pkl", data_value, len(data_value))
+
+    # Write byte order marker
+    if not _disable_byteorder_record:
+        if sys.byteorder not in ["little", "big"]:
+            raise ValueError("Unknown endianness type: " + sys.byteorder)
+
+        zip_file.write_record("byteorder", sys.byteorder, len(sys.byteorder))
+
+    # Write each tensor to a file named tensor/the_tensor_key in the zip archive
+    for key in sorted(serialized_storages.keys()):
+        name = f"data/{key}"
+        storage = serialized_storages[key]
+        num_bytes = storage.nbytes()
+        global _serialization_tls
+        if _serialization_tls.skip_data:
+            zip_file.write_record_metadata(name, num_bytes)
+        else:
+            # given that we copy things around anyway, we might use storage.cpu()
+            # this means to that to get tensors serialized, you need to implement
+            # .cpu() on the underlying Storage
+            if storage.device.type != "cpu":
+                storage = storage.cpu()
+            # Now that it is on the CPU we can directly copy it into the zip file
+            zip_file.write_record(name, storage, num_bytes)
 
 
 def save(
@@ -313,9 +387,10 @@ def save(
             "if it is necessary to use this, please convert the npu tensor to cpu tensor for saving"
         )
         _warn_legacy_serialization(warn_massage, "save")
-    return _get_npu_save_result(obj, f, pickle_module, pickle_protocol, True, _disable_byteorder_record)
+    return torch.serialization.save(obj, f, pickle_module, pickle_protocol, True, _disable_byteorder_record)
 
 
 def _add_serialization_methods():
     torch.save = save
     torch.load = load
+    torch.serialization._save = _npu_save
-- 
Gitee


From 8988eaa3a56d717748774e9ccfbda55fee4d22f8 Mon Sep 17 00:00:00 2001
From: jiangpengfei <jiangpengfei24@huawei.com>
Date: Mon, 31 Mar 2025 08:53:56 +0000
Subject: [PATCH 263/358] !19815 not makesurequeueempty when check whether
 under capturing Merge pull request !19815 from jiangpengfei/v2.6.0

---
 torch_npu/csrc/core/npu/NPUGraph.cpp     | 4 ----
 torch_npu/csrc/core/npu/NPUGraphsUtils.h | 5 ++---
 torch_npu/csrc/framework/OpCommand.cpp   | 2 +-
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUGraph.cpp b/torch_npu/csrc/core/npu/NPUGraph.cpp
index 60351139d9..e259d1a724 100644
--- a/torch_npu/csrc/core/npu/NPUGraph.cpp
+++ b/torch_npu/csrc/core/npu/NPUGraph.cpp
@@ -127,8 +127,6 @@ void NPUGraph::capture_begin(MempoolId_t pool, aclmdlRICaptureMode capture_mode)
     // prevent potentially unsafe CUDA API calls during capture.
     NPU_CHECK_ERROR(c10_npu::acl::AclmdlRICaptureBegin(capture_stream_, capture_mode));
 
-    c10_npu::is_stream_capturing.store(true);
-
     aclmdlRICaptureStatus status;
     NPU_CHECK_ERROR(c10_npu::acl::AclmdlRICaptureGetInfo(stream, &status, &model_ri_));
     TORCH_INTERNAL_ASSERT(status == aclmdlRICaptureStatus::ACL_MODEL_RI_CAPTURE_STATUS_ACTIVE);
@@ -144,8 +142,6 @@ void NPUGraph::capture_end()
     aclmdlRI model_ri;
     NPU_CHECK_ERROR(c10_npu::acl::AclmdlRICaptureEnd(capture_stream_, &model_ri));
 
-    c10_npu::is_stream_capturing.store(false);
-
     c10_npu::NPUCachingAllocator::endAllocateToPool(capture_dev_, mempool_id_);
 
     TORCH_CHECK(model_ri == model_ri_, "Invalid end capture model id: ", model_ri);
diff --git a/torch_npu/csrc/core/npu/NPUGraphsUtils.h b/torch_npu/csrc/core/npu/NPUGraphsUtils.h
index 254dc42599..f14b7d01e2 100644
--- a/torch_npu/csrc/core/npu/NPUGraphsUtils.h
+++ b/torch_npu/csrc/core/npu/NPUGraphsUtils.h
@@ -10,8 +10,6 @@
 
 namespace c10_npu {
 
-static std::atomic<bool> is_stream_capturing(false);
-
 using CaptureId_t = unsigned long long;
 
 // first is set if the instance is created by NPUGraph::capture_begin.
@@ -75,8 +73,9 @@ inline CaptureStatus currentStreamCaptureStatusMayInitCtx()
 
     aclmdlRICaptureStatus is_capturing{ACL_MODEL_RI_CAPTURE_STATUS_NONE};
     aclmdlRI model_ri;
+    auto s = c10_npu::getCurrentNPUStream();
     NPU_CHECK_ERROR(
-        c10_npu::acl::AclmdlRICaptureGetInfo(c10_npu::getCurrentNPUStream(), &is_capturing, &model_ri));
+        c10_npu::acl::AclmdlRICaptureGetInfo(s.stream(false), &is_capturing, &model_ri));
     return CaptureStatus(is_capturing);
 }
 
diff --git a/torch_npu/csrc/framework/OpCommand.cpp b/torch_npu/csrc/framework/OpCommand.cpp
index 01b894f0d0..9c64e376d1 100644
--- a/torch_npu/csrc/framework/OpCommand.cpp
+++ b/torch_npu/csrc/framework/OpCommand.cpp
@@ -123,7 +123,7 @@ OpCommand& OpCommand::Output(at::Tensor &output, const string &descName,
 void OpCommand::Run()
 {
     // Check for npu graph
-    if (c10_npu::is_stream_capturing.load() && aclCmd->CheckCustomHandlerNull()) {
+    if (aclCmd->CheckCustomHandlerNull()) {
         c10_npu::assertNotCapturing("Cannot run aclop operators");
     }
 
-- 
Gitee


From 731bb1f6a3aaafcf468d23697a6e0ecf2c0191c1 Mon Sep 17 00:00:00 2001
From: yuhaiyan <yuhaiyan8@huawei.com>
Date: Mon, 31 Mar 2025 09:20:24 +0000
Subject: [PATCH 264/358] !19764 Fixed/skip some failed testcases Merge pull
 request !19764 from yuhaiyan/v2.6.0-dev2

---
 test/npu/test_cann_version.py                 |   2 +-
 ..._combined_flatten_x__copy_to_contiguous.py |  20 ++--
 ...t_combined_squeeze_x_copy_to_contiguous.py |  40 ++++---
 .../test_combined_views_copy_to_contiguous.py | 100 +++++++++++-------
 .../test_single_permute_copy_to_contiguous.py |  10 +-
 ...t_tri_combined_views_copy_to_contiguous.py |  20 ++--
 .../.pytorch-disabled-tests.json              |  61 +++++++++++
 torch_npu/csrc/core/npu/GetCANNInfo.cpp       |   6 +-
 torch_npu/npu/utils.py                        |   2 +-
 9 files changed, 182 insertions(+), 79 deletions(-)

diff --git a/test/npu/test_cann_version.py b/test/npu/test_cann_version.py
index 72f69e638a..3fbfea312b 100644
--- a/test/npu/test_cann_version.py
+++ b/test/npu/test_cann_version.py
@@ -28,7 +28,7 @@ class TestCANNversion(TestCase):
             self.assertTrue(result, f"The env version is {version_env}, the result from _is_gte_cann_version is False")
         else:
             with self.assertRaisesRegex(RuntimeError,
-                    "When the version is less than \"8.1.RC1\", this function is not supported"):
+                    "When the version 7.0.0 is less than \"8.1.RC1\", this function is not supported"):
                 _is_gte_cann_version("7.0.0", "CANN")
 
 
diff --git a/test/trans_contiguous/test_combined_flatten_x__copy_to_contiguous.py b/test/trans_contiguous/test_combined_flatten_x__copy_to_contiguous.py
index 9087109e76..d3737df425 100644
--- a/test/trans_contiguous/test_combined_flatten_x__copy_to_contiguous.py
+++ b/test/trans_contiguous/test_combined_flatten_x__copy_to_contiguous.py
@@ -28,15 +28,17 @@ class CombinedFlattenXCopyToContiguous(TestCase):
             # case 1: flatten+select
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.flatten(2).select(1, 1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_StridedSlice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_StridedSlice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.flatten(2).select(1, 1).contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             # case 2: select+flatten == can be optimized as single select(contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input.select(2, 1).flatten(1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input.select(2, 1).flatten(1).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -55,16 +57,18 @@ class CombinedFlattenXCopyToContiguous(TestCase):
             # case 1: flatten+strideslice ==> can be optimized as slice(contiguous with offset) + select
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.flatten()[2:100:10].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape', 'contiguous_d_StridedSlice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape', 'contiguous_d_StridedSlice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.flatten()[2:100:10].contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             # case 2: strideslice+flatten==> can be optimized as single strideslice
             # (contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[:, 2:20:3].flatten().contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input[:, 2:20:3].flatten().contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
diff --git a/test/trans_contiguous/test_combined_squeeze_x_copy_to_contiguous.py b/test/trans_contiguous/test_combined_squeeze_x_copy_to_contiguous.py
index 2e0e5ebee1..4dccc03a36 100644
--- a/test/trans_contiguous/test_combined_squeeze_x_copy_to_contiguous.py
+++ b/test/trans_contiguous/test_combined_squeeze_x_copy_to_contiguous.py
@@ -28,16 +28,18 @@ class CombinedSqueezeXCopyToContiguous(TestCase):
             # case 1: squeeze+permute ==> can be optimized as single permute(contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.squeeze(1).transpose(0, 1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.squeeze(1).transpose(0, 1).contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
 
             # case 2: permute+squeeze ==> can be optimized as single permute(contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input.permute(1, 0, 3, 2).squeeze(0).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input.permute(1, 0, 3, 2).squeeze(0).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -57,15 +59,17 @@ class CombinedSqueezeXCopyToContiguous(TestCase):
             # case 1: squeeze + narrow
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.squeeze(1)[:, 1:10, :].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.squeeze(1)[:, 1:10, :].contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             # case 2: narrow + squeeze
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[:, :, :, 10:19].squeeze(1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input[:, :, :, 10:19].squeeze(1).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -84,15 +88,17 @@ class CombinedSqueezeXCopyToContiguous(TestCase):
             # case 1: squeeze+select
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.squeeze().select(2, 1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_StridedSlice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_StridedSlice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.squeeze().select(2, 1).contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             # case 2: select+squeeze
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input.select(2, 1).squeeze().contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_StridedSlice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_StridedSlice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input.select(2, 1).squeeze().contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -111,15 +117,17 @@ class CombinedSqueezeXCopyToContiguous(TestCase):
             # case 1: squeeze + strideslice ==> cannot be optimized(contiguous_h_combined should not called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.squeeze(1)[:, 20:150:3].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.squeeze(1)[:, 20:150:3].contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             # case 2: strideslice + squeeze ==> cannot be optimized(contiguous_h_combined should not called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[:, :, 10:19:3].squeeze(1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input[:, :, 10:19:3].squeeze(1).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
diff --git a/test/trans_contiguous/test_combined_views_copy_to_contiguous.py b/test/trans_contiguous/test_combined_views_copy_to_contiguous.py
index 554c71f6e3..25b2a30a95 100644
--- a/test/trans_contiguous/test_combined_views_copy_to_contiguous.py
+++ b/test/trans_contiguous/test_combined_views_copy_to_contiguous.py
@@ -26,16 +26,18 @@ class CombinedViewsCopyToContiguous(TestCase):
             # case 1: permute+narrow
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.permute(1, 3, 2, 0)[:10].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice', 'contiguous_d_Transpose'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice', 'contiguous_d_Transpose'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.permute(1, 3, 2, 0)[:10].contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
 
             # case 2: narrow+permute
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[:, 1:10].permute(1, 0, 3, 2).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice', 'contiguous_d_Transpose'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice', 'contiguous_d_Transpose'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input[:, 1:10].permute(1, 0, 3, 2).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -52,16 +54,18 @@ class CombinedViewsCopyToContiguous(TestCase):
             # case 1: permute+select
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.permute(1, 3, 2, 0).select(1, 2).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice', 'contiguous_d_Transpose'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice', 'contiguous_d_Transpose'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.permute(1, 3, 2, 0).select(1, 2).contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
 
             # case 2: select+permute
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input.select(1, 0).permute(1, 0, 2).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice', 'contiguous_d_Transpose'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_StridedSlice', 'contiguous_d_Transpose'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input.select(1, 0).permute(1, 0, 2).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -79,8 +83,9 @@ class CombinedViewsCopyToContiguous(TestCase):
             # (contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.permute(1, 3, 2, 0)[::2].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.permute(1, 3, 2, 0)[::2].contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
 
@@ -88,8 +93,9 @@ class CombinedViewsCopyToContiguous(TestCase):
             # (contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[:, 1:10:3].permute(1, 3, 0, 2).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input[:, 1:10:3].permute(1, 3, 0, 2).contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
 
@@ -107,14 +113,16 @@ class CombinedViewsCopyToContiguous(TestCase):
             # narrow at any dim + select the last dim ==> narrow
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input[:, 2:4].select(3, 1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input[:, 2:4].select(3, 1).contiguous()
             # narrow at 0 dim + select the any dim ==> common copy
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[2:4].select(2, 2).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input[2:4].select(2, 2).contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
@@ -122,14 +130,16 @@ class CombinedViewsCopyToContiguous(TestCase):
             # select the 0 dim + narrow at the 1 dim ==> reshape + select
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out3 = npu_input.select(0, 2)[:, 1:2].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out3 = cpu_input.select(0, 2)[:, 1:2].contiguous()
             # select the 0 dim + narrow at the last dim ==> reshape + select
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out4 = npu_input.select(0, 1)[:, :, 1:2].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out4 = cpu_input.select(0, 1)[:, :, 1:2].contiguous()
 
             self.assertRtolEqual(npu_out3.to("cpu").numpy(), cpu_out3.numpy())
@@ -149,20 +159,23 @@ class CombinedViewsCopyToContiguous(TestCase):
             # slice at adjacent axes + strideslice at lower dim ==> cannot be optimized(contiguous_h_combined is called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input[2:4, ::2].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input[2:4, ::2].contiguous()
             # strideslice at last dim ==> cannot be optimized(contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[:, 2:4, :, 1:10:2].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input[:, 2:4, :, 1:10:2].contiguous()
             # narrow at 0 dim and strideslice at last dim==> can be optimized as slice(contiguous)+select
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out3 = npu_input[2:4, :, :, ::2].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape', 'contiguous_d_StridedSlice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Reshape', 'contiguous_d_StridedSlice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out3 = cpu_input[2:4, :, :, ::2].contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
@@ -172,14 +185,16 @@ class CombinedViewsCopyToContiguous(TestCase):
             # slice at adjacent axes + strideslice at higher dim ==> reshape+narrow
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out4 = npu_input[1:10:2, 1:10].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out4 = cpu_input[1:10:2, 1:10].contiguous()
             # slice at non-adjacent axes
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out5 = npu_input[::2, :, 1:10].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out5 = cpu_input[::2, :, 1:10].contiguous()
             self.assertRtolEqual(npu_out4.to("cpu").numpy(), cpu_out4.numpy())
             self.assertRtolEqual(npu_out5.to("cpu").numpy(), cpu_out5.numpy())
@@ -198,29 +213,33 @@ class CombinedViewsCopyToContiguous(TestCase):
             # select at last dim ==> cannot be optimized(contiguous_h_combined is called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input[:10:2].select(3, 1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input[:10:2].select(3, 1).contiguous()
             # select at lower dims except last dim ==> reshape+narrow
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input[1:10:2].select(2, 1).contiguous()
             cpu_out2 = cpu_input[1:10:2].select(2, 1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
             # case 2: select+strideslice
             # strideslice at lower dims except last dim ==> reshape+narrow
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out3 = npu_input.select(0, 1)[1:10:2].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_h_match', 'contiguous_d_Slice'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out3 = cpu_input.select(0, 1)[1:10:2].contiguous()
             # strideslice at the last dim ==> cannot be optimized(contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out4 = npu_input.select(0, 1)[:, :, ::3].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, message="Error operators called!")
             cpu_out4 = cpu_input.select(0, 1)[:, :, ::3].contiguous()
             self.assertRtolEqual(npu_out3.to("cpu").numpy(), cpu_out3.numpy())
             self.assertRtolEqual(npu_out4.to("cpu").numpy(), cpu_out4.numpy())
@@ -242,8 +261,9 @@ class CombinedViewsCopyToContiguous(TestCase):
             # Broadcast + permute all cannot be optimized(contiguous_h_combined should not be called)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.expand(item[2][1]).transpose(1, 3).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.expand(item[2][1]).transpose(1, 3).contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
 
diff --git a/test/trans_contiguous/test_single_permute_copy_to_contiguous.py b/test/trans_contiguous/test_single_permute_copy_to_contiguous.py
index 8bec6d1f16..39c81fe303 100644
--- a/test/trans_contiguous/test_single_permute_copy_to_contiguous.py
+++ b/test/trans_contiguous/test_single_permute_copy_to_contiguous.py
@@ -24,13 +24,15 @@ class SingleViewCopyToContiguous(TestCase):
             cpu_input, npu_input = create_common_tensor(item, 0, 100)
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.permute(1, 0, 2, 3).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof),
-                             True, "contiguous_d_Transpose op is not called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, "contiguous_d_Transpose or aclnnInplaceCopy op is not called!")
 
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out2 = npu_input.permute(2, 3, 0, 1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof),
-                             True, "contiguous_d_Transpose op is not called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_Transpose'], prof)
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof),
+                             True, "contiguous_d_Transpose or aclnnInplaceCopy op is not called!")
 
             cpu_out1 = cpu_input.permute(1, 0, 2, 3).contiguous()
             cpu_out2 = cpu_input.permute(2, 3, 0, 1).contiguous()
diff --git a/test/trans_contiguous/test_tri_combined_views_copy_to_contiguous.py b/test/trans_contiguous/test_tri_combined_views_copy_to_contiguous.py
index d970098906..9496d34e2b 100644
--- a/test/trans_contiguous/test_tri_combined_views_copy_to_contiguous.py
+++ b/test/trans_contiguous/test_tri_combined_views_copy_to_contiguous.py
@@ -26,8 +26,9 @@ class TestTriCombinedViewsCopyToContiguous(TestCase):
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.view(npu_input.size(0) * npu_input.size(1), npu_input.size(2),
                                           npu_input.size(3))[:, 1:10].transpose(0, 1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.view(cpu_input.size(0) * cpu_input.size(1), cpu_input.size(2),
                                       cpu_input.size(3))[:, 1:10].transpose(0, 1).contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
@@ -37,8 +38,9 @@ class TestTriCombinedViewsCopyToContiguous(TestCase):
                 npu_out2 = npu_input.permute(1, 0, 2, 3). \
                     view(npu_input.size(1), npu_input.size(0), npu_input.size(
                         2) * npu_input.size(3))[:, :, 1:10].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input.permute(1, 0, 2, 3). \
                 view(
                 cpu_input.size(1),
@@ -66,8 +68,9 @@ class TestTriCombinedViewsCopyToContiguous(TestCase):
             with torch.autograd.profiler.profile(use_device='npu') as prof:
                 npu_out1 = npu_input.view(npu_input.size(0) * npu_input.size(1), npu_input.size(2),
                                           npu_input.size(3))[:, 1].transpose(0, 1).contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out1 = cpu_input.view(cpu_input.size(0) * cpu_input.size(1), cpu_input.size(2),
                                       cpu_input.size(3))[:, 1].transpose(0, 1).contiguous()
             self.assertRtolEqual(npu_out1.to("cpu").numpy(), cpu_out1.numpy())
@@ -77,8 +80,9 @@ class TestTriCombinedViewsCopyToContiguous(TestCase):
                 npu_out2 = npu_input.permute(1, 0, 2, 3). \
                     view(npu_input.size(1), npu_input.size(0), npu_input.size(
                         2) * npu_input.size(3))[:, :, 2].contiguous()
-            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined']),
-                             True, "Error operators called!")
+            self.assertEqual(check_operators_in_prof(['contiguous_d_AsStrided'], prof, ['contiguous_h_combined'])
+                             or check_operators_in_prof(['aclnnInplaceCopy'], prof, ['contiguous_h_combined']),
+                             True, message="Error operators called!")
             cpu_out2 = cpu_input.permute(1, 0, 2, 3). \
                 view(cpu_input.size(1), cpu_input.size(0), cpu_input.size(2) * cpu_input.size(3))[:, :, 2].contiguous()
             self.assertRtolEqual(npu_out2.to("cpu").numpy(), cpu_out2.numpy())
diff --git a/test/unsupported_test_cases/.pytorch-disabled-tests.json b/test/unsupported_test_cases/.pytorch-disabled-tests.json
index e281b46a61..ca30713a4e 100644
--- a/test/unsupported_test_cases/.pytorch-disabled-tests.json
+++ b/test/unsupported_test_cases/.pytorch-disabled-tests.json
@@ -12938,6 +12938,10 @@
   "test_inplace_grad_abs_npu_float64 (__main__.TestBwdGradientsPRIVATEUSE1)": ["", [""]],
   "test_inplace_grad_acos_npu_complex128 (__main__.TestBwdGradientsPRIVATEUSE1)": ["", [""]],
   "test_inplace_grad_acos_npu_float64 (__main__.TestBwdGradientsPRIVATEUSE1)": ["", [""]],
+  "test_inplace_grad_acosh_npu_complex128 (__main__.TestBwdGradientsPRIVATEUSE1)": ["", ["910A"]],
+  "test_inplace_grad_asinh_npu_complex128 (__main__.TestBwdGradientsPRIVATEUSE1)": ["", ["910A"]],
+  "test_inplace_grad_atanh_npu_complex128 (__main__.TestBwdGradientsPRIVATEUSE1)": ["", ["910A"]],
+  "test_inplace_grad_ldexp_npu_complex128 (__main__.TestBwdGradientsPRIVATEUSE1)": ["", ["910A"]],
   "test_inplace_grad_add_npu_complex128 (__main__.TestBwdGradientsPRIVATEUSE1)": ["", [""]],
   "test_inplace_grad_add_npu_float64 (__main__.TestBwdGradientsPRIVATEUSE1)": ["", [""]],
   "test_inplace_grad_addbmm_npu_complex128 (__main__.TestBwdGradientsPRIVATEUSE1)": ["", [""]],
@@ -27971,6 +27975,26 @@
   "test_batched_nms (__main__.TestONNXRuntime_opset_version_20_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
   "test_batched_nms (__main__.TestONNXRuntime_opset_version_20_is_script_True_keep_initializers_as_inputs_False)": ["", [""]],
   "test_batched_nms (__main__.TestONNXRuntime_opset_version_20_is_script_True_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_10_is_script_False_keep_initializers_as_inputs_False)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_10_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_11_is_script_False_keep_initializers_as_inputs_False)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_11_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_12_is_script_False_keep_initializers_as_inputs_False)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_12_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_13_is_script_False_keep_initializers_as_inputs_False)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_13_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_14_is_script_False_keep_initializers_as_inputs_False)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_14_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_15_is_script_False_keep_initializers_as_inputs_False)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_15_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_16_is_script_False_keep_initializers_as_inputs_False)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_16_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_17_is_script_False_keep_initializers_as_inputs_False)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_17_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_7_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_8_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_9_is_script_False_keep_initializers_as_inputs_False)": ["", [""]],
+  "test_apex_o2 (__main__.TestONNXRuntime_npu_opset_version_9_is_script_False_keep_initializers_as_inputs_True)": ["", [""]],
   "test_lstm (__main__.TestONNXRuntime_opset_version_10_is_script_True_keep_initializers_as_inputs_False)": ["", [""]],
   "test_lstm (__main__.TestONNXRuntime_opset_version_10_is_script_True_keep_initializers_as_inputs_True)": ["", [""]],
   "test_lstm (__main__.TestONNXRuntime_opset_version_11_is_script_True_keep_initializers_as_inputs_False)": ["", [""]],
@@ -30750,6 +30774,43 @@
   "test_custom_op_fallthrough (__main__.TestUtilityFuns_opset_19)": ["", [""]],
   "test_custom_op_fallthrough (__main__.TestUtilityFuns_opset_20)": ["", [""]],
   "test_custom_op_fallthrough (__main__.TestUtilityFuns_opset_9)": ["", [""]],
+  "test_check_inplace_nn_ReLU_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_cpu_gpu_parity_nn_ReLU_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_cpu_gpu_parity_nn_Tanh_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_cpu_gpu_parity_nn_Tanhshrink_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_AvgPool1d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_AvgPool2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_FractionalMaxPool3d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_LPPool1d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_LPPPool2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_ReLU_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_ReplicationPad2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_Tanh_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_Tanhshrink_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_ZeroPad1d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_forward_nn_ZeroPad2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_if_train_and_eval_modes_differ_nn_AvgPool1d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_if_train_and_eval_modes_differ_nn_AvgPool2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_if_train_and_eval_modes_differ_nn_LPPool1d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_if_train_and_eval_modes_differ_nn_LPPool2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_if_train_and_eval_modes_differ_nn_ReLU_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_if_train_and_eval_modes_differ_nn_Tanh_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_if_train_and_eval_modes_differ_nn_Tanhshrink_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_AvgPool1d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_AvgPool2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_CircularPad2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_CircularPad3d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_ConstantPad1d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_ConstantPad2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_ConstantPad3d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_FractionalMaxPool2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_FractionalMaxPool3d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_LPPool1d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_LPPool2d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_ReLU_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_Tanh_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_Tanhshrink_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
+  "test_pickle_nn_ZeroPad3d_npu_float64 (__main__.TestModulePRIVATEUSE1)": ["", ["A2"]],
   "test_fuzz_symbolize (__main__.TestExperimentalUtils)": ["", [""]],
   "test_profiler_strides (__main__.TestProfiler)": ["", [""]],
   "test_schedule_function_count (__main__.TestProfiler)": ["", [""]],
diff --git a/torch_npu/csrc/core/npu/GetCANNInfo.cpp b/torch_npu/csrc/core/npu/GetCANNInfo.cpp
index 84597850ed..8a15ef2085 100644
--- a/torch_npu/csrc/core/npu/GetCANNInfo.cpp
+++ b/torch_npu/csrc/core/npu/GetCANNInfo.cpp
@@ -82,8 +82,12 @@ std::string GetCANNVersion(const std::string& module)
 bool IsGteCANNVersion(const std::string version, const std::string module)
 {
     static std::string baseVersion = "8.1.RC1";
+    static std::string unsupportedModule = "DRIVER";
+    if (module.compare(unsupportedModule) == 0) {
+        TORCH_CHECK(false, "When the module is DRIVER, this function is not supported.", PTA_ERROR(ErrCode::VALUE));
+    }
     if (version.compare(baseVersion) < 0) {
-        TORCH_CHECK(false, "When the version is less than \"8.1.RC1\", this function is not supported.", PTA_ERROR(ErrCode::VALUE));
+        TORCH_CHECK(false, "When the version " + version + " is less than \"8.1.RC1\", this function is not supported.", PTA_ERROR(ErrCode::VALUE));
     }
     std::string currentVersion = GetCANNVersion(module);
     double current_num = VersionToNum(currentVersion);
diff --git a/torch_npu/npu/utils.py b/torch_npu/npu/utils.py
index 59a70856f1..3f3b8493bf 100644
--- a/torch_npu/npu/utils.py
+++ b/torch_npu/npu/utils.py
@@ -38,7 +38,7 @@ def _is_gte_cann_version(version, module="CANN"):
     compare current cann_version and version.
     Args:
         version: the features are supported or not from which cann version.
-        module: can be selected from [\"CANN\", \"RUNTIME\", \"COMPILER\", \"HCCL\", \"TOOLKIT\", \"OPP\", \"OPP_KERNEL\", \"DRIVER\"]
+        module: can be selected from [\"CANN\", \"RUNTIME\", \"COMPILER\", \"HCCL\", \"TOOLKIT\", \"OPP\", \"OPP_KERNEL\"]
 
     Returns: If current_version >= version, return True, else return False.
 
-- 
Gitee


From 6cd779b2d142bb3be44e20c50fd6be7ae5ba13ce Mon Sep 17 00:00:00 2001
From: jiangpengfei <jiangpengfei24@huawei.com>
Date: Mon, 31 Mar 2025 13:30:06 +0000
Subject: [PATCH 265/358] !19846 reconstruct makesurequeueempty
 throwruntimeerror Merge pull request !19846 from jiangpengfei/v2.6.0

---
 torch_npu/csrc/core/npu/NPUQueue.cpp | 67 ++++++++++++----------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUQueue.cpp b/torch_npu/csrc/core/npu/NPUQueue.cpp
index 8e0b28e03f..80fc7802bc 100644
--- a/torch_npu/csrc/core/npu/NPUQueue.cpp
+++ b/torch_npu/csrc/core/npu/NPUQueue.cpp
@@ -225,6 +225,8 @@ void Repository::ChangeStatus(RepoStatus expected, RepoStatus desired)
 
 NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
 {
+    std::string error_msg;
+    std::string runtime_error;
     if (initialized == false) {
         ASCEND_LOGE("Task queue is not initialized, shouldn't call MakeSureQueueEmpty(). !!");
         return NPU_STATUS_FAILED;
@@ -271,27 +273,18 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
     }
 
     if (GetStatus() == RepoStatus::UCE_EXIT) {
-        if (check_error) {
-            throw std::runtime_error("UCE ERROR." + PTA_ERROR(ErrCode::ACL));
-        } else {
-            ASCEND_LOGE("UCE ERROR happend.");
-        }
-    } else if (GetStatus() == RepoStatus::HBM_ECC_EXIT) {
-        if (check_error) {
-            std::string error_msg = c10_npu::c10_npu_get_error_message();
-            throw std::runtime_error("HBM MULTI BIT ECC ERROR." + error_msg + PTA_ERROR(ErrCode::ACL));
-        } else {
-            ASCEND_LOGE("HBM MULTI BIT ECC ERROR happend.");
-        }
+        runtime_error = "UCE ERROR." + PTA_ERROR(ErrCode::ACL);
+        error_msg = "UCE ERROR happend.";
+    }
+    
+    if (GetStatus() == RepoStatus::HBM_ECC_EXIT) {
+        runtime_error = "HBM MULTI BIT ECC ERROR." + std::string(c10_npu::c10_npu_get_error_message()) + PTA_ERROR(ErrCode::ACL);
+        error_msg = "HBM MULTI BIT ECC ERROR happend.";
     }
 
     if (GetStatus() == RepoStatus::STOP_EXIT) {
-        if (check_error) {
-            ASCEND_LOGE("getRepoStopFlag in EmptyQueue, throw FORCE STOP.");
-            throw std::runtime_error("FORCE STOP." + PTA_ERROR(ErrCode::ACL));
-        } else {
-            ASCEND_LOGE("FORCE STOP happend.");
-        }
+        runtime_error = "FORCE STOP." + PTA_ERROR(ErrCode::ACL);
+        error_msg = "FORCE STOP happend.";
     }
 
     if (GetStatus() == RepoStatus::ERROR_EXIT) {
@@ -306,26 +299,17 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
             }
         }
 
-#ifndef BUILD_LIBTORCH
-        if (gilState) {
-            PyEval_RestoreThread(gilState);
-        }
-#endif
-
-        if (check_error) {
-            throw std::runtime_error("The Inner error is reported as above. "
-                "The process exits for this inner error, and " +
-                repo_error + ".\n" +
-                "Since the operator is called asynchronously, the stacktrace may be inaccurate. "
-                "If you want to get the accurate stacktrace, "
-                "pleace set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" +
-                "Note: ASCEND_LAUNCH_BLOCKING=1 will force ops to run in synchronous mode, "
-                "resulting in performance degradation. "
-                "Please unset ASCEND_LAUNCH_BLOCKING in time after debugging." +
-                PTA_ERROR(ErrCode::ACL) + ".\n" + acl_error);
-        } else {
-            ASCEND_LOGE("Inner error happend, detail: %s", repo_error);
-        }
+        runtime_error = "The Inner error is reported as above. "
+            "The process exits for this inner error, and " +
+            repo_error + ".\n" +
+            "Since the operator is called asynchronously, the stacktrace may be inaccurate. "
+            "If you want to get the accurate stacktrace, "
+            "pleace set the environment variable ASCEND_LAUNCH_BLOCKING=1.\n" +
+            "Note: ASCEND_LAUNCH_BLOCKING=1 will force ops to run in synchronous mode, "
+            "resulting in performance degradation. "
+            "Please unset ASCEND_LAUNCH_BLOCKING in time after debugging." +
+            PTA_ERROR(ErrCode::ACL) + ".\n" + acl_error;
+        error_msg = "Inner error happend, detail: " + repo_error;
     }
 
 #ifndef BUILD_LIBTORCH
@@ -335,6 +319,13 @@ NPUStatus Repository::MakeSureQueueEmpty(bool check_error)
     }
 #endif
 
+    if (!error_msg.empty()) {
+        ASCEND_LOGE(error_msg);
+    }
+    if (check_error && !runtime_error.empty()) {
+        throw std::runtime_error(runtime_error);
+    }
+
     return NPU_STATUS_SUCCESS;
 }
 
-- 
Gitee


From da3260b0cf4848e7fa271490ebba8f46b5dc6b18 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Mon, 31 Mar 2025 13:39:54 +0000
Subject: [PATCH 266/358] !19858 Update op_plugin commit id Merge pull request
 !19858 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index bc27d0f9f2..b707593d6d 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit bc27d0f9f2864412df6a6d97dfba6653eb503028
+Subproject commit b707593d6d44e3ff6bbab2dba0ea5b2337347354
-- 
Gitee


From 2cd88c79b28f0c9fd8ceba055534532c51a47356 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= <jiangyiwen5@huawei.com>
Date: Tue, 1 Apr 2025 01:29:11 +0000
Subject: [PATCH 267/358] =?UTF-8?q?!19863=20Fix=20torch.save=20patch=20for?=
 =?UTF-8?q?=20typed=5Fstorage=20Merge=20pull=20request=20!19863=20from=20?=
 =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87/v2.6.0=5Fsv?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/utils/serialization.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/torch_npu/utils/serialization.py b/torch_npu/utils/serialization.py
index eb030806cb..4e09951f6c 100644
--- a/torch_npu/utils/serialization.py
+++ b/torch_npu/utils/serialization.py
@@ -300,7 +300,11 @@ def _npu_save(
                 storage_dtype = obj.dtype
                 storage_type_str = obj._pickle_storage_type()
                 storage_type = getattr(torch, storage_type_str)
-                storage_numel = obj._size()
+                if storage.device.type != "cpu":
+                    storage_tensor = torch_npu._C._tensor_construct_from_storage(storage)
+                    storage_numel = storage_tensor.size().numel() * storage_tensor.element_size() // obj._element_size()
+                else:
+                    storage_numel = obj._size()
 
             else:
                 storage = obj
-- 
Gitee


From ab8f871c5fee7d72bbfb6bea73133c9d8e3feca3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= <jiangyiwen5@huawei.com>
Date: Tue, 1 Apr 2025 03:53:23 +0000
Subject: [PATCH 268/358] =?UTF-8?q?!19879=20Fix=20torch.save=20patch=202?=
 =?UTF-8?q?=20Merge=20pull=20request=20!19879=20from=20=E5=A7=9C=E6=80=A1?=
 =?UTF-8?q?=E6=96=87/v2.6.0=5Fsv?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/utils/serialization.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/torch_npu/utils/serialization.py b/torch_npu/utils/serialization.py
index 4e09951f6c..46b4d73bdc 100644
--- a/torch_npu/utils/serialization.py
+++ b/torch_npu/utils/serialization.py
@@ -363,7 +363,11 @@ def _npu_save(
     for key in sorted(serialized_storages.keys()):
         name = f"data/{key}"
         storage = serialized_storages[key]
-        num_bytes = storage.nbytes()
+        if storage.device.type != "cpu":
+            storage_tensor = torch_npu._C._tensor_construct_from_storage(storage)
+            num_bytes = storage_tensor.size().numel() * storage_tensor.element_size()
+        else:
+            num_bytes = storage.nbytes()
         global _serialization_tls
         if _serialization_tls.skip_data:
             zip_file.write_record_metadata(name, num_bytes)
-- 
Gitee


From 3c70771ceddd51ded533791cf7d9563ea1da2581 Mon Sep 17 00:00:00 2001
From: wangjie <wjchuee@foxmail.com>
Date: Tue, 1 Apr 2025 07:40:09 +0000
Subject: [PATCH 269/358] !19842 [PROF] Profiler python tracer fix Merge pull
 request !19842 from wangjie/proflier_trace_fix_260

---
 torch_npu/csrc/profiler/profiler_python.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_npu/csrc/profiler/profiler_python.cpp b/torch_npu/csrc/profiler/profiler_python.cpp
index 704b65d923..45ccf8f1b2 100644
--- a/torch_npu/csrc/profiler/profiler_python.cpp
+++ b/torch_npu/csrc/profiler/profiler_python.cpp
@@ -343,7 +343,7 @@ void PythonTracer::stop()
 {
     TORCH_INTERNAL_ASSERT(active_.load(), "PythonTracer is not running.", PROF_ERROR(ErrCode::INTERNAL));
 
-    pybind11::gil_scoped_acquire gil;
+    GilAndRestoreThread gil;
     for (const auto thread_state : getInterpreterThreads(interpreter_)) {
         if (thread_state->c_profilefunc == &PythonTracer::pyProfileFn) {
             PyThreadState_Swap(thread_state);
@@ -573,7 +573,7 @@ void PythonTracer::recordCCall(TraceContext* ctx, PyFrameObject* frame, PyObject
 void PythonTracer::recordReturn(TraceContext* ctx, PyFrameObject* frame, TraceTag tag)
 {
     recordEvent(tag, EXIT_EVENT_HASH_ID);
-    
+
     // record ctx to thread id map
     auto ctx_addr = reinterpret_cast<uintptr_t>(ctx);
     if (ctx_tid_map_.find(ctx_addr) == ctx_tid_map_.end()) {
-- 
Gitee


From 9b95aa9727c776cf7a12d97272a0210fa36db292 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E5=98=89=E8=AF=9A?=
 <12731429+wang-pierre-jiacheng@user.noreply.gitee.com>
Date: Tue, 1 Apr 2025 07:42:34 +0000
Subject: [PATCH 270/358] =?UTF-8?q?!19826=20add=20IsGteCANNDriverVersion?=
 =?UTF-8?q?=20Merge=20pull=20request=20!19826=20from=20=E7=8E=8B=E5=98=89?=
 =?UTF-8?q?=E8=AF=9A/v2.6.0=5Fdriver=5Fver?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/core/npu/GetCANNInfo.cpp       | 97 ++++++++++++++++---
 torch_npu/csrc/core/npu/GetCANNInfo.h         |  2 +
 .../csrc/core/npu/NPUCachingAllocator.cpp     |  5 +-
 3 files changed, 88 insertions(+), 16 deletions(-)

diff --git a/torch_npu/csrc/core/npu/GetCANNInfo.cpp b/torch_npu/csrc/core/npu/GetCANNInfo.cpp
index 8a15ef2085..309227f619 100644
--- a/torch_npu/csrc/core/npu/GetCANNInfo.cpp
+++ b/torch_npu/csrc/core/npu/GetCANNInfo.cpp
@@ -6,6 +6,11 @@
 #include "third_party/acl/inc/acl/acl.h"
 
 
+constexpr size_t kVersionIndex1 = 1;
+constexpr size_t kVersionIndex2 = 2;
+constexpr size_t kVersionIndex3 = 3;
+constexpr size_t kVersionIndex4 = 4;
+
 std::unordered_map<std::string, aclCANNPackageName> packageNameMap = {
     {"CANN", ACL_PKG_NAME_CANN},
     {"RUNTIME", ACL_PKG_NAME_RUNTIME},
@@ -27,22 +32,22 @@ double VersionToNum(std::string versionStr)
     int TVersion = -1;
     int alphaVersion = 0;
     if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).RC([0-9]+)"))) {
-        major = stoi(results[1]);
-        minor = stoi(results[2]);
-        RCVersion = stoi(results[3]);
+        major = stoi(results[kVersionIndex1]);
+        minor = stoi(results[kVersionIndex2]);
+        RCVersion = stoi(results[kVersionIndex3]);
     } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).([0-9]+)"))) {
-        major = stoi(results[1]);
-        minor = stoi(results[2]);
-        release = stoi(results[3]);
+        major = stoi(results[kVersionIndex1]);
+        minor = stoi(results[kVersionIndex2]);
+        release = stoi(results[kVersionIndex3]);
     } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).T([0-9]+)"))) {
-        major = stoi(results[1]);
-        minor = stoi(results[2]);
-        TVersion = stoi(results[3]);
+        major = stoi(results[kVersionIndex1]);
+        minor = stoi(results[kVersionIndex2]);
+        TVersion = stoi(results[kVersionIndex3]);
     } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).RC([0-9]+).alpha([0-9]+)"))) {
-        major = stoi(results[1]);
-        minor = stoi(results[2]);
-        RCVersion = stoi(results[3]);
-        alphaVersion = stoi(results[4]);
+        major = stoi(results[kVersionIndex1]);
+        minor = stoi(results[kVersionIndex2]);
+        RCVersion = stoi(results[kVersionIndex3]);
+        alphaVersion = stoi(results[kVersionIndex4]);
     } else {
         TORCH_NPU_WARN_ONCE("Version: " + versionStr + " is invalid.");
         return 0.0;
@@ -52,6 +57,53 @@ double VersionToNum(std::string versionStr)
     return num;
 }
 
+double DriverVersionToNum(std::string versionStr)
+{
+    std::smatch results;
+    int major = -1;
+    int minor = -1;
+    int release = -1;
+    int TVersion = -1;
+    int RCVersion = -51;
+    int bVersion = 0;
+    // driver version check only supports pattern listed here:
+    // 24.1.0,24.1.RC1,24.1.rc1,24.1.RC1.B10,24.1.rc1.b10,24.1.T1
+    if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).RC([0-9]+)"))) {
+        major = stoi(results[kVersionIndex1]);
+        minor = stoi(results[kVersionIndex2]);
+        RCVersion = stoi(results[kVersionIndex3]);
+    } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).rc([0-9]+)"))) {
+        major = stoi(results[kVersionIndex1]);
+        minor = stoi(results[kVersionIndex2]);
+        RCVersion = stoi(results[kVersionIndex3]);
+    } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).([0-9]+)"))) {
+        major = stoi(results[kVersionIndex1]);
+        minor = stoi(results[kVersionIndex2]);
+        release = stoi(results[kVersionIndex3]);
+    } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).T([0-9]+)"))) {
+        major = stoi(results[kVersionIndex1]);
+        minor = stoi(results[kVersionIndex2]);
+        TVersion = stoi(results[kVersionIndex3]);
+    } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).RC([0-9]+).B([0-9]+)"))) {
+        major = stoi(results[kVersionIndex1]);
+        minor = stoi(results[kVersionIndex2]);
+        RCVersion = stoi(results[kVersionIndex3]);
+        bVersion = stoi(results[kVersionIndex4]);
+    } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).rc([0-9]+).b([0-9]+)"))) {
+        major = stoi(results[kVersionIndex1]);
+        minor = stoi(results[kVersionIndex2]);
+        RCVersion = stoi(results[kVersionIndex3]);
+        bVersion = stoi(results[kVersionIndex4]);
+    } else {
+        TORCH_NPU_WARN_ONCE("Driver Version: " + versionStr + " is invalid or not supported yet.");
+        return 0.0;
+    }
+
+    double num = ((major + 1) * 100000000) + ((minor + 1) * 1000000) + ((release + 1) * 10000) +
+                 ((RCVersion + 1) * 100 + 5000) + ((TVersion + 1) * 100) + bVersion;
+    return num;
+}
+
 std::unordered_map<std::string, std::string> CANNVersionCache;
 
 std::string GetCANNVersion(const std::string& module)
@@ -97,4 +149,23 @@ bool IsGteCANNVersion(const std::string version, const std::string module)
     } else {
         return false;
     }
+}
+
+bool IsGteDriverVersion(const std::string driverVersion)
+{
+    // if cann does not support AclsysGetCANNVersion，GetCANNVersion("DRIVER") will return "".
+    // The result of this function will be false, even if current driver version meets the requirement.
+    const static std::string baseCANNVersion = "8.1.RC1";
+    std::string currentCANNVersion = GetCANNVersion("CANN");
+    double currentCannNum = VersionToNum(currentCANNVersion);
+    double boundaryCannNum = VersionToNum(baseCANNVersion);
+    if (currentCannNum < boundaryCannNum) {
+        TORCH_CHECK(false, "When the cann version is less than \"8.1.RC1\", this function is not supported.",
+                    PTA_ERROR(ErrCode::VALUE));
+    }
+    // check driver version
+    std::string currentDriverVersion = GetCANNVersion("DRIVER");
+    double currentDriverNum = DriverVersionToNum(currentDriverVersion);
+    double boundaryDriverNum = DriverVersionToNum(driverVersion);
+    return currentDriverNum >= boundaryDriverNum;
 }
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/GetCANNInfo.h b/torch_npu/csrc/core/npu/GetCANNInfo.h
index 8c3aa86c6b..917b7edde4 100644
--- a/torch_npu/csrc/core/npu/GetCANNInfo.h
+++ b/torch_npu/csrc/core/npu/GetCANNInfo.h
@@ -11,4 +11,6 @@ formula: ((a+1) * 100000000) + ((b+1) * 1000000) + ((c+1) * 10000) + ((d+1) * 10
 */
 bool IsGteCANNVersion(const std::string version, const std::string module = "CANN");
 
+bool IsGteDriverVersion(const std::string driverVersion);
+
 #endif
\ No newline at end of file
diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index 55b0cd5b33..10a3aea7bd 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -99,7 +99,6 @@ constexpr size_t kLargePoolVirAddrSize = 10737418240; // 10 GB
 const std::string kMinCannVersion = "8.1.RC1";        // minimum cann version which supports 1g mem 8.1.RC1
 const std::string kMinDriverVersion = "25.0.RC1";     // minimum driver version which supports 1g mem 25.0.RC1
 const std::string kCannModule = "CANN";               // cann module name
-const std::string kDriverModule = "DRIVER";           // driver module name
 
 using StatTypes = std::array<bool, static_cast<size_t>(StatType::NUM_TYPES)>;
 
@@ -157,12 +156,12 @@ bool IsMallocPage1GMem(bool is_small_pool)
             return false;
         }
 
-        if (!IsGteCANNVersion(kMinDriverVersion, kDriverModule)) {
+        if (!IsGteDriverVersion(kMinDriverVersion)) {
             TORCH_NPU_WARN_ONCE("The application for 1G large-page physical memory failed. "
                 "Using the HUGE_MEM memory page allocation method may result in performance degradation. "
                 "This warning occurs because the PYTORCH_NPU_ALLOC_CONF = page_size:1g configuration is enabled, "
                 "but the current driver version does not support this feature. "
-                "Please upgrade the CANN package version 1-2.");
+                "Please upgrade the HDK(driver) package version.");
             return false;
         }
         return true;
-- 
Gitee


From e2bd6cf550f177d87528fa5a619abf8c8985f494 Mon Sep 17 00:00:00 2001
From: yuhaiyan <yuhaiyan8@huawei.com>
Date: Tue, 1 Apr 2025 07:59:16 +0000
Subject: [PATCH 271/358] !19837 Delete the @unittest.skip and fix the
 torch.load failure Merge pull request !19837 from yuhaiyan/v2.6.0-dev2

---
 test/contrib/test_drop_path.py          | 3 ++-
 test/distributed/test_hccl_stream_id.py | 7 +++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/test/contrib/test_drop_path.py b/test/contrib/test_drop_path.py
index 19182bef11..ad314e34d6 100644
--- a/test/contrib/test_drop_path.py
+++ b/test/contrib/test_drop_path.py
@@ -80,7 +80,8 @@ class TestDropPath(TestCase):
             [[np.float32, 3, [13, 5]], [np.float32, 3, [13, 5]]],
         ]
 
-        data = torch.load(os.path.join(os.path.dirname(os.path.abspath(__file__)), "base_data/drop_path_base_data.pth"))
+        data = torch.load(os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                                       "base_data/drop_path_base_data.pth"), weights_only=False)
         base_result = data["base_result"]
         for index, item in enumerate(shape_format):
             _, mat1_npu = create_common_tensor(item[0], -10, 10)
diff --git a/test/distributed/test_hccl_stream_id.py b/test/distributed/test_hccl_stream_id.py
index 371791be1c..ee0bcfc36c 100644
--- a/test/distributed/test_hccl_stream_id.py
+++ b/test/distributed/test_hccl_stream_id.py
@@ -1,4 +1,3 @@
-import unittest
 import os
 from unittest.mock import patch
 
@@ -40,6 +39,11 @@ class HcclStreamIdTest(TestCase):
             p2p_stream_id = _world.default_pg._get_backend(torch.device('npu'))._get_stream_id(True, src)
 
         stream_num = os.environ.get("STREAMS_PER_DEVICE", 8)
+        try:
+            stream_num = int(stream_num)
+        except Exception:
+            stream_num = 8
+        
         if stream_num != 32:
             stream_num = 8
         assert0 = ((collective_stream_id & stream_num) == stream_num)
@@ -72,7 +76,6 @@ class HcclStreamIdTest(TestCase):
         for p in ps:
             p.join()
 
-    @unittest.skip("skip this case tmp")
     @skipIfUnsupportMultiNPU(2)
     def test_dist_get_hccl_stream_id_same(self):
         # CI currently supports only 2 devices
-- 
Gitee


From 9b9a16b73ec9bfb440946ea5d7fc7b8132ec8bf6 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 1 Apr 2025 09:09:59 +0000
Subject: [PATCH 272/358] !19907 Update op_plugin commit id Merge pull request
 !19907 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index b707593d6d..7383f70df5 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit b707593d6d44e3ff6bbab2dba0ea5b2337347354
+Subproject commit 7383f70df5fbc3dc4cf50b5a72648ee5e8931da9
-- 
Gitee


From 591da9cdbfa29dc60350f301b4dfa2a77a472721 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 1 Apr 2025 09:09:59 +0000
Subject: [PATCH 273/358] !19907 Update op_plugin commit id Merge pull request
 !19907 from pta-robot/v2.6.0

-- 
Gitee


From 7e276ce65bb2f7e6756308c49368544b9e07bf10 Mon Sep 17 00:00:00 2001
From: zhangqiongwen <zhangqiongwen@huawei.com>
Date: Wed, 2 Apr 2025 01:46:00 +0000
Subject: [PATCH 274/358] !19895 fix reduce_scatter_with_different_shape_avg
 Merge pull request !19895 from zhangqiongwen/v2.6.0_avg_fix

---
 test/distributed/test_reduce_scatter.py         | 4 ++--
 test/distributed/test_reduce_scatter_base.py    | 6 +++---
 test/distributed/test_reduce_scatter_tensor.py  | 8 ++++----
 test/distributed/test_scatter.py                | 2 +-
 torch_npu/csrc/distributed/ProcessGroupHCCL.cpp | 6 ++++++
 5 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/test/distributed/test_reduce_scatter.py b/test/distributed/test_reduce_scatter.py
index 40a28c0533..36431af96a 100644
--- a/test/distributed/test_reduce_scatter.py
+++ b/test/distributed/test_reduce_scatter.py
@@ -83,7 +83,7 @@ class HcclReduceScatterTest(HcclReduceScatterTestBase):
                     shape[1] = 0
                 input_list = []
                 for _ in range(world_size):
-                    _, input1 = create_common_tensor(shape, -10, -10)
+                    _, input1 = create_common_tensor(shape, -10, 10)
                     input_list.append(input1.cpu())
                 expected = self._construct_excepted_result(input_list, world_size, dist.reduce_scatter)
                 self._test_multiprocess(HcclReduceScatterTest._test_reduce_scatter,
@@ -126,7 +126,7 @@ class HcclReduceScatterTest(HcclReduceScatterTestBase):
                     shape[1] = 0
                 input_list = []
                 for _ in range(world_size):
-                    _, input1 = create_common_tensor(shape, -10, -10)
+                    _, input1 = create_common_tensor(shape, -10, 10)
                     input_list.append(input1.cpu())
                 expected = self._construct_excepted_result(input_list, world_size, dist.reduce_scatter, dist.ReduceOp.AVG)
                 self._test_multiprocess(HcclReduceScatterTest._test_reduce_scatter,
diff --git a/test/distributed/test_reduce_scatter_base.py b/test/distributed/test_reduce_scatter_base.py
index 4b45a89a47..5d11345d44 100644
--- a/test/distributed/test_reduce_scatter_base.py
+++ b/test/distributed/test_reduce_scatter_base.py
@@ -42,7 +42,7 @@ class HcclReduceScatterBaseTest(HcclReduceScatterTestBase):
                     shape[1] = 0
                 input_list = []
                 for _ in range(world_size):
-                    _, input1 = create_common_tensor(shape, -10, -10)
+                    _, input1 = create_common_tensor(shape, -10, 10)
                     input_list.append(input1.cpu())
                 expected = self._construct_excepted_result(input_list, world_size, dist._reduce_scatter_base)
                 self._test_multiprocess(HcclReduceScatterBaseTest._test_reduce_scatter_base,
@@ -60,7 +60,7 @@ class HcclReduceScatterBaseTest(HcclReduceScatterTestBase):
             for shape in shape_format:
                 input_list = []
                 for _ in range(world_size):
-                    _, input1 = create_common_tensor(shape, -10, -10)
+                    _, input1 = create_common_tensor(shape, -10, 10)
                     input_list.append(input1.cpu())
                 expected = self._construct_excepted_result(input_list, world_size, dist._reduce_scatter_base)
                 self._test_multiprocess(HcclReduceScatterBaseTest._test_reduce_scatter_base,
@@ -77,7 +77,7 @@ class HcclReduceScatterBaseTest(HcclReduceScatterTestBase):
                     shape[1] = 0
                 input_list = []
                 for _ in range(world_size):
-                    _, input1 = create_common_tensor(shape, -10, -10)
+                    _, input1 = create_common_tensor(shape, -10, 10)
                     input_list.append(input1.cpu())
                 expected = self._construct_excepted_result(input_list, world_size, dist._reduce_scatter_base, dist.ReduceOp.AVG)
                 self._test_multiprocess(HcclReduceScatterBaseTest._test_reduce_scatter_base,
diff --git a/test/distributed/test_reduce_scatter_tensor.py b/test/distributed/test_reduce_scatter_tensor.py
index c58236ba1a..52eb58adbb 100644
--- a/test/distributed/test_reduce_scatter_tensor.py
+++ b/test/distributed/test_reduce_scatter_tensor.py
@@ -42,7 +42,7 @@ class HcclReduceScatterTensorTest(HcclReduceScatterTestBase):
                     shape[1] = 0
                 input_list = []
                 for _ in range(world_size):
-                    _, input1 = create_common_tensor(shape, -10, -10)
+                    _, input1 = create_common_tensor(shape, -10, 10)
                     input_list.append(input1.cpu())
                 expected = self._construct_excepted_result(input_list, world_size, dist.reduce_scatter_tensor)
                 self._test_multiprocess(HcclReduceScatterTensorTest._test_reduce_scatter_tensor,
@@ -74,7 +74,7 @@ class HcclReduceScatterTensorTest(HcclReduceScatterTestBase):
                     shape[1] = 0
                 input_list = []
                 for _ in range(world_size):
-                    _, input1 = create_common_tensor(shape, -10, -10)
+                    _, input1 = create_common_tensor(shape, -10, 10)
                     input_list.append(input1.cpu())
                 expected = self._construct_excepted_result(input_list, world_size, torch_npu.distributed.reduce_scatter_tensor_uneven)
                 self._test_multiprocess(HcclReduceScatterTensorTest._test_reduce_scatter_tensor_uneven,
@@ -91,7 +91,7 @@ class HcclReduceScatterTensorTest(HcclReduceScatterTestBase):
                     shape[1] = 0
                 input_list = []
                 for _ in range(world_size):
-                    _, input1 = create_common_tensor(shape, -10, -10)
+                    _, input1 = create_common_tensor(shape, -10, 10)
                     input_list.append(input1.cpu())
                 expected = self._construct_excepted_result(input_list, world_size, dist.reduce_scatter_tensor, dist.ReduceOp.AVG)
                 self._test_multiprocess(HcclReduceScatterTensorTest._test_reduce_scatter_tensor,
@@ -108,7 +108,7 @@ class HcclReduceScatterTensorTest(HcclReduceScatterTestBase):
                     shape[1] = 0
                 input_list = []
                 for _ in range(world_size):
-                    _, input1 = create_common_tensor(shape, -10, -10)
+                    _, input1 = create_common_tensor(shape, -10, 10)
                     input_list.append(input1.cpu())
                 expected = self._construct_excepted_result(input_list, world_size, dist.reduce_scatter_tensor_uneven, dist.ReduceOp.AVG)
                 self._test_multiprocess(HcclReduceScatterTensorTest._test_reduce_scatter_tensor_uneven,
diff --git a/test/distributed/test_scatter.py b/test/distributed/test_scatter.py
index 2d9303b440..eb18b91db7 100644
--- a/test/distributed/test_scatter.py
+++ b/test/distributed/test_scatter.py
@@ -75,7 +75,7 @@ class HcclScatterTest(TestCase):
             for shape in shape_format:
                 input_list = []
                 for _ in range(world_size):
-                    _, npu_input = create_common_tensor(shape, -10, -10)
+                    _, npu_input = create_common_tensor(shape, -10, 10)
                     input_list.append(npu_input.cpu())
                 expected = self._construct_expected_result(input_list, dist.scatter)
                 self._test_multiprocess(HcclScatterTest._test_scatter,
diff --git a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
index 7da9523049..99e4fc6112 100644
--- a/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
+++ b/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp
@@ -3943,6 +3943,12 @@ c10::intrusive_ptr<c10d::Work> ProcessGroupHCCL::reduce_scatter(
                     at::Tensor output_tensor_reshape = at::reshape(outputFlattened[i], outputTensors[i].sizes());
                     outputTensors[i].copy_(output_tensor_reshape, true);
                 }
+                if (opts.reduceOp == c10d::ReduceOp::AVG) {
+                    c10_npu::NPUStreamGuard guard(hcclStreams[0]);
+                    for (auto& tensor : outputTensors) {
+                        tensor.div_(getSize());
+                    }
+                }
             },
             c10d::OpType::REDUCE_SCATTER);
     } else {
-- 
Gitee


From 14e1a41f29d8bc280793b09e95d5f10ae3135ac5 Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Wed, 2 Apr 2025 01:54:32 +0000
Subject: [PATCH 275/358] !19915 Update torchair commit id Merge pull request
 !19915 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 034b3d7c15..25de07e4bb 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 034b3d7c1501d9a72c28f2904abf1ec427898d08
+Subproject commit 25de07e4bb4f0bdec418099465f6b8e28fade989
-- 
Gitee


From a09ce7fd10b18f79c37d71b67349a1d6d53a9e75 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=9C=E9=87=91=E8=88=AA?= <15990042527@163.com>
Date: Wed, 2 Apr 2025 02:57:08 +0000
Subject: [PATCH 276/358] =?UTF-8?q?!19903=20Fixed=20interlocks=20caused=20?=
 =?UTF-8?q?by=20recursive=5Fmutex=20and=20eventfd=5Fread=20Merge=20pull=20?=
 =?UTF-8?q?request=20!19903=20from=20=E6=9D=9C=E9=87=91=E8=88=AA/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../csrc/core/npu/NPUCachingAllocator.cpp     | 53 +++++++++++++++----
 1 file changed, 44 insertions(+), 9 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
index 10a3aea7bd..baddbe6395 100644
--- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
+++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp
@@ -918,6 +918,28 @@ bool isConfig1GPageSizeEnable()
     return CachingAllocatorConfig::page_size_1g_enable();
 }
 
+// To prevent the deadlock situation, temporarily release the lock.
+//
+// Deadlock Scenario Description:
+//
+// 1. Main Thread:
+//    - Acquires the lock and performs sync to clear the taskqueue.
+//    - taskqueue wait a empty signal from the sub-thread.
+//
+// 2. Sub-thread:
+//    - Python function (tbe op compile) called in CANN may trigger GC that introduces a resource release operation.
+//    - The release operation (`free`) cannot acquire the same lock holded in main thread.
+//    - Unable to send a signal to the main thread.
+class UnlockGuard {
+public:
+    explicit UnlockGuard(std::unique_lock<std::recursive_mutex>& lock) : lock_(lock) { lock_.unlock(); }
+
+    ~UnlockGuard() { lock_.lock(); }
+
+private:
+    std::unique_lock<std::recursive_mutex>& lock_;
+};
+
 class DeviceCachingAllocator {
 private:
     // lock around all operations
@@ -1132,13 +1154,13 @@ public:
         if (!block_found) {
             // Do garbage collection if the flag is set.
             if (C10_UNLIKELY(set_fraction && CachingAllocatorConfig::garbage_collection_threshold() > 0.0)) {
-                garbage_collect_cached_blocks(context);
+                garbage_collect_cached_blocks(context, lock);
             }
             // Attempt allocate
             block_found = alloc_block(params, false, context, lock) ||
                 // Free enough available cached blocks to satisfy alloc and retry
                 // alloc.
-                (release_available_cached_blocks(params, context) && alloc_block(params, false, context, lock));
+                (release_available_cached_blocks(params, context, lock) && alloc_block(params, false, context, lock));
         }
 
         if (!block_found && C10_LIKELY(captures_underway.empty())) {
@@ -1146,6 +1168,11 @@ public:
                 "Get a block from the existing pool failed. Try to free cached blocks and reallocate. This error log "
                 "can be ignored.");
             // Free all non-split cached blocks and retry alloc.
+            {
+                UnlockGuard guard(lock);
+                // Make sure taskqueue is empty, then execute release_cached_blocks
+                c10_npu::npuSynchronizeDevice(true);
+            }
             c10_npu::NPUWorkspaceAllocator::emptyCache(device, true, true);
             block_found = (release_cached_blocks(true, context) && alloc_block(params, true, context, lock));
         }
@@ -1463,6 +1490,8 @@ public:
     void emptyCache(int device, bool check_error)
     {
         std::shared_ptr<c10::GatheredContext> context = maybeGatherContext(RecordContext::ALL);
+        // Make sure event deque from taskqueue, then synchronize Event
+        c10_npu::npuSynchronizeDevice(check_error);
         std::lock_guard<std::recursive_mutex> lock(mutex);
         c10_npu::NPUWorkspaceAllocator::emptyCache(device, true, check_error);
         release_cached_blocks(check_error, context);
@@ -2067,7 +2096,8 @@ private:
         return freed_memory;
     }
 
-    void garbage_collect_cached_blocks(const std::shared_ptr<c10::GatheredContext> &ctx)
+    void garbage_collect_cached_blocks(const std::shared_ptr<c10::GatheredContext>& ctx,
+                                       std::unique_lock<std::recursive_mutex>& lock)
     {
         // Free unused cached blocks to reclaim NPU memory.
         // Unlike release_cached_blocks(), this does not enforce synchronization and
@@ -2097,7 +2127,10 @@ private:
             return;
         }
 
-        c10_npu::npuSynchronizeDevice(true);
+        {
+            UnlockGuard guard(lock);
+            c10_npu::npuSynchronizeDevice(true);
+        }
 
         // Repeat GC until we reach reclaim > target size.
         bool block_freed = true;
@@ -2197,7 +2230,8 @@ private:
     }
 
     /* * Free one or more oversize blocks to the system allocator.  But only enough to satisfy the target size * */
-    bool release_available_cached_blocks(const AllocParams &p, const std::shared_ptr<c10::GatheredContext> &ctx)
+    bool release_available_cached_blocks(const AllocParams& p, const std::shared_ptr<c10::GatheredContext>& ctx,
+                                         std::unique_lock<std::recursive_mutex>& lock)
     {
         if (CachingAllocatorConfig::max_split_size() == std::numeric_limits<size_t>::max()) {
             return false;
@@ -2208,7 +2242,10 @@ private:
             (key.size < CachingAllocatorConfig::max_split_size()) ? CachingAllocatorConfig::max_split_size() : key.size;
         auto it = pool.blocks.lower_bound(&key);
 
-        c10_npu::npuSynchronizeDevice(true);
+        {
+            UnlockGuard guard(lock);
+            c10_npu::npuSynchronizeDevice(true);
+        }
 
         if (it == pool.blocks.end() || (*it)->stream != p.stream()) {
             // No single block is large enough; free multiple oversize blocks, starting with the largest
@@ -2239,11 +2276,9 @@ private:
         return true;
     }
 
+    // npuSynchronizeDevice must be executed before this function can be called
     bool release_cached_blocks(bool check_error, const std::shared_ptr<c10::GatheredContext> &context)
     {
-        // Make sure event deque from taskqueue, then synchronize Event
-        c10_npu::npuSynchronizeDevice(check_error);
-
         // First ensure that all blocks that can't currently be allocated due to
         // outstanding events are returned to the pool.
         synchronize_and_free_events(check_error, context);
-- 
Gitee


From 794858295a9a25e93abc69119b0a05b610744ae1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= <jiangyiwen5@huawei.com>
Date: Wed, 2 Apr 2025 07:55:02 +0000
Subject: [PATCH 277/358] =?UTF-8?q?!19936=20Change=20acl=5Fop=5Finit=5Fmod?=
 =?UTF-8?q?e's=20Pos=20Merge=20pull=20request=20!19936=20from=20=E5=A7=9C?=
 =?UTF-8?q?=E6=80=A1=E6=96=87/v2.6.0=5Flz?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/framework/interface/EnvVariables.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torch_npu/csrc/framework/interface/EnvVariables.cpp b/torch_npu/csrc/framework/interface/EnvVariables.cpp
index c0892c91fe..4da3d362d1 100644
--- a/torch_npu/csrc/framework/interface/EnvVariables.cpp
+++ b/torch_npu/csrc/framework/interface/EnvVariables.cpp
@@ -46,11 +46,10 @@ REGISTER_OPTION_HOOK(mdldumpconfigpath, [](const std::string &val) {
   aclmdlSetDump(val.c_str());
 })
 
-auto acl_op_init_mode = c10_npu::option::OptionsManager::GetAclOpInitMode();
-
 REGISTER_OPTION_BOOL_FUNCTION(CheckJitDisableInner, jitCompile, "enable", "disable")
 REGISTER_OPTION_CACHE(bool, isJitDisable, CheckJitDisableInner)
 REGISTER_OPTION_HOOK(jitCompile, [](const std::string &val) {
+    auto acl_op_init_mode = c10_npu::option::OptionsManager::GetAclOpInitMode();
     if (acl_op_init_mode == 0) {
         NPU_CHECK_ERROR(AclSetCompileopt(aclCompileOpt::ACL_OP_JIT_COMPILE, val.c_str()));
     } else if (GET_OPTION_WITH_CACHE(isJitDisable) != ("disable" == val)) {
-- 
Gitee


From 132da4d03b17b99933744f5aad0395ee5c6a8210 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E5=98=89=E8=AF=9A?=
 <12731429+wang-pierre-jiacheng@user.noreply.gitee.com>
Date: Wed, 2 Apr 2025 08:07:14 +0000
Subject: [PATCH 278/358] =?UTF-8?q?!19946=20fix=20driver=20version=20num?=
 =?UTF-8?q?=20bug=20Merge=20pull=20request=20!19946=20from=20=E7=8E=8B?=
 =?UTF-8?q?=E5=98=89=E8=AF=9A/v2.6.0=5Fcompare?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/core/npu/GetCANNInfo.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/torch_npu/csrc/core/npu/GetCANNInfo.cpp b/torch_npu/csrc/core/npu/GetCANNInfo.cpp
index 309227f619..ab3f12d5f1 100644
--- a/torch_npu/csrc/core/npu/GetCANNInfo.cpp
+++ b/torch_npu/csrc/core/npu/GetCANNInfo.cpp
@@ -99,8 +99,12 @@ double DriverVersionToNum(std::string versionStr)
         return 0.0;
     }
 
-    double num = ((major + 1) * 100000000) + ((minor + 1) * 1000000) + ((release + 1) * 10000) +
-                 ((RCVersion + 1) * 100 + 5000) + ((TVersion + 1) * 100) + bVersion;
+    double num = ((static_cast<double>(major) + 1.0) * 100000000) +
+                 ((static_cast<double>(minor) + 1.0) * 1000000) +
+                 ((static_cast<double>(release) + 1.0) * 10000) +
+                 ((static_cast<double>(RCVersion) + 1.0) * 100 + 5000) +
+                 ((static_cast<double>(TVersion) + 1.0) * 100) +
+                 static_cast<double>(bVersion);
     return num;
 }
 
-- 
Gitee


From 0a359ec36ce90ccbbbf17b1cd237d3a9ce19a6ee Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Wed, 2 Apr 2025 08:39:58 +0000
Subject: [PATCH 279/358] !19959 Update op_plugin commit id Merge pull request
 !19959 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 7383f70df5..6f320f6d4b 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 7383f70df5fbc3dc4cf50b5a72648ee5e8931da9
+Subproject commit 6f320f6d4b77fecb3e86d5e1e516f5e74674aa1b
-- 
Gitee


From f61ec4cf677baafda0305ab5a0ec860350308f52 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 3 Apr 2025 04:40:00 +0000
Subject: [PATCH 280/358] !19986 Update op_plugin commit id Merge pull request
 !19986 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 6f320f6d4b..0bf41f45be 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 6f320f6d4b77fecb3e86d5e1e516f5e74674aa1b
+Subproject commit 0bf41f45be142847db0bad59ca72d84c1ed5e547
-- 
Gitee


From e098e6948a56f9dab4c6859dc5be9b3d397c5141 Mon Sep 17 00:00:00 2001
From: shaoyf <shaoyifan1@huawei.com>
Date: Thu, 3 Apr 2025 07:13:16 +0000
Subject: [PATCH 281/358] !19992 [2.6.0] Update the referenced PyTorch version
 to v2.3.1. Merge pull request !19992 from shaoyf/26_notice

---
 Third_Party_Open_Source_Software_Notice | 223 +++++++++++-------------
 1 file changed, 102 insertions(+), 121 deletions(-)

diff --git a/Third_Party_Open_Source_Software_Notice b/Third_Party_Open_Source_Software_Notice
index b808499836..ca39279142 100644
--- a/Third_Party_Open_Source_Software_Notice
+++ b/Third_Party_Open_Source_Software_Notice
@@ -6,118 +6,109 @@ Warranty Disclaimer
 THE OPEN SOURCE SOFTWARE IN THIS PRODUCT IS DISTRIBUTED IN THE HOPE THAT IT WILL BE USEFUL, BUT WITHOUT ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. SEE THE APPLICABLE LICENSES FOR MORE DETAILS.
 
 Copyright Notice and License Texts
-Software: pytorch v2.1.0
+Software: pytorch v2.3.1
 Copyright notice:
-Copyright (c) 2016- Facebook, Inc 
-Copyright (c) 2014- Facebook, Inc 
-Copyright (c) 2011-2014 Idiap Research Institute 
-Copyright (c) 2012-2014 Deepmind Technologies 
-Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) 
-Copyright (c) 2011-2013 NYU 
-Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) 
-Copyright (c) 2006 Idiap Research Institute 
-Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) 
-Copyright (c) 2016-present, Facebook Inc. 
-Copyright (c) 2016 Facebook Inc. 
-Copyright (c) 2015 Google Inc. 
-Copyright (c) 2015 Yangqing Jia 
-Copyright 2019-2020 Kakao Brain 
-Copyright (c) 2022 Cruise LLC. 
-Copyright (c) 2013, 2014, 2015, the respective contributors 
-Copyright (c) 2015, 2016 the respective contributors 
-Copyright (c) 2014, The Regents of the University of California (Regents) 
-Copyright (c) 2014, the respective contributors 
-Copyright (c) 2018, Steven Moshier 
-Copyright (c) 2001-2002 Enthought, Inc. 2003-2019, SciPy Developers 
-Copyright (c) 1997-2011 by Secret Labs AB 
-Copyright (c) 1995-2011 by Fredrik Lundh 
-Copyright (c) 2010-2022 by Alex Clark and contributors 
-Copyright (c) 2006 The Android Open Source Project 
-Copyright (c) Facebook, Inc. and its affiliates 
-Copyright (c) Meta Platforms, Inc. and affiliates 
-Copyright 2004-present Facebook 
-Copyright (c) 2017 by Contributors 
-Copyright (c) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura 
-Copyright (c) 2022 Apple Inc. 
-Copyright (c) 2023 Apple Inc. 
-Copyright 2005 Robert Kern (robert.kern@gmail.com) 
-copyright 2019 The TensorFlow Authors 
-Copyright (c) 2018 MathInf GmbH, Thomas Viehmann 
-Copyright (c) 2014 Indiana University (c) 
-Copyright John Maddock 2006 
-Copyright (c) 2012 Massachusetts Institute of Technology 
-Copyright (c) 2012 Giovanni Garberoglio Interdisciplinary Laboratory for Computational Science (LISC) Fondazione Bruno Kessler and University of Trento 
-Copyright (c) 2018 Marat Dukhan 
-Copyright (c) 2017-2018 Facebook Inc. 
-Copyright (c) 2017 Georgia Institute of Technology 
-Copyright 2015 Google Inc. 
-Copyright (c) 2011-2021, NVIDIA CORPORATION. 
-Copyright (c) 2022, Tri Dao 
-Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. 
-Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. 
-Copyright (c) 2017 The Android Open Source Project 
-Copyright (c) 2016-present, Facebook, Inc. 
-Copyright (c) 2005-2020 Rich Felker 
-Copyright Malte Skarupke 2017 
-Copyright 2008 Google Inc. 
-Copyright (c) 2011 - 2012 Andrzej Krzemienski 
-Copyright (c) 2001-2019 Free Software Foundation, Inc. 
-Copyright (c) 1994 Hewlett-Packard Company 
-Copyright (c) 1996-1998 Silicon Graphics Computer Systems, Inc. 
-Copyright (c) Bjorn Fahller 
-Copyright Michael Park, 2015-2017 
-Copyright (c) 2017-present, Facebook, Inc. 
-Copyright (c) 2018-present, Facebook, Inc. 
-Copyright (c) 2008-2015 The Khronos Group Inc. 
-Copyright 2016 Facebook 
-Copyright (c) 2016, NVIDIA CORPORATION 
-Copyright (c) 2008 - 2012 The Khronos Group Inc. 
-Copyright (c) 2008-2013 The Khronos Group Inc. 
-Copyright (c) 2008-2012 The Khronos Group Inc. 
-Copyright (c) 2016-2017, ARM Limited and Contributors 
-Copyright (c) 2014-2015 The Khronos Group Inc. 
-Copyright (c) 2015-2017 The Khronos Group Inc. 
-Copyright (c) Facebook Inc. and Microsoft Corporation 
-Copyright (c) 2014-2017 The Regents of the University of California (Regents) 
-Copyright (c) 2014-2017, the respective contributors 
-Copyright (c) 2017 Microsoft 
-Copyright 2015 The Gemmlowp Authors 
-Copyright (c) 2011-2019 Stephan Brumme 
-Copyright 2006, Google Inc. 
-Copyright (c) Meta Platforms, Inc. and its affiliates 
-Copyright (c) 2008 - 2009 NVIDIA Corporation 
-Copyright (c) 2007-2009 Scientific Computing and Imaging Institute, University of Utah 
-Copyright (c) 2006, Laurent Montel, montel@kde.org 
-Copyright 2013 Conrad Steenberg conrad.steenberg@gmail.com 
-copyright 2022, PyTorch 
-copyright 2023, PyTorch 
-Copyright (c) 2005-2022 NVIDIA Corporation Built 
-copyright PyTorch Contributors 
-Copyright (c) 2018 Alex Rogozhnikov 
-Copyright (c) 2016 Microsoft 
-Copyright (c) 2014, 2015, The Regents of the University of California (Regents) 
-Copyright (c) 2014, 2015, the respective contributors 
-Copyright (c) 2005-2017, NumPy Developers (c) Parameter containing Float 
-Copyright 2005, Google Inc. 
-Copyright 2019 Kakao Brain 
-Copyright 2013-2014 RAD Game 
-Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC 
-Copyright 2016 Martin Raiber 
-Copyright (c) 2003-2017 Josef Weidendorfer 
-Copyright (c) 2000-2017 Julian Seward 
-Copyright (c) Edward Z. Yang ezyang@mit.edu 
-Copyright (c) 2005-2010 ActiveState Software Inc. 
-Copyright (c) 2013 Eddy Petrisor 
-Copyright (c) 2010 ActiveState Software Inc. 
-Copyright (c) 2001-2014 Python Software Foundation 
-Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020 Python Software Foundation 
-Copyright Python Software Foundation 
-Copyright 2022 Cruise LLC 
-Copyright (c) 2014 Matthew Rocklin 
-Copyright (c) 2015 Melissa E. O'Neill 
-Copyright (c) 2019 NumPy Developers 
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. 
-Copyright 2013 Mark Dickinson
+Copyright (c) 2011-2013 NYU
+Copyright (c) Microsoft Corporation
+Copyright (c) 2014- Facebook, Inc
+Copyright (c) 2017 The Android Open Source Project
+Copyright Python Software Foundation
+Copyright (c) 2018 Alex Rogozhnikov
+Copyright (c) 2007-2009 Scientific Computing and Imaging Institute, University of Utah
+Copyright (c) 2016, NVIDIA CORPORATION, All rights reserved
+Copyright (c) 2017 Microsoft
+Copyright (c) Meta Platforms, Inc.
+Copyright (c) 2022 Apple Inc.
+Copyright (c) 2018-present, Facebook, Inc.
+Copyright (c) Facebook Inc. and Microsoft Corporation
+Copyright (c) 2005-2017, NumPy Developers. All rights reserved
+Copyright (c) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, All rights reserved
+Copyright (c) 2014, The Regents
+Copyright (c) 2005-2010 ActiveState Software Inc.
+Copyright 2005, Google Inc. All rights reserved
+Copyright (c) 2022, Tri Dao
+Copyright (c) 2001-2002 Enthought, Inc. 2003-2019, SciPy Developers. All rights reserved
+Copyright 2008 Google Inc. All rights reserved
+Copyright (c) 2003-2017 Josef Weidendorfer. All rights reserved
+Copyright (c) 2014 Matthew Rocklin
+Copyright (c) 2016 Microsoft
+Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved
+Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC
+Copyright (c) 2008-2012 The Khronos Group Inc.
+Copyright (c) 2016 Facebook Inc.
+Copyright 2010-2014 Rich Geldreich and Tenacious Software LLC All Rights Reserved
+Copyright (c) 2006, Laurent Montel, <montel@kde.org>
+Copyright (c) 2015 Google Inc. All rights reserved
+Copyright (c) 2010-2022 by Alex Clark and contributors
+Copyright 2015 Google Inc. All Rights Reserved
+Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+(c) BNParamType
+Copyright 2013-2014 RAD Game
+Copyright (c) 2011-2019 Stephan Brumme. All rights reserved
+Copyright (c) 2018 MathInf GmbH, Thomas Viehmann
+Copyright 2019 Kakao Brain
+Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020 Python Software Foundation All Rights Reserved
+Copyright 2004-present Facebook. All Rights Reserved
+Copyright (c) 2008-2013 The Khronos Group Inc.
+Copyright (c) Microsoft Corporation. All rights reserved
+Copyright 2006, Google Inc. All rights reserved
+Copyright (c) 2014-2015 The Khronos Group Inc.
+Copyright 2015 The TensorFlow Authors. All Rights Reserved
+Copyright (c) 2023, Tri Dao
+Copyright (c) 2011-2014 Idiap Research Institute
+Copyright (c) 2016-present, Facebook Inc. All rights reserved
+Copyright (c) Advanced Micro Devices, Inc.
+Copyright (c) 2001-2014 Python Software Foundation All Rights Reserved
+Copyright (c) Bjorn Fahller
+Copyright (c) 1995-2011 by Fredrik Lundh
+Copyright (c) Edward Z. Yang <ezyang@mit.edu>
+Copyright (c) 2012 Massachusetts Institute of Technology
+Copyright (c) 2006 Idiap Research Institute
+Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved
+Copyright (c) 2008 - 2012 The Khronos Group Inc.
+Copyright (c) 2015 Yangqing Jia All rights reserved
+Copyright 2023-present Facebook. All Rights Reserved
+Copyright 2013 Conrad Steenberg <conrad.steenberg@gmail.com>
+Copyright (c) 2008-2015 The Khronos Group Inc.
+Copyright (c) 2014-2017 The Regents
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright Malte Skarupke 2017
+Copyright (c) Meta Platforms, Inc. and affiliates
+Copyright (c) 2023, Advanced Micro Devices, Inc.
+Copyright (c) 2016- Facebook, Inc
+Copyright (c) 1997-2011 by Secret Labs AB
+Copyright (c) 2005-2022 NVIDIA Corporation Built
+Copyright (c) Facebook, Inc.
+Copyright 2019-2020 Kakao Brain
+Copyright (c) 2000-2017 Julian Seward. All rights reserved
+Copyright (c) 2016-2017, ARM Limited and Contributors
+Copyright (c) 2005-2020 Rich Felker
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+Copyright 2016 Facebook
+Copyright (c) 2012-2014 Deepmind Technologies
+Copyright (c) 2012 Giovanni Garberoglio Interdisciplinary Laboratory
+Copyright (c) 2024, Tri Dao
+Copyright (c) Donald Stufft and individual contributors. All rights reserved
+Copyright (c) 2018, Steven Moshier All rights reserved
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved
+Copyright (c) 2017-present, Facebook, Inc.
+Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved
+Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved
+Copyright (c) 2014, 2015, The Regents
+Copyright (c) 2013 Eddy Petrisor
+Copyright (c) 2010 ActiveState Software Inc.
+Copyright (c) 2006 The Android Open Source Project
+Copyright (c) 2023 Apple Inc.
+Copyright 2015 The Gemmlowp Authors. All Rights Reserved
+Copyright (c) 2015-2017 The Khronos Group Inc.
+Copyright 2022 Cruise LLC
+Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved
+Copyright (c) 2022 Cruise LLC. All rights reserved
+Copyright (c) 2016-present, Facebook, Inc.
+(c) Copyright John Maddock 2006
+Copyright (c) 2014 Indiana University All rights reserved
+copyright 2019 The TensorFlow Authors
 
 License: BSD 3-Clause License
 Copyright (c) , <YEAR>,<OWNER>
@@ -129,13 +120,3 @@ Redistribution and use in source and binary forms, with or without modification,
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 
 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-Written Offer
-This product contains software whose rights holders license it on the terms of the GNU General Public License, version 2 (GPLv2) and/or other open source software licenses. We will provide you and any third party with the source code of the software licensed under an open source software license if you send us a written request by mail or email to the following addresses:
-foss@huawei.com
-detailing the name of the product and the firmware version for which you need the source code and indicating how we can contact you.
-
-Please note you need to make a payment before you obtain the complete Corresponding Source Code from us. For how much you will pay and how we will deliver the complete Corresponding Source Code to you, we will further discuss it by mail or email.
-This offer is valid to anyone in receipt of this information.
-
-THIS OFFER IS VALID FOR THREE YEARS FROM THE MOMENT WE DISTRIBUTED THE PRODUCT OR FIRMWARE.
-- 
Gitee


From 6ba95ca41c6fdd01561ba2dc0e92c1ded66ea15e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E5=98=89=E5=B7=8D?= <843972097@qq.com>
Date: Thu, 3 Apr 2025 07:47:50 +0000
Subject: [PATCH 282/358] =?UTF-8?q?!19782=20Remove=20expose=20for=20deprec?=
 =?UTF-8?q?ated=20apis.=20Merge=20pull=20request=20!19782=20from=20?=
 =?UTF-8?q?=E5=88=98=E5=98=89=E5=B7=8D/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/deprecated_apis.json                     |  59 +++++
 test/npu/test_public_bindings.py              |   6 +-
 test/torch_npu_schema.json                    | 246 ------------------
 torch_npu/contrib/__init__.py                 |  11 -
 torch_npu/contrib/function/__init__.py        |   5 -
 torch_npu/contrib/function/iou.py             |  15 +-
 torch_npu/contrib/module/__init__.py          |   6 -
 torch_npu/contrib/module/activations.py       |  10 +
 .../contrib/module/bidirectional_lstm.py      |   6 +
 torch_npu/contrib/module/deform_conv.py       |   1 -
 torch_npu/contrib/module/fusedcolorjitter.py  |   9 +-
 torch_npu/csrc/aten/npu_native_functions.yaml |   3 -
 12 files changed, 99 insertions(+), 278 deletions(-)
 create mode 100644 test/deprecated_apis.json

diff --git a/test/deprecated_apis.json b/test/deprecated_apis.json
new file mode 100644
index 0000000000..293a11ab56
--- /dev/null
+++ b/test/deprecated_apis.json
@@ -0,0 +1,59 @@
+{
+    "torch_npu": [
+        "npu_broadcast",
+        "npu_conv2d",
+        "npu_conv_transpose2d",
+        "npu_convolution",
+        "npu_convolution_transpose",
+        "npu_dtype_cast",
+        "npu_gru",
+        "npu_layer_norm_eval",
+        "npu_min",
+        "npu_mish",
+        "npu_ptiou",
+        "npu_reshape",
+        "npu_silu",
+        "npu_sort_v2"
+    ],
+    "torch_npu.contrib": [
+        "BiLSTM",
+        "DCNv2",
+        "FusedColorJitter",
+        "Mish",
+        "SiLU",
+        "Swish",
+        "npu_ciou",
+        "npu_diou",
+        "npu_giou",
+        "npu_iou",
+        "npu_ptiou"
+    ],
+    "torch_npu.contrib.module": [
+        "BiLSTM",
+        "DCNv2",
+        "FusedColorJitter",
+        "Mish",
+        "SiLU",
+        "Swish"
+    ],
+    "torch_npu.contrib.function": [
+        "npu_ciou",
+        "npu_diou",
+        "npu_giou",
+        "npu_iou",
+        "npu_ptiou"
+    ],
+    "torch_npu.contrib.function.iou": [
+        "npu_ciou",
+        "npu_diou",
+        "npu_giou",
+        "npu_iou",
+        "npu_ptiou"
+    ],
+    "torch_npu.contrib.module.deform_conv": [
+        "DCNv2"
+    ],
+    "torch_npu.contrib.module.fusedcolorjitter": [
+        "FusedColorJitter"
+    ]
+}
diff --git a/test/npu/test_public_bindings.py b/test/npu/test_public_bindings.py
index 3c7c93b958..7b73204563 100644
--- a/test/npu/test_public_bindings.py
+++ b/test/npu/test_public_bindings.py
@@ -643,6 +643,9 @@ class TestPublicBindings(TestCase):
             for modname in allow_dict["being_migrated"]:
                 if modname in allow_dict:
                     allow_dict[allow_dict["being_migrated"][modname]] = allow_dict[modname]
+        with open(
+                os.path.join(os.path.dirname(os.path.dirname(__file__)), 'deprecated_apis.json')) as json_file:
+            deprecated_dict = json.load(json_file)
                     
         if update_allow_dict_torchair:
             allow_dict.update(update_allow_dict_torchair)
@@ -694,7 +697,8 @@ class TestPublicBindings(TestCase):
                             modname.startswith("torch_npu.dynamo.torchair.ge_concrete_graph"):
                         return
 
-                    if modname in allow_dict and elem in allow_dict[modname]:
+                    if ((modname in allow_dict and elem in allow_dict[modname]) or
+                        (modname in deprecated_dict and elem in deprecated_dict[modname])):
                         return
 
                     if is_public:
diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json
index 403e83257a..1415732141 100644
--- a/test/torch_npu_schema.json
+++ b/test/torch_npu_schema.json
@@ -1,10 +1,4 @@
 {
-  "torch_npu.contrib.BiLSTM": {
-    "signature": "(input_size, hidden_size)"
-  },
-  "torch_npu.contrib.BiLSTM.forward": {
-    "signature": "(self, inputs)"
-  },
   "torch_npu.contrib.ChannelShuffle": {
     "signature": "(in_channels, groups=2, split_shuffle=True)"
   },
@@ -14,30 +8,12 @@
   "torch_npu.contrib.ChannelShuffle.forward": {
     "signature": "(self, x1, x2)"
   },
-  "torch_npu.contrib.DCNv2": {
-    "signature": "(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, bias=True, pack=True)"
-  },
-  "torch_npu.contrib.DCNv2.init_param": {
-    "signature": "(self)"
-  },
-  "torch_npu.contrib.DCNv2.forward": {
-    "signature": "(self, x)"
-  },
   "torch_npu.contrib.Focus": {
     "signature": "(c1, c2, k=1, s=1, p=None, g=1, act=True)"
   },
   "torch_npu.contrib.Focus.forward": {
     "signature": "(self, x)"
   },
-  "torch_npu.contrib.FusedColorJitter": {
-    "signature": "(brightness=0, contrast=0, saturation=0, hue=0)"
-  },
-  "torch_npu.contrib.FusedColorJitter._check_input": {
-    "signature": "(self, value, name, center=1, bound=(0, inf), clip_first_on_zero=True)"
-  },
-  "torch_npu.contrib.FusedColorJitter.forward": {
-    "signature": "(self, img)"
-  },
   "torch_npu.contrib.LabelSmoothingCrossEntropy": {
     "signature": "(num_classes=1000, smooth_factor=0.0)"
   },
@@ -62,12 +38,6 @@
   "torch_npu.contrib.LinearWeightQuant.forward": {
     "signature": "(self, x: torch.Tensor) -> torch.Tensor"
   },
-  "torch_npu.contrib.Mish": {
-    "signature": "()"
-  },
-  "torch_npu.contrib.Mish.forward": {
-    "signature": "(self, x)"
-  },
   "torch_npu.contrib.ModulatedDeformConv": {
     "signature": "(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, bias=True, pack=True)"
   },
@@ -161,18 +131,6 @@
   "torch_npu.contrib.ROIAlign.forward": {
     "signature": "(self, input_tensor, rois)"
   },
-  "torch_npu.contrib.SiLU": {
-    "signature": "()"
-  },
-  "torch_npu.contrib.SiLU.forward": {
-    "signature": "(self, x)"
-  },
-  "torch_npu.contrib.Swish": {
-    "signature": "()"
-  },
-  "torch_npu.contrib.Swish.forward": {
-    "signature": "(self, x)"
-  },
   "torch_npu.contrib.matmul_transpose": {
     "signature": "(*args, **kwargs)"
   },
@@ -188,12 +146,6 @@
   "torch_npu.contrib.npu_bbox_coder_encode_yolo": {
     "signature": "(bboxes, gt_bboxes, stride)"
   },
-  "torch_npu.contrib.npu_ciou": {
-    "signature": "(boxes1, boxes2, trans=True, is_cross=False, mode=0)"
-  },
-  "torch_npu.contrib.npu_diou": {
-    "signature": "(boxes1, boxes2, trans=True, is_cross=False, mode=0)"
-  },
   "torch_npu.contrib.npu_fast_condition_index_put": {
     "signature": "(x, condition, value)"
   },
@@ -203,18 +155,9 @@
   "torch_npu.contrib.npu_fused_attention_with_layernorm": {
     "signature": "(*args, **kwargs)"
   },
-  "torch_npu.contrib.npu_giou": {
-    "signature": "(boxes1, boxes2, is_permuted=True)"
-  },
-  "torch_npu.contrib.npu_iou": {
-    "signature": "(boxes1, boxes2, mode='ptiou', is_normalized=False, normalized_scale=100.0)"
-  },
   "torch_npu.contrib.npu_multiclass_nms": {
     "signature": "(multi_bboxes, multi_scores, score_thr=0.05, nms_thr=0.45, max_num=50, score_factors=None)"
   },
-  "torch_npu.contrib.npu_ptiou": {
-    "signature": "(boxes1, boxes2, mode='ptiou', is_normalized=False, normalized_scale=100.0)"
-  },
   "torch_npu.contrib.npu_single_level_responsible_flags": {
     "signature": "(featmap_size, gt_bboxes, stride, num_base_anchors)"
   },
@@ -242,12 +185,6 @@
   "torch_npu.contrib.function.npu_bbox_coder_encode_yolo": {
     "signature": "(bboxes, gt_bboxes, stride)"
   },
-  "torch_npu.contrib.function.npu_ciou": {
-    "signature": "(boxes1, boxes2, trans=True, is_cross=False, mode=0)"
-  },
-  "torch_npu.contrib.function.npu_diou": {
-    "signature": "(boxes1, boxes2, trans=True, is_cross=False, mode=0)"
-  },
   "torch_npu.contrib.function.npu_fast_condition_index_put": {
     "signature": "(x, condition, value)"
   },
@@ -257,18 +194,9 @@
   "torch_npu.contrib.function.npu_fused_attention_with_layernorm": {
     "signature": "(*args, **kwargs)"
   },
-  "torch_npu.contrib.function.npu_giou": {
-    "signature": "(boxes1, boxes2, is_permuted=True)"
-  },
-  "torch_npu.contrib.function.npu_iou": {
-    "signature": "(boxes1, boxes2, mode='ptiou', is_normalized=False, normalized_scale=100.0)"
-  },
   "torch_npu.contrib.function.npu_multiclass_nms": {
     "signature": "(multi_bboxes, multi_scores, score_thr=0.05, nms_thr=0.45, max_num=50, score_factors=None)"
   },
-  "torch_npu.contrib.function.npu_ptiou": {
-    "signature": "(boxes1, boxes2, mode='ptiou', is_normalized=False, normalized_scale=100.0)"
-  },
   "torch_npu.contrib.function.npu_single_level_responsible_flags": {
     "signature": "(featmap_size, gt_bboxes, stride, num_base_anchors)"
   },
@@ -302,21 +230,6 @@
   "torch_npu.contrib.function.index_op.npu_fast_condition_index_put": {
     "signature": "(x, condition, value)"
   },
-  "torch_npu.contrib.function.iou.npu_ciou": {
-    "signature": "(boxes1, boxes2, trans=True, is_cross=False, mode=0)"
-  },
-  "torch_npu.contrib.function.iou.npu_diou": {
-    "signature": "(boxes1, boxes2, trans=True, is_cross=False, mode=0)"
-  },
-  "torch_npu.contrib.function.iou.npu_giou": {
-    "signature": "(boxes1, boxes2, is_permuted=True)"
-  },
-  "torch_npu.contrib.function.iou.npu_iou": {
-    "signature": "(boxes1, boxes2, mode='ptiou', is_normalized=False, normalized_scale=100.0)"
-  },
-  "torch_npu.contrib.function.iou.npu_ptiou": {
-    "signature": "(boxes1, boxes2, mode='ptiou', is_normalized=False, normalized_scale=100.0)"
-  },
   "torch_npu.contrib.function.matmul_transpose.matmul_transpose": {
     "signature": "(*args, **kwargs)"
   },
@@ -332,12 +245,6 @@
   "torch_npu.contrib.function.roll.roll": {
     "signature": "(x, shifts, dims)"
   },
-  "torch_npu.contrib.module.BiLSTM": {
-    "signature": "(input_size, hidden_size)"
-  },
-  "torch_npu.contrib.module.BiLSTM.forward": {
-    "signature": "(self, inputs)"
-  },
   "torch_npu.contrib.module.ChannelShuffle": {
     "signature": "(in_channels, groups=2, split_shuffle=True)"
   },
@@ -347,15 +254,6 @@
   "torch_npu.contrib.module.ChannelShuffle.forward": {
     "signature": "(self, x1, x2)"
   },
-  "torch_npu.contrib.module.DCNv2": {
-    "signature": "(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, bias=True, pack=True)"
-  },
-  "torch_npu.contrib.module.DCNv2.init_param": {
-    "signature": "(self)"
-  },
-  "torch_npu.contrib.module.DCNv2.forward": {
-    "signature": "(self, x)"
-  },
   "torch_npu.contrib.module.DropoutWithByteMask": {
     "signature": "(p=0.5, inplace=False, max_seed=1023)"
   },
@@ -368,15 +266,6 @@
   "torch_npu.contrib.module.Focus.forward": {
     "signature": "(self, x)"
   },
-  "torch_npu.contrib.module.FusedColorJitter": {
-    "signature": "(brightness=0, contrast=0, saturation=0, hue=0)"
-  },
-  "torch_npu.contrib.module.FusedColorJitter._check_input": {
-    "signature": "(self, value, name, center=1, bound=(0, inf), clip_first_on_zero=True)"
-  },
-  "torch_npu.contrib.module.FusedColorJitter.forward": {
-    "signature": "(self, img)"
-  },
   "torch_npu.contrib.module.LabelSmoothingCrossEntropy": {
     "signature": "(num_classes=1000, smooth_factor=0.0)"
   },
@@ -401,12 +290,6 @@
   "torch_npu.contrib.module.LinearWeightQuant.forward": {
     "signature": "(self, x: torch.Tensor) -> torch.Tensor"
   },
-  "torch_npu.contrib.module.Mish": {
-    "signature": "()"
-  },
-  "torch_npu.contrib.module.Mish.forward": {
-    "signature": "(self, x)"
-  },
   "torch_npu.contrib.module.ModulatedDeformConv": {
     "signature": "(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, bias=True, pack=True)"
   },
@@ -500,42 +383,6 @@
   "torch_npu.contrib.module.ROIAlign.forward": {
     "signature": "(self, input_tensor, rois)"
   },
-  "torch_npu.contrib.module.SiLU": {
-    "signature": "()"
-  },
-  "torch_npu.contrib.module.SiLU.forward": {
-    "signature": "(self, x)"
-  },
-  "torch_npu.contrib.module.Swish": {
-    "signature": "()"
-  },
-  "torch_npu.contrib.module.Swish.forward": {
-    "signature": "(self, x)"
-  },
-  "torch_npu.contrib.module.activations.Mish": {
-    "signature": "()"
-  },
-  "torch_npu.contrib.module.activations.Mish.forward": {
-    "signature": "(self, x)"
-  },
-  "torch_npu.contrib.module.activations.SiLU": {
-    "signature": "()"
-  },
-  "torch_npu.contrib.module.activations.SiLU.forward": {
-    "signature": "(self, x)"
-  },
-  "torch_npu.contrib.module.activations.Swish": {
-    "signature": "()"
-  },
-  "torch_npu.contrib.module.activations.Swish.forward": {
-    "signature": "(self, x)"
-  },
-  "torch_npu.contrib.module.bidirectional_lstm.BiLSTM": {
-    "signature": "(input_size, hidden_size)"
-  },
-  "torch_npu.contrib.module.bidirectional_lstm.BiLSTM.forward": {
-    "signature": "(self, inputs)"
-  },
   "torch_npu.contrib.module.channel_shuffle.ChannelShuffle": {
     "signature": "(in_channels, groups=2, split_shuffle=True)"
   },
@@ -575,15 +422,6 @@
   "torch_npu.contrib.module.crossentropy.LabelSmoothingCrossEntropy.forward": {
     "signature": "(self, pred, target)"
   },
-  "torch_npu.contrib.module.deform_conv.DCNv2": {
-    "signature": "(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, bias=True, pack=True)"
-  },
-  "torch_npu.contrib.module.deform_conv.DCNv2.init_param": {
-    "signature": "(self)"
-  },
-  "torch_npu.contrib.module.deform_conv.DCNv2.forward": {
-    "signature": "(self, x)"
-  },
   "torch_npu.contrib.module.deform_conv.ModulatedDeformConv": {
     "signature": "(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, deformable_groups=1, bias=True, pack=True)"
   },
@@ -650,15 +488,6 @@
   "torch_npu.contrib.module.focus.fast_slice": {
     "signature": "(x)"
   },
-  "torch_npu.contrib.module.fusedcolorjitter.FusedColorJitter": {
-    "signature": "(brightness=0, contrast=0, saturation=0, hue=0)"
-  },
-  "torch_npu.contrib.module.fusedcolorjitter.FusedColorJitter._check_input": {
-    "signature": "(self, value, name, center=1, bound=(0, inf), clip_first_on_zero=True)"
-  },
-  "torch_npu.contrib.module.fusedcolorjitter.FusedColorJitter.forward": {
-    "signature": "(self, img)"
-  },
   "torch_npu.contrib.module.linear_a8w8_quant.LinearA8W8Quant": {
     "signature": "(in_features: int, out_features: int, *, bias: bool = True, offset: bool = False, pertoken_scale: bool = False, device=None, dtype=None, output_dtype=None) -> None"
   },
@@ -2432,12 +2261,6 @@
   "torch_npu._npu_dropout": {
     "signature": "(*args, **kwargs)"
   },
-  "torch_npu.copy_memory_": {
-    "signature": "(*args, **kwargs)"
-  },
-  "torch_npu.empty_with_format": {
-    "signature": "(*args, **kwargs)"
-  },
   "torch_npu.fast_gelu": {
     "signature": "(self)"
   },
@@ -2450,9 +2273,6 @@
   "torch_npu.npu_anti_quant": {
     "signature": "(x, scale, offset=None, dst_dtype=None, src_dtype=None)"
   },
-  "torch_npu.npu_apply_adam": {
-    "signature": "(*args, **kwargs)"
-  },
   "torch_npu.npu_batch_nms": {
     "signature": "(self, scores, score_threshold, iou_threshold, max_size_per_class, max_total_size, change_coordinate_frame=False, transpose_box=False)"
   },
@@ -2468,48 +2288,24 @@
   "torch_npu.npu_bounding_box_encode": {
     "signature": "(anchor_box, ground_truth_box, means0, means1, means2, means3, stds0, stds1, stds2, stds3)"
   },
-  "torch_npu.npu_broadcast": {
-    "signature": "(self, size, out=None)"
-  },
-  "torch_npu.npu_ciou": {
-    "signature": "(self, gtboxes, trans=False, is_cross=True, mode=0, atan_sub_flag=False)"
-  },
   "torch_npu.npu_clear_float_status": {
     "signature": "(*args, **kwargs)"
   },
   "torch_npu.npu_confusion_transpose": {
     "signature": "(self, perm, shape, transpose_first)"
   },
-  "torch_npu.npu_conv2d": {
-    "signature": "(input_, weight, bias, stride, padding, dilation, groups)"
-  },
   "torch_npu.npu_conv3d": {
     "signature": "(input_, weight, bias, stride, padding, dilation, groups)"
   },
-  "torch_npu.npu_conv_transpose2d": {
-    "signature": "(input_, weight, bias, padding, output_padding, stride, dilation, groups)"
-  },
   "torch_npu.npu_convert_weight_to_int4pack": {
     "signature": "(*args, **kwargs)"
   },
-  "torch_npu.npu_convolution": {
-    "signature": "(input_, weight, bias, stride, padding, dilation, groups)"
-  },
-  "torch_npu.npu_convolution_transpose": {
-    "signature": "(input_, weight, bias, padding, output_padding, stride, dilation, groups)"
-  },
   "torch_npu.npu_deformable_conv2d": {
     "signature": "(inputs, weight, offset, bias, kernel_size, stride, padding, dilation=[1, 1, 1, 1], groups=1, deformable_groups=1, modulated=True)"
   },
-  "torch_npu.npu_diou": {
-    "signature": "(self, gtboxes, trans=False, is_cross=False, mode=0)"
-  },
   "torch_npu.npu_dropout_with_add_softmax": {
     "signature": "(self, x1, alpha, prob, dim)"
   },
-  "torch_npu.npu_dtype_cast": {
-    "signature": "(self, dtype)"
-  },
   "torch_npu.npu_dynamic_quant": {
     "signature": "(input_dummy, smooth_scales=None)"
   },
@@ -2546,30 +2342,18 @@
   "torch_npu.npu_get_float_status": {
     "signature": "(*args, **kwargs)"
   },
-  "torch_npu.npu_giou": {
-    "signature": "(self, gtboxes, trans=False, is_cross=False, mode=0)"
-  },
   "torch_npu.npu_grid_assign_positive": {
     "signature": "(self, overlaps, box_responsible_flags, max_overlaps, argmax_overlaps, gt_max_overlaps, gt_argmax_overlaps, num_gts, pos_iou_thr, min_pos_iou, gt_max_assign_all)"
   },
   "torch_npu.npu_grouped_matmul": {
     "signature": "(*args, **kwargs)"
   },
-  "torch_npu.npu_gru": {
-    "signature": "(inputs, hx, weight_input, weight_hidden, bias_input, bias_hidden, seq_length, has_biases, num_layers, dropout, train, bidirectional, batch_first)"
-  },
   "torch_npu.npu_incre_flash_attention": {
     "signature": "(self, query, key, value, padding_mask, atten_mask, pse_shift, actual_seq_lengths, num_heads, scale_value, input_layout, num_key_value_heads)"
   },
   "torch_npu.npu_indexing": {
     "signature": "(self, begin, end, strides, begin_mask=0, end_mask=0, ellipsis_mask=0, new_axis_mask=0, shrink_axis_mask=0)"
   },
-  "torch_npu.npu_iou": {
-    "signature": "(bboxes, gtboxes, mode=0)"
-  },
-  "torch_npu.npu_layer_norm_eval": {
-    "signature": "(input_, normalized_shape, weight=None, bias=None, eps=1e-05)"
-  },
   "torch_npu.npu_linear": {
     "signature": "(input_, weight, bias=None)"
   },
@@ -2579,18 +2363,9 @@
   "torch_npu.npu_max": {
     "signature": "(self, dim, keepdim=False)"
   },
-  "torch_npu.npu_min": {
-    "signature": "(self, dim, keepdim=False)"
-  },
-  "torch_npu.npu_mish": {
-    "signature": "(self)"
-  },
   "torch_npu.npu_multi_head_attention": {
     "signature": "(query, key, value, query_weight, key_weight, value_weight, attn_mask, out_proj_weight, query_bias, key_bias, value_bias, out_proj_bias, dropout_mask, attn_head_num, attn_dim_per_head, src_len, tgt_len, dropout_prob, softmax_use_float)"
   },
-  "torch_npu.npu_nms_rotated": {
-    "signature": "(*args, **kwargs)"
-  },
   "torch_npu.npu_nms_v4": {
     "signature": "(self, scores, max_output_size, iou_threshold, scores_threshold, pad_to_max_output_size=False)"
   },
@@ -2609,9 +2384,6 @@
   "torch_npu.npu_ps_roi_pooling": {
     "signature": "(self, rois, spatial_scale, group_size, output_dim)"
   },
-  "torch_npu.npu_ptiou": {
-    "signature": "(bboxes, gtboxes, mode=0)"
-  },
   "torch_npu.npu_quant_matmul": {
     "signature": "(*args, **kwargs)"
   },
@@ -2621,9 +2393,6 @@
   "torch_npu.npu_random_choice_with_mask": {
     "signature": "(*args, **kwargs)"
   },
-  "torch_npu.npu_reshape": {
-    "signature": "(self, shape, can_refresh=False, out=None)"
-  },
   "torch_npu.npu_rms_norm": {
     "signature": "(self, gamma, epsilon=1e-06)"
   },
@@ -2654,18 +2423,12 @@
   "torch_npu.npu_sign_bits_unpack": {
     "signature": "(inputs, size, dtype)"
   },
-  "torch_npu.npu_silu": {
-    "signature": "(self)"
-  },
   "torch_npu.npu_slice": {
     "signature": "(self, offsets, size)"
   },
   "torch_npu.npu_softmax_cross_entropy_with_logits": {
     "signature": "(self, labels)"
   },
-  "torch_npu.npu_sort_v2": {
-    "signature": "(self, dim=-1, descending=False, out=None)"
-  },
   "torch_npu.npu_swiglu": {
     "signature": "(*args, **kwargs)"
   },
@@ -2786,15 +2549,6 @@
   "func: get_npu_format": {
     "signature": "(Tensor self) -> int"
   },
-  "func: empty_with_format": {
-    "signature": "(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2) -> Tensor"
-  },
-  "func: empty_with_format.names": {
-    "signature": "(int[] size, Dimname[]? names, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2) -> Tensor"
-  },
-  "func: copy_memory_": {
-    "signature": "(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)"
-  },
   "func: copy_": {
     "signature": ""
   },
diff --git a/torch_npu/contrib/__init__.py b/torch_npu/contrib/__init__.py
index a4ba757d49..faa330ed0d 100644
--- a/torch_npu/contrib/__init__.py
+++ b/torch_npu/contrib/__init__.py
@@ -9,11 +9,6 @@ from .module import ChannelShuffle, Prefetcher, LabelSmoothingCrossEntropy, ROIA
 
 __all__ = [
     # from function
-    "npu_iou",
-    "npu_ptiou",
-    "npu_giou",
-    "npu_diou",
-    "npu_ciou",
     "npu_multiclass_nms",
     "npu_batched_multiclass_nms",
     "npu_single_level_responsible_flags",
@@ -32,13 +27,8 @@ __all__ = [
     "Prefetcher",
     "LabelSmoothingCrossEntropy",
     "ROIAlign",
-    "DCNv2",
     "ModulatedDeformConv",
-    "Mish",
-    "BiLSTM",
     "PSROIPool",
-    "SiLU",
-    "Swish",
     "NpuFairseqDropout",
     "NpuCachedDropout",
     "MultiheadAttention",
@@ -46,6 +36,5 @@ __all__ = [
     "Focus",
     "LinearA8W8Quant",
     "LinearQuant",
-    "FusedColorJitter",
     "LinearWeightQuant",
 ]
diff --git a/torch_npu/contrib/function/__init__.py b/torch_npu/contrib/function/__init__.py
index 9df0bb7fe4..1b0e61beab 100644
--- a/torch_npu/contrib/function/__init__.py
+++ b/torch_npu/contrib/function/__init__.py
@@ -10,11 +10,6 @@ from .fused_attention import npu_fused_attention_with_layernorm, npu_fused_atten
 from .npu_functional import dropout_with_byte_mask
 
 __all__ = [
-    "npu_iou",
-    "npu_ptiou",
-    "npu_giou",
-    "npu_diou",
-    "npu_ciou",
     "npu_multiclass_nms",
     "npu_batched_multiclass_nms",
     "npu_single_level_responsible_flags",
diff --git a/torch_npu/contrib/function/iou.py b/torch_npu/contrib/function/iou.py
index e79513fa12..03f584e76e 100644
--- a/torch_npu/contrib/function/iou.py
+++ b/torch_npu/contrib/function/iou.py
@@ -1,9 +1,12 @@
+__all__ = []
+
+import warnings
+
 import torch
 import torch_npu
 from torch_npu.utils._error_code import ErrCode, ops_error
 
-
-__all__ = ['npu_iou', 'npu_ptiou', 'npu_giou', 'npu_diou', 'npu_ciou']
+warnings.filterwarnings(action='once')
 
 
 def _box_dtype_check(box):
@@ -52,6 +55,8 @@ def npu_iou(boxes1,
     Returns:
         Tensor: IoU, sized [N,M].
     """
+    warnings.warn("torch_npu.contrib.npu_iou is deprecated. "
+                  "Please use torch_npu.npu_iou or torch_npu.npu_ptiou for replacement.", FutureWarning)
 
     if mode not in ["iou", "ptiou"]:
         raise ValueError("Expected mode in [iou, ptiou]" + ops_error(ErrCode.VALUE))
@@ -114,6 +119,8 @@ def npu_giou(boxes1,
     Returns:
         Tensor: IoU, sized [n, 1].
     """
+    warnings.warn("torch_npu.contrib.npu_giou is deprecated. "
+                  "Please use torch_npu.npu_giou for replacement.", FutureWarning)
 
     if boxes1.shape != boxes2.shape:
         raise ValueError("Expected boxes1.shape == boxes2.shape" + ops_error(ErrCode.VALUE))
@@ -173,6 +180,8 @@ def npu_diou(boxes1,
     Returns:
         Tensor: IoU, sized [1, n].
     """
+    warnings.warn("torch_npu.contrib.function.npu_diou is deprecated. "
+                  "Please use torch_npu.npu_diou for replacement.", FutureWarning)
 
     out = torch_npu.npu_diou(boxes1, boxes2, trans, is_cross, mode)
 
@@ -224,6 +233,8 @@ def npu_ciou(boxes1,
         Tensor: IoU, sized [1, n].
 
     """
+    warnings.warn("torch_npu.contrib.function.npu_ciou is deprecated. "
+                  "Please use torch_npu.npu_ciou for replacement.", FutureWarning)
 
     out = torch_npu.npu_ciou(boxes1, boxes2, trans, is_cross, mode, True)
 
diff --git a/torch_npu/contrib/module/__init__.py b/torch_npu/contrib/module/__init__.py
index 58c8bec44b..e51cd3e4b4 100644
--- a/torch_npu/contrib/module/__init__.py
+++ b/torch_npu/contrib/module/__init__.py
@@ -24,17 +24,11 @@ __all__ = [
     "Prefetcher",
     "LabelSmoothingCrossEntropy",
     "ROIAlign",
-    "DCNv2",
     "ModulatedDeformConv",
-    "Mish",
-    "BiLSTM",
     "PSROIPool",
-    "SiLU",
-    "Swish",
     "NpuFairseqDropout",
     "NpuCachedDropout",
     "MultiheadAttention",
-    "FusedColorJitter",
     "NpuDropPath",
     "Focus",
     "LinearA8W8Quant",
diff --git a/torch_npu/contrib/module/activations.py b/torch_npu/contrib/module/activations.py
index 91faa93944..c5fc5bc5ea 100644
--- a/torch_npu/contrib/module/activations.py
+++ b/torch_npu/contrib/module/activations.py
@@ -1,7 +1,11 @@
+import warnings
+
 import torch
 import torch.nn as nn
 import torch_npu
 
+warnings.filterwarnings(action='once')
+
 
 class Mish(nn.Module):
     def __init__(self):
@@ -21,6 +25,9 @@ class Mish(nn.Module):
             >>> output = m(input_tensor)
         """
         super(Mish, self).__init__()
+        
+        warnings.warn("torch_npu.contrib.module.Mish is deprecated. "
+                      "Please use torch.nn.Mish for replacement.", FutureWarning)
 
     def forward(self, x):
         x = torch_npu.npu_mish(x)
@@ -41,6 +48,9 @@ class SiLU(nn.Module):
             >>> output = m(input_tensor)
         """
         super(SiLU, self).__init__()
+        
+        warnings.warn("torch_npu.contrib.module.SiLU is deprecated. "
+                      "Please use torch.nn.SiLU for replacement.", FutureWarning)
 
     def forward(self, x):
         x = torch_npu.npu_silu(x)
diff --git a/torch_npu/contrib/module/bidirectional_lstm.py b/torch_npu/contrib/module/bidirectional_lstm.py
index 9ef9efa774..5ff78d99df 100644
--- a/torch_npu/contrib/module/bidirectional_lstm.py
+++ b/torch_npu/contrib/module/bidirectional_lstm.py
@@ -1,6 +1,10 @@
+import warnings
+
 import torch
 import torch_npu
 
+warnings.filterwarnings(action='once')
+
 
 class BiLSTM(torch.nn.Module):
     r"""Applies an NPU compatible bidirectional LSTM operation to an input
@@ -59,6 +63,8 @@ class BiLSTM(torch.nn.Module):
     def __init__(self, input_size, hidden_size):
         super(BiLSTM, self).__init__()
 
+        warnings.warn("torch_npu.contrib.BiLSTM is deprecated. "
+                      "Please check document for replacement.", FutureWarning)
         self.fw_rnn = torch.nn.LSTM(input_size, hidden_size, bidirectional=False)
         self.bw_rnn = torch.nn.LSTM(input_size, hidden_size, bidirectional=False)
 
diff --git a/torch_npu/contrib/module/deform_conv.py b/torch_npu/contrib/module/deform_conv.py
index 106221fff2..2b20be597f 100644
--- a/torch_npu/contrib/module/deform_conv.py
+++ b/torch_npu/contrib/module/deform_conv.py
@@ -8,7 +8,6 @@ import torch_npu
 __all__ = [
     "ModulatedDeformConv2dFunction",
     "ModulatedDeformConv",
-    "DCNv2"
 ]
 
 
diff --git a/torch_npu/contrib/module/fusedcolorjitter.py b/torch_npu/contrib/module/fusedcolorjitter.py
index 2e73c53355..de45694e1a 100644
--- a/torch_npu/contrib/module/fusedcolorjitter.py
+++ b/torch_npu/contrib/module/fusedcolorjitter.py
@@ -1,3 +1,6 @@
+__all__ = []
+
+import warnings
 import random
 from math import sin, cos, pi
 import numbers
@@ -6,9 +9,7 @@ import torch
 
 from torch_npu.utils._error_code import ErrCode, ops_error
 
-__all__ = [
-    "FusedColorJitter"
-]
+warnings.filterwarnings(action='once')
 
 
 class _FusedColorJitterApply(object):
@@ -122,6 +123,8 @@ class FusedColorJitter(torch.nn.Module):
 
     def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
         super().__init__()
+        warnings.warn("torch_npu.contrib.module.FusedColorJitter is deprecated. "
+                      "Please use torchvision.transforms.ColorJitter for replacement.", FutureWarning)
         self.brightness = self._check_input(brightness, 'brightness')
         self.contrast = self._check_input(contrast, 'contrast')
         self.saturation = self._check_input(saturation, 'saturation')
diff --git a/torch_npu/csrc/aten/npu_native_functions.yaml b/torch_npu/csrc/aten/npu_native_functions.yaml
index edbd46750b..977e6e147c 100644
--- a/torch_npu/csrc/aten/npu_native_functions.yaml
+++ b/torch_npu/csrc/aten/npu_native_functions.yaml
@@ -73,17 +73,14 @@ custom:
   - func: empty_with_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2) -> Tensor
     dispatch:
       CompositeExplicitAutograd: empty_with_format
-    exposed: True
   - func: unsafe_empty_with_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2, bool keep_format=False) -> Tensor
     dispatch:
       CompositeExplicitAutograd: empty_with_format
   - func: empty_with_format.names(int[] size, Dimname[]? names, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, int acl_format=2) -> Tensor
     dispatch:
       CompositeExplicitAutograd: empty_with_format
-    exposed: True
   - func: copy_memory_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
     device_check: NoCheck
-    exposed: True
   - func: get_storage_size(Tensor self) -> int
   - func: npu_format_cast(Tensor self, int acl_format) -> Tensor
     exposed: True
-- 
Gitee


From caac02023c783e88a254f41778e95f4f9730b0bd Mon Sep 17 00:00:00 2001
From: yuhaiyan <yuhaiyan8@huawei.com>
Date: Thu, 3 Apr 2025 08:04:52 +0000
Subject: [PATCH 283/358] !19975 Fix ut and prevent overflow Merge pull request
 !19975 from yuhaiyan/v2.6.0-dev1

---
 test/npu/test_cann_version.py           | 3 ---
 torch_npu/csrc/core/npu/GetCANNInfo.cpp | 6 +++++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/npu/test_cann_version.py b/test/npu/test_cann_version.py
index 3fbfea312b..76925b9b44 100644
--- a/test/npu/test_cann_version.py
+++ b/test/npu/test_cann_version.py
@@ -17,9 +17,6 @@ class TestCANNversion(TestCase):
                         or re.match("([0-9]+).([0-9]+).T([0-9]+)", version)
                         or re.match("([0-9]+).([0-9]+).RC([0-9]+).alpha([0-9]+)", version))
             self.assertTrue(is_match, f"The env version is {version_env}. The format of cann version {version} is invalid.")
-        else:
-            self.assertEqual(version, "")
-
 
     def test_compare_cann_version(self):
         version_env = get_cann_version_from_env()
diff --git a/torch_npu/csrc/core/npu/GetCANNInfo.cpp b/torch_npu/csrc/core/npu/GetCANNInfo.cpp
index ab3f12d5f1..0f03168717 100644
--- a/torch_npu/csrc/core/npu/GetCANNInfo.cpp
+++ b/torch_npu/csrc/core/npu/GetCANNInfo.cpp
@@ -53,7 +53,11 @@ double VersionToNum(std::string versionStr)
         return 0.0;
     }
 
-    double num = ((major + 1) * 100000000) + ((minor + 1) * 1000000) + ((release + 1) * 10000) + ((RCVersion + 1) * 100 + 5000) + ((TVersion + 1) * 100) - (100 - alphaVersion);
+    double num = ((static_cast<double>(major) + 1.0) * 100000000) +
+                  ((static_cast<double>(minor) + 1.0) * 1000000) +
+                  ((static_cast<double>(release) + 1.0) * 10000) +
+                   ((static_cast<double>(RCVersion) + 1.0) * 100 + 5000) +
+                   ((static_cast<double>(TVersion) + 1) * 100) - (100 - static_cast<double>(alphaVersion));
     return num;
 }
 
-- 
Gitee


From 5db2b0fd480082db6c4cea4b4043ee8ed50fcf91 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 3 Apr 2025 08:40:02 +0000
Subject: [PATCH 284/358] !20001 Update op_plugin commit id Merge pull request
 !20001 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 0bf41f45be..2c4c1e647f 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 0bf41f45be142847db0bad59ca72d84c1ed5e547
+Subproject commit 2c4c1e647f2eba7ddbf764c6e12f1bcebebb2ca6
-- 
Gitee


From 80c6ae4c1390fa6cf536d0c591e98e7b12ec5ab4 Mon Sep 17 00:00:00 2001
From: yuhaiyan <yuhaiyan8@huawei.com>
Date: Thu, 3 Apr 2025 12:54:03 +0000
Subject: [PATCH 285/358] !20012 Prevent overflow Merge pull request !20012
 from yuhaiyan/v2.6.0-dev1

---
 torch_npu/csrc/core/npu/GetCANNInfo.cpp | 60 ++++++++++++-------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/torch_npu/csrc/core/npu/GetCANNInfo.cpp b/torch_npu/csrc/core/npu/GetCANNInfo.cpp
index 0f03168717..c009465e4e 100644
--- a/torch_npu/csrc/core/npu/GetCANNInfo.cpp
+++ b/torch_npu/csrc/core/npu/GetCANNInfo.cpp
@@ -22,42 +22,42 @@ std::unordered_map<std::string, aclCANNPackageName> packageNameMap = {
     {"DRIVER", ACL_PKG_NAME_DRIVER}
 };
 
-double VersionToNum(std::string versionStr)
+int64_t VersionToNum(std::string versionStr)
 {
     std::smatch results;
-    int major = -1;
-    int minor = -1;
-    int release = -1;
-    int RCVersion = -51;
-    int TVersion = -1;
-    int alphaVersion = 0;
+    int64_t major = -1;
+    int64_t minor = -1;
+    int64_t release = -1;
+    int64_t RCVersion = -51;
+    int64_t TVersion = -1;
+    int64_t alphaVersion = 0;
     if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).RC([0-9]+)"))) {
-        major = stoi(results[kVersionIndex1]);
-        minor = stoi(results[kVersionIndex2]);
-        RCVersion = stoi(results[kVersionIndex3]);
+        major = stoll(results[kVersionIndex1]);
+        minor = stoll(results[kVersionIndex2]);
+        RCVersion = stoll(results[kVersionIndex3]);
     } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).([0-9]+)"))) {
-        major = stoi(results[kVersionIndex1]);
-        minor = stoi(results[kVersionIndex2]);
-        release = stoi(results[kVersionIndex3]);
+        major = stoll(results[kVersionIndex1]);
+        minor = stoll(results[kVersionIndex2]);
+        release = stoll(results[kVersionIndex3]);
     } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).T([0-9]+)"))) {
-        major = stoi(results[kVersionIndex1]);
-        minor = stoi(results[kVersionIndex2]);
-        TVersion = stoi(results[kVersionIndex3]);
+        major = stoll(results[kVersionIndex1]);
+        minor = stoll(results[kVersionIndex2]);
+        TVersion = stoll(results[kVersionIndex3]);
     } else if (std::regex_match(versionStr, results, std::regex("([0-9]+).([0-9]+).RC([0-9]+).alpha([0-9]+)"))) {
-        major = stoi(results[kVersionIndex1]);
-        minor = stoi(results[kVersionIndex2]);
-        RCVersion = stoi(results[kVersionIndex3]);
-        alphaVersion = stoi(results[kVersionIndex4]);
+        major = stoll(results[kVersionIndex1]);
+        minor = stoll(results[kVersionIndex2]);
+        RCVersion = stoll(results[kVersionIndex3]);
+        alphaVersion = stoll(results[kVersionIndex4]);
     } else {
         TORCH_NPU_WARN_ONCE("Version: " + versionStr + " is invalid.");
-        return 0.0;
+        return 0;
     }
 
-    double num = ((static_cast<double>(major) + 1.0) * 100000000) +
-                  ((static_cast<double>(minor) + 1.0) * 1000000) +
-                  ((static_cast<double>(release) + 1.0) * 10000) +
-                   ((static_cast<double>(RCVersion) + 1.0) * 100 + 5000) +
-                   ((static_cast<double>(TVersion) + 1) * 100) - (100 - static_cast<double>(alphaVersion));
+    int64_t num = ((major + 1) * 100000000) +
+                 ((minor + 1) * 1000000) +
+                 ((release + 1) * 10000) +
+                 ((RCVersion + 1) * 100 + 5000) +
+                 ((TVersion + 1) * 100) - (100 - alphaVersion);
     return num;
 }
 
@@ -150,8 +150,8 @@ bool IsGteCANNVersion(const std::string version, const std::string module)
         TORCH_CHECK(false, "When the version " + version + " is less than \"8.1.RC1\", this function is not supported.", PTA_ERROR(ErrCode::VALUE));
     }
     std::string currentVersion = GetCANNVersion(module);
-    double current_num = VersionToNum(currentVersion);
-    double boundary_num = VersionToNum(version);
+    int64_t current_num = VersionToNum(currentVersion);
+    int64_t boundary_num = VersionToNum(version);
     if (current_num >= boundary_num) {
         return true;
     } else {
@@ -165,8 +165,8 @@ bool IsGteDriverVersion(const std::string driverVersion)
     // The result of this function will be false, even if current driver version meets the requirement.
     const static std::string baseCANNVersion = "8.1.RC1";
     std::string currentCANNVersion = GetCANNVersion("CANN");
-    double currentCannNum = VersionToNum(currentCANNVersion);
-    double boundaryCannNum = VersionToNum(baseCANNVersion);
+    int64_t currentCannNum = VersionToNum(currentCANNVersion);
+    int64_t boundaryCannNum = VersionToNum(baseCANNVersion);
     if (currentCannNum < boundaryCannNum) {
         TORCH_CHECK(false, "When the cann version is less than \"8.1.RC1\", this function is not supported.",
                     PTA_ERROR(ErrCode::VALUE));
-- 
Gitee


From 37b62bd5c39023bf75c74f377cffe7a3173aa7af Mon Sep 17 00:00:00 2001
From: PapayaMilkZ <zhangdengke9@h-partners.com>
Date: Mon, 7 Apr 2025 01:11:24 +0000
Subject: [PATCH 286/358] !20024 add grouped_matmul_finalize_routing Merge pull
 request !20024 from PapayaMilkZ/v2.6.0

---
 test/allowlist_for_publicAPI.json | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index eb7e1dff1c..2db698611c 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -2859,7 +2859,8 @@
     "npu_moe_gating_top_k_softmax",
     "npu_moe_init_routing",
     "npu_group_norm_swish",
-    "npu_mrope"
+    "npu_mrope",
+    "npu_grouped_matmul_finalize_routing"
   ],
   "torch_npu.contrib": [
     "npu_fused_attention_with_layernorm",
-- 
Gitee


From afeccb2d14fc8ae5c9766d0e2ad76d84ca188751 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Mon, 7 Apr 2025 02:09:27 +0000
Subject: [PATCH 287/358] !20037 Update op_plugin commit id Merge pull request
 !20037 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 2c4c1e647f..79059acc07 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 2c4c1e647f2eba7ddbf764c6e12f1bcebebb2ca6
+Subproject commit 79059acc07997fa2b33d1cc529ea425f82835704
-- 
Gitee


From f9d9feb9156f4937302f1a34b5c782c0870e61f2 Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Mon, 7 Apr 2025 06:29:29 +0000
Subject: [PATCH 288/358] !19970 Update torchair commit id Merge pull request
 !19970 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 25de07e4bb..9faf65c70e 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 25de07e4bb4f0bdec418099465f6b8e28fade989
+Subproject commit 9faf65c70e9fdd8d5e1afc4d97718c73e812a2c0
-- 
Gitee


From 23ebfb801594e01b693a084817e1e163990afc86 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Mon, 7 Apr 2025 13:55:07 +0000
Subject: [PATCH 289/358] !20083 Update op_plugin commit id Merge pull request
 !20083 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 79059acc07..0c4620c1b1 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 79059acc07997fa2b33d1cc529ea425f82835704
+Subproject commit 0c4620c1b16b3be6e8531934df254bb1de22c7ea
-- 
Gitee


From d0e5d7b3699e23d2c17b5ffb15432b23308c9d69 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 8 Apr 2025 08:40:08 +0000
Subject: [PATCH 290/358] !20105 Update op_plugin commit id Merge pull request
 !20105 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 0c4620c1b1..7296c1ba6a 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 0c4620c1b16b3be6e8531934df254bb1de22c7ea
+Subproject commit 7296c1ba6ac85d3c9aee2acfbf98b69715439ff9
-- 
Gitee


From 33ad23905b7160b0cf5be9b68875babec1de9392 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 8 Apr 2025 08:40:09 +0000
Subject: [PATCH 291/358] !20105 Update op_plugin commit id Merge pull request
 !20105 from pta-robot/v2.6.0

-- 
Gitee


From 60be4a5fa87ef7672d0f964543dc4199c5e88a14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= <jiangyiwen5@huawei.com>
Date: Tue, 8 Apr 2025 11:48:53 +0000
Subject: [PATCH 292/358] =?UTF-8?q?!20074=20SetThreadAffinity=20in=20Initi?=
 =?UTF-8?q?alize=20Merge=20pull=20request=20!20074=20from=20=E5=A7=9C?=
 =?UTF-8?q?=E6=80=A1=E6=96=87/v2.6.0=5Fbc?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
index 843f6606f6..42d80b0792 100644
--- a/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
+++ b/torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.cpp
@@ -191,6 +191,8 @@ NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id)
 
     GetAffinityInfo();
 
+    SetThreadAffinity(device_id_);
+
     init_flag_ = true;
     ASCEND_LOGD("Npu sys ctrl initialize successfully.");
 
-- 
Gitee


From fa4c9fed1292b240504672e23c738a777eabcf7d Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 8 Apr 2025 14:10:06 +0000
Subject: [PATCH 293/358] !20137 Update op_plugin commit id Merge pull request
 !20137 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 7296c1ba6a..815a2964cb 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 7296c1ba6ac85d3c9aee2acfbf98b69715439ff9
+Subproject commit 815a2964cb5055b731c15359dcdf6e6aa1056927
-- 
Gitee


From 76e3b4cb29fbd5e672d9f8f8e7af10a719f80821 Mon Sep 17 00:00:00 2001
From: jiangpengfei <jiangpengfei24@huawei.com>
Date: Tue, 8 Apr 2025 14:10:42 +0000
Subject: [PATCH 294/358] !20122 not getdevice repeatedly when already set
 device Merge pull request !20122 from jiangpengfei/v2.6.0

---
 torch_npu/csrc/core/npu/NPUFunctions.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/torch_npu/csrc/core/npu/NPUFunctions.cpp b/torch_npu/csrc/core/npu/NPUFunctions.cpp
index d9c1944978..eb34b3c925 100644
--- a/torch_npu/csrc/core/npu/NPUFunctions.cpp
+++ b/torch_npu/csrc/core/npu/NPUFunctions.cpp
@@ -187,6 +187,10 @@ int ExchangeDevice(int device)
 
 bool IsContextInitialized()
 {
+    if (local_device >= 0) {
+        return true;
+    }
+
     int32_t device = -1;
     aclError err =  aclrtGetDevice(&device);
     if (err == ACL_ERROR_NONE) {
-- 
Gitee


From 19e4306f3b8d1fcaa766327743147cb4e3cef78d Mon Sep 17 00:00:00 2001
From: wangzixuan <617225691@qq.com>
Date: Tue, 8 Apr 2025 14:51:02 +0000
Subject: [PATCH 295/358] !20079 feature manager add v2 func Merge pull request
 !20079 from wangzixuan/dev-2.6.0

---
 .../framework/interface/MsProfilerInterface.cpp   | 15 +++++++++++----
 torch_npu/csrc/profiler/feature_mgr.cpp           |  2 +-
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp b/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp
index 4dacd70fb1..2d427b78dc 100644
--- a/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp
+++ b/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp
@@ -18,6 +18,7 @@ REGISTER_LIBRARY(libmsprofiler)
 LOAD_FUNCTION(aclprofWarmup)
 LOAD_FUNCTION(aclprofSetConfig)
 LOAD_FUNCTION(aclprofGetSupportedFeatures)
+LOAD_FUNCTION(aclprofGetSupportedFeaturesV2)
 LOAD_FUNCTION(aclprofMarkEx)
 
 aclError AclProfilingWarmup(const aclprofConfig *profilerConfig)
@@ -52,12 +53,18 @@ aclError AclprofGetSupportedFeatures(size_t* featuresSize, void** featuresData)
     typedef aclError(*AclprofGetSupportedFeaturesFunc)(size_t*, void**);
     static AclprofGetSupportedFeaturesFunc func = nullptr;
     if (func == nullptr) {
+        func = (AclprofGetSupportedFeaturesFunc)GET_FUNC(aclprofGetSupportedFeaturesV2);
+    }
+    if (func != nullptr) {
+        return func(featuresSize, featuresData);
+    } else {
         func = (AclprofGetSupportedFeaturesFunc)GET_FUNC(aclprofGetSupportedFeatures);
-        if (func == nullptr) {
-            return ACL_ERROR_PROF_MODULES_UNSUPPORTED;
-        }
     }
-    return func(featuresSize, featuresData);
+    
+    if (func != nullptr) {
+        return func(featuresSize, featuresData);
+    }
+    return ACL_ERROR_PROF_MODULES_UNSUPPORTED;
 }
 
 aclError AclProfilingMarkEx(const char *msg, size_t msgLen, aclrtStream stream)
diff --git a/torch_npu/csrc/profiler/feature_mgr.cpp b/torch_npu/csrc/profiler/feature_mgr.cpp
index 242407cf84..a7220f940d 100644
--- a/torch_npu/csrc/profiler/feature_mgr.cpp
+++ b/torch_npu/csrc/profiler/feature_mgr.cpp
@@ -8,7 +8,7 @@ namespace torch_npu {
 namespace profiler {
 
 namespace {
-const static char* VERSION = "master\0";
+const static char* VERSION = "2.6.0\0";
 
 static std::unordered_map<std::string, FeatureType> NAME_TABLE = {
     {"ATTR", FeatureType::FEATURE_ATTR},
-- 
Gitee


From 2dc237d6d006ba2e62df610334a2ca965f118540 Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Tue, 8 Apr 2025 22:15:13 +0000
Subject: [PATCH 296/358] !20092 Update torchair commit id Merge pull request
 !20092 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 9faf65c70e..55cc6ded85 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 9faf65c70e9fdd8d5e1afc4d97718c73e812a2c0
+Subproject commit 55cc6ded85a78dc248fa163856135b8852e45352
-- 
Gitee


From 37ee617bfacc814d1c76bd5d2061db04bd3d2227 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Wed, 9 Apr 2025 05:10:07 +0000
Subject: [PATCH 297/358] !20160 Update op_plugin commit id Merge pull request
 !20160 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 815a2964cb..8afefb8daa 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 815a2964cb5055b731c15359dcdf6e6aa1056927
+Subproject commit 8afefb8daafe36efff4c8248b01f51123001fcf3
-- 
Gitee


From 3158aa59d938adfa9fb9740cd43e988bbc64be4d Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Wed, 9 Apr 2025 11:10:07 +0000
Subject: [PATCH 298/358] !20177 Update op_plugin commit id Merge pull request
 !20177 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 8afefb8daa..857b87b81f 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 8afefb8daafe36efff4c8248b01f51123001fcf3
+Subproject commit 857b87b81f2833796ccaa72909747b72ac0cffdb
-- 
Gitee


From 9673a4170fb0bfde8beca36a81590481d16cd72e Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Thu, 10 Apr 2025 02:24:27 +0000
Subject: [PATCH 299/358] !20183 Update torchair commit id Merge pull request
 !20183 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 55cc6ded85..7f5361c18d 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 55cc6ded85a78dc248fa163856135b8852e45352
+Subproject commit 7f5361c18d400b227ea4b76403e739e5b474d6b5
-- 
Gitee


From b80ab3b9f342d27b0a8f4ba496653189d9d47715 Mon Sep 17 00:00:00 2001
From: jiangpengfei <jiangpengfei24@huawei.com>
Date: Thu, 10 Apr 2025 02:30:20 +0000
Subject: [PATCH 300/358] !20188 update OWNERS. Merge pull request !20188 from
 jiangpengfei/v2.6.0

---
 OWNERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/OWNERS b/OWNERS
index 9b52a18683..7aa7ec4ab8 100644
--- a/OWNERS
+++ b/OWNERS
@@ -13,6 +13,7 @@ approvers:
 - sunboquan
 - wangchao147
 - yanpengquan07
+- kaixin1976
 reviewers:
 - xiaxia3
 - ascendzyj
@@ -47,3 +48,4 @@ reviewers:
 - guo-guanghao
 - yuhaiyan
 - wangchao147
+- insanecoder
-- 
Gitee


From e8e9e17fa139c43a683afe8b1e0ef70efee601cf Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 10 Apr 2025 04:40:07 +0000
Subject: [PATCH 301/358] !20199 Update op_plugin commit id Merge pull request
 !20199 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 857b87b81f..646ba86530 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 857b87b81f2833796ccaa72909747b72ac0cffdb
+Subproject commit 646ba86530531be83685c68257c35fdc8c5c21ea
-- 
Gitee


From 9d4be0aa59e3a94b95366860d6ca8fa929beb786 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 10 Apr 2025 08:55:08 +0000
Subject: [PATCH 302/358] !20214 Update op_plugin commit id Merge pull request
 !20214 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 646ba86530..27f697e72d 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 646ba86530531be83685c68257c35fdc8c5c21ea
+Subproject commit 27f697e72def188ba8713ad107967a61cdc10035
-- 
Gitee


From 453e982ecebad2ab2587801a0c9d3a6e9277f254 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 10 Apr 2025 10:10:08 +0000
Subject: [PATCH 303/358] !20229 Update op_plugin commit id Merge pull request
 !20229 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 27f697e72d..5509be3469 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 27f697e72def188ba8713ad107967a61cdc10035
+Subproject commit 5509be3469c12252fab8ce955bd8296cb3a66945
-- 
Gitee


From 67143e55dfc0033a6246b5d7a5f8b53e0c8f2b18 Mon Sep 17 00:00:00 2001
From: wangzixuan <617225691@qq.com>
Date: Thu, 10 Apr 2025 13:25:50 +0000
Subject: [PATCH 304/358] !20144 feature manager func Merge pull request !20144
 from wangzixuan/dev-2.6.0

---
 .../csrc/framework/interface/MsProfilerInterface.cpp      | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp b/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp
index 2d427b78dc..8d7e6f179d 100644
--- a/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp
+++ b/torch_npu/csrc/framework/interface/MsProfilerInterface.cpp
@@ -54,11 +54,9 @@ aclError AclprofGetSupportedFeatures(size_t* featuresSize, void** featuresData)
     static AclprofGetSupportedFeaturesFunc func = nullptr;
     if (func == nullptr) {
         func = (AclprofGetSupportedFeaturesFunc)GET_FUNC(aclprofGetSupportedFeaturesV2);
-    }
-    if (func != nullptr) {
-        return func(featuresSize, featuresData);
-    } else {
-        func = (AclprofGetSupportedFeaturesFunc)GET_FUNC(aclprofGetSupportedFeatures);
+        if (func == nullptr) {
+            func = (AclprofGetSupportedFeaturesFunc)GET_FUNC(aclprofGetSupportedFeatures);
+        }
     }
     
     if (func != nullptr) {
-- 
Gitee


From 6d95fb5546af0a29b899e2d24a854d0ee4d05c96 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Thu, 10 Apr 2025 16:25:08 +0000
Subject: [PATCH 305/358] !20252 Update op_plugin commit id Merge pull request
 !20252 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 5509be3469..f851cda871 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 5509be3469c12252fab8ce955bd8296cb3a66945
+Subproject commit f851cda871c9bd783f85ee09845281a480f868dc
-- 
Gitee


From 79e0aa1c0e9cb302fc05db6aa60e391622ad37e8 Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Thu, 10 Apr 2025 22:23:00 +0000
Subject: [PATCH 306/358] !20247 Update torchair commit id Merge pull request
 !20247 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 7f5361c18d..bafb906ce0 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 7f5361c18d400b227ea4b76403e739e5b474d6b5
+Subproject commit bafb906ce04416fced63effab71002863f8f21f9
-- 
Gitee


From 569aa849fde7ec64be33e9765c0150a96d7b7e3d Mon Sep 17 00:00:00 2001
From: hhz886 <hehongzhe@h-partners.com>
Date: Fri, 11 Apr 2025 08:00:15 +0000
Subject: [PATCH 307/358] =?UTF-8?q?!20050=20=E3=80=90Profiler=E3=80=91safe?=
 =?UTF-8?q?=20fix=20Merge=20pull=20request=20!20050=20from=20hhz886/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/profiler/init.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_npu/csrc/profiler/init.cpp b/torch_npu/csrc/profiler/init.cpp
index b871f2d991..0d18f1a70d 100644
--- a/torch_npu/csrc/profiler/init.cpp
+++ b/torch_npu/csrc/profiler/init.cpp
@@ -132,8 +132,8 @@ PyObject* THNPModule_rangeStartOnHost(PyObject* _unused, PyObject* args)
 PyObject* THNPModule_rangeEnd(PyObject* self, PyObject* args)
 {
     HANDLE_TH_ERRORS
-    mstxRangeId rangeId;
-    if (!PyArg_ParseTuple(args, "k", &rangeId)) {
+    int rangeId;
+    if (!PyArg_ParseTuple(args, "i", &rangeId)) {
         return nullptr;
     }
     mstxRangeEnd(rangeId);
-- 
Gitee


From 17fb01f3ad92bc61fd98368946e710157574e54f Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Sat, 12 Apr 2025 09:10:12 +0000
Subject: [PATCH 308/358] !20286 Update op_plugin commit id Merge pull request
 !20286 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index f851cda871..7466dc7950 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit f851cda871c9bd783f85ee09845281a480f868dc
+Subproject commit 7466dc7950532eabaf24355a60e07250e06690d5
-- 
Gitee


From 65a39dbd5f37caf53ec8543145bf2226f36363d6 Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Sun, 13 Apr 2025 22:08:24 +0000
Subject: [PATCH 309/358] !20278 Update torchair commit id Merge pull request
 !20278 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index bafb906ce0..9d6c8350f4 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit bafb906ce04416fced63effab71002863f8f21f9
+Subproject commit 9d6c8350f4cb27b8358b6250b64b3e3246fc6b44
-- 
Gitee


From 58df34437b2620f92621353ba1c60efb61859683 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Mon, 14 Apr 2025 04:55:12 +0000
Subject: [PATCH 310/358] !20303 Update op_plugin commit id Merge pull request
 !20303 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 7466dc7950..0371cfc456 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 7466dc7950532eabaf24355a60e07250e06690d5
+Subproject commit 0371cfc456ad6c34e9e8d6338ed163a845d017e9
-- 
Gitee


From 5805d004b62ab78d4c31dbfcd08f057e608ae01e Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Mon, 14 Apr 2025 10:10:12 +0000
Subject: [PATCH 311/358] !20314 Update op_plugin commit id Merge pull request
 !20314 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 0371cfc456..ae0459ab53 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 0371cfc456ad6c34e9e8d6338ed163a845d017e9
+Subproject commit ae0459ab53cc6441c759f7a72766ec8fbe82925d
-- 
Gitee


From a5b615dfbf462f0e7789022a6e8ecb079c287290 Mon Sep 17 00:00:00 2001
From: xudaohong <xudaohong@huawei.com>
Date: Mon, 14 Apr 2025 11:28:41 +0000
Subject: [PATCH 312/358] !20262 [feat] aclGraph task group Merge pull request
 !20262 from xudaohong/v2.6.0

---
 test/torch_npu_schema.json                    | 24 +++++++
 third_party/acl/inc/acl/acl_base.h            |  1 +
 third_party/acl/inc/acl/acl_mdl.h             | 38 +++++++++++
 third_party/acl/inc/acl/acl_rt.h              |  1 +
 torch_npu/csrc/core/npu/NPUEvent.cpp          | 13 ++++
 torch_npu/csrc/core/npu/NPUEvent.h            |  1 +
 torch_npu/csrc/core/npu/NPUGraph.cpp          | 24 +++++++
 torch_npu/csrc/core/npu/NPUGraph.h            |  9 +++
 .../csrc/core/npu/interface/AclInterface.cpp  | 65 +++++++++++++++++--
 .../csrc/core/npu/interface/AclInterface.h    |  8 +++
 torch_npu/csrc/npu/Event.cpp                  | 23 ++++++-
 torch_npu/csrc/npu/Graph.cpp                  | 22 ++++++-
 torch_npu/csrc/npu/Stream.cpp                 |  9 +++
 torch_npu/csrc/npu/Stream.h                   |  2 +
 torch_npu/npu/__init__.py                     | 13 +++-
 torch_npu/npu/graphs.py                       | 24 +++++++
 torch_npu/npu/streams.py                      | 65 ++++++++++++++++++-
 17 files changed, 329 insertions(+), 13 deletions(-)

diff --git a/test/torch_npu_schema.json b/test/torch_npu_schema.json
index 1415732141..c8ea4752e5 100644
--- a/test/torch_npu_schema.json
+++ b/test/torch_npu_schema.json
@@ -743,6 +743,30 @@
   "torch_npu.npu.Event.synchronize": {
     "signature": "(self)"
   },
+  "torch_npu.npu.ExternalEvent": {
+    "signature": "()"
+  },
+  "torch_npu.npu.ExternalEvent.record": {
+    "signature": "(self, stream=None)"
+  },
+  "torch_npu.npu.ExternalEvent.wait": {
+    "signature": "(self, stream=None)"
+  },
+  "torch_npu.npu.ExternalEvent.reset": {
+    "signature": "(self, stream=None)"
+  },
+  "torch_npu.npu.graph_task_group_begin": {
+    "signature": "(stream)"
+  },
+  "torch_npu.npu.graph_task_group_end": {
+    "signature": "(stream)"
+  },
+  "torch_npu.npu.graph_task_update_begin": {
+    "signature": "(stream, handle)"
+  },
+  "torch_npu.npu.graph_task_update_end": {
+    "signature": "(stream)"
+  },
   "torch_npu.npu.FloatStorage": {
     "signature": "(*args, wrap_storage=None, dtype=None, device=None, _internal=False)"
   },
diff --git a/third_party/acl/inc/acl/acl_base.h b/third_party/acl/inc/acl/acl_base.h
index 9780a01f8f..749e1d3c77 100755
--- a/third_party/acl/inc/acl/acl_base.h
+++ b/third_party/acl/inc/acl/acl_base.h
@@ -56,6 +56,7 @@ typedef void *aclrtAllocatorDesc;
 typedef void *aclrtAllocator;
 typedef void *aclrtAllocatorBlock;
 typedef void *aclrtAllocatorAddr;
+typedef void *aclrtTaskGrp;
 
 static const int ACL_ERROR_NONE = 0;
 static const int ACL_SUCCESS = 0;
diff --git a/third_party/acl/inc/acl/acl_mdl.h b/third_party/acl/inc/acl/acl_mdl.h
index 45a36898ef..f13950ab85 100755
--- a/third_party/acl/inc/acl/acl_mdl.h
+++ b/third_party/acl/inc/acl/acl_mdl.h
@@ -1545,6 +1545,44 @@ ACL_FUNC_VISIBILITY aclError aclmdlRICaptureEnd(aclrtStream stream, aclmdlRI *mo
  */
 ACL_FUNC_VISIBILITY aclError aclmdlRIDebugPrint(aclmdlRI modelRI);
 
+/**
+ * @ingroup AscendCL
+ * @brief the start interface of the task group
+ * @param stream [IN] capture stream
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlRICaptureTaskGrpBegin(aclrtStream stream);
+
+/**
+ * @ingroup AscendCL
+ * @brief the end interface of the task group
+ * @param stream [IN] capture stream
+ * @param handle [OUT] task group handle
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlRICaptureTaskGrpEnd(aclrtStream stream, aclrtTaskGrp *handle);
+
+/**
+ * @ingroup AscendCL
+ * @brief begin to update the task group specified by the handle
+ * @param stream [IN] specify the stream used for task update
+ * @param handle [IN] task group handle
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlRICaptureTaskUpdateBegin(aclrtStream stream, aclrtTaskGrp handle);
+
+/**
+ * @ingroup AscendCL
+ * @brief end the update of the task
+ * @param stream [IN] specify the stream used for task update
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ */
+ACL_FUNC_VISIBILITY aclError aclmdlRICaptureTaskUpdateEnd(aclrtStream stream);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/third_party/acl/inc/acl/acl_rt.h b/third_party/acl/inc/acl/acl_rt.h
index be607a8281..963c29f93a 100755
--- a/third_party/acl/inc/acl/acl_rt.h
+++ b/third_party/acl/inc/acl/acl_rt.h
@@ -22,6 +22,7 @@ extern "C" {
 #define ACL_EVENT_SYNC                    0x00000001u
 #define ACL_EVENT_CAPTURE_STREAM_PROGRESS 0x00000002u
 #define ACL_EVENT_TIME_LINE               0x00000008u
+#define ACL_EVENT_EXTERNAL                0x00000020u
 
 #define ACL_STREAM_FAST_LAUNCH 0x00000001u
 #define ACL_STREAM_FAST_SYNC   0x00000002u
diff --git a/torch_npu/csrc/core/npu/NPUEvent.cpp b/torch_npu/csrc/core/npu/NPUEvent.cpp
index 0e6857cf51..ec6fd9b780 100644
--- a/torch_npu/csrc/core/npu/NPUEvent.cpp
+++ b/torch_npu/csrc/core/npu/NPUEvent.cpp
@@ -91,6 +91,9 @@ void NPUEvent::record(const NPUStream& stream)
 
 void NPUEvent::block(const NPUStream& stream)
 {
+    if (!is_created_ && (flags_ == ACL_EVENT_EXTERNAL)) {
+        createEvent(stream.device_index());
+    }
     if (is_created_) {
         NPUGuard guard(stream.device_index());
         c10_npu::queue::LaunchWaitEventTask(event_, stream);
@@ -162,6 +165,16 @@ void NPUEvent::synchronize() const
     }
 }
 
+void NPUEvent::reset(const NPUStream& stream) const
+{
+    if (is_created_) {
+        TORCH_CHECK(flags_ == ACL_EVENT_EXTERNAL,
+                    "API reset() only support ACL_EVENT_EXTERNAL flag event.", PTA_ERROR(ErrCode::INTERNAL));
+        NPUGuard guard(stream.device_index());
+        NPU_CHECK_ERROR_WITHOUT_UCE(aclrtResetEvent(event_, stream.stream()));
+    }
+}
+
 void NPUEvent::createEvent(c10::DeviceIndex device_index)
 {
     device_index_ = device_index;
diff --git a/torch_npu/csrc/core/npu/NPUEvent.h b/torch_npu/csrc/core/npu/NPUEvent.h
index 5eba816db6..cf6e34ee9c 100644
--- a/torch_npu/csrc/core/npu/NPUEvent.h
+++ b/torch_npu/csrc/core/npu/NPUEvent.h
@@ -49,6 +49,7 @@ struct C10_NPU_API NPUEvent {
     float elapsed_time(const NPUEvent& other) const;
     uint64_t recorded_time() const;
     void synchronize() const;
+    void reset(const NPUStream& stream) const;
 
     // npu do not support IpcEventHandle until now
 
diff --git a/torch_npu/csrc/core/npu/NPUGraph.cpp b/torch_npu/csrc/core/npu/NPUGraph.cpp
index e259d1a724..48522306d2 100644
--- a/torch_npu/csrc/core/npu/NPUGraph.cpp
+++ b/torch_npu/csrc/core/npu/NPUGraph.cpp
@@ -25,6 +25,30 @@ MempoolId_t graph_pool_handle()
     return new_pool.id();
 }
 
+void graph_task_group_begin(c10_npu::NPUStream stream)
+{
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlRICaptureTaskGrpBegin(stream));
+}
+
+NPUTaskGroupHandle graph_task_group_end(c10_npu::NPUStream stream)
+{
+    aclrtTaskGrp group;
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlRICaptureTaskGrpEnd(stream, &group));
+    NPUTaskGroupHandle handle;
+    handle.task_group = group;
+    return handle;
+}
+
+void graph_task_update_begin(c10_npu::NPUStream stream, NPUTaskGroupHandle handle)
+{
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlRICaptureTaskUpdateBegin(stream, handle.task_group));
+}
+
+void graph_task_update_end(c10_npu::NPUStream stream)
+{
+    NPU_CHECK_ERROR(c10_npu::acl::AclmdlRICaptureTaskUpdateEnd(stream));
+}
+
 /**
  * Note [CUDA Graph Wrapper Class]
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/torch_npu/csrc/core/npu/NPUGraph.h b/torch_npu/csrc/core/npu/NPUGraph.h
index ccb8c29067..442ae335cc 100644
--- a/torch_npu/csrc/core/npu/NPUGraph.h
+++ b/torch_npu/csrc/core/npu/NPUGraph.h
@@ -14,6 +14,15 @@ namespace c10_npu {
 // to CUDAGraph::capture_begin
 TORCH_NPU_API MempoolId_t graph_pool_handle();
 
+struct TORCH_NPU_API NPUTaskGroupHandle {
+    aclrtTaskGrp task_group;
+};
+
+TORCH_NPU_API void graph_task_group_begin(c10_npu::NPUStream stream);
+TORCH_NPU_API NPUTaskGroupHandle graph_task_group_end(c10_npu::NPUStream stream);
+TORCH_NPU_API void graph_task_update_begin(c10_npu::NPUStream stream, NPUTaskGroupHandle handle);
+TORCH_NPU_API void graph_task_update_end(c10_npu::NPUStream stream);
+
 struct TORCH_NPU_API NPUGraph {
     NPUGraph();
     ~NPUGraph();
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.cpp b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
index 78737ed91a..ba5d1e4042 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.cpp
@@ -76,6 +76,10 @@ LOAD_FUNCTION(aclmdlRIDebugPrint)
 LOAD_FUNCTION(aclmdlRIExecuteAsync)
 LOAD_FUNCTION(aclmdlRIDestroy)
 LOAD_FUNCTION(aclsysGetCANNVersion)
+LOAD_FUNCTION(aclmdlRICaptureTaskGrpBegin)
+LOAD_FUNCTION(aclmdlRICaptureTaskGrpEnd)
+LOAD_FUNCTION(aclmdlRICaptureTaskUpdateBegin)
+LOAD_FUNCTION(aclmdlRICaptureTaskUpdateEnd)
 
 aclprofStepInfoPtr init_stepinfo() {
     typedef aclprofStepInfoPtr(*npdInitFunc)();
@@ -202,13 +206,16 @@ aclError AclrtCreateEventWithFlag(aclrtEvent *event, uint32_t flag)
     //   2. There is no limit on the number of events.
     //   3. Only support query event record status, aclrtQueryEvent and aclrtQueryEventWaitStatus are not supported.
     //   4. aclrtDestroyEvent change to asynchronous destroy event.
-    static AclrtCreateEventWithFlagFunc func = (AclrtCreateEventWithFlagFunc)GET_FUNC(aclrtCreateEventExWithFlag);
-    if (func == nullptr) {
-        TORCH_NPU_WARN_ONCE(func, "Failed to find function ", "aclrtCreateEventExWithFlag");
-        func = (AclrtCreateEventWithFlagFunc)GET_FUNC(aclrtCreateEventWithFlag);
+    static AclrtCreateEventWithFlagFunc func_ex = (AclrtCreateEventWithFlagFunc)GET_FUNC(aclrtCreateEventExWithFlag);
+    if (func_ex == nullptr) {
+        TORCH_NPU_WARN_ONCE(func_ex, "Failed to find function ", "aclrtCreateEventExWithFlag");
     }
+    static AclrtCreateEventWithFlagFunc func = (AclrtCreateEventWithFlagFunc)GET_FUNC(aclrtCreateEventWithFlag);
     TORCH_CHECK(func, "Failed to find function ", "aclrtCreateEventWithFlag", PROF_ERROR(ErrCode::NOT_FOUND));
-    return func(event, flag);
+    if ((flag == ACL_EVENT_EXTERNAL) || (func_ex == nullptr)) {
+        return func(event, flag);
+    }
+    return func_ex(event, flag);
 }
 
 aclError AclQueryEventWaitStatus(aclrtEvent event, aclrtEventWaitStatus *waitStatus)
@@ -846,5 +853,53 @@ bool IsCaptureSupported()
     return is_support;
 }
 
+aclError AclmdlRICaptureTaskGrpBegin(aclrtStream stream)
+{
+    typedef aclError (*AclmdlRICaptureTaskGrpBegin)(aclrtStream);
+    static AclmdlRICaptureTaskGrpBegin func = nullptr;
+    if (func == nullptr) {
+        func = (AclmdlRICaptureTaskGrpBegin) GET_FUNC(aclmdlRICaptureTaskGrpBegin);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclmdlRICaptureTaskGrpBegin", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(stream);
+}
+
+aclError AclmdlRICaptureTaskGrpEnd(aclrtStream stream, aclrtTaskGrp *handle)
+{
+    typedef aclError (*AclmdlRICaptureTaskGrpEnd)(aclrtStream, aclrtTaskGrp*);
+    static AclmdlRICaptureTaskGrpEnd func = nullptr;
+    if (func == nullptr) {
+        func = (AclmdlRICaptureTaskGrpEnd) GET_FUNC(aclmdlRICaptureTaskGrpEnd);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclmdlRICaptureTaskGrpEnd", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(stream, handle);
+}
+
+aclError AclmdlRICaptureTaskUpdateBegin(aclrtStream stream, aclrtTaskGrp handle)
+{
+    typedef aclError (*AclmdlRICaptureTaskUpdateBegin)(aclrtStream, aclrtTaskGrp);
+    static AclmdlRICaptureTaskUpdateBegin func = nullptr;
+    if (func == nullptr) {
+        func = (AclmdlRICaptureTaskUpdateBegin) GET_FUNC(aclmdlRICaptureTaskUpdateBegin);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclmdlRICaptureTaskUpdateBegin", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(stream, handle);
+}
+
+aclError AclmdlRICaptureTaskUpdateEnd(aclrtStream stream)
+{
+    typedef aclError (*AclmdlRICaptureTaskUpdateEnd)(aclmdlRI);
+    static AclmdlRICaptureTaskUpdateEnd func = nullptr;
+    if (func == nullptr) {
+        func = (AclmdlRICaptureTaskUpdateEnd) GET_FUNC(aclmdlRICaptureTaskUpdateEnd);
+    }
+
+    TORCH_CHECK(func, "Failed to find function aclmdlRICaptureTaskUpdateEnd", PTA_ERROR(ErrCode::NOT_FOUND));
+    return func(stream);
+}
+
 } // namespace acl
 } // namespace c10
diff --git a/torch_npu/csrc/core/npu/interface/AclInterface.h b/torch_npu/csrc/core/npu/interface/AclInterface.h
index ca5c03d30e..2f34bd561e 100644
--- a/torch_npu/csrc/core/npu/interface/AclInterface.h
+++ b/torch_npu/csrc/core/npu/interface/AclInterface.h
@@ -199,5 +199,13 @@ aclError AclmdlRIDestroy(aclmdlRI modelRI);
 
 bool IsCaptureSupported();
 
+aclError AclmdlRICaptureTaskGrpBegin(aclrtStream stream);
+
+aclError AclmdlRICaptureTaskGrpEnd(aclrtStream stream, aclrtTaskGrp *handle);
+
+aclError AclmdlRICaptureTaskUpdateBegin(aclrtStream stream, aclrtTaskGrp handle);
+
+aclError AclmdlRICaptureTaskUpdateEnd(aclrtStream stream);
+
 } // namespace acl
 } // namespace c10_npu
diff --git a/torch_npu/csrc/npu/Event.cpp b/torch_npu/csrc/npu/Event.cpp
index 3c92a33539..38db7ccdf2 100644
--- a/torch_npu/csrc/npu/Event.cpp
+++ b/torch_npu/csrc/npu/Event.cpp
@@ -17,10 +17,11 @@ static PyObject* THNPEvent_pynew(PyTypeObject *type, PyObject *args, PyObject *k
     unsigned char enable_timing = 0;
     unsigned char blocking = 0;
     unsigned char interprocess = 0;
+    unsigned char external = 0;
 
-    constexpr const char* kwlist[] = {"enable_timing", "blocking", "interprocess", nullptr};
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|bbb", const_cast<char**>(kwlist),
-        &enable_timing, &blocking, &interprocess)) {
+    constexpr const char* kwlist[] = {"enable_timing", "blocking", "interprocess", "graph_external", nullptr};
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|bbbb", const_cast<char**>(kwlist),
+        &enable_timing, &blocking, &interprocess, &external)) {
         return nullptr;
     }
 
@@ -37,6 +38,9 @@ static PyObject* THNPEvent_pynew(PyTypeObject *type, PyObject *args, PyObject *k
     } else {
         flags = enable_timing ? ACL_EVENT_TIME_LINE : ACL_EVENT_DEFAULT;
     }
+    if (external) {
+        flags = ACL_EVENT_EXTERNAL;
+    }
     new (&self->npu_event) c10_npu::NPUEvent(flags);
 
     return (PyObject *)ptr.release();
@@ -121,6 +125,18 @@ static PyObject* THNPEvent_synchronize(THNPEvent *self, PyObject *noargs)
     END_HANDLE_TH_ERRORS
 }
 
+static PyObject* THNPEvent_reset(THNPEvent *self, THNPStream *stream)
+{
+    HANDLE_TH_ERRORS
+    {
+        pybind11::gil_scoped_release no_gil;
+        self->npu_event.reset(stream->npu_stream);
+        ASCEND_LOGI("Event: reset api is successfully executed, event=%p", self->npu_event.event());
+    }
+    Py_RETURN_NONE;
+    END_HANDLE_TH_ERRORS
+}
+
 static struct PyGetSetDef THNPEvent_properties[] = {
     {"device", (getter)THNPEvent_get_device, nullptr, nullptr, nullptr},
     {"npu_event", (getter)THNPEvent_get_npu_event, nullptr, nullptr, nullptr},
@@ -134,6 +150,7 @@ static PyMethodDef THNPEvent_methods[] = {
     {(char*)"elapsed_time", (PyCFunction)THNPEvent_elapsed_time, METH_O, nullptr},
     {(char*)"recorded_time", (PyCFunction)THNPEvent_recorded_time, METH_NOARGS, nullptr},
     {(char*)"synchronize", (PyCFunction)THNPEvent_synchronize, METH_NOARGS, nullptr},
+    {(char*)"reset", (PyCFunction)THNPEvent_reset, METH_O, nullptr},
     {nullptr}
 };
 
diff --git a/torch_npu/csrc/npu/Graph.cpp b/torch_npu/csrc/npu/Graph.cpp
index 3a471cb2aa..c8d30cfa44 100644
--- a/torch_npu/csrc/npu/Graph.cpp
+++ b/torch_npu/csrc/npu/Graph.cpp
@@ -7,6 +7,7 @@
 
 #include "torch_npu/csrc/core/npu/NPUGraph.h"
 #include "torch_npu/csrc/core/npu/NPUGraphsUtils.h"
+#include "torch_npu/csrc/npu/Stream.h"
 
 template <typename T>
 using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
@@ -16,7 +17,26 @@ void TORCH_NPU_API THNPGraph_init(PyObject* module) {
     // but CI linter and some builds prefer "module".
     auto torch_N_m = py::handle(module).cast<py::module>();
 
-    torch_N_m.def("_graph_pool_handle", &c10_npu::graph_pool_handle);
+    py::class_<c10_npu::NPUTaskGroupHandle>(torch_N_m, "_NPUTaskGroupHandle")
+            .def_readonly("task_group", &c10_npu::NPUTaskGroupHandle::task_group);
+
+    torch_N_m.def("_graph_pool_handle", &c10_npu::graph_pool_handle)
+        .def("_graph_task_group_begin", [](py::object py_stream) {
+            auto stream = (*py_stream).ptr();
+            c10_npu::graph_task_group_begin(THNPUtils_PyObject_to_NPUStream(stream));
+        })
+        .def("_graph_task_group_end", [](py::object py_stream) {
+            auto stream = (*py_stream).ptr();
+            return c10_npu::graph_task_group_end(THNPUtils_PyObject_to_NPUStream(stream));
+        })
+        .def("_graph_task_update_begin", [](py::object py_stream, c10_npu::NPUTaskGroupHandle handle) {
+            auto stream = (*py_stream).ptr();
+            c10_npu::graph_task_update_begin(THNPUtils_PyObject_to_NPUStream(stream), handle);
+        })
+        .def("_graph_task_update_end", [](py::object py_stream) {
+            auto stream = (*py_stream).ptr();
+            c10_npu::graph_task_update_end(THNPUtils_PyObject_to_NPUStream(stream));
+        });
 
     shared_ptr_class_<c10_npu::NPUGraph>(torch_N_m, "_NPUGraph")
         .def(py::init<>())
diff --git a/torch_npu/csrc/npu/Stream.cpp b/torch_npu/csrc/npu/Stream.cpp
index 8059cf3447..180fede5ec 100644
--- a/torch_npu/csrc/npu/Stream.cpp
+++ b/torch_npu/csrc/npu/Stream.cpp
@@ -250,3 +250,12 @@ std::vector<c10::optional<c10_npu::NPUStream>> THNPUtils_PySequence_to_NPUStream
     }
     return streams;
 }
+
+c10_npu::NPUStream THNPUtils_PyObject_to_NPUStream(PyObject* stream)
+{
+    TORCH_CHECK(PyObject_IsInstance(stream, THNPStreamClass), "Need torch_npu.npu.Stream argument type.");
+    return c10_npu::NPUStream::unpack3(
+        (reinterpret_cast<THNPStream *>(stream))->stream_id,
+        (reinterpret_cast<THNPStream *>(stream))->device_index,
+        static_cast<c10::DeviceType>((reinterpret_cast<THNPStream *>(stream))->device_type));
+}
diff --git a/torch_npu/csrc/npu/Stream.h b/torch_npu/csrc/npu/Stream.h
index f51479f2b0..f6f084bca3 100644
--- a/torch_npu/csrc/npu/Stream.h
+++ b/torch_npu/csrc/npu/Stream.h
@@ -21,4 +21,6 @@ inline bool THNPStream_Check(PyObject* obj)
 
 TORCH_NPU_API std::vector<c10::optional<c10_npu::NPUStream>> THNPUtils_PySequence_to_NPUStreamList(PyObject* obj);
 
+c10_npu::NPUStream THNPUtils_PyObject_to_NPUStream(PyObject* py_stream);
+
 #endif // THNP_STREAM_INC
diff --git a/torch_npu/npu/__init__.py b/torch_npu/npu/__init__.py
index 12d67a0bb8..337ad40fc1 100644
--- a/torch_npu/npu/__init__.py
+++ b/torch_npu/npu/__init__.py
@@ -108,7 +108,12 @@ __all__ = [
     "graph",
     "graph_pool_handle",
     "is_current_stream_capturing",
-    "make_graphed_callables"
+    "make_graphed_callables",
+    "ExternalEvent",
+    "graph_task_group_begin",
+    "graph_task_group_end",
+    "graph_task_update_begin",
+    "graph_task_update_end"
 ]
 
 from typing import Tuple, Union
@@ -130,7 +135,7 @@ from .utils import (synchronize, device_count, can_device_access_peer, set_devic
                     utilization, finalize_dump, set_dump, get_npu_overflow_flag, clear_npu_overflow_flag, mem_get_info,
                     check_uce_in_memory, stress_detect)
 from ._recovery import restart_device, stop_device
-from .streams import Stream, Event, SyncLaunchStream
+from .streams import Stream, Event, SyncLaunchStream, ExternalEvent
 from .mstx import mstx
 from .npu_config import *  # noqa: F403
 from .autocast_utils import *  # noqa: F403
@@ -144,6 +149,10 @@ from .graphs import (
     graph_pool_handle,
     is_current_stream_capturing,
     make_graphed_callables,
+    graph_task_group_begin,
+    graph_task_group_end,
+    graph_task_update_begin,
+    graph_task_update_end,
 )
 
 # init profiler
diff --git a/torch_npu/npu/graphs.py b/torch_npu/npu/graphs.py
index c9608906fd..dd0a34c018 100644
--- a/torch_npu/npu/graphs.py
+++ b/torch_npu/npu/graphs.py
@@ -13,11 +13,19 @@ if not hasattr(torch_npu._C, "_NPUStreamBase"):
     torch_npu._C.__dict__["_npu_isCurrentStreamCapturing"] = _dummy_type(
         "_npu_isCurrentStreamCapturing"
     )
+    torch_npu._C.__dict__["_graph_task_group_begin"] = _dummy_type("_graph_task_group_begin")
+    torch_npu._C.__dict__["_graph_task_group_end"] = _dummy_type("_graph_task_group_end")
+    torch_npu._C.__dict__["_graph_task_update_begin"] = _dummy_type("_graph_task_update_begin")
+    torch_npu._C.__dict__["_graph_task_update_end"] = _dummy_type("_graph_task_update_end")
 
 from torch_npu._C import (  # noqa: F401
     _npu_isCurrentStreamCapturing,
     _NPUGraph,
     _graph_pool_handle,
+    _graph_task_group_begin,
+    _graph_task_group_end,
+    _graph_task_update_begin,
+    _graph_task_update_end,
 )
 
 
@@ -41,6 +49,22 @@ def graph_pool_handle():
     return _graph_pool_handle()
 
 
+def graph_task_group_begin(stream):
+    _graph_task_group_begin(stream)
+
+
+def graph_task_group_end(stream):
+    return _graph_task_group_end(stream)
+
+
+def graph_task_update_begin(stream, handle):
+    _graph_task_update_begin(stream, handle)
+
+
+def graph_task_update_end(stream):
+    _graph_task_update_end(stream)
+
+
 # Python shim helps Sphinx process docstrings more reliably.
 class NPUGraph(torch_npu._C._NPUGraph):
     r"""Wrapper around a NPU graph.
diff --git a/torch_npu/npu/streams.py b/torch_npu/npu/streams.py
index b790ae2517..c4ba551f67 100644
--- a/torch_npu/npu/streams.py
+++ b/torch_npu/npu/streams.py
@@ -3,7 +3,7 @@ import ctypes
 import torch_npu
 import torch_npu._C
 
-__all__ = ["Stream", "Event", "SyncLaunchStream"]
+__all__ = ["Stream", "Event", "SyncLaunchStream", "ExternalEvent"]
 
 
 class Stream(torch_npu._C._NPUStreamBase):
@@ -132,7 +132,8 @@ class Event(torch_npu._C._NPUEventBase):
     """
 
     def __new__(cls, enable_timing=False, blocking=False, interprocess=False):
-        return super(Event, cls).__new__(cls, enable_timing=enable_timing, blocking=blocking, interprocess=interprocess)
+        return super(Event, cls).__new__(cls, enable_timing=enable_timing, blocking=blocking,
+                                         interprocess=interprocess, graph_external=False)
 
     def record(self, stream=None):
         r"""Records the event in a given stream.
@@ -196,6 +197,66 @@ class Event(torch_npu._C._NPUEventBase):
             return '<torch_npu.npu.Event uninitialized>'
 
 
+class ExternalEvent(torch_npu._C._NPUEventBase):
+    r"""Wrapper around a NPU event with graph_external=True.
+
+    The difference from torch.npu.Event is that you can call wait() before
+    record(). Before reusing ExternalEvent, you need to call reset() to clear
+    the flag.
+
+    Event is captured in the graph as an external event node when performing
+    stream capture.
+
+    The underlying NPU events are lazily initialized when the event is first
+    recorded or waited.
+
+    """
+
+    def __new__(cls):
+        return super(ExternalEvent, cls).__new__(cls, enable_timing=False, blocking=False,
+                                                 interprocess=False, graph_external=True)
+
+    def record(self, stream=None):
+        r"""Records the event in a given stream.
+
+        Uses ``torch_npu.npu.current_stream()`` if no stream is specified. The
+        stream's device must match the event's device.
+        """
+        if stream is None:
+            stream = torch_npu.npu.current_stream()
+        super(ExternalEvent, self).record(stream)
+
+    def wait(self, stream=None):
+        r"""Makes all future work submitted to the given stream wait for this
+        event.
+
+        Use ``torch_npu.npu.current_stream()`` if no stream is specified.
+        """
+        if stream is None:
+            stream = torch_npu.npu.current_stream()
+        super(ExternalEvent, self).wait(stream)
+
+    def reset(self, stream=None):
+        r"""Reset an event.
+
+        Users need to make sure to wait for the tasks in the Stream
+        to complete before resetting the Event.
+        """
+        if stream is None:
+            stream = torch_npu.npu.current_stream()
+        super(ExternalEvent, self).reset(stream)
+
+    @property
+    def _as_parameter_(self):
+        return ctypes.c_void_p(self.npu_event)
+
+    def __repr__(self):
+        if self.npu_event:
+            return '<torch_npu.npu.ExternalEvent {0:#x}>'.format(self._as_parameter_.value)
+        else:
+            return '<torch_npu.npu.ExternalEvent uninitialized>'
+
+
 class SyncLaunchStream(torch_npu._C._NPUStreamBase):
     r"""Wrapper around a SyncLaunch NPU stream.
 
-- 
Gitee


From c9a933b64ed692981096719e861d348fb5a4c0f6 Mon Sep 17 00:00:00 2001
From: dilililiwhy <why.wuhuanyu@huawei.com>
Date: Mon, 14 Apr 2025 11:38:20 +0000
Subject: [PATCH 313/358] !20308 Remove redundant DISABLE_RPC_FRAMEWORK=FALSE
 Merge pull request !20308 from dilililiwhy/build_bugfix_260

---
 ci/build.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ci/build.sh b/ci/build.sh
index 269cd6e3ad..1778876a31 100644
--- a/ci/build.sh
+++ b/ci/build.sh
@@ -6,7 +6,6 @@ CUR_DIR=$(dirname $(readlink -f $0))
 SUPPORTED_PY_VERSION=(3.9 3.10 3.11)
 # Default supported python version is 3.9
 PY_VERSION="3.9"
-export DISABLE_RPC_FRAMEWORK=FALSE
 
 # Parse arguments inside script
 function parse_script_args() {
-- 
Gitee


From 14bffc5e7ec6eca8c69aeef1786a21037093e858 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 15 Apr 2025 01:20:46 +0000
Subject: [PATCH 314/358] !20344 Update op_plugin commit id Merge pull request
 !20344 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index ae0459ab53..3f708d77b0 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit ae0459ab53cc6441c759f7a72766ec8fbe82925d
+Subproject commit 3f708d77b0d95165f425f3ee6f6ee1551caed497
-- 
Gitee


From 6a654a475b6239d714c3835bbbf667797c4fa8d0 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 15 Apr 2025 04:55:15 +0000
Subject: [PATCH 315/358] !20366 Update op_plugin commit id Merge pull request
 !20366 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 3f708d77b0..88e5e75ea3 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 3f708d77b0d95165f425f3ee6f6ee1551caed497
+Subproject commit 88e5e75ea3f238563423b23734b3879cc5d88924
-- 
Gitee


From 22aade7598ff0d4a8d6f0f4c2eea397194ba82a9 Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Tue, 15 Apr 2025 07:19:44 +0000
Subject: [PATCH 316/358] !20336 Update torchair commit id Merge pull request
 !20336 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 9d6c8350f4..39d32a9cb8 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 9d6c8350f4cb27b8358b6250b64b3e3246fc6b44
+Subproject commit 39d32a9cb87e09717240f2da3f5d48a3408f87ce
-- 
Gitee


From a5899a813beafe2fb129a0565fe7e25e8f065c71 Mon Sep 17 00:00:00 2001
From: pta-robot <pta_robot@163.com>
Date: Tue, 15 Apr 2025 08:55:15 +0000
Subject: [PATCH 317/358] !20374 Update op_plugin commit id Merge pull request
 !20374 from pta-robot/v2.6.0

---
 third_party/op-plugin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/op-plugin b/third_party/op-plugin
index 88e5e75ea3..3b79ab9a4d 160000
--- a/third_party/op-plugin
+++ b/third_party/op-plugin
@@ -1 +1 @@
-Subproject commit 88e5e75ea3f238563423b23734b3879cc5d88924
+Subproject commit 3b79ab9a4d043f955969b9672306f0baa51316ad
-- 
Gitee


From 749aaab9602a12c6e4369af71d784ccfbeef27ed Mon Sep 17 00:00:00 2001
From: wangzixuan <617225691@qq.com>
Date: Tue, 15 Apr 2025 09:26:22 +0000
Subject: [PATCH 318/358] !20270 pta export error Merge pull request !20270
 from wangzixuan/dev-2.6.0

---
 .../profiler/analysis/prof_config/_parser_deps_config.py    | 2 +-
 torch_npu/profiler/analysis/prof_parse/_cann_file_parser.py | 2 +-
 .../analysis/prof_parse/_fwk_cann_relation_parser.py        | 3 ++-
 .../profiler/analysis/prof_view/cann_parse/_cann_export.py  | 6 +++---
 .../profiler/analysis/prof_view/prof_db_parse/_db_parser.py | 2 +-
 5 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/torch_npu/profiler/analysis/prof_config/_parser_deps_config.py b/torch_npu/profiler/analysis/prof_config/_parser_deps_config.py
index 51cd44ba02..e61ecc3690 100644
--- a/torch_npu/profiler/analysis/prof_config/_parser_deps_config.py
+++ b/torch_npu/profiler/analysis/prof_config/_parser_deps_config.py
@@ -55,7 +55,7 @@ class ParserDepsConfig:
                                   Constant.DEPS: [Constant.TREE_BUILD_PARSER]},
         Constant.DB_PARSER: {Constant.MODE: ConcurrentMode.PTHREAD,
                              Constant.DEPS: [Constant.CANN_EXPORT_PARSER, Constant.MEMORY_PREPARE,
-                                             Constant.TREE_BUILD_PARSER]},
+                                             Constant.TREE_BUILD_PARSER, Constant.CANN_ANALYZE_PARSER]},
         Constant.MEMORY_TIMELINE_PARSER: {}
     }
 
diff --git a/torch_npu/profiler/analysis/prof_parse/_cann_file_parser.py b/torch_npu/profiler/analysis/prof_parse/_cann_file_parser.py
index a053bbdebd..c564894cfa 100644
--- a/torch_npu/profiler/analysis/prof_parse/_cann_file_parser.py
+++ b/torch_npu/profiler/analysis/prof_parse/_cann_file_parser.py
@@ -119,7 +119,7 @@ class CANNFileParser:
                 event_dict[unique_id] = data
 
         if not flow_dict:
-            logger.error("There is no HostToDevice flow events in msprof timeline.")
+            logger.warning("There is no HostToDevice flow events in msprof timeline.")
 
         if not event_dict:
             logger.error("There is no kernel events in msprof timeline.")
diff --git a/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py b/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py
index de8b466067..0f029ee7a8 100644
--- a/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py
+++ b/torch_npu/profiler/analysis/prof_parse/_fwk_cann_relation_parser.py
@@ -1,4 +1,5 @@
 from ._fwk_file_parser import FwkFileParser
+from .._profiler_config import ProfilerConfig
 from ..prof_bean._torch_op_node import TorchOpNode
 from ..prof_common_func._constant import Constant, print_error_msg
 from ..prof_common_func._log import ProfilerLogger
@@ -47,7 +48,7 @@ class FwkCANNRelationParser:
 
     def get_kernel_dict(self) -> dict:
         acl_to_npu_dict = CANNFileParser(self._profiler_path).get_acl_to_npu_data()
-        if not acl_to_npu_dict:
+        if not acl_to_npu_dict and ProfilerConfig().get_level() != Constant.LEVEL_NONE:
             print_error_msg("Failed to get acl to npu flow events.")
             return acl_to_npu_dict
         dequeue_data_list = FwkFileParser(self._profiler_path).get_dequeue_data()
diff --git a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py
index cff2628575..f14676422e 100644
--- a/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py
+++ b/torch_npu/profiler/analysis/prof_view/cann_parse/_cann_export.py
@@ -114,10 +114,10 @@ class CANNTimelineParser(BaseParser):
                 except InterruptedError:
                     return Constant.FAIL, None
         else:
-            patten = r'^ascend_pytorch_profiler\.db$' if ProfilerConfig().rank_id == -1 else r'^ascend_pytorch_profiler_\d+\.db$'
+            patten = r'^msprof_\d+\.db$'
             while True:
-                for file in os.listdir(self._output_path):
-                    if re.match(patten, file) and os.path.isfile(os.path.join(self._output_path, file)):
+                for file in os.listdir(self._cann_path):
+                    if re.match(patten, file) and os.path.isfile(os.path.join(self._cann_path, file)):
                         return Constant.SUCCESS, None
                 try:
                     time.sleep(Constant.SLEEP_TIME)
diff --git a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py
index a2c36511a1..89cc322980 100644
--- a/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py
+++ b/torch_npu/profiler/analysis/prof_view/prof_db_parse/_db_parser.py
@@ -45,7 +45,7 @@ class DbParser(BaseParser):
         AnalysisDb().init(os.path.join(self._output_path, DbConstant.DB_ANALYSIS))
 
         parser_db_map = self.PYTORCH_DB_MAP
-        if ProfilerPathManager.get_cann_path(self._profiler_path) and ProfilerConfig().get_level() != "Level_none":
+        if ProfilerPathManager.get_cann_path(self._profiler_path) and ProfilerConfig().get_level() != Constant.LEVEL_NONE:
             parser_db_map = {**self.PYTORCH_DB_MAP, **self.ANALYSIS_DB_MAP}
         try:
             for name, parser in parser_db_map.items():
-- 
Gitee


From 3c11b06e856fde5926e190b9b09c151255fc2d5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=97=AB=E9=B9=8F=E5=85=A8?= <yanpengquan@huawei.com>
Date: Tue, 15 Apr 2025 09:34:24 +0000
Subject: [PATCH 319/358] =?UTF-8?q?!20358=20Add=20api=20transfer=20for=20C?=
 =?UTF-8?q?udaGraph=20Merge=20pull=20request=20!20358=20from=20=E9=97=AB?=
 =?UTF-8?q?=E9=B9=8F=E5=85=A8/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/contrib/transfer_to_npu.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/torch_npu/contrib/transfer_to_npu.py b/torch_npu/contrib/transfer_to_npu.py
index 1c9d0f5a03..e2c2bde222 100644
--- a/torch_npu/contrib/transfer_to_npu.py
+++ b/torch_npu/contrib/transfer_to_npu.py
@@ -352,6 +352,9 @@ def _init():
     if hasattr(torch.distributed, 'init_device_mesh'):
         _del_nccl_device_backend_map()
         torch.distributed.init_device_mesh = _wrapper_cuda(torch.distributed.init_device_mesh)
+    
+    # CudaGraph
+    torch.cuda.CudaGraph = torch.npu.NPUGraph
 
     # torch.nn.parallel.DistributedDataParallel
     _device_wrapper(torch.nn.parallel.DistributedDataParallel, torch_distributed_fn_white_list)
-- 
Gitee


From 7648bf44236d9461eed67bf610b4dd775722b14f Mon Sep 17 00:00:00 2001
From: jiangpengfei <jiangpengfei24@huawei.com>
Date: Tue, 15 Apr 2025 11:04:05 +0000
Subject: [PATCH 320/358] !20220 add log when capturing under aclop Merge pull
 request !20220 from jiangpengfei/v2.6.0

---
 torch_npu/csrc/core/npu/NPUGraphsUtils.h | 17 ++++++++++++++++-
 torch_npu/csrc/framework/OpCommand.cpp   |  2 +-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/torch_npu/csrc/core/npu/NPUGraphsUtils.h b/torch_npu/csrc/core/npu/NPUGraphsUtils.h
index f14b7d01e2..1045b6c855 100644
--- a/torch_npu/csrc/core/npu/NPUGraphsUtils.h
+++ b/torch_npu/csrc/core/npu/NPUGraphsUtils.h
@@ -98,7 +98,22 @@ inline void assertNotCapturing(const std::string &attempt)
                 " during NPU graph capture. If you need this call to be captured, "
                 "please file an issue. "
                 "Current npuStreamCaptureStatus: ",
-                status);
+                status,
+                PTA_ERROR(ErrCode::NOT_SUPPORT));
+}
+
+inline void assertNotCapturingAclop(const std::string &opName)
+{
+    auto status = currentStreamCaptureStatus();
+    TORCH_CHECK(status == CaptureStatus::None,
+                "Cannot run aclop operators during NPU graph capture. Current working aclop is ",
+                opName,
+                ". If you need this call to be captured, "
+                "please try to set torch.npu.config.allow_internal_format = False. "
+                "If still fail, the operator needs aclnn implementation and please file an issue. "
+                "Current npuStreamCaptureStatus: ",
+                status,
+                PTA_ERROR(ErrCode::NOT_SUPPORT));
 }
 
 } // namespace c10_npu
diff --git a/torch_npu/csrc/framework/OpCommand.cpp b/torch_npu/csrc/framework/OpCommand.cpp
index 9c64e376d1..f4501fbfdc 100644
--- a/torch_npu/csrc/framework/OpCommand.cpp
+++ b/torch_npu/csrc/framework/OpCommand.cpp
@@ -124,7 +124,7 @@ void OpCommand::Run()
 {
     // Check for npu graph
     if (aclCmd->CheckCustomHandlerNull()) {
-        c10_npu::assertNotCapturing("Cannot run aclop operators");
+        c10_npu::assertNotCapturingAclop(aclCmd->GetName());
     }
 
     aclCmd->SetEnginePriority();
-- 
Gitee


From 0f7d76d984560a62fda526d083b194c83638182d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= <wangchao430@huawei.com>
Date: Tue, 15 Apr 2025 11:15:47 +0000
Subject: [PATCH 321/358] =?UTF-8?q?!20386=20SilentCheck=20Revert:=20set=20?=
 =?UTF-8?q?IsExistAclnnSilentCheckV2=20to=20false=20Merge=20pull=20request?=
 =?UTF-8?q?=20!20386=20from=20=E7=8E=8B=E8=B6=85/v2.6.0=5Frevert?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/core/npu/interface/OpInterface.cpp | 5 +----
 torch_npu/utils/_step.py                          | 2 --
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/torch_npu/csrc/core/npu/interface/OpInterface.cpp b/torch_npu/csrc/core/npu/interface/OpInterface.cpp
index 4026cd9978..2ba4baaebc 100644
--- a/torch_npu/csrc/core/npu/interface/OpInterface.cpp
+++ b/torch_npu/csrc/core/npu/interface/OpInterface.cpp
@@ -26,10 +26,7 @@ bool IsExistAclnnSilentCheck()
 
 bool IsExistAclnnSilentCheckV2()
 {
-    const static bool isExistV2 = []() -> bool {
-        static auto func = GET_FUNC(aclnnSilentCheckV2);
-        return func != nullptr;
-    }();
+    const static bool isExistV2 = false;
     return isExistV2;
 }
 
diff --git a/torch_npu/utils/_step.py b/torch_npu/utils/_step.py
index 314e0b69c9..555cbd25c1 100644
--- a/torch_npu/utils/_step.py
+++ b/torch_npu/utils/_step.py
@@ -333,8 +333,6 @@ def add_perf_dump_patch():
         if torch_npu._C._get_silent_check_version() == 1:
             warnings.warn(f"Warning: CANN version lower than 8.0.RC3 and currently does not support silent check 2.0 version or later. It will switch to 1.0 version.")
             asd_enable = 0
-        elif torch_npu._C._get_silent_check_version() == 2:
-            warnings.warn(f"Warning: CANN version lower than 8.0.0 and currently does not support silent check 3.0 version. It will switch to 2.0 version. The asd_detect is {asd_enable}")
         else:
             loggerSilent.debug(f"Silent check 3.0 version will be enabled. The asd_detect is {asd_enable}")
 
-- 
Gitee


From 9deaed66b5c85d3e90acaf9c2f658ae1092cdba7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=A7=9C=E6=80=A1=E6=96=87?= <jiangyiwen5@huawei.com>
Date: Tue, 15 Apr 2025 12:55:05 +0000
Subject: [PATCH 322/358] =?UTF-8?q?!20355=20Fix=20param=20in=20ASCEND=5FLO?=
 =?UTF-8?q?G=20Merge=20pull=20request=20!20355=20from=20=E5=A7=9C=E6=80=A1?=
 =?UTF-8?q?=E6=96=87/v2.6.0=5Flz?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/csrc/framework/LazyInitAclops.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_npu/csrc/framework/LazyInitAclops.cpp b/torch_npu/csrc/framework/LazyInitAclops.cpp
index 03b0814659..5f51f9f0a5 100644
--- a/torch_npu/csrc/framework/LazyInitAclops.cpp
+++ b/torch_npu/csrc/framework/LazyInitAclops.cpp
@@ -126,7 +126,7 @@ void InitializeJitCompilationMode()
     std::string value_str = GetJitCompileMode();
     if (value_str != "") {
         c10_npu::option::SetOption("jitCompileInit", value_str);
-        ASCEND_LOGI("Set jitCompileInit option to %s", value_str);
+        ASCEND_LOGI("Set jitCompileInit option to %s", value_str.c_str());
     } else {
         c10_npu::option::SetOption("jitCompileInit", "disable");
         ASCEND_LOGI("Set jitCompileInit option to default value: disable");
-- 
Gitee


From c63b2447cfb4f73cc758db12f9520d1c8959164d Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Tue, 15 Apr 2025 22:35:05 +0000
Subject: [PATCH 323/358] !20421 Update torchair commit id Merge pull request
 !20421 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 39d32a9cb8..353f95a40c 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 39d32a9cb87e09717240f2da3f5d48a3408f87ce
+Subproject commit 353f95a40c6235625d55678621d86e0ee864de50
-- 
Gitee


From 98d867468ebfd17e50c1fb99fe41764bb63c0047 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E5=98=89=E5=B7=8D?= <843972097@qq.com>
Date: Wed, 16 Apr 2025 08:19:02 +0000
Subject: [PATCH 324/358] =?UTF-8?q?!20405=20[Feature]=20Registe=20op=5Fplu?=
 =?UTF-8?q?gin=20in=20logging.=20Merge=20pull=20request=20!20405=20from=20?=
 =?UTF-8?q?=E5=88=98=E5=98=89=E5=B7=8D/v2.6.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/_logging/_internal.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch_npu/_logging/_internal.py b/torch_npu/_logging/_internal.py
index c27a2aa3c6..59d0fd9f14 100644
--- a/torch_npu/_logging/_internal.py
+++ b/torch_npu/_logging/_internal.py
@@ -36,3 +36,4 @@ def _add_logging_module():
     torch._logging._internal.register_log("memory", "torch_npu.memory")
     torch._logging._internal.register_log("dispatch", "torch_npu.dispatch")
     torch._logging._internal.register_log("silent", "torch_npu.silent_check")
+    torch._logging._internal.register_log("op_plugin", "torch_npu.op_plugin")
-- 
Gitee


From 4e92c770dc8c2020842b2f09753c929211cebd78 Mon Sep 17 00:00:00 2001
From: torchair_robot <torchair_ascend@163.com>
Date: Wed, 16 Apr 2025 16:43:50 +0000
Subject: [PATCH 325/358] !20446 Update torchair commit id Merge pull request
 !20446 from torchair_robot/v2.6.0

---
 third_party/torchair/torchair | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/torchair/torchair b/third_party/torchair/torchair
index 353f95a40c..798f3cdd4b 160000
--- a/third_party/torchair/torchair
+++ b/third_party/torchair/torchair
@@ -1 +1 @@
-Subproject commit 353f95a40c6235625d55678621d86e0ee864de50
+Subproject commit 798f3cdd4bf1dc78751a074bfd5f1b8f0bdfcf82
-- 
Gitee


From 771a86f297b5ed0fd86abb88ccb285bd4cbd6e1e Mon Sep 17 00:00:00 2001
From: wl1259 <wanglei752@huawei.com>
Date: Thu, 17 Apr 2025 21:09:05 +0800
Subject: [PATCH 326/358] commit inductor 2.6 and test

---
 test/_inductor/__init__.py                    |    3 +
 test/_inductor/commonutils.py                 |   89 +
 test/_inductor/conftest.py                    |   22 +
 test/_inductor/run_ut.sh                      |   69 +
 test/_inductor/test_abs.py                    |   41 +
 test/_inductor/test_add.py                    |   54 +
 test/_inductor/test_add_sum.py                |   49 +
 test/_inductor/test_alias.py                  |   44 +
 test/_inductor/test_argmax.py                 |   32 +
 test/_inductor/test_argmax_unalign.py         |   30 +
 test/_inductor/test_arrange.py                |   52 +
 test/_inductor/test_attncp.py                 |   37 +
 test/_inductor/test_batch_norm.py             |   61 +
 test/_inductor/test_broadcast.py              |   58 +
 test/_inductor/test_cat.py                    |   41 +
 test/_inductor/test_ceil.py                   |   36 +
 test/_inductor/test_clamp.py                  |  117 +
 test/_inductor/test_clone.py                  |   42 +
 test/_inductor/test_cos.py                    |   53 +
 test/_inductor/test_device_put.py             |   48 +
 test/_inductor/test_div.py                    |   43 +
 test/_inductor/test_embedding.py              |   49 +
 test/_inductor/test_embedding_fallback.py     |   51 +
 test/_inductor/test_empty.py                  |   68 +
 test/_inductor/test_eq.py                     |   41 +
 test/_inductor/test_exp.py                    |   48 +
 test/_inductor/test_expm1.py                  |   41 +
 test/_inductor/test_floor.py                  |   40 +
 test/_inductor/test_foreach_add.py            |   47 +
 test/_inductor/test_ge.py                     |   35 +
 test/_inductor/test_geometric.py              |   52 +
 test/_inductor/test_gt.py                     |   57 +
 test/_inductor/test_high_order_sum.py         |   66 +
 test/_inductor/test_issue54.py                |   76 +
 test/_inductor/test_issue57.py                |   48 +
 test/_inductor/test_issue59.py                |   47 +
 test/_inductor/test_issue62.py                |   55 +
 test/_inductor/test_issue70.py                |   30 +
 test/_inductor/test_opensora_graph1.py        |  343 +++
 test/_inductor/test_permute.py                |   47 +
 test/_inductor/test_reduction_brocast_add.py  |   34 +
 test/_inductor/test_relu.py                   |   34 +
 test/_inductor/test_renorm.py                 |   40 +
 test/_inductor/test_repeat.py                 |   40 +
 test/_inductor/test_reshape.py                |   39 +
 test/_inductor/test_rsqrt.py                  |   35 +
 test/_inductor/test_slice.py                  |   55 +
 test/_inductor/test_split_loop.py             |   38 +
 test/_inductor/test_sqrt.py                   |   44 +
 test/_inductor/test_sub.py                    |   42 +
 test/_inductor/test_sum.py                    |   75 +
 test/_inductor/test_sum_add.py                |   49 +
 test/_inductor/test_var.py                    |   41 +
 test/_inductor/test_var_mean.py               |   44 +
 test/_inductor/test_var_mean_add_mul.py       |   45 +
 test/_inductor/test_where.py                  |   41 +
 test/_inductor/testutils.py                   |   41 +
 torch_npu/_inductor/__init__.py               |   91 +
 torch_npu/_inductor/codegen/__init__.py       |   40 +
 torch_npu/_inductor/codegen/_sizevars.py      |    9 +
 torch_npu/_inductor/codegen/ir.py             |  194 ++
 .../_inductor/codegen/npu_kernel_features.py  |  109 +
 torch_npu/_inductor/codegen/schduling.py      |  221 ++
 torch_npu/_inductor/codegen/split_tiling.py   |  298 +++
 torch_npu/_inductor/codegen/tile_generator.py |  135 ++
 torch_npu/_inductor/codegen/triton.py         | 2090 +++++++++++++++++
 torch_npu/_inductor/codegen/triton_utils.py   |   29 +
 torch_npu/_inductor/codegen/wrapper.py        |   86 +
 torch_npu/_inductor/config.py                 |   44 +
 torch_npu/_inductor/decomposition.py          |   47 +
 torch_npu/_inductor/dynamo_patch3.py          |   11 +
 torch_npu/_inductor/lowering.py               |  333 +++
 torch_npu/_inductor/npu_choices.py            |   40 +
 .../_inductor/npu_fusion_attention_graph.py   |  231 ++
 torch_npu/_inductor/npu_triton_helpers.py     |   18 +
 torch_npu/_inductor/npu_triton_heuristics.py  |  969 ++++++++
 torch_npu/_inductor/runtime.py                |   56 +
 torch_npu/_inductor/utils.py                  |    6 +
 78 files changed, 8056 insertions(+)
 create mode 100644 test/_inductor/__init__.py
 create mode 100644 test/_inductor/commonutils.py
 create mode 100644 test/_inductor/conftest.py
 create mode 100644 test/_inductor/run_ut.sh
 create mode 100644 test/_inductor/test_abs.py
 create mode 100644 test/_inductor/test_add.py
 create mode 100644 test/_inductor/test_add_sum.py
 create mode 100644 test/_inductor/test_alias.py
 create mode 100644 test/_inductor/test_argmax.py
 create mode 100644 test/_inductor/test_argmax_unalign.py
 create mode 100644 test/_inductor/test_arrange.py
 create mode 100644 test/_inductor/test_attncp.py
 create mode 100644 test/_inductor/test_batch_norm.py
 create mode 100644 test/_inductor/test_broadcast.py
 create mode 100644 test/_inductor/test_cat.py
 create mode 100644 test/_inductor/test_ceil.py
 create mode 100644 test/_inductor/test_clamp.py
 create mode 100644 test/_inductor/test_clone.py
 create mode 100644 test/_inductor/test_cos.py
 create mode 100644 test/_inductor/test_device_put.py
 create mode 100644 test/_inductor/test_div.py
 create mode 100644 test/_inductor/test_embedding.py
 create mode 100644 test/_inductor/test_embedding_fallback.py
 create mode 100644 test/_inductor/test_empty.py
 create mode 100644 test/_inductor/test_eq.py
 create mode 100644 test/_inductor/test_exp.py
 create mode 100644 test/_inductor/test_expm1.py
 create mode 100644 test/_inductor/test_floor.py
 create mode 100644 test/_inductor/test_foreach_add.py
 create mode 100644 test/_inductor/test_ge.py
 create mode 100644 test/_inductor/test_geometric.py
 create mode 100644 test/_inductor/test_gt.py
 create mode 100644 test/_inductor/test_high_order_sum.py
 create mode 100644 test/_inductor/test_issue54.py
 create mode 100644 test/_inductor/test_issue57.py
 create mode 100644 test/_inductor/test_issue59.py
 create mode 100644 test/_inductor/test_issue62.py
 create mode 100644 test/_inductor/test_issue70.py
 create mode 100644 test/_inductor/test_opensora_graph1.py
 create mode 100644 test/_inductor/test_permute.py
 create mode 100644 test/_inductor/test_reduction_brocast_add.py
 create mode 100644 test/_inductor/test_relu.py
 create mode 100644 test/_inductor/test_renorm.py
 create mode 100644 test/_inductor/test_repeat.py
 create mode 100644 test/_inductor/test_reshape.py
 create mode 100644 test/_inductor/test_rsqrt.py
 create mode 100644 test/_inductor/test_slice.py
 create mode 100644 test/_inductor/test_split_loop.py
 create mode 100644 test/_inductor/test_sqrt.py
 create mode 100644 test/_inductor/test_sub.py
 create mode 100644 test/_inductor/test_sum.py
 create mode 100644 test/_inductor/test_sum_add.py
 create mode 100644 test/_inductor/test_var.py
 create mode 100644 test/_inductor/test_var_mean.py
 create mode 100644 test/_inductor/test_var_mean_add_mul.py
 create mode 100644 test/_inductor/test_where.py
 create mode 100644 test/_inductor/testutils.py
 create mode 100644 torch_npu/_inductor/__init__.py
 create mode 100644 torch_npu/_inductor/codegen/__init__.py
 create mode 100644 torch_npu/_inductor/codegen/_sizevars.py
 create mode 100644 torch_npu/_inductor/codegen/ir.py
 create mode 100644 torch_npu/_inductor/codegen/npu_kernel_features.py
 create mode 100644 torch_npu/_inductor/codegen/schduling.py
 create mode 100644 torch_npu/_inductor/codegen/split_tiling.py
 create mode 100644 torch_npu/_inductor/codegen/tile_generator.py
 create mode 100644 torch_npu/_inductor/codegen/triton.py
 create mode 100644 torch_npu/_inductor/codegen/triton_utils.py
 create mode 100644 torch_npu/_inductor/codegen/wrapper.py
 create mode 100644 torch_npu/_inductor/config.py
 create mode 100644 torch_npu/_inductor/decomposition.py
 create mode 100644 torch_npu/_inductor/dynamo_patch3.py
 create mode 100644 torch_npu/_inductor/lowering.py
 create mode 100644 torch_npu/_inductor/npu_choices.py
 create mode 100644 torch_npu/_inductor/npu_fusion_attention_graph.py
 create mode 100644 torch_npu/_inductor/npu_triton_helpers.py
 create mode 100644 torch_npu/_inductor/npu_triton_heuristics.py
 create mode 100644 torch_npu/_inductor/runtime.py
 create mode 100644 torch_npu/_inductor/utils.py

diff --git a/test/_inductor/__init__.py b/test/_inductor/__init__.py
new file mode 100644
index 0000000000..d2497b84ae
--- /dev/null
+++ b/test/_inductor/__init__.py
@@ -0,0 +1,3 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#  Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
\ No newline at end of file
diff --git a/test/_inductor/commonutils.py b/test/_inductor/commonutils.py
new file mode 100644
index 0000000000..00527a41d8
--- /dev/null
+++ b/test/_inductor/commonutils.py
@@ -0,0 +1,89 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
+
+import subprocess
+from io import StringIO
+from typing import List
+
+
+"""
+对外统一调用接口
+"""
+def get_available_npu_device_ids():
+    npu_ids = _get_all_npu_device_ids()
+    sorted_npu_dict = _sort_npu_by_usage_cap(npu_ids)
+    return list(sorted_npu_dict.keys())
+
+
+"""
+通过ls /dev/davinci*获取所有的npu的id
+"""
+def _get_all_npu_device_ids():
+    ## ls /dev/davinci*
+    buffer = StringIO()
+    try:
+        result = subprocess.run(
+            ["ls /dev/davinci*"],
+            capture_output=True,
+            shell=True,
+            text=True,
+            check=True
+        )
+        output = result.stdout
+        buffer.write(output)
+    except subprocess.CalledProcessError as e:
+        print(f"Error running command: {e}")
+    finally:
+        content = buffer.getvalue()
+        buffer.close()
+
+    npu_ids = []
+    if content is None:
+        return npu_ids
+    for line in content.splitlines():
+        if not line[-1].isdigit():
+            continue
+        idx = -1
+        while line[idx].isdigit():
+            idx -= 1
+        id = line[idx + 1:]
+        npu_ids.append(id)
+    return npu_ids
+
+
+"""
+通过npu-smi info -t usages -i %id 获取每个卡的使用率并升序排序
+返回字典{id:[HBM Capacity(MB), HBM Usage Rate(%)]},按使用率升序，使用率相同按容量降序
+"""
+def _sort_npu_by_usage_cap(npu_ids: List[str]) -> List[int]:
+    npu_dict = dict()
+    try:
+        for id in npu_ids:
+            result = subprocess.run(["npu-smi info -t usages -i " + id],
+                                    capture_output=True,
+                                    text=True,
+                                    shell=True,
+                                    check=True)
+            ss = result.stdout
+            ## [HBM Capacity(MB), HBM Usage Rate(%)]
+            tmp = []
+            for line in ss.splitlines():
+                if ":" not in line:
+                    continue
+                key, val = line.split(":")
+                key, val = key.strip(), val.strip()
+                if key == "HBM Usage Rate(%)":
+                    tmp.append(val)
+                if key == "HBM Capacity(MB)":
+                    tmp.append(val)
+            if tmp is not None:
+                npu_dict[int(id)] = tmp
+        sorted_npu_dict = dict(sorted(npu_dict.items(), key=lambda x: (int(x[1][1]), -int(x[1][0]))))
+        return sorted_npu_dict
+    except subprocess.CalledProcessError as e:
+        print(f"Error running command: {e}")
+
+
+if __name__ == '__main__':
+    res = get_available_npu_device_ids()
+    print(res)
\ No newline at end of file
diff --git a/test/_inductor/conftest.py b/test/_inductor/conftest.py
new file mode 100644
index 0000000000..0b907d9a29
--- /dev/null
+++ b/test/_inductor/conftest.py
@@ -0,0 +1,22 @@
+import pytest
+import os
+import torch_npu._inductor
+import getpass
+
+
+def pytest_addoption(parser):
+    parser.addoption("--npu_indexing", action='store', default='False',
+                     help='whether enable npu indexing or not,default is True', choices=['True', 'False'])
+
+
+@pytest.fixture(scope="session")
+def clear_cache():
+    # os.system('rm -rf /tmp/torchinductor_' + getpass.getuser() + '/*')
+    # os.system('rm -rf ~/.triton/dump')
+    # os.system('rm -rf ~/.triton/cache')
+    return
+
+
+@pytest.fixture(scope="session", autouse=True)
+def set_npu_indexing(pytestconfig):
+    torch_npu._inductor.config.enable_npu_indexing = eval(pytestconfig.getoption("--npu_indexing"))
diff --git a/test/_inductor/run_ut.sh b/test/_inductor/run_ut.sh
new file mode 100644
index 0000000000..bdbe08e8df
--- /dev/null
+++ b/test/_inductor/run_ut.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+set -ex
+
+source /root/anaconda3/bin/activate inductor260
+pip list
+
+# 先编译tritonNpu
+pip uninstall triton
+
+mkdir -p ${WORKSPACE}TritonNpu
+cd ${WORKSPACE}TritonNpu
+git clone https://gitee.com/ascend/triton-ascend.git -b master
+
+if [ -d ${WORKSPACE}TritonNpu/triton-ascend/triton ];then
+  rm -rf ${WORKSPACE}TritonNpu/triton-ascend/triton
+fi
+
+if [ -d ~/.triton/dump ];then
+  rm -rf ~/.triton/dump
+fi
+
+if [ -d ~/.triton/cache ];then
+  rm -rf ~/.triton/cache
+fi
+
+cd ${WORKSPACE}TritonNpu/triton-ascend
+git clone --depth 1 https://gitee.com/shijingchang/triton.git
+#cp -r /triton_depends/triton ${WORKSPACE}TritonNpu/triton-ascend/triton
+#cd ${WORKSPACE}TritonNpu/triton-ascend/triton
+#git apply ${WORKSPACE}TritonNpu/triton-ascend/build/patch/triton_ebce7f.patch
+#git apply ${WORKSPACE}TritonNpu/triton-ascend/build/patch/0001-AttrDescriptor-fix-and-delete-power-of-two.patch
+#cd ${WORKSPACE}TritonNpu/triton-ascend
+echo ${pwd}
+
+TRITON_PLUGIN_DIRS=${WORKSPACE}TritonNpu/triton-ascend/ascend \
+LLVM_INCLUDE_DIRS=$LLVM_SYSPATH/include \
+LLVM_LIBRARY_DIR=$LLVM_SYSPATH/lib \
+LLVM_SYSPATH=$LLVM_SYSPATH  \
+TRITON_BUILD_WITH_CLANG_LLD=true \
+pip install -e ${WORKSPACE}TritonNpu/triton-ascend/triton/python --no-build-isolation -vvv
+
+pip list
+
+cd ${WORKSPACE}
+echo ${PWD}
+ls -al
+
+# run inductor ut
+export PYTHONPATH=${WORKSPACE}:$PYTHONPATH
+export TORCHINDUCTOR_COMPILE_THREADS=1
+export ASCEND_LAUNCH_BLOCKING=1
+export CI=""
+env
+
+if [ -d ~/.triton/dump ];then
+  rm -rf ~/.triton/dump
+fi
+
+if [ -d ~/.triton/cache ];then
+  rm -rf ~/.triton/cache
+fi
+
+tree
+
+cd test
+
+pytest -svvv . --npu_indexing=True || { exit 1; }
+
+
diff --git a/test/_inductor/test_abs.py b/test/_inductor/test_abs.py
new file mode 100644
index 0000000000..8440aab1fc
--- /dev/null
+++ b/test/_inductor/test_abs.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from .testutils import OperatorType, TestUtils
+
+
+class TestAbs(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    # optimized function, auto timeout after __TIME_LIMIT seconds
+
+    # @torch.compile(options={"aggressive_fusion": False})
+
+    def op_calc(self, first_element):
+        result = torch.abs(first_element)
+        return result
+
+    # 在连续测试场景下,测试结果不稳定,建议单独重测批量测试未通过的 case
+    # 若需测试更多数据类型，将dtype后面的list改成 ProtoTestCase._test_dtypes即可
+    # 对indexing开关情况的测试需要用外部参数--npu_indexing=True/False完成
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(1024, 32), (256, 8)])
+    @pytest.mark.parametrize('dtype', ['float16', 'float32', 'bfloat16'])
+    def test_pointwise_cases(self, shape, dtype, clear_cache):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        first_element = self._generate_tensor(shape, dtype)
+
+        std_result = self.op_calc(first_element)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_result = compiled_op_calc(first_element)
+        # print(std_result[0:8])
+        # print(inductor_result[0:8])
+        torch.testing.assert_close(std_result, inductor_result, atol=1e-3, rtol=1e-3)
diff --git a/test/_inductor/test_add.py b/test/_inductor/test_add.py
new file mode 100644
index 0000000000..8da8dff4f5
--- /dev/null
+++ b/test/_inductor/test_add.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+
+class TestAdd(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    # optimized function, auto timeout after __TIME_LIMIT seconds
+
+    # @torch.compile(options={"aggressive_fusion": False})
+
+    def op_calc(self, first_element, second_element):
+        result = first_element + second_element
+        return result
+
+    # 在连续测试场景下,测试结果不稳定,建议单独重测批量测试未通过的 case
+    # 若需测试更多数据类型，将dtype后面的list改成 ProtoTestCase._test_dtypes即可
+    # 对indexing开关情况的测试需要用外部参数--npu_indexing=True/False完成
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', TestUtils._pointwise_demo_shapes)
+    @pytest.mark.parametrize('dtype', ['float32', 'int64'])
+    def test_pointwise_cases(self, shape, dtype, clear_cache):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        first_element = self._generate_tensor(shape, dtype)
+        second_element = self._generate_tensor(shape, dtype)
+
+        std_sum = self.op_calc(first_element, second_element)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_sum = compiled_op_calc(first_element, second_element)
+
+        torch.testing.assert_close(std_sum, inductor_sum)
+
+    # should be implemented when __OPTYPE is OperatorType.REDUCTION
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape,dim', TestUtils._reduction_extest_SDbinding)
+    @pytest.mark.parametrize('dtype', TestUtils._test_dtypes)
+    @pytest.mark.skipif(__OPTYPE != OperatorType.REDUCTION, reason='not reduction operator')
+    def test_reduction_cases(self, shape, dim, dtype, clear_cache):
+        pass
+
+if __name__ == "__main__":
+    size = (1024, 1024)
+    test = TestAdd()
+    test.test_pointwise_cases(size, 'float32',  None)
\ No newline at end of file
diff --git a/test/_inductor/test_add_sum.py b/test/_inductor/test_add_sum.py
new file mode 100644
index 0000000000..3e54152be9
--- /dev/null
+++ b/test/_inductor/test_add_sum.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+class TestSumAdd(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.REDUCTION
+
+    def foo(self,a, b, dim):
+        y = a + b
+        y = y.sum(dim)
+        return y
+
+    # case：change shapes
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(9, 9, 31, 64)])
+    @pytest.mark.parametrize('dim', [3])
+    @pytest.mark.parametrize('dtype', ['float32'])
+    def test_reduction_cases_shapes(self, shape, dim, dtype, clear_cache):
+        print(f"shape= {shape}")
+        print(f"dim= {dim}")
+        print(f"dtype= {dtype}")
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        a, b = [torch.randn(shape, requires_grad=False, dtype=torch.float32, device="npu") for _ in range(2)]
+        r1 = self.foo(a, b, dim)
+        func = torch.compile(self.foo, backend="inductor", dynamic=False)
+        r = func(a, b, dim)
+        torch.testing.assert_close(r, r1, rtol=1e-3, atol=1e-3)
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(9, 10, 31, 63)])
+    @pytest.mark.parametrize('dim', [0, 1])
+    @pytest.mark.parametrize('dtype', ['float32'])
+    def test_reduction_cases_shapes1(self, shape, dim, dtype, clear_cache):
+        print(f"shape= {shape}")
+        print(f"dim= {dim}")
+        print(f"dtype= {dtype}")
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+
+        a, b = [torch.randn(shape, requires_grad=False, dtype=torch.float32, device="npu") for _ in range(2)]
+        r1 = self.foo(a, b, dim)
+        func = torch.compile(self.foo, backend="inductor", dynamic=False)
+        r = func(a, b, dim)
+        torch.testing.assert_close(r, r1, rtol=1e-3, atol=1e-3)
diff --git a/test/_inductor/test_alias.py b/test/_inductor/test_alias.py
new file mode 100644
index 0000000000..7f93f091ce
--- /dev/null
+++ b/test/_inductor/test_alias.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+class TestAlias(TestUtils):
+    __TIME_LIMIT = 100
+
+    def op_calc(self, input_element, dim):
+        x = torch.ops.aten.alias(input_element)
+        y = x + 1.0
+        return y
+
+    # case：change shapes
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(32, 64)])
+    @pytest.mark.parametrize('dim', [0])
+    @pytest.mark.parametrize('dtype', ['float32'])
+    def test_reduction_cases_shapes(self, shape, dim, dtype, clear_cache):
+        print(f"shape= {shape}")
+        print(f"dim= {dim}")
+        print(f"dtype= {dtype}")
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+
+        input_element = self._generate_tensor(shape, dtype)
+        print(f"input_element= {input_element}")
+        std_ret = self.op_calc(input_element, dim)
+        print(f"std_ret= {std_ret}")
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_ret = compiled_op_calc(input_element, dim)
+        print(f"inductor_ret= {inductor_ret}")
+        rtol = 1e-1
+        atol = 1e-1
+        assert torch.allclose(std_ret, inductor_ret, equal_nan=True, rtol=rtol, atol=atol)
+
+
+if __name__ == "__main__":
+    size = (32, 64)
+    test = TestAlias()
+    test.test_reduction_cases_shapes(size, -1, 'float32', None)
diff --git a/test/_inductor/test_argmax.py b/test/_inductor/test_argmax.py
new file mode 100644
index 0000000000..d8ddb3771f
--- /dev/null
+++ b/test/_inductor/test_argmax.py
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+import pytest
+from testutils import OperatorType, TestUtils
+
+
+class TestArgmax(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    # 在连续测试场景下,测试结果不稳定,建议单独重测批量测试未通过的 case
+    # 若需测试更多数据类型，将dtype后面的list改成 ProtoTestCase._test_dtypes即可
+    # 对indexing开关情况的测试需要用外部参数--npu_indexing=True/False完成
+    def argmax(self, a, dim):
+        return torch.argmax(a, dim)
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.skip(reason='not support yet')
+    def test_argmax(self):
+        shape=(512, 64)
+        dim = -1
+        print(f"start to test argmax on shape:{shape} dim:{dim} ")
+        a = torch.randn(shape, requires_grad=False, dtype=torch.float32, device='npu')
+
+        argmax_triton = torch.compile(self.argmax, backend="inductor", dynamic=False)
+        r = self.argmax(a, dim)
+        r1 = argmax_triton(a, dim)
+        torch.testing.assert_close(r, r1, rtol=1e-3, atol=1e-3)
+
+
diff --git a/test/_inductor/test_argmax_unalign.py b/test/_inductor/test_argmax_unalign.py
new file mode 100644
index 0000000000..66beb40305
--- /dev/null
+++ b/test/_inductor/test_argmax_unalign.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import sys
+sys.path.append("../..")
+import torch_npu._inductor
+
+import pytest
+# from .testutils import OperatorType, TestUtils
+torch_npu._inductor.config.enable_npu_indexing = True
+class TestMaxWithIndex():
+    __TIME_LIMIT = 100
+    def op_calc(self, input_element, dim):
+        return torch.argmax(input_element, dim)
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(512, 64)]) # (513, 64), (514,33)
+    @pytest.mark.parametrize('dim', [-1 ])
+    @pytest.mark.parametrize('dtype', ['float32'])
+    def test_reduction_cases(self, shape, dim, dtype):
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        input_element = torch.randn(size=shape, dtype=eval('torch.' + dtype), device=torch.device("npu")) * 2000
+        std_argmax = self.op_calc(input_element, dim)
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor", dynamic=False)
+        inductor_argmax = compiled_op_calc(input_element, dim)
+        torch.testing.assert_close(std_argmax, inductor_argmax, rtol=1e-2, atol=1e-2)
+if __name__ == '__main__':
+    self = TestMaxWithIndex()
+    self.test_reduction_cases((513, 64), -1, 'float32')
\ No newline at end of file
diff --git a/test/_inductor/test_arrange.py b/test/_inductor/test_arrange.py
new file mode 100644
index 0000000000..3fe320fdb4
--- /dev/null
+++ b/test/_inductor/test_arrange.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+
+class TestArrange(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    # optimized function, auto timeout after __TIME_LIMIT seconds
+
+    # @torch.compile(options={"aggressive_fusion": False})
+
+    def op_calc(self, start, end, step):
+        a = torch.arange(start, end, step, device=torch.device('npu'))
+        y = a + a
+        return y
+
+    # 在连续测试场景下,测试结果不稳定,建议单独重测批量测试未通过的 case
+    # 若需测试更多数据类型，将dtype后面的list改成 ProtoTestCase._test_dtypes即可
+    # 对indexing开关情况的测试需要用外部参数--npu_indexing=True/False完成
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(2, )])
+    @pytest.mark.parametrize('dtype', TestUtils._test_dtypes)
+    def test_pointwise_cases(self, shape, dtype, clear_cache):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        s = self._generate_tensor(shape, dtype)
+        start = min(s)
+        end = max(s)
+        step = (end - start) / 32
+
+        std_arrange = self.op_calc(start, end, step)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor", dynamic=False)
+        inductor_arrange = compiled_op_calc(start, end, step)
+
+        torch.testing.assert_close(std_arrange, inductor_arrange)
+
+    # should be implemented when __OPTYPE is OperatorType.REDUCTION
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape,dim', TestUtils._reduction_extest_SDbinding)
+    @pytest.mark.parametrize('dtype', TestUtils._test_dtypes)
+    @pytest.mark.skipif(__OPTYPE != OperatorType.REDUCTION, reason='not reduction operator')
+    def test_reduction_cases(self, shape, dim, dtype, clear_cache):
+        pass
diff --git a/test/_inductor/test_attncp.py b/test/_inductor/test_attncp.py
new file mode 100644
index 0000000000..c2e40d3469
--- /dev/null
+++ b/test/_inductor/test_attncp.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+import pytest
+from testutils import OperatorType, TestUtils
+
+
+class TestAttnCp(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    # optimized function, auto timeout after __TIME_LIMIT seconds
+    # @torch.compile(options={"aggressive_fusion": False})
+    shape = (8, 8, 256, 128)
+    dim = -1
+    def foo(self, a, b, c):
+        y = a + b
+        y = y.sum(self.dim)
+        y = y.unsqueeze(self.dim)
+        y = y.broadcast_to(self.shape) + b
+        y = c + y.permute(0, 1, 3, 2)
+        return y
+
+    # 在连续测试场景下,测试结果不稳定,建议单独重测批量测试未通过的 case
+    # 若需测试更多数据类型，将dtype后面的list改成 ProtoTestCase._test_dtypes即可
+    # 对indexing开关情况的测试需要用外部参数--npu_indexing=True/False完成
+
+    def test_pointwise_cases(self):
+        a, b = [torch.randn(self.shape, dtype=torch.float32, device="npu") for _ in range(2)]
+        d = torch.randn(self.shape, dtype=torch.float32, device="npu")
+        c = d.permute(0, 1, 3, 2).contiguous()
+        func = torch.compile(self.foo, backend="inductor")
+        r = func(a, b, c)
+        r1 = self.foo(a, b, c)
+        torch.testing.assert_close(r, r1, rtol=1e-3, atol=1e-3)
diff --git a/test/_inductor/test_batch_norm.py b/test/_inductor/test_batch_norm.py
new file mode 100644
index 0000000000..d330800f8f
--- /dev/null
+++ b/test/_inductor/test_batch_norm.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+class TestNativeBatchNorm(TestUtils):
+    __TIME_LIMIT = 100
+
+    def op_calc(self, input_element):
+        # 创建权重和偏置张量
+        weight = torch.ones(32).npu()
+        bias = torch.zeros(32).npu()
+
+        # 创建运行均值和方差张量
+        running_mean = torch.zeros(32).npu()
+        running_var = torch.ones(32).npu()
+
+
+        # 执行批量归一化
+        output, running_mean_out, running_var_out = torch.native_batch_norm(
+            input=input_element,
+            weight=weight,
+            bias=bias,
+            running_mean=running_mean,
+            running_var=running_var,
+            training=True,
+            momentum=0.1,
+            eps=1e-05
+        )
+        return output, running_mean_out, running_var_out
+
+    @pytest.mark.skip(reason="npu compiler bug")
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(16, 32, 64)])
+    @pytest.mark.parametrize('dtype', ['float32'])
+    def test_reduction_cases_shapes(self, shape, dtype, clear_cache):
+        print(f"shape= {shape}")
+        print(f"dtype= {dtype}")
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+
+        input_element = self._generate_tensor(shape, dtype)
+
+        print(f"input_element= {input_element}")
+        std_ret, std_ret2, std_ret3 = self.op_calc(input_element)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_ret, inductor_ret2, inductor_ret3 = compiled_op_calc(input_element)
+        rtol = 1e-1
+        atol = 1e-1
+        assert torch.allclose(std_ret, inductor_ret, equal_nan=True, rtol=rtol, atol=atol)
+
+
+if __name__ == "__main__":
+    size = (16, 32, 64)
+    test = TestNativeBatchNorm()
+    test.test_reduction_cases_shapes(size, 'float32', None)
+
diff --git a/test/_inductor/test_broadcast.py b/test/_inductor/test_broadcast.py
new file mode 100644
index 0000000000..85ec062ff5
--- /dev/null
+++ b/test/_inductor/test_broadcast.py
@@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+
+import torch_npu._inductor
+
+import copy
+import pytest
+from testutils import OperatorType, TestUtils
+
+class TestBroadcast(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    # optimized function, auto timeout after __TIME_LIMIT seconds
+
+    # @torch.compile(options={"aggressive_fusion": False})
+    broadcast_size = 128
+
+    def op_calc(self, a, b, dim, new_shape):
+        a = a.unsqueeze(dim)
+        a = a.broadcast_to(new_shape)
+        b = b.unsqueeze(dim)
+        b = b.broadcast_to(new_shape)
+        y = a + b
+        return y
+
+    # 在连续测试场景下,测试结果不稳定,建议单独重测批量测试未通过的 case
+    # 若需测试更多数据类型，将dtype后面的list改成 ProtoTestCase._test_dtypes即可
+    # 对indexing开关情况的测试需要用外部参数--npu_indexing=True/False完成
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(8, 8, 256)])
+    @pytest.mark.parametrize('dtype', ['float32', 'int32', 'float16', 'bfloat16'])
+    def test_view_cases(self, shape, dtype, clear_cache):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        a = self._generate_tensor(shape, dtype)
+        b = self._generate_tensor(shape, dtype)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor", dynamic=False)
+        for dim in [3, 2, 1, 0]:
+            new_shape = list(copy.deepcopy(shape))
+            new_shape.insert(dim, self.broadcast_size)
+            std_broadcast = self.op_calc(a, b, dim, new_shape)
+            inductor_broadcast = compiled_op_calc(a, b, dim, new_shape)
+
+            torch.testing.assert_close(std_broadcast.float(), inductor_broadcast.float(), rtol=1e-3, atol=1e-3)
+        print(f"data validation passed")
+
+    # should be implemented when __OPTYPE is OperatorType.REDUCTION
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape,dim', TestUtils._reduction_extest_SDbinding)
+    @pytest.mark.parametrize('dtype', TestUtils._test_dtypes)
+    @pytest.mark.skipif(__OPTYPE != OperatorType.REDUCTION, reason='not reduction operator')
+    def test_reduction_cases(self, shape, dim, dtype, clear_cache):
+        pass
diff --git a/test/_inductor/test_cat.py b/test/_inductor/test_cat.py
new file mode 100644
index 0000000000..715c44bf56
--- /dev/null
+++ b/test/_inductor/test_cat.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+class TestCat(TestUtils):
+    __TIME_LIMIT = 100
+
+    def op_calc(self, input_element, dim):
+        return torch.cat([input_element, input_element], dim)
+
+    # case：change shapes
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(8, 16, 32, 64)])
+    @pytest.mark.parametrize('dim', [-1])
+    @pytest.mark.parametrize('dtype', ['bfloat16'])
+    def test_reduction_cases_shapes(self, shape, dim, dtype, clear_cache):
+        print(f"shape= {shape}")
+        print(f"dim= {dim}")
+        print(f"dtype= {dtype}")
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+
+        input_element = self._generate_tensor(shape, dtype)
+        std_cat = self.op_calc(input_element, dim)
+        # print(f"std_cat.shape= {std_cat.shape}")
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_cat = compiled_op_calc(input_element, dim)
+        # print(f"inductor_cat.shape= {inductor_cat.shape}")
+        rtol = 1e-1
+        atol = 1e-1
+        assert torch.allclose(std_cat, inductor_cat, equal_nan=True, rtol=rtol, atol=atol)
+
+
+if __name__ == "__main__":
+    size = (8, 8, 8, 2048)
+    test = TestCat()
+    test.test_reduction_cases_shapes(size, 2, 'float32', None)
diff --git a/test/_inductor/test_ceil.py b/test/_inductor/test_ceil.py
new file mode 100644
index 0000000000..da2d7cc73b
--- /dev/null
+++ b/test/_inductor/test_ceil.py
@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+
+class TestRelu(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    def op_calc(self, first_element):
+        result = torch.ceil(first_element)
+        return result
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', TestUtils._pointwise_demo_shapes)
+    @pytest.mark.parametrize('dtype', ['float32', 'float16', 'bfloat16'])
+    def test_pointwise_cases(self, shape, dtype):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        first_element = self._generate_tensor(shape, dtype)
+
+        std_result = self.op_calc(first_element)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_result = compiled_op_calc(first_element)
+
+        torch.testing.assert_close(std_result, inductor_result)
+
+
+if __name__ == '__main__':
+    TestRelu()
diff --git a/test/_inductor/test_clamp.py b/test/_inductor/test_clamp.py
new file mode 100644
index 0000000000..adc3bcaf12
--- /dev/null
+++ b/test/_inductor/test_clamp.py
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+import pytest
+from testutils import OperatorType, TestUtils
+
+
+class TestClamp(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    # optimized function, auto timeout after __TIME_LIMIT seconds
+
+    # @torch.compile(options={"aggressive_fusion": False})
+
+    def op_calc(self, input, min=None, max=None):
+        return input.clamp(min, max)
+
+
+    # 在连续测试场景下,测试结果不稳定,建议单独重测批量测试未通过的 case
+    # 若需测试更多数据类型，将dtype后面的list改成 ProtoTestCase._test_dtypes即可
+    # 对indexing开关情况的测试需要用外部参数--npu_indexing=True/False完成
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', TestUtils._pointwise_demo_shapes)
+    @pytest.mark.parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32', 'int64'])
+    @pytest.mark.skip(reason='not support yet')
+    def test_pointwise_cases_minmax_is_tensor(self, shape, dtype, clear_cache):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        min = self._generate_tensor(shape, dtype)
+        max = self._generate_tensor(shape, dtype)
+
+        first_element = self._generate_tensor(shape, dtype)
+
+        std_result = self.op_calc(first_element, min=min, max=max)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_result = compiled_op_calc(first_element, min=min, max=max)
+
+        torch.testing.assert_close(std_result, inductor_result)
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(1,)])
+    @pytest.mark.parametrize('dtype', ['float32'])
+    @pytest.mark.skip(reason='not support yet')
+    def test_pointwise_cases_single_scalar(self, shape, dtype, clear_cache):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        min = 0
+        max = 100
+
+        first_element = 200 * torch.rand(size=shape, dtype=eval('torch.' + dtype), device=torch.device("npu"))
+
+        std_result = self.op_calc(first_element, min=min, max=max)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_result = compiled_op_calc(first_element, min=min, max=max)
+        torch.testing.assert_close(std_result, inductor_result)
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(1024, 32)])
+    @pytest.mark.parametrize('dtype', ['int32'])
+    @pytest.mark.skip(reason='not support yet')
+    def test_pointwise_cases_minmax_is_number(self, shape, dtype, clear_cache):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        min = 0
+        max = 100
+
+        first_element = self._generate_tensor(shape, dtype)
+
+        std_result = self.op_calc(first_element, min=min, max=max)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_result = compiled_op_calc(first_element, min=min, max=max)
+
+        torch.testing.assert_close(std_result, inductor_result)
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', TestUtils._pointwise_demo_shapes)
+    @pytest.mark.parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32', 'int64'])
+    def test_pointwise_cases_max_only(self, shape, dtype, clear_cache):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        max = 100
+
+        first_element = self._generate_tensor(shape, dtype)
+
+        std_result = self.op_calc(first_element, min=None, max=max)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_result = compiled_op_calc(first_element, min=None, max=max)
+
+        torch.testing.assert_close(std_result, inductor_result)
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', TestUtils._pointwise_demo_shapes)
+    @pytest.mark.parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32', 'int64'])  
+    def test_pointwise_cases_min_only(self, shape, dtype, clear_cache):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        min = 0
+
+        first_element = self._generate_tensor(shape, dtype)
+
+        std_result = self.op_calc(first_element, min=min, max=None)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_result = compiled_op_calc(first_element, min=min, max=None)
+
+        torch.testing.assert_close(std_result, inductor_result)
+if __name__ == '__main__':
+    obj = TestClamp()
+    obj.test_pointwise_cases_single_scalar((1,), 'float32', None)
\ No newline at end of file
diff --git a/test/_inductor/test_clone.py b/test/_inductor/test_clone.py
new file mode 100644
index 0000000000..374317523b
--- /dev/null
+++ b/test/_inductor/test_clone.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei TechNologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+class TestClone(TestUtils):
+    __TIME_LIMIT = 100
+
+    def op_calc(self, input_element, dim):
+        return torch.clone(input_element)
+
+    # case： change shapes
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(8, 64, 128)])
+    @pytest.mark.parametrize('dim', [0])
+    @pytest.mark.parametrize('dtype', ['float32'])
+    def test_reduction_cases_shapes(self, shape, dim, dtype, clear_cache):
+        print(f"shape= {shape}")
+        print(f"dim= {dim}")
+        print(f"dtype= {dtype}")
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+
+        input_element = self._generate_tensor(shape, dtype)
+        std_ret = self.op_calc(input_element, dim)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_ret = compiled_op_calc(input_element, dim)
+
+        assert torch.allclose(std_ret, inductor_ret, equal_nan=True)
+
+
+if __name__ == "__main__":
+    size = (8, 64, 128)
+    test = TestClone()
+    test.test_reduction_cases_shapes(size, 2, 'float32', None)
+
+
+
diff --git a/test/_inductor/test_cos.py b/test/_inductor/test_cos.py
new file mode 100644
index 0000000000..b963eb8ce8
--- /dev/null
+++ b/test/_inductor/test_cos.py
@@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei TechNologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+
+class TestLog(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+    # optimized function, auto timeout after __TIME_LIMIT seconds
+
+    # @torch.compile(options={"aggressive_fusion": False})
+
+    def op_calc(self, first_element):
+        result = torch.cos(first_element)
+        return result
+
+    # 在连续测试场景下，测试结果不稳定，建议单独重测批量测试未通过的 case
+    # 若需测试更多数据类型， 将dtype后面的list改成 ProtoTestCase._test_dtypes即可
+    # 对indexing开关情况的测试需要用外部参数--npu_indexing=True/False完成
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', TestUtils._pointwise_demo_shapes)
+    @pytest.mark.parametrize('dtype', ['float32', 'int64'])
+    @pytest.mark.skip(reason='not support yet')
+    def test_pointwise_cases(self, shape, dtype):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        first_element = self._generate_tensor(shape, dtype)
+
+        std_result = self.op_calc(first_element)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_result = compiled_op_calc(first_element)
+
+        torch.testing.assert_close(std_result, inductor_result)
+
+    # should be implemented when __OPTYPE is OperatorType.REDUCTION
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape,dim', TestUtils._reduction_extest_SDbinding)
+    @pytest.mark.parametrize('dtype', TestUtils._test_dtypes)
+    @pytest.mark.skipif(__OPTYPE != OperatorType.REDUCTION, reason='not reduction operator')
+    def test_reduction_cases(self, shape, dim, dtype, clear_cache):
+        pass
+
+if __name__ == '__main__':
+    TestLog()
+
+
diff --git a/test/_inductor/test_device_put.py b/test/_inductor/test_device_put.py
new file mode 100644
index 0000000000..39b17ea27d
--- /dev/null
+++ b/test/_inductor/test_device_put.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei TechNologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+class TestDevicePut(TestUtils):
+    __TIME_LIMIT = 100
+
+    def op_calc(self, input_element1, input_element2):
+        return torch.add(input_element1, input_element2)
+
+    # case： change shapes
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(8, 16, 8)])
+    @pytest.mark.parametrize('dtype', ['int32'])
+    def test_cases_shapes(self, shape,  dtype, clear_cache):
+        low = 0
+        high = 2
+        dtype = eval('torch.' + dtype)
+        print(f"shape= {shape}")
+        print(f"dtype= {dtype}")
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        # 指定目标设备为 NPU
+        npu_device = torch.device('npu:0')
+        input_element1_tmp = torch.randint(low, high, shape, dtype=dtype).cpu()
+        input_element2_tmp = torch.randint(low, high, shape, dtype=dtype).cpu()
+        input_element1 = torch.ops.prims.device_put(input_element1_tmp, npu_device)
+        input_element2 = torch.ops.prims.device_put(input_element2_tmp, npu_device)
+
+        std_ret = self.op_calc(input_element1, input_element2)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_ret = compiled_op_calc(input_element1, input_element2)
+
+        assert torch.allclose(std_ret, inductor_ret, equal_nan=True)
+
+
+if __name__ == "__main__":
+    size = (8, 16, 8)
+    test = TestDevicePut()
+    test.test_cases_shapes(size, 2, 'int32', None)
+        
+
+
diff --git a/test/_inductor/test_div.py b/test/_inductor/test_div.py
new file mode 100644
index 0000000000..318b521fe4
--- /dev/null
+++ b/test/_inductor/test_div.py
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei TechNologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from .testutils import OperatorType, TestUtils
+
+
+class TestMul(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    # optimized function, auto timeout after __TIME_LIMIT seconds
+
+    # @torch.compile(options={"aggressive_fusion": False})
+
+    def op_calc(self, first_element, second_element):
+        result = torch.div(first_element, second_element)
+        return result
+
+    # 在连续测试场景下，测试结果不稳定，建议单独重测批量测试未通过的 case
+    # 若需测试更多数据类型， 将dtype后面的list改成 ProtoTestCase._test_dtypes即可
+    # 对indexing开关情况的测试需要用外部参数--npu_indexing=True/False完成
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', TestUtils._pointwise_demo_shapes)
+    @pytest.mark.parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32',  'int64'])
+    def test_pointwise_cases(self, shape, dtype, clear_cache):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        first_element = self._generate_tensor(shape, dtype)
+        second_element = self._generate_tensor(shape, dtype)
+
+        std_result = self.op_calc(first_element, second_element)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_result = compiled_op_calc(first_element, second_element)
+        torch.testing.assert_close(std_result, inductor_result, equal_nan=True)
+
+
+
diff --git a/test/_inductor/test_embedding.py b/test/_inductor/test_embedding.py
new file mode 100644
index 0000000000..c7732ec5c6
--- /dev/null
+++ b/test/_inductor/test_embedding.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei TechNologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+#from testutils import OperatorType, TestUtils
+import torch.nn as nn
+
+class TestSub():
+
+    def op_calc(self):
+        embedding = nn.Embedding(16, 128).npu()
+
+        input = torch.tensor([[14, 1, 2, 10,  0, 10, 0],
+                        [ 9, 13, 13,  4,  7, 15, 14],
+                        [ 8,  0,  3, 15,  4,  2,  6],
+                        [15, 12, 13,  9,  0,  8,  1],
+                        [ 8, 15,  4, 15, 12,  9,  3],
+                        [ 6, 11, 12,  8,  0, 13,  8],
+                        [ 4, 10,  1, 12,  0,  0,  4],
+                        [ 6,  6, 15,  6,  0, 10, 15],
+                        [ 2,  5, 14,  0,  5,  7,  9],
+                        [13,  4, 14, 11, 11,  9,  2],
+                        [ 1,  1,  5,  1,  1,  6, 14],
+                        [ 3,  9,  8,  4, 13,  8,  3],
+                        [ 4, 10,  8, 13,  6,  8,  3]], device='npu:0')
+
+        output = embedding(input.npu())
+        return output
+
+    def test_pointwise_cases(self):
+        torch_npu._inductor.config.enable_npu_indexing = True
+
+        std_sub = self.op_calc()
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_sum = compiled_op_calc()
+        #torch.testing.assert_close(std_sub, inductor_sum)
+
+
+if __name__ == "__main__":
+    test = TestSub()
+    test.test_pointwise_cases()
+
+
+
+
diff --git a/test/_inductor/test_embedding_fallback.py b/test/_inductor/test_embedding_fallback.py
new file mode 100644
index 0000000000..4a5b492b9d
--- /dev/null
+++ b/test/_inductor/test_embedding_fallback.py
@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei TechNologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+
+class TestRsqrt(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    # optimized function, auto timeout after __TIME_LIMIT seconds
+
+    # @torch.compile(options={"aggressive_fusion": False})
+
+    def op_calc(self, slice_4, sum_23):
+        result = torch.ops.aten.embedding_dense_backward.default(sum_23, slice_4, 512, -1, False)
+        return result
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(1, 512, 128)])
+    @pytest.mark.parametrize('dtype', ['float32'])
+    def test_pointwise_cases(self, shape, dtype):
+        torch_npu._inductor.config.enable_npu_indexing = True
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        first_element = torch.randint(low=0, high=128, size=(1, 512), dtype=torch.int64).npu()
+        second_element = self._generate_tensor(shape, dtype)
+
+        std_result = self.op_calc(first_element, second_element)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_result = compiled_op_calc(first_element, second_element)
+
+        print(std_result)
+        print(inductor_result)
+
+        torch.testing.assert_close(std_result, inductor_result, rtol=1e-1, atol=1e-1)
+
+
+if __name__ == "__main__":
+    size = (1, 512, 128)
+    test = TestRsqrt()
+    test.test_pointwise_cases(size, 'float32')
+
+
+
+
diff --git a/test/_inductor/test_empty.py b/test/_inductor/test_empty.py
new file mode 100644
index 0000000000..9cc8fe36f5
--- /dev/null
+++ b/test/_inductor/test_empty.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei TechNologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+class TestEmpty(TestUtils):
+    __TIME_LIMIT = 100
+
+    def op_calc(self):
+        x = torch.empty(8, 64, 128, dtype=torch.float32).npu()
+        x.uniform_(-100, 100)
+        return x
+    def op_calc_empty_permuted(self):
+        input_shape = (8, 64, 128)
+        physical_layout =(0, 1, 2)  #物理布局与输入形状相同
+        x = torch.empty_permuted(input_shape, physical_layout).npu()
+        x.uniform_(-100, 100)
+        return x
+
+    # case： change shapes
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(8, 64, 128)])
+    @pytest.mark.parametrize('dim', [0])
+    @pytest.mark.parametrize('dtype', ['float32'])
+    def test_cases_empty(self, shape, dim, dtype, clear_cache):
+        print(f"shape= {shape}")
+        print(f"dim= {dim}")
+        print(f"dtype= {dtype}")
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+
+        std_ret = self.op_calc()
+        # print(f"std_ret= {std_ret}")
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_ret = compiled_op_calc()
+        # print(f"inductor_ret= {inductor_ret}")
+
+        assert inductor_ret.numel() > 0
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(8, 64, 128)])
+    @pytest.mark.parametrize('dim', [0])
+    @pytest.mark.parametrize('dtype', ['float32'])
+    def test_cases_empty_permuted(self, shape, dim, dtype, clear_cache):
+        print(f"shape= {shape}")
+        print(f"dim= {dim}")
+        print(f"dtype= {dtype}")
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+
+        std_ret = self.op_calc_empty_permuted()
+        # print(f"std_ret= {std_ret}")
+        compiled_op_calc = torch.compile(self.op_calc_empty_permuted, backend="inductor")
+        inductor_ret = compiled_op_calc()
+        # print(f"inductor_ret= {inductor_ret}")
+
+        assert inductor_ret.numel() > 0
+
+
+if __name__ == "__main__":
+    size = (8, 64, 128)
+    test = TestEmpty()
+    test.test_reduction_cases_shapes(size, 2, 'float32', None)
+
+
+
diff --git a/test/_inductor/test_eq.py b/test/_inductor/test_eq.py
new file mode 100644
index 0000000000..0b4c9d103b
--- /dev/null
+++ b/test/_inductor/test_eq.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei TechNologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+import pytest
+from testutils import OperatorType, TestUtils
+
+
+class TestEq(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    def op_calc(self, first_element, second_element):
+        return torch.eq(first_element, second_element)
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', TestUtils._pointwise_demo_shapes)
+    @pytest.mark.parametrize('dtype', ['float32', 'int32', 'float16', 'bfloat16'])
+    def test_pointwise_cases(self, shape, dtype, clear_cache):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+
+        first_element = self._generate_tensor(shape, dtype)
+        second_element = first_element.clone()
+
+        # randomly change some elements in second tensor
+        flat_second_view = second_element.flatten()
+        num_elements_to_change = first_element.numel() //3
+        random_indices = torch.randint(0, first_element.numel(), (num_elements_to_change,))
+        flat_second_view[random_indices] = 1- flat_second_view[random_indices]
+
+        std_result = self.op_calc(first_element, second_element)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_result = compiled_op_calc(first_element, second_element)
+
+        torch.testing.assert_close(std_result, inductor_result)
+
+
+
diff --git a/test/_inductor/test_exp.py b/test/_inductor/test_exp.py
new file mode 100644
index 0000000000..078f9e653d
--- /dev/null
+++ b/test/_inductor/test_exp.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei TechNologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from .testutils import OperatorType, TestUtils
+
+
+class TestExp(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    # optimized function, auto timeout after __TIME_LIMIT seconds
+
+    # @torch.compile(options={"aggressive_fusion": False})
+
+    def op_calc(self, first_element):
+        result = torch.exp(first_element)
+        return result
+
+    # 在连续测试场景下，测试结果不稳定，建议单独重测批量测试未通过的 case
+    # 若需测试更多数据类型， 将dtype后面的list改成 ProtoTestCase._test_dtypes即可
+    # 对indexing开关情况的测试需要用外部参数--npu_indexing=True/False完成
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', TestUtils._pointwise_demo_shapes)
+    @pytest.mark.parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int64'])
+    def test_pointwise_cases(self, shape, dtype, clear_cache):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        first_element = self._generate_tensor(shape, dtype)
+
+        std_result = self.op_calc(first_element)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_result = compiled_op_calc(first_element)
+        # print(std_result[0:8])
+        # print(inductor_result[0:8])
+        # torch.testing.assert_close(std_result, inductor_result)
+        # 需要比较包含 NaN 值的张量， 并且希望认为两个 NaN值是相等的， 您可以使用 torch.allclose 函数， 并设置 equal_nan=True 参数
+        rtol = 1e-1
+        atol = 1e-1
+        assert torch.allclose(std_result, inductor_result, equal_nan=True, rtol=rtol, atol=atol)
+
+
+
diff --git a/test/_inductor/test_expm1.py b/test/_inductor/test_expm1.py
new file mode 100644
index 0000000000..27d8e05346
--- /dev/null
+++ b/test/_inductor/test_expm1.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei TechNologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from .testutils import OperatorType, TestUtils
+
+
+class TestSqrt(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    # optimized function, auto timeout after __TIME_LIMIT seconds
+
+    # @torch.compile(options={"aggressive_fusion": False})
+
+    def op_calc(self, first_element):
+        result = torch.expm1(first_element)
+        return result
+
+    # 在连续测试场景下，测试结果不稳定，建议单独重测批量测试未通过的 case
+    # 若需测试更多数据类型， 将dtype后面的list改成 ProtoTestCase._test_dtypes即可
+    # 对indexing开关情况的测试需要用外部参数--npu_indexing=True/False完成
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', TestUtils._pointwise_demo_shapes)
+    @pytest.mark.parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int64'])
+    def test_pointwise_cases(self, shape, dtype, clear_cache):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        first_element = self._generate_tensor(shape, dtype)
+
+        std_result = self.op_calc(first_element)
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_result = compiled_op_calc(first_element)
+
+        torch.allclose(std_result, inductor_result, equal_nan=True)
+
+
diff --git a/test/_inductor/test_floor.py b/test/_inductor/test_floor.py
new file mode 100644
index 0000000000..1d7d144fee
--- /dev/null
+++ b/test/_inductor/test_floor.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei TechNologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+
+class TestRelu(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    def op_calc(self, first_element):
+        result = torch.floor(first_element)
+        return result
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', TestUtils._pointwise_demo_shapes)
+    @pytest.mark.parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32', 'int64'])
+    def test_pointwise_cases(self, shape, dtype):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        first_element = self._generate_tensor(shape, dtype)
+
+        std_result = self.op_calc(first_element)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_result = compiled_op_calc(first_element)
+
+        torch.testing.assert_close(std_result, inductor_result)
+
+
+if __name__ == '__main__':
+    TestRelu()
+
+
+
+
diff --git a/test/_inductor/test_foreach_add.py b/test/_inductor/test_foreach_add.py
new file mode 100644
index 0000000000..66111096f6
--- /dev/null
+++ b/test/_inductor/test_foreach_add.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+
+class TestRsqrt(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    def op_calc(self, first_element, second_element):
+        tensor_list = [first_element, second_element]
+
+        add_list =[first_element, second_element]
+        result = torch._foreach_add_(tensor_list, add_list)
+        return result
+
+    @pytest.mark.skip(reason='compile error, torch npu segmet fault')
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', TestUtils._pointwise_demo_shapes)
+    @pytest.mark.parametrize('dtype', ['int32'])
+    def test_pointwise_cases(self, shape, dtype):
+        torch_npu._inductor.config.enable_npu_indexing = True
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        first_element = self._generate_tensor(shape, dtype)
+        second_element = self._generate_tensor(shape, dtype)
+
+        std_result = self.op_calc(first_element, second_element)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_result = compiled_op_calc(first_element, second_element)
+
+        torch.testing.assert_close(std_result, inductor_result, rtol=1e-1, atol=1e-1)
+
+
+if __name__ == "__main__":
+    size = (1024, 32)
+    test = TestRsqrt()
+    test.test_pointwise_cases(size, 'float32')
+
+
+
diff --git a/test/_inductor/test_ge.py b/test/_inductor/test_ge.py
new file mode 100644
index 0000000000..4fe7a95e50
--- /dev/null
+++ b/test/_inductor/test_ge.py
@@ -0,0 +1,35 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei TechNologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+import pytest
+from testutils import OperatorType, TestUtils
+
+
+class TestGe(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    def op_calc(self, first_element, second_element):
+        return torch.ge(first_element, second_element)
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', TestUtils._pointwise_demo_shapes)
+    @pytest.mark.parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32'])
+    def test_pointwise_cases(self, shape, dtype, clear_cache):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+
+        first_element = self._generate_tensor(shape, dtype)
+        second_element = self._generate_tensor(shape, dtype)
+
+        std_result = self.op_calc(first_element, second_element)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_result = compiled_op_calc(first_element, second_element)
+
+        torch.testing.assert_close(std_result, inductor_result)
+
+
+
diff --git a/test/_inductor/test_geometric.py b/test/_inductor/test_geometric.py
new file mode 100644
index 0000000000..827146f2e5
--- /dev/null
+++ b/test/_inductor/test_geometric.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+class TestGeometric(TestUtils):
+    __TIME_LIMIT = 100
+
+    def op_calc(self):
+        # 创建一个形状为 (3, 3)的张量， 每个位置的概率为 0.5
+        prob =torch.full((16, 16), 0.5).npu()
+
+        #使用 aten.geometric生成几何分布的随机数
+        geometric_tensor =torch.ops.aten.geometric(prob, p=0.5)
+
+        return geometric_tensor
+
+    # case： change shapes
+    @pytest.mark.skip(reason="this has problem in torch 260")
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(16, 16, 16)])
+    @pytest.mark.parametrize('dim', [0])
+    @pytest.mark.parametrize('dtype', ['int32'])
+    def test_reduction_cases_shapes(self, shape, dim, dtype, clear_cache):
+        print(f"shape= {shape}")
+        print(f"dim= {dim}")
+        print(f"dtype= {dtype}")
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+
+        std_ret = self.op_calc()
+        std_ret_mean =torch.mean(std_ret)
+        print(f"std_ret_mean= {std_ret_mean}")
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_ret = compiled_op_calc()
+
+        inductor_ret_mean = torch.mean(inductor_ret)
+        print(f"inductor_ret_mean= {inductor_ret_mean}")
+        assert inductor_ret_mean is not None
+
+
+if __name__ == "__main__":
+    size = (16, 16, 16)
+    test = TestGeometric()
+    test.test_reduction_cases_shapes(size, -1, 'float32', None)
+
+
+
diff --git a/test/_inductor/test_gt.py b/test/_inductor/test_gt.py
new file mode 100644
index 0000000000..4b29d7eef7
--- /dev/null
+++ b/test/_inductor/test_gt.py
@@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei TechNologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from .testutils import OperatorType, TestUtils
+
+
+class TestGt(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+    # optimized function, auto timeout after __TIME_LIMIT seconds
+
+    # @torch.compile(options={"aggressive_fusion": False})
+
+    def op_calc(self, first_element, second_element):
+        result = torch.gt(first_element, second_element)
+        return result
+
+    # 在连续测试场景下，测试结果不稳定，建议单独重测批量测试未通过的 case
+    # 若需测试更多数据类型， 将dtype后面的list改成 ProtoTestCase._test_dtypes即可
+    # 对indexing开关情况的测试需要用外部参数--npu_indexing=True/False完成
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', TestUtils._pointwise_demo_shapes)
+    @pytest.mark.parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32'])
+    def test_pointwise_cases(self, shape, dtype):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        first_element = self._generate_tensor(shape, dtype)
+        second_element = self._generate_tensor(shape, dtype)
+
+        std_result = self.op_calc(first_element, second_element)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_result = compiled_op_calc(first_element, second_element)
+
+        print("start test!")
+        torch.testing.assert_close(std_result, inductor_result)
+
+    # should be implemented when __OPTYPE is OperatorType.REDUCTION
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape,dim', TestUtils._reduction_extest_SDbinding)
+    @pytest.mark.parametrize('dtype', TestUtils._test_dtypes)
+    @pytest.mark.skipif(__OPTYPE != OperatorType.REDUCTION, reason='not reduction operator')
+    def test_reduction_cases(self, shape, dim, dtype, clear_cache):
+        pass
+
+if __name__ == '__main__':
+    TestGt()
+
+
+
+
+
diff --git a/test/_inductor/test_high_order_sum.py b/test/_inductor/test_high_order_sum.py
new file mode 100644
index 0000000000..8b48e963e3
--- /dev/null
+++ b/test/_inductor/test_high_order_sum.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch.nn.functional as F
+import torch
+import torch_npu
+import torch_npu._inductor
+
+def op_sum(npu_dropout_backward_9):
+    view_337: "f32[32768, 256]" = torch.ops.aten.view.default(npu_dropout_backward_9, [32768, 256]);
+    sum_63: "f32[1, 256]" = torch.ops.aten.sum.dim_IntList(view_337, [0], True);
+    view_338: "f32[256]" = torch.ops.aten.view.default(sum_63, [256]);
+    return view_338
+
+device='npu'
+
+def test_high_order_sum():
+    npu_dropout_backward_9 = torch.randn((32768, 256), device=device, dtype=torch.float32)
+    ref = op_sum(npu_dropout_backward_9)
+    func = torch.compile(op_sum, backend="inductor", dynamic=False)
+    calc = func(npu_dropout_backward_9)
+
+    torch.testing.assert_close(ref, calc, rtol=1e-3, atol=1e-3)
+    torch.testing.assert_close(ref, calc, rtol=1e-3, atol=1e-3)
+
+if __name__ == "__main__":
+    npu_dropout_backward_9 = torch.randn((32768, 256), device=device, dtype=torch.float32)
+    ref = op_sum(npu_dropout_backward_9)
+    func = torch.compile(op_sum, backend="inductor", dynamic=False)
+    calc = func(npu_dropout_backward_9)
+
+    torch.testing.assert_close(ref, calc, rtol=1e-3, atol=1e-3)
+    torch.testing.assert_close(ref, calc, rtol=1e-3, atol=1e-3)
+
+    experimental_config = torch_npu.profiler._ExperimentalConfig(
+        aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization,
+        profiler_level=torch_npu.profiler.ProfilerLevel.Level1, l2_cache=False
+    )
+    with torch_npu.profiler.profile(
+        activities=[  # torch_npu.profiler.ProfilerActivity.CPU,
+            torch_npu.profiler.ProfilerActivity.NPU],
+        with_stack=False, #采集torch 算子的函数调用栈的开关，该参数选填，默认关闭
+        record_shapes=False,  # 采集torch 算子的input shape和input type的开关，该参数选填，默认关闭
+        profile_memory=False,  # 采集memory相关数据的开关，该参数选填，默认关闭
+        schedule=torch_npu.profiler.schedule(wait=1,
+                                             warmup=1,
+                                             active=10,
+                                             repeat=1,
+                                             skip_first=1),
+        # schedule=torch_npu.profiler.schedule(wait=1, warmup=1, active=1, skip_first=6),
+        # warmup默认为0，老版本torch_npu包该参数为必填项
+        experimental_config=experimental_config,  # 该参数选填，默认为Level0
+        # 产生的profling文件的位置
+        on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./result_dir")
+        # 导出tensorboard可呈现的数据形式，可指定worker_name, 默认为：{host名称}_{进程id}
+    ) as prof:
+        for i in range(20):
+            # ref1 = call(args)
+            op_sum(npu_dropout_backward_9)
+            func(npu_dropout_backward_9)
+            prof.step()
+
+
+
+
+
+
diff --git a/test/_inductor/test_issue54.py b/test/_inductor/test_issue54.py
new file mode 100644
index 0000000000..2f532c059b
--- /dev/null
+++ b/test/_inductor/test_issue54.py
@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
+import torch.nn.functional as F
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from torch.nn import CrossEntropyLoss
+from torch import nn
+from test2.npu_indexing.utils import benchmark_test
+
+
+class Test_issue54():
+    def func_layernorm(self, add_3, primals_6, primals_7, view, primals_9, permute_1, primals_10, primals_11):
+        # 原网络
+        permute: "f32[256, 256]" = torch.ops.aten.permute.default(primals_6, [1, 0]);
+        addmm: "f32[32768, 256]" = torch.ops.aten.addmm.default(primals_7, view, permute);
+        view_1: "f32[64, 512, 256]" = torch.ops.aten.view.default(addmm, [64, 512, 256]);
+        addmm_1: "f32[32768, 256]" = torch.ops.aten.addmm.default(primals_9, view, permute_1);
+        view_3: "f32[64, 512, 256]" = torch.ops.aten.view.default(addmm_1, [64, 512, 256]);
+        view_4: "f32[64, 512, 4, 64]" = torch.ops.aten.view.default(view_3, [64, 512, 4, 64]);
+        permute_2: "f32[64, 4, 512, 64]" = torch.ops.aten.permute.default(view_4, [0, 2, 1, 3]);
+        permute_3: "f32[256, 256]" = torch.ops.aten.permute.default(primals_10, [1, 0]);
+        addmm_2: "f32[32768, 256]" = torch.ops.aten.addmm.default(primals_11, view, permute_3);
+        view_6: "f32[64, 512, 256]" = torch.ops.aten.view.default(addmm_2, [64, 512, 256]);
+
+        view_8: "f32[64, 512, 4, 64]" = torch.ops.aten.view.default(view_1, [64, 512, 4, 64]);
+        permute_5: "f32[64, 4, 512, 64]" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]);
+
+        permute_6: "f32[64, 4, 64, 512]" = torch.ops.aten.permute.default(permute_2, [0, 1, 3, 2]);
+        expand_1: "f32[64, 4, 512, 64]" = torch.ops.aten.expand.default(permute_5, [64, 4, 512, 64])
+        clone: "f32[64, 4, 512, 64]" = torch.ops.aten.clone.default(expand_1, memory_format=torch.contiguous_format);
+        view_9: "f32[256, 512, 64]" = torch.ops.aten.view.default(clone, [256, 512, 64]);
+        expand_2: "f32[64, 4, 64, 512]" = torch.ops.aten.expand.default(permute_6, [64, 4, 64, 512])
+        clone_1: "f32[64, 4, 64, 512]" = torch.ops.aten.clone.default(expand_2, memory_format=torch.contiguous_format);
+        view_10: "f32[256, 64, 512]" = torch.ops.aten.view.default(clone_1, [256, 64, 512]);
+        bmm: "f32[256, 512, 512]" = torch.ops.aten.bmm.default(view_9, view_10);
+        view_7: "f32[64, 512, 4, 64]" = torch.ops.aten.view.default(view_6, [64, 512, 4, 64]);
+        permute_4: "f32[64, 4, 512, 64]" = torch.ops.aten.permute.default(view_7, [0, 2, 1, 3]);
+        expand_4: "f32[64, 4, 512, 64]" = torch.ops.aten.expand.default(permute_4, [64, 4, 512, 64])
+        clone_2: "f32[64, 4, 512, 64]" = torch.ops.aten.clone.default(expand_4, memory_format=torch.contiguous_format);
+        view_13: "f32[256, 512, 64]" = torch.ops.aten.view.default(clone_2, [256, 512, 64]);
+
+        return bmm, view_13
+
+    def test_issue54(self):
+        device = 'npu'
+        test = Test_issue54()
+        # add_3, primals_6, primals_7, view, primals_9, permute_1, primals_10, primals_11
+
+        add_3 = torch.randn((64, 512, 256), device=device, dtype=torch.float32)
+        primals_6 = torch.randn((256, 256), device=device, dtype=torch.float32)
+        primals_7 = torch.randn((256), device=device, dtype=torch.float32)
+        view = torch.randn((32768, 256), device=device, dtype=torch.float32)
+        primals_9 = torch.randn((256), device=device, dtype=torch.float32)
+        permute_1 = torch.randn((256, 256), device=device, dtype=torch.float32)
+        primals_10 = torch.randn((256, 256), device=device, dtype=torch.float32)
+        primals_11 = torch.randn((256), device=device, dtype=torch.float32)
+
+        ref = test.func_layernorm(add_3, primals_6, primals_7, view, primals_9, permute_1, primals_10, primals_11)
+        func = torch.compile(test.func_layernorm, backend="inductor", dynamic=False,
+                             options={"unroll_reductions_threshold": 1, "aggressive_fusion": True})
+        calc = func(add_3, primals_6, primals_7, view, primals_9, permute_1, primals_10, primals_11)
+        torch.testing.assert_close(ref[0], calc[0], rtol=1e-3, atol=1e-3)
+        torch.testing.assert_close(ref[1], calc[1], rtol=1e-3, atol=1e-3)
+        print("valid ok")
+
+        benchmark_test(test.func_layernorm, func,
+                       args=(add_3, primals_6, primals_7, view, primals_9, permute_1, primals_10, primals_11,),
+                       name="test_layernorm", times=10, repeat=10, profile=False)
+
+
+if __name__ == "__main__":
+    test = Test_issue54()
+    test.test_issue54()
\ No newline at end of file
diff --git a/test/_inductor/test_issue57.py b/test/_inductor/test_issue57.py
new file mode 100644
index 0000000000..5ad6be8e2d
--- /dev/null
+++ b/test/_inductor/test_issue57.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
+import torch.nn.functional as F
+import torch
+import torch_npu
+import torch_npu._inductor
+import pytest
+from test2.npu_indexing.utils import benchmark_test
+
+
+class Test_issue57():
+    def op_sum(self, view_12, embedding_1, slice_11):
+        # 原网络
+
+        permute_7 = torch.ops.aten.permute.default(embedding_1, [2, 0, 1]);
+        embedding_1 = None
+        unsqueeze_4 = torch.ops.aten.unsqueeze.default(permute_7, 0);
+        permute_7 = None
+
+        add_5 = torch.ops.aten.add.Tensor(unsqueeze_4, slice_11);
+        slice_8 = slice_11 = None
+        add_6 = torch.ops.aten.add.Tensor(view_12, add_5);
+        view_12 = None
+        return add_6
+
+    def test_issue57(self):
+        device = 'npu'
+        test = Test_issue57()
+        embedding_1 = torch.randn((512, 512, 64), device=device, dtype=torch.float32)
+        primals_221 = torch.randn((1, 1, 1, 512), device=device, dtype=torch.float32)
+        view_12 = torch.randn((1, 64, 512, 512), device=device, dtype=torch.float32)
+        slice_11 = torch.randn((1, 1, 1, 512), device=device, dtype=torch.float32)
+
+        ref = test.op_sum(view_12, embedding_1, primals_221)
+        func = torch.compile(test.op_sum, backend="inductor", dynamic=False)
+        calc = func(view_12, embedding_1, primals_221)
+
+        torch.testing.assert_close(ref, calc, rtol=1e-3, atol=1e-3)
+        torch.testing.assert_close(ref, calc, rtol=1e-3, atol=1e-3)
+
+        print("valid ok")
+        benchmark_test(test.op_sum, func, args=(view_12, embedding_1, primals_221),
+                       name="issue57", times=10, repeat=10, profile=False)
+
+
+if __name__ == "__main__":
+    test = Test_issue57()
+    test.test_issue57()
\ No newline at end of file
diff --git a/test/_inductor/test_issue59.py b/test/_inductor/test_issue59.py
new file mode 100644
index 0000000000..a1644749e4
--- /dev/null
+++ b/test/_inductor/test_issue59.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+import pytest
+from test2.npu_indexing.utils import benchmark_test
+
+
+class Test_issue59():
+    def layernorm_backward(self, x, y, z):
+        sum = torch.sum(x)
+        mean = sum / torch.numel(sum)
+        sub = x - mean
+        sqr = sub * sub
+        sum_1 = torch.sum(sqr)
+        mean_1 = sum_1 / torch.numel(sum_1) + 1e-05
+        rsqrt = torch.rsqrt(mean_1)
+        mul = sub * rsqrt
+        mul_1 = mul * y
+        add = mul_1 + z
+        mean_2 = rsqrt / torch.numel(rsqrt)
+        return mul, add, mean_2
+
+    def test_issue59(self):
+        device = 'npu'
+        test = Test_issue59()
+        x = torch.randn((1, 1024), device=device, dtype=torch.float32)
+        y = torch.randn((1, 1024), device=device, dtype=torch.float32)
+        z = torch.randn((1, 1024), device=device, dtype=torch.float32)
+
+        mul, add, mean_2 = test.layernorm_backward(x, y, z)
+        func = torch.compile(test.layernorm_backward, backend="inductor", dynamic=False)
+        mul_t, add_t, mean_2_t = func(x, y, z)
+
+        torch.testing.assert_close(mul, mul_t, rtol=1e-3, atol=1e-3)
+        torch.testing.assert_close(add, add_t, rtol=1e-3, atol=1e-3)
+        torch.testing.assert_close(mean_2, mean_2_t, rtol=1e-3, atol=1e-3)
+
+        print("valid ok")
+        benchmark_test(test.layernorm_backward, func, args=(x, y, z),
+                       name="issue59", times=10, repeat=10, profile=False)
+
+
+if __name__ == "__main__":
+    test = Test_issue59()
+    test.test_issue59()
diff --git a/test/_inductor/test_issue62.py b/test/_inductor/test_issue62.py
new file mode 100644
index 0000000000..075b45a7b0
--- /dev/null
+++ b/test/_inductor/test_issue62.py
@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
+import torch
+import torch_npu
+import triton
+import triton.language as tl
+import torch_npu._inductor
+import pytest
+
+
+# 实际就是 layernorm的计算过程 ： torch.nn.LayerNorm(convert_element_type_25, elementwise_affine=False, eps=1e-6)
+class Test_issue62():
+    def op_func(self, addmm_5, add):
+        split = torch.ops.aten.split.Tensor(addmm_5, 1536, 1)
+        getitem = split[0]
+        getitem_1 = split[1]
+        getitem_2 = split[2]
+        getitem_3 = split[3]
+        getitem_4 = split[4]
+        getitem_5 = split[5]
+
+        clone_1 = torch.ops.aten.clone.default(add, memory_format=torch.contiguous_format)
+        convert_element_type_25 = torch.ops.prims.convert_element_type.default(clone_1, torch.float32)
+        var_mean = torch.ops.aten.var_mean.correction(convert_element_type_25, [2], correction=0, keepdim=True)
+        getitem_6 = var_mean[0]
+        getitem_7 = var_mean[1]
+        add_3 = torch.ops.aten.add.Tensor(getitem_6, 1e-06)
+        rsqrt = torch.ops.aten.rsqrt.default(add_3)
+        sub = torch.ops.aten.sub.Tensor(clone_1, getitem_7)
+        mul_7 = torch.ops.aten.mul.Tensor(sub, rsqrt)
+        convert_element_type_26 = torch.ops.prims.convert_element_type.default(mul_7, torch.float16)
+        slice_11 = torch.ops.aten.slice.Tensor(getitem_1, 0, 0, 9223372036854775807)
+        unsqueeze_2 = torch.ops.aten.unsqueeze.default(slice_11, 1)
+        add_4 = torch.ops.aten.add.Tensor(unsqueeze_2, 1)
+        mul_8 = torch.ops.aten.mul.Tensor(convert_element_type_26, add_4)
+        slice_12 = torch.ops.aten.slice.Tensor(getitem, 0, 0, 9223372036854775807)
+        unsqueeze_3 = torch.ops.aten.unsqueeze.default(slice_12, 1)
+        add_5 = torch.ops.aten.add.Tensor(mul_8, unsqueeze_3)
+        return add_5
+
+    def test_issue62(self):
+        test = Test_issue62()
+        addmm_5 = torch.randn((2, 9216), device='npu:0', dtype=torch.float16)
+        add = torch.randn((2, 4096, 1536), device='npu:0', dtype=torch.float16)
+
+        std_ret = test.op_func(addmm_5, add)
+        compiled_func = torch.compile(test.op_func, backend="inductor")
+        inductor_ret = compiled_func(addmm_5, add)
+        assert torch.allclose(std_ret, inductor_ret, atol=1e-2, rtol=1e-2), "Tensors are not close enough!"
+        print("valid ok")
+
+
+if __name__ == "__main__":
+    test = Test_issue62()
+    test.test_issue62()
\ No newline at end of file
diff --git a/test/_inductor/test_issue70.py b/test/_inductor/test_issue70.py
new file mode 100644
index 0000000000..6b8410bb1d
--- /dev/null
+++ b/test/_inductor/test_issue70.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+import torch.nn as nn
+import pytest
+
+
+class Test_issue70():
+
+    def op_forward(self, x):
+        return x.mean(-1)
+
+
+    def test_issue70(self):
+        test = Test_issue70()
+        compiled_net = torch.compile(test.op_forward, backend="inductor")
+
+        input = torch.randn((1, 1, 7168)).npu()
+
+        output = test.op_forward(input)
+        output1 = compiled_net(input)
+        torch.testing.assert_allclose(output, output1, rtol=1e-03, atol=1e-03)
+        print("valid ok")
+
+
+if __name__ == "__main__":
+    test = Test_issue70()
+    test.test_issue70()
diff --git a/test/_inductor/test_opensora_graph1.py b/test/_inductor/test_opensora_graph1.py
new file mode 100644
index 0000000000..5b7f35992f
--- /dev/null
+++ b/test/_inductor/test_opensora_graph1.py
@@ -0,0 +1,343 @@
+import torch
+import torch_npu
+import torch_npu._inductor
+import pytest
+__TIME_LIMIT = 100
+from torch import device
+device_npu = 'npu'
+
+@pytest.mark.timeout(__TIME_LIMIT)
+def test_opensora_cases_model_9_inference():
+    def forward(primals_1: "f32[1, 9600, 2304]"):
+        permute: "f32[9600, 1, 2304]" = torch.ops.aten.permute.default(primals_1, [1, 0, 2]);
+        return permute
+    primals_2 = torch.randn((1, 9600, 2304), device = device_npu, dtype=torch.float32)
+    ref = forward(primals_2)
+    forward_calc = torch.compile(forward, backend="inductor", dynamic=False)
+    calc = forward_calc(primals_2)
+    assert torch.allclose(ref, calc, equal_nan=True, rtol=1e-4, atol=1e-4)
+    primals_3 = torch.randn((1, 512, 2304), device=device_npu, dtype=torch.float32)
+    forward_calc = torch.compile(forward, backend="inductor", dynamic=False)
+    calc = forward_calc(primals_3)
+    ref = forward(primals_3)
+    assert torch.allclose(ref, calc, equal_nan=True, rtol=1e-4, atol=1e-4)
+    primals_4 = torch.randn((9600, 1, 2304), device=device_npu, dtype=torch.float32)
+    forward_calc = torch.compile(forward, backend="inductor", dynamic=False)
+    calc = forward_calc(primals_4)
+    ref = forward(primals_4)
+    assert torch.allclose(ref, calc, equal_nan=True, rtol=1e-4, atol=1e-4)
+
+@pytest.mark.skip
+@pytest.mark.timeout(__TIME_LIMIT)
+def test_opensora_cases_model_11_inference():
+    def forward(arg0_1: "f32[1, 1, 9600]", arg1_1: "f32[1, 1, 512]"):
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:119 in prepare_sparse_mask, code: video_mask = video_mask.unsqueeze(1)
+        unsqueeze: "f32[1, 1, 1, 9600]" = torch.ops.aten.unsqueeze.default(arg0_1, 1);
+        arg0_1 = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:120 in prepare_sparse_mask, code: prompt_mask = prompt_mask.unsqueeze(1)
+        unsqueeze_1: "f32[1, 1, 1, 512]" = torch.ops.aten.unsqueeze.default(arg1_1, 1);
+        arg1_1 = None
+        # File: /root/anaconda3/envs/inductor2.3_sora/lib/python3.9/site-packages/torch/nn/functional.py:4522 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
+        constant_pad_nd: "f32[1, 1, 1, 9600]" = torch.ops.aten.constant_pad_nd.default(unsqueeze, [0, 0, 0, 0],
+                                                                                       -9980.0);
+        unsqueeze = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:128 in prepare_sparse_mask, code: video_mask_sparse_1d = rearrange(
+        view: "f32[1, 9600, 1]" = torch.ops.aten.view.default(constant_pad_nd, [1, 9600, 1])
+        permute: "f32[1, 1, 9600]" = torch.ops.aten.permute.default(view, [2, 0, 1]);
+        view = None
+        view_1: "f32[1, 1, 1, 9600]" = torch.ops.aten.view.default(permute, [1, 1, 1, 9600]);
+        permute = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:133 in prepare_sparse_mask, code: video_mask_sparse_1d_group = rearrange(
+        view_2: "f32[1, 9600, 1, 1]" = torch.ops.aten.view.default(constant_pad_nd, [1, 9600, 1, 1]);
+        constant_pad_nd = None
+        permute_1: "f32[1, 1, 9600, 1]" = torch.ops.aten.permute.default(view_2, [2, 0, 1, 3]);
+        view_2 = None
+        view_3: "f32[1, 1, 1, 9600]" = torch.ops.aten.view.default(permute_1, [1, 1, 1, 9600]);
+        permute_1 = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:139 in prepare_sparse_mask, code: prompt_mask_sparse = prompt_mask.repeat(sparse_n, 1, 1, 1)
+        repeat: "f32[1, 1, 1, 512]" = torch.ops.aten.repeat.default(unsqueeze_1, [1, 1, 1, 1]);
+        unsqueeze_1 = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:142 in get_attention_mask, code: mask = mask.to(torch.bool)
+        npu_dtype_cast: "b8[1, 1, 1, 9600]" = torch.ops.npu.npu_dtype_cast.default(view_1, torch.bool);
+        view_1 = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:143 in get_attention_mask, code: mask = mask.repeat(1, 1, repeat_num, 1)
+        repeat_1: "b8[1, 1, 9600, 9600]" = torch.ops.aten.repeat.default(npu_dtype_cast, [1, 1, 9600, 1]);
+        npu_dtype_cast = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:142 in get_attention_mask, code: mask = mask.to(torch.bool)
+        npu_dtype_cast_1: "b8[1, 1, 1, 9600]" = torch.ops.npu.npu_dtype_cast.default(view_3, torch.bool);
+        view_3 = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:143 in get_attention_mask, code: mask = mask.repeat(1, 1, repeat_num, 1)
+        repeat_2: "b8[1, 1, 9600, 9600]" = torch.ops.aten.repeat.default(npu_dtype_cast_1, [1, 1, 9600, 1]);
+        npu_dtype_cast_1 = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:142 in get_attention_mask, code: mask = mask.to(torch.bool)
+        npu_dtype_cast_2: "b8[1, 1, 1, 512]" = torch.ops.npu.npu_dtype_cast.default(repeat, torch.bool);
+        repeat = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:143 in get_attention_mask, code: mask = mask.repeat(1, 1, repeat_num, 1)
+        repeat_3: "b8[1, 1, 9600, 512]" = torch.ops.aten.repeat.default(npu_dtype_cast_2, [1, 1, 9600, 1]);
+        npu_dtype_cast_2 = None
+        return (repeat_1, repeat_3, repeat_2)
+    arg0_1 = torch.rand((1, 1, 9600), device=device_npu, dtype=torch.float32)
+    arg1_1 = torch.rand((1, 1, 512), device=device_npu, dtype=torch.float32)
+    ref = forward(arg0_1, arg1_1)
+    forward_calc = torch.compile(forward, backend="inductor", dynamic=False)
+    calc = forward_calc(arg0_1, arg1_1)
+    for i in range(len(ref)):
+        print(ref[i])
+        assert torch.allclose(ref[i], calc[i], equal_nan=True, rtol=1e-4, atol=1e-4)
+
+@pytest.mark.skip
+@pytest.mark.timeout(__TIME_LIMIT)
+def test_opensora_cases_model_14_backward():
+    def forward(primals_5: "f32[1, 9600, 2304]", getitem_3: "f32[1, 9600, 1]", rsqrt: "f32[1, 9600, 1]",
+                add_2: "f32[1, 1, 2304]", view: "f32[9600, 2304]", permute_1: "f32[32, 2304]",
+                tangents_1: "f32[1, 9600, 32]"):
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:384 in _get_output_for_patched_inputs, code: latents = self.norm_out(latents)
+        sub: "f32[1, 9600, 2304]" = torch.ops.aten.sub.Tensor(primals_5, getitem_3);
+        primals_5 = getitem_3 = None
+        mul: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(sub, rsqrt);
+        sub = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:387 in _get_output_for_patched_inputs, code: latents = self.proj_out(latents)
+        view_2: "f32[9600, 32]" = torch.ops.aten.view.default(tangents_1, [9600, 32]);
+        tangents_1 = None
+        mm: "f32[9600, 2304]" = torch.ops.aten.mm.default(view_2, permute_1);
+        permute_1 = None
+        permute_2: "f32[32, 9600]" = torch.ops.aten.permute.default(view_2, [1, 0])
+        mm_1: "f32[32, 2304]" = torch.ops.aten.mm.default(permute_2, view);
+        permute_2 = view = None
+        permute_3: "f32[2304, 32]" = torch.ops.aten.permute.default(mm_1, [1, 0]);
+        mm_1 = None
+        sum_1: "f32[1, 32]" = torch.ops.aten.sum.dim_IntList(view_2, [0], True);
+        view_2 = None
+        view_3: "f32[32]" = torch.ops.aten.view.default(sum_1, [32]);
+        sum_1 = None
+        permute_4: "f32[32, 2304]" = torch.ops.aten.permute.default(permute_3, [1, 0]);
+        permute_3 = None
+        view_4: "f32[1, 9600, 2304]" = torch.ops.aten.view.default(mm, [1, 9600, 2304]);
+        mm = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:386 in _get_output_for_patched_inputs, code: latents = latents * (1 + scale) + shift
+        sum_2: "f32[1, 1, 2304]" = torch.ops.aten.sum.dim_IntList(view_4, [1], True)
+        mul_2: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(view_4, mul)
+        mul_3: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(view_4, add_2);
+        view_4 = add_2 = None
+        sum_3: "f32[1, 1, 2304]" = torch.ops.aten.sum.dim_IntList(mul_2, [1], True);
+        mul_2 = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:384 in _get_output_for_patched_inputs, code: latents = self.norm_out(latents)
+        mul_5: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(mul_3, 2304)
+        sum_4: "f32[1, 9600, 1]" = torch.ops.aten.sum.dim_IntList(mul_3, [2], True)
+        mul_6: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(mul_3, mul);
+        mul_3 = None
+        sum_5: "f32[1, 9600, 1]" = torch.ops.aten.sum.dim_IntList(mul_6, [2], True);
+        mul_6 = None
+        mul_7: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(mul, sum_5);
+        mul = sum_5 = None
+        sub_2: "f32[1, 9600, 2304]" = torch.ops.aten.sub.Tensor(mul_5, sum_4);
+        mul_5 = sum_4 = None
+        sub_3: "f32[1, 9600, 2304]" = torch.ops.aten.sub.Tensor(sub_2, mul_7);
+        sub_2 = mul_7 = None
+        div: "f32[1, 9600, 1]" = torch.ops.aten.div.Tensor(rsqrt, 2304);
+        rsqrt = None
+        mul_8: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(div, sub_3);
+        div = sub_3 = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:383 in _get_output_for_patched_inputs, code: shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
+        cat: "f32[1, 2, 2304]" = torch.ops.aten.cat.default([sum_2, sum_3], 1);
+        sum_2 = sum_3 = None
+        sum_6: "f32[1, 1, 2304]" = torch.ops.aten.sum.dim_IntList(cat, [1], True)
+        squeeze_1: "f32[1, 2304]" = torch.ops.aten.squeeze.dim(sum_6, 1);
+        sum_6 = None
+        full_default: "f32[1, 2304]" = torch.ops.aten.full.default([1, 2304], 0, dtype=torch.float32,
+                                                                   layout=torch.strided,
+                                                                   device=device(type='npu', index=0), pin_memory=False)
+        slice_scatter: "f32[1, 2304]" = torch.ops.aten.slice_scatter.default(full_default, squeeze_1, 0, 0,
+                                                                             9223372036854775807);
+        full_default = squeeze_1 = None
+        squeeze_2: "f32[2, 2304]" = torch.ops.aten.squeeze.dim(cat, 0);
+        cat = None
+        return [squeeze_2, permute_4, view_3, slice_scatter, mul_8]
+    primals_5 = torch.randn((1, 9600, 2304), device=device_npu, dtype=torch.float32)
+    getitem_3 = torch.randn((1, 9600, 1), device=device_npu, dtype=torch.float32)
+    rsqrt = torch.randn((1, 9600, 1), device=device_npu, dtype=torch.float32)
+    add_2 = torch.randn((1, 1, 2304), device=device_npu, dtype=torch.float32)
+    view = torch.randn((9600, 2304), device=device_npu, dtype=torch.float32)
+    permute_1 = torch.randn((32, 2304), device=device_npu, dtype=torch.float32)
+    tangents_1 = torch.randn((1, 9600, 32), device=device_npu, dtype=torch.float32)
+    ref = forward(primals_5, getitem_3, rsqrt,
+                add_2, view, permute_1,tangents_1)
+    forward_calc = torch.compile(forward, backend="inductor", dynamic=False)
+    calc = forward_calc(primals_5, getitem_3, rsqrt,
+                add_2, view, permute_1,tangents_1)
+    for i in range(len(ref)):
+        # 1e-3 can not pass, should check reduction accuracy
+        assert torch.allclose(ref[i], calc[i], equal_nan=True, rtol=1e-4, atol=1e-4)
+
+@pytest.mark.timeout(__TIME_LIMIT)
+def test_opensora_cases_model_14_forward():
+    def forward(primals_1: "f32[2, 2304]", primals_2: "f32[32, 2304]", primals_3: "f32[32]",
+                primals_4: "f32[1, 2304]", primals_5: "f32[1, 9600, 2304]"):
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:383 in _get_output_for_patched_inputs, code: shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
+        unsqueeze: "f32[1, 2, 2304]" = torch.ops.aten.unsqueeze.default(primals_1, 0);
+        primals_1 = None
+        slice_1: "f32[1, 2304]" = torch.ops.aten.slice.Tensor(primals_4, 0, 0, 9223372036854775807);
+        primals_4 = None
+        unsqueeze_1: "f32[1, 1, 2304]" = torch.ops.aten.unsqueeze.default(slice_1, 1);
+        slice_1 = None
+        add: "f32[1, 2, 2304]" = torch.ops.aten.add.Tensor(unsqueeze, unsqueeze_1);
+        unsqueeze = unsqueeze_1 = None
+        split = torch.ops.aten.split.Tensor(add, 1, 1);
+        add = None
+        getitem: "f32[1, 1, 2304]" = split[0]
+        getitem_1: "f32[1, 1, 2304]" = split[1];
+        split = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:384 in _get_output_for_patched_inputs, code: latents = self.norm_out(latents)
+        var_mean = torch.ops.aten.var_mean.correction(primals_5, [2], correction=0, keepdim=True)
+        getitem_2: "f32[1, 9600, 1]" = var_mean[0]
+        getitem_3: "f32[1, 9600, 1]" = var_mean[1];
+        var_mean = None
+        add_1: "f32[1, 9600, 1]" = torch.ops.aten.add.Tensor(getitem_2, 1e-06);
+        getitem_2 = None
+        rsqrt: "f32[1, 9600, 1]" = torch.ops.aten.rsqrt.default(add_1);
+        add_1 = None
+        sub: "f32[1, 9600, 2304]" = torch.ops.aten.sub.Tensor(primals_5, getitem_3)
+        mul: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(sub, rsqrt);
+        sub = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:386 in _get_output_for_patched_inputs, code: latents = latents * (1 + scale) + shift
+        add_2: "f32[1, 1, 2304]" = torch.ops.aten.add.Tensor(getitem_1, 1);
+        getitem_1 = None
+        mul_1: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(mul, add_2);
+        mul = None
+        add_3: "f32[1, 9600, 2304]" = torch.ops.aten.add.Tensor(mul_1, getitem);
+        mul_1 = getitem = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:387 in _get_output_for_patched_inputs, code: latents = self.proj_out(latents)
+        view: "f32[9600, 2304]" = torch.ops.aten.view.default(add_3, [9600, 2304]);
+        add_3 = None
+        permute: "f32[2304, 32]" = torch.ops.aten.permute.default(primals_2, [1, 0]);
+        primals_2 = None
+        addmm: "f32[9600, 32]" = torch.ops.aten.addmm.default(primals_3, view, permute);
+        primals_3 = None
+        view_1: "f32[1, 9600, 32]" = torch.ops.aten.view.default(addmm, [1, 9600, 32]);
+        addmm = None
+        # No stacktrace found for following nodes
+        squeeze: "f32[1, 9600, 32]" = torch.ops.aten.squeeze.dim(view_1, 1);
+        view_1 = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:387 in _get_output_for_patched_inputs, code: latents = self.proj_out(latents)
+        permute_1: "f32[32, 2304]" = torch.ops.aten.permute.default(permute, [1, 0]);
+        permute = None
+        return [squeeze, primals_5, getitem_3, rsqrt, add_2, view, permute_1]
+    primals_1 = torch.ones((2, 2304), device=device_npu, dtype=torch.float32)
+    primals_2 = torch.ones((32, 2304), device=device_npu, dtype=torch.float32)
+    primals_3 = torch.ones((32,), device=device_npu, dtype=torch.float32)
+    primals_4 = torch.ones((1, 2304), device=device_npu, dtype=torch.float32)
+    primals_5 = torch.ones((1, 9600, 2304), device=device_npu, dtype=torch.float32)
+    ref = forward(primals_1, primals_2, primals_3,primals_4, primals_5)
+    forward_calc = torch.compile(forward, backend="inductor", dynamic=False)
+    calc = forward_calc(primals_1, primals_2, primals_3,primals_4, primals_5)
+    for i in range(len(ref)):
+        assert torch.allclose(ref[i], calc[i], equal_nan=True, rtol=1e-4, atol=1e-4)
+
+@pytest.mark.timeout(__TIME_LIMIT)
+def test_opensora_cases_model_15_forward():
+    def forward(primals_1: "f32[1, 8, 30, 40, 1, 2, 2, 8]", primals_2: "i64[]", primals_3: "i64[]",
+                primals_4: "i64[]"):
+        permute: "f32[1, 8, 8, 1, 30, 2, 40, 2]" = torch.ops.aten.permute.default(primals_1, [0, 7, 1, 4, 2, 5, 3, 6]);
+        mul: "i64[]" = torch.ops.aten.mul.Tensor(primals_2, 1);
+        mul_1: "i64[]" = torch.ops.aten.mul.Tensor(primals_3, 2);
+        mul_2: "i64[]" = torch.ops.aten.mul.Tensor(primals_4, 2);
+        return [permute, mul, mul_1, mul_2]
+
+    primals_1 = torch.randn((1, 8, 30, 40, 1, 2, 2, 8), device=device_npu, dtype=torch.float32)
+    primals_2 = torch.tensor((1), device=device_npu, dtype=torch.int64)
+    primals_3 = torch.tensor((1), device=device_npu, dtype=torch.int64)
+    primals_4 = torch.tensor((1), device=device_npu, dtype=torch.int64)
+    ref = forward(primals_1, primals_2, primals_3,
+                primals_4)
+    forward_calc = torch.compile(forward, backend="inductor", dynamic=False)
+    calc = forward_calc(primals_1, primals_2, primals_3,
+                primals_4)
+    for i in range(len(ref)):
+        assert torch.allclose(ref[i], calc[i], equal_nan=True, rtol=1e-4, atol=1e-4)
+
+def find_first_mismatch(output_calc, out, rtol=1e-2, atol=1e-2):
+    for index in torch.cartesian_prod(*[torch.arange(s) for s in output_calc.shape]):
+        index = tuple(index.tolist())
+        diff = torch.abs(output_calc[index] - out[index])
+        rel_diff = diff / torch.abs(out[index]) if torch.abs(out[index]) > 0 else 0
+        if diff > atol or rel_diff > rtol:
+            return index
+    return None
+
+@pytest.mark.skip
+@pytest.mark.timeout(__TIME_LIMIT)
+def test_opensora_cases_model_16_forward():
+    def forward(primals_1: "f32[2, 2304]", primals_2: "f32[32, 2304]", primals_3: "f32[32]", primals_4: "f32[1, 2304]", primals_5: "f32[1, 9600, 2304]"):
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:407 in _get_output_for_patched_inputs, code: shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
+        unsqueeze: "f32[1, 2, 2304]" = torch.ops.aten.unsqueeze.default(primals_1, 0);  primals_1 = None
+        slice_1: "f32[1, 2304]" = torch.ops.aten.slice.Tensor(primals_4, 0, 0, 9223372036854775807);  primals_4 = None
+        unsqueeze_1: "f32[1, 1, 2304]" = torch.ops.aten.unsqueeze.default(slice_1, 1);  slice_1 = None
+        add: "f32[1, 2, 2304]" = torch.ops.aten.add.Tensor(unsqueeze, unsqueeze_1);  unsqueeze = unsqueeze_1 = None
+        split = torch.ops.aten.split.Tensor(add, 1, 1);  add = None
+        getitem: "f32[1, 1, 2304]" = split[0]
+        getitem_1: "f32[1, 1, 2304]" = split[1];  split = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:408 in _get_output_for_patched_inputs, code: latents = self.norm_out(latents)
+        var_mean = torch.ops.aten.var_mean.correction(primals_5, [2], correction = 0, keepdim = True)
+        getitem_2: "f32[1, 9600, 1]" = var_mean[0]
+        getitem_3: "f32[1, 9600, 1]" = var_mean[1];  var_mean = None
+        add_1: "f32[1, 9600, 1]" = torch.ops.aten.add.Tensor(getitem_2, 1e-06);  getitem_2 = None
+        rsqrt: "f32[1, 9600, 1]" = torch.ops.aten.rsqrt.default(add_1);  add_1 = None
+        sub: "f32[1, 9600, 2304]" = torch.ops.aten.sub.Tensor(primals_5, getitem_3)
+        mul: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(sub, rsqrt);  sub = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:410 in _get_output_for_patched_inputs, code: latents = latents * (1 + scale) + shift
+        add_2: "f32[1, 1, 2304]" = torch.ops.aten.add.Tensor(getitem_1, 1);  getitem_1 = None
+        mul_1: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(mul, add_2);  mul = None
+        add_3: "f32[1, 9600, 2304]" = torch.ops.aten.add.Tensor(mul_1, getitem);  mul_1 = getitem = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:411 in _get_output_for_patched_inputs, code: latents = self.proj_out(latents)
+        view: "f32[9600, 2304]" = torch.ops.aten.view.default(add_3, [9600, 2304]);  add_3 = None
+        permute: "f32[2304, 32]" = torch.ops.aten.permute.default(primals_2, [1, 0]);  primals_2 = None
+        addmm: "f32[9600, 32]" = torch.ops.aten.addmm.default(primals_3, view, permute);  primals_3 = None
+        #import pdb;pdb.set_trace()
+        view_1: "f32[1, 9600, 32]" = torch.ops.aten.view.default(addmm, [1, 9600, 32]);
+        # No stacktrace found for following nodes
+        squeeze: "f32[1, 9600, 32]" = torch.ops.aten.squeeze.dim(view_1, 1);
+        # import pdb;
+        # pdb.set_trace()
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:418 in _get_output_for_patched_inputs, code: latents = latents.reshape(
+        view_2: "f32[1, 8, 30, 40, 1, 2, 2, 8]" = torch.ops.aten.view.default(squeeze, [1, 8, 30, 40, 1, 2, 2, 8]);  squeeze = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:428 in _get_output_for_patched_inputs, code: latents = latents.permute(0, 7, 1, 4, 2, 5, 3, 6).contiguous()
+        permute_1: "f32[1, 8, 8, 1, 30, 2, 40, 2]" = torch.ops.aten.permute.default(view_2, [0, 7, 1, 4, 2, 5, 3, 6]);  view_2 = None
+        clone: "f32[1, 8, 8, 1, 30, 2, 40, 2]" = torch.ops.aten.clone.default(permute_1);  permute_1 = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:429 in _get_output_for_patched_inputs, code: output = latents.reshape(
+        clone_1: "f32[1, 8, 8, 1, 30, 2, 40, 2]" = torch.ops.aten.clone.default(clone, memory_format = torch.contiguous_format);  clone = None
+        view_3: "f32[1, 8, 8, 60, 80]" = torch.ops.aten.view.default(clone_1, [1, 8, 8, 60, 80]);  clone_1 = None
+        # File: /home/w00685865/osl1.3/mindspeed_mm/models/predictor/dits/video_dit_sparse.py:411 in _get_output_for_patched_inputs, code: latents = self.proj_out(latents)
+        permute_3: "f32[32, 2304]" = torch.ops.aten.permute.default(permute, [1, 0]);  permute = None
+        return [view_3, primals_5, getitem_3, rsqrt, add_2, view, permute_3]
+
+    import random
+    import numpy as np
+    import os
+    def seed_all(seed=1234, mode=False):
+        random.seed(seed)
+        os.environ['PYTHONHASHSEED'] = str(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.use_deterministic_algorithms(mode)
+        torch_npu.npu.manual_seed_all(seed)
+        torch_npu.npu.manual_seed(seed)
+
+    seed_all(True)
+    primals_1 = torch.randn((2, 2304), device=device_npu,dtype=torch.float32)
+    print(primals_1)
+    primals_2 = torch.randn((32, 2304), device=device_npu,dtype=torch.float32)
+    primals_3 = torch.randn((32,), device=device_npu,dtype=torch.float32)
+    primals_4 = torch.randn((1, 2304), device=device_npu,dtype=torch.float32)
+    primals_5 = torch.randn((1, 9600, 2304), device=device_npu,dtype=torch.float32)
+
+    ref = forward(primals_1, primals_2, primals_3, primals_4, primals_5)
+    forward_calc = torch.compile(forward, backend="inductor", dynamic=False)
+    calc = forward_calc(primals_1, primals_2, primals_3, primals_4, primals_5)
+    for i in range(len(ref)):
+        print("i=", i)
+        assert torch.allclose(ref[i], calc[i], equal_nan=True, rtol=1e-3, atol=1e-3)
+
+if __name__ == '__main__':
+    test_opensora_cases_model_15_forward()
+    #test_opensora_cases_model_15_forward()
+    #test_opensora_cases_model_16_forward()
diff --git a/test/_inductor/test_permute.py b/test/_inductor/test_permute.py
new file mode 100644
index 0000000000..fee2819592
--- /dev/null
+++ b/test/_inductor/test_permute.py
@@ -0,0 +1,47 @@
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+torch_npu._inductor.config.enable_npu_indexing = True
+
+
+class TestPermute(TestUtils):
+    __TIME_LIMIT = 100
+
+    _permute_dims = [
+        (0, 1, 2, 3), (0, 1, 3, 2), (0, 2, 1, 3), (0, 2, 3, 1),
+        (0, 3, 1, 2), (0, 3, 2, 1), (1, 0, 2, 3), (1, 0, 3, 2),
+        (1, 2, 0, 3), (1, 2, 3, 0), (1, 3, 0, 2), (1, 3, 2, 0),
+        (2, 0, 1, 3), (2, 0, 3, 1), (2, 1, 0, 3), (2, 1, 3, 0),
+        (2, 3, 0, 1), (2, 3, 1, 0), (3, 0, 1, 2), (3, 0, 2, 1),
+        (3, 1, 0, 2), (3, 1, 2, 0), (3, 2, 0, 1), (3, 2, 1, 0),
+    ]
+
+    def op_calc(self, a, b, dim):
+        a = a.permute(dim)
+        b = b.permute(dim)
+        y = a + b
+        return y
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(8, 8, 512, 128)])
+    @pytest.mark.parametrize('dtype', ['float32', 'int32', 'float16', 'bfloat16', 'int64'])
+    def test_view_cases(self, shape, dtype, clear_cache):
+        print(f"shape={shape}")
+        print(f"dtype={dtype}")
+        print("npu_indexing={}".format(torch_npu._inductor.config.enable_npu_indexing))
+
+        a = self._generate_tensor(shape, dtype)
+        b = self._generate_tensor(shape, dtype)
+
+        for dim in self._permute_dims:
+            print(f"start to test permute on dim :{dim}")
+            std_permute = self.op_calc(a, b, dim)
+            compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+            inductor_permute = compiled_op_calc(a, b, dim)
+
+            torch.testing.assert_close(std_permute, inductor_permute, rtol=1e-3, atol=1e-3)
+        print("data validation passed.")
diff --git a/test/_inductor/test_reduction_brocast_add.py b/test/_inductor/test_reduction_brocast_add.py
new file mode 100644
index 0000000000..29e86fdae9
--- /dev/null
+++ b/test/_inductor/test_reduction_brocast_add.py
@@ -0,0 +1,34 @@
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+class TestSumAdd(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.REDUCTION
+
+    def foo(self,a, b, dim, shape):
+        y = a + b
+        y = y.sum(dim)
+        y = y.unsqueeze(dim)
+        y = y.broadcast_to(shape) + b
+        return y
+
+    # case：change shapes
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(9, 9, 31, 63)])
+    @pytest.mark.parametrize('dim', [0, 1, 2])
+    @pytest.mark.parametrize('dtype', ['float32'])
+    def test_reduction_cases_shapes1(self, shape, dim, dtype, clear_cache):
+        print(f"shape= {shape}")
+        print(f"dim= {dim}")
+        print(f"dtype= {dtype}")
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+
+        a, b = [torch.randn(shape, requires_grad=False, dtype=torch.float32, device="npu") for _ in range(2)]
+        r1 = self.foo(a, b, dim, shape)
+        func = torch.compile(self.foo, backend="inductor", dynamic=False)
+        r = func(a, b, dim, shape)
+        torch.testing.assert_close(r, r1, rtol=1e-3, atol=1e-3)
diff --git a/test/_inductor/test_relu.py b/test/_inductor/test_relu.py
new file mode 100644
index 0000000000..3107d4d5cb
--- /dev/null
+++ b/test/_inductor/test_relu.py
@@ -0,0 +1,34 @@
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+
+class TestRelu(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    def op_calc(self, first_element):
+        result = torch.relu(first_element)
+        return result
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', TestUtils._pointwise_demo_shapes)
+    @pytest.mark.parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32', 'int64'])
+    def test_pointwise_cases(self, shape, dtype):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        first_element = self._generate_tensor(shape, dtype)
+
+        std_result = self.op_calc(first_element)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_result = compiled_op_calc(first_element)
+
+        torch.testing.assert_close(std_result, inductor_result)
+
+
+if __name__ == '__main__':
+    TestRelu()
diff --git a/test/_inductor/test_renorm.py b/test/_inductor/test_renorm.py
new file mode 100644
index 0000000000..f2e55a833d
--- /dev/null
+++ b/test/_inductor/test_renorm.py
@@ -0,0 +1,40 @@
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+class TestRenorm(TestUtils):
+    __TIME_LIMIT = 100
+
+    def op_calc(self, input_element, dim):
+        return torch.renorm(input_element, p=2, dim=dim, maxnorm=5)
+
+    # case：change shapes
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(32, 64)])
+    @pytest.mark.parametrize('dim', [-1])
+    @pytest.mark.parametrize('dtype', ['float32'])
+    def test_reduction_cases_shapes(self, shape, dim, dtype, clear_cache):
+        print(f"shape= {shape}")
+        print(f"dim= {dim}")
+        print(f"dtype= {dtype}")
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+
+        input_element = self._generate_tensor(shape, dtype)
+        print(f"input_element= {input_element}")
+        std_ret = self.op_calc(input_element, dim)
+        print(f"std_ret= {std_ret}")
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_ret = compiled_op_calc(input_element, dim)
+        print(f"inductor_ret= {inductor_ret}")
+
+        assert torch.allclose(std_ret, inductor_ret, equal_nan=True)
+
+
+if __name__ == "__main__":
+    size = (32, 64)
+    test = TestRenorm()
+    test.test_reduction_cases_shapes(size, -1, 'float32', None)
+
diff --git a/test/_inductor/test_repeat.py b/test/_inductor/test_repeat.py
new file mode 100644
index 0000000000..d3df6a138d
--- /dev/null
+++ b/test/_inductor/test_repeat.py
@@ -0,0 +1,40 @@
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+class TestRepeat(TestUtils):
+    __TIME_LIMIT = 100
+
+    def op_calc(self, input_element, dim):
+        return input_element.repeat(dim)
+
+    # case：change shapes
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(16, 128, 64)])
+    @pytest.mark.parametrize('dim', [(1, 1, 2), (1, 2, 1), (2, 1, 1)]) #(2, 3, 4), (1, 2, 3)
+    @pytest.mark.parametrize('dtype', ['float32'])
+    def test_reduction_cases_shapes(self, shape, dim, dtype, clear_cache):
+        print(f"shape= {shape}")
+        print(f"dim= {dim}")
+        print(f"dtype= {dtype}")
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+
+        input_element = self._generate_tensor(shape, dtype)
+
+        std_ret = self.op_calc(input_element, dim)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_ret = compiled_op_calc(input_element, dim)
+
+        torch.testing.assert_close(std_ret, inductor_ret, rtol=1e-1, atol=1e-1)
+
+
+if __name__ == "__main__":
+    size = (16, 512, 64)
+    dim = (2, 3, 4)
+    test = TestRepeat()
+    test.test_reduction_cases_shapes(size, dim, 'float32', None)
+
diff --git a/test/_inductor/test_reshape.py b/test/_inductor/test_reshape.py
new file mode 100644
index 0000000000..910e0c83b4
--- /dev/null
+++ b/test/_inductor/test_reshape.py
@@ -0,0 +1,39 @@
+import torch
+import torch_npu
+import pytest
+import torch_npu._inductor
+from testutils import OperatorType, TestUtils
+
+torch_npu._inductor.config.enable_npu_indexing = True
+
+class TestReshape(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    B, N, S, D = (1, 12, 256, 8)
+
+    def op_calc(self, a, b):
+        a = a.reshape(self.S, self.B, self.N * self.D)
+        b = b.reshape(self.S, self.B, self.N * self.D)
+        y = a + b
+        return y
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(1, 12, 256, 8)])
+    @pytest.mark.parametrize('dtype', ['float32', 'int32', 'float16', 'bfloat16', 'int64'])
+    def test_view_cases(self, shape, dtype, clear_cache):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        a = self._generate_tensor(shape, dtype)
+        b = self._generate_tensor(shape, dtype)
+
+        print(f"start to test reshape on shape :{shape} ")
+        std_reshape = self.op_calc(a, b)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_reshape = compiled_op_calc(a, b)
+
+        torch.testing.assert_close(std_reshape, inductor_reshape, rtol=1e-3, atol=1e-3)
+
+        print("data validation passed")
+
diff --git a/test/_inductor/test_rsqrt.py b/test/_inductor/test_rsqrt.py
new file mode 100644
index 0000000000..b76e1779f4
--- /dev/null
+++ b/test/_inductor/test_rsqrt.py
@@ -0,0 +1,35 @@
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+
+class TestRsqrt(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    def op_calc(self, first_element):
+        result = torch.rsqrt(first_element)
+        return result
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', TestUtils._pointwise_demo_shapes)
+    @pytest.mark.parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32', 'int64'])
+    def test_pointwise_cases(self, shape, dtype):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        first_element = self._generate_tensor(shape, dtype, 1)
+
+        std_result = self.op_calc(first_element)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_result = compiled_op_calc(first_element)
+
+        torch.testing.assert_close(std_result, inductor_result, rtol=1e-1, atol=1e-1)
+
+
+if __name__ == '__main__':
+    TestRsqrt()
+
diff --git a/test/_inductor/test_slice.py b/test/_inductor/test_slice.py
new file mode 100644
index 0000000000..2b8e75a91b
--- /dev/null
+++ b/test/_inductor/test_slice.py
@@ -0,0 +1,55 @@
+import torch
+import torch_npu
+import pytest
+import torch_npu._inductor
+from testutils import OperatorType, TestUtils
+
+
+class TestSlice(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    def op_calc(self, a, b, dim, step):
+        if dim == 0:
+            target = a.shape[0]
+            end = target // step
+            a = a[:end:, ::, ::, ::]
+            b = b[:end:, ::, ::, ::]
+        elif dim == 1:
+            target = a.shape[1]
+            end = target // step
+            a = a[::, :end:, ::, ::]
+            b = b[::, :end:, ::, ::]
+        elif dim == 2:
+            target = a.shape[2]
+            end = target // step
+            a = a[::, ::, :end:, ::]
+            b = b[::, ::, :end:, ::]
+        elif dim == 3:
+            target = a.shape[3]
+            end = target // step
+            a = a[::, ::, ::, :end:]
+            b = b[::, ::, ::, :end:]
+        y = a + b
+        return y
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(8, 8, 256, 128)])
+    @pytest.mark.parametrize('dtype', ['float32', 'int32', 'float16', 'bfloat16', 'int64'])
+    def test_view_cases(self, shape, dtype, clear_cache):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        a = self._generate_tensor(shape, dtype)
+        b = self._generate_tensor(shape, dtype)
+
+        for dim in [3, 2, 1, 0]:
+            print(f"start to test slice on dim :{dim} ")
+            std_slice = self.op_calc(a, b, dim, min(shape)//2)
+
+            compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+            inductor_slice = compiled_op_calc(a, b, dim, min(shape)//2)
+
+            torch.testing.assert_close(std_slice, inductor_slice, rtol=1e-3, atol=1e-3)
+
+        print("data validation passed")
+
diff --git a/test/_inductor/test_split_loop.py b/test/_inductor/test_split_loop.py
new file mode 100644
index 0000000000..5682a22836
--- /dev/null
+++ b/test/_inductor/test_split_loop.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+
+class TestSplitLoop(TestUtils):
+    __TIME_LIMIT = 100
+
+    def op_calc(self, a, b):
+        return torch.nn.functional.gelu(a + b)
+
+    # case：change shapes
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(8,86,1152),(61,89,157),(7,89,971)])
+    @pytest.mark.parametrize('dtype', ['float32'])
+    def test_split_loop(self, shape, dtype):
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+
+        a = self._generate_tensor(shape, dtype)
+        b = self._generate_tensor((shape[0],1,shape[2]), dtype)
+        
+        std_ = self.op_calc(a, b)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor", dynamic=False)
+        inductor_ = compiled_op_calc(a, b)
+        # print(f"inductor_cat.shape= {inductor_cat.shape}")
+        torch.testing.assert_close(std_,inductor_,atol=1e-3,rtol=1e-3)
+
+
+if __name__ == "__main__":
+    size = (8,86,1152)
+    test = TestSplitLoop()
+    test.test_split_loop(size, 'float32')
diff --git a/test/_inductor/test_sqrt.py b/test/_inductor/test_sqrt.py
new file mode 100644
index 0000000000..201b646f9c
--- /dev/null
+++ b/test/_inductor/test_sqrt.py
@@ -0,0 +1,44 @@
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from .testutils import OperatorType, TestUtils
+
+
+class TestSqrt(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    # optimized function, auto timeout after __TIME_LIMIT seconds
+
+    # @torch.compile(options={"aggressive_fusion": False})
+
+    def op_calc(self, first_element):
+        result = torch.sqrt(first_element)
+        return result
+
+    # 在连续测试场景下,测试结果不稳定,建议单独重测批量测试未通过的 case
+    # 若需测试更多数据类型，将dtype后面的list改成 ProtoTestCase._test_dtypes即可
+    # 对indexing开关情况的测试需要用外部参数--npu_indexing=True/False完成
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', TestUtils._pointwise_demo_shapes)
+    @pytest.mark.parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32', 'int64'])
+    def test_pointwise_cases(self, shape, dtype, clear_cache):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        first_element = self._generate_tensor(shape, dtype, 1)
+
+        std_result = self.op_calc(first_element)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_result = compiled_op_calc(first_element)
+        # print(std_result[0:8])
+        # print(inductor_result[0:8])
+        # torch.testing.assert_close(std_result, inductor_result)
+        # 需要比较包含 NaN 值的张量，并且希望认为两个 NaN 值是相等的，您可以使用 torch.allclose 函数，并设置 equal_nan=True 参数
+        rtol = 1e-1
+        atol = 1e-1
+        assert torch.allclose(std_result, inductor_result, equal_nan=True, rtol=rtol, atol=atol)
+
diff --git a/test/_inductor/test_sub.py b/test/_inductor/test_sub.py
new file mode 100644
index 0000000000..dfc44938c3
--- /dev/null
+++ b/test/_inductor/test_sub.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from .testutils import OperatorType, TestUtils
+
+
+class TestSub(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    # optimized function, auto timeout after __TIME_LIMIT seconds
+
+    # @torch.compile(options={"aggressive_fusion": False})
+
+    def op_calc(self, first_element, second_element):
+        result = first_element - second_element
+        return result
+
+    # 在连续测试场景下,测试结果不稳定,建议单独重测批量测试未通过的 case
+    # 若需测试更多数据类型，将dtype后面的list改成 ProtoTestCase._test_dtypes即可
+    # 对indexing开关情况的测试需要用外部参数--npu_indexing=True/False完成
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', TestUtils._pointwise_demo_shapes)
+    @pytest.mark.parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32', 'int64'])
+    def test_pointwise_cases(self, shape, dtype, clear_cache):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+        first_element = self._generate_tensor(shape, dtype)
+        second_element = self._generate_tensor(shape, dtype)
+
+        std_sub = self.op_calc(first_element, second_element)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_sum = compiled_op_calc(first_element, second_element)
+        # print(std_sub[0:8])
+        # print(inductor_sum[0:8])
+        torch.testing.assert_close(std_sub, inductor_sum)
diff --git a/test/_inductor/test_sum.py b/test/_inductor/test_sum.py
new file mode 100644
index 0000000000..6b13e88c48
--- /dev/null
+++ b/test/_inductor/test_sum.py
@@ -0,0 +1,75 @@
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+
+class TestSum(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.REDUCTION
+
+    def op_calc(self, input_element, dim):
+        return torch.sum(input_element, dim)
+    # 规约轴和非规约轴对齐用例 float32 XBLOCK_SUB>=8:shape=(8,32)
+    # non-persistent reduction 用例 规约轴>1024:shape=(8,8,8,2048) dim=-1
+    _reduction_extest_shape4d_all = [(8, 32), (8, 8, 8, 2048)]
+    _reduction_extest_dim4d_low = [-1]
+    _reduction_extest_dim4d_all = [0, 1, 2]
+
+    # 在连续测试场景下，测试结果不稳定，建议单独重测批量测试未通过的case
+    # 若需测试更多数据类型，将dtype手动修改，若在一个ut中涉及多个dtype的更改，可能因为tiling固化导致失败
+    # 对indexing开关情况的测试需要用外部参数--npu-indexing=True/False完成
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', _reduction_extest_shape4d_all)
+    @pytest.mark.parametrize('dim', _reduction_extest_dim4d_low)
+    @pytest.mark.parametrize('dtype', ['float32'])
+    def test_reduction_cases_shapes(self, shape, dim, dtype, clear_cache):
+        print(f"shape={shape}")
+        print(f"dim={dim}")
+        print(f"dtype={dtype}")
+        print('npu_indexing={}'.format(torch_npu._inductor.config.enable_npu_indexing))
+
+        input_element = self._generate_tensor(shape, dtype)
+        std_sum = self.op_calc(input_element, dim)
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor", dynamic=False)
+        inductor_sum_tmp = compiled_op_calc(input_element, dim)
+        if dtype == 'int32' or dtype == 'int64':
+            # inductor return float32,need to change int64 for assert
+            inductor_sum = inductor_sum_tmp.long()
+        elif dtype == 'float16':
+            # inductor return float32,need to change float16 for assert
+            inductor_sum = inductor_sum_tmp.half()
+        elif dtype == 'bfloat16':
+            # inductor return float32,need to change float32 for assert
+            std_sum = std_sum.float()
+            inductor_sum = inductor_sum_tmp
+        else:
+            inductor_sum = inductor_sum_tmp
+
+        # print(f"std_sum={std_sum[0:8]}")
+        # print(f"inductor_sum={inductor_sum[0:8]}")
+        torch.testing.assert_close(std_sum, inductor_sum, rtol=1e-1, atol=1e-1)
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(32, 16, 64, 128)])
+    @pytest.mark.parametrize('dim', _reduction_extest_dim4d_all)
+    @pytest.mark.parametrize('dtype', ['float32'])
+    def test_reduction_cases_dims(self, shape, dim, dtype, clear_cache):
+        print(f"shape= {shape}")
+        print(f"dim= {dim}")
+        print(f"dtype= {dtype}")
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+
+        input_element = self._generate_tensor(shape, dtype)
+        std_sum = self.op_calc(input_element, dim)
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor", dynamic=False)
+        inductor_sum = compiled_op_calc(input_element, dim)
+
+        torch.testing.assert_close(std_sum, inductor_sum, rtol=1e-1, atol=1e-1)
+
+if __name__ == "__main__":
+    size = (32, 16, 64, 128)
+    test = TestSum()
+    test.test_reduction_cases_shapes(size, 2, 'float32', None)
diff --git a/test/_inductor/test_sum_add.py b/test/_inductor/test_sum_add.py
new file mode 100644
index 0000000000..670623d722
--- /dev/null
+++ b/test/_inductor/test_sum_add.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+class TestSumAdd(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.REDUCTION
+    def op_calc(self, input_element, dim, input_element2):
+        tmp = torch.sum(input_element, dim)
+        return tmp + input_element2
+
+    # case：change shapes
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(32, 64, 128, 2048)])
+    @pytest.mark.parametrize('dim', [0, 1, 2, 3])
+    @pytest.mark.parametrize('dtype', ['float32'])
+    def test_reduction_cases_shapes(self, shape, dim, dtype, clear_cache):
+        print(f"shape= {shape}")
+        print(f"dim= {dim}")
+        print(f"dtype= {dtype}")
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+
+        input_element = self._generate_tensor(shape, dtype)
+        if dim == -1 or dim == 3:
+            input_element2 = torch.full(size=(32, 64, 128), fill_value=1000.0, dtype=torch.float32, device=torch.device("npu"))
+        elif dim == 2:
+            input_element2 = torch.full(size=(32, 64, 2048), fill_value=1000.0, dtype=torch.float32, device=torch.device("npu"))
+        elif dim == 1:
+            input_element2 = torch.full(size=(32, 128, 2048), fill_value=1000.0, dtype=torch.float32, device=torch.device("npu"))
+        else:
+            input_element2 = torch.full(size=(64, 128, 2048), fill_value=1000.0, dtype=torch.float32, device=torch.device("npu"))
+
+        std_sum = self.op_calc(input_element, dim, input_element2)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_sum = compiled_op_calc(input_element, dim, input_element2)
+
+        torch.testing.assert_close(std_sum, inductor_sum, rtol=1e-1, atol=1e-1)
+
+
+if __name__ == "__main__":
+    size = (32, 64, 128, 2048)
+    test = TestSumAdd()
+    test.test_reduction_cases_shapes(size, -1, 'float32', None)
\ No newline at end of file
diff --git a/test/_inductor/test_var.py b/test/_inductor/test_var.py
new file mode 100644
index 0000000000..5c583452c8
--- /dev/null
+++ b/test/_inductor/test_var.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+class TestVar(TestUtils):
+    __TIME_LIMIT = 100
+
+    def op_calc(self, input_element, dim):
+        return torch.var(input_element, dim)
+
+    # case：change shapes
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(8, 64, 128)])
+    @pytest.mark.parametrize('dim', [0, 1, 2])
+    @pytest.mark.parametrize('dtype', ['float16'])
+    def test_reduction_cases_shapes(self, shape, dim, dtype, clear_cache):
+        print(f"shape= {shape}")
+        print(f"dim= {dim}")
+        print(f"dtype= {dtype}")
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+
+        input_element = self._generate_tensor(shape, dtype)
+        std_ret = self.op_calc(input_element, dim)
+        # print(f"std_ret= {std_ret}")
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_ret = compiled_op_calc(input_element, dim)
+        # print(f"inductor_ret= {inductor_ret}")
+        rtol = 1e-1
+        atol = 1e-1
+        assert torch.allclose(std_ret, inductor_ret, equal_nan=True, rtol=rtol, atol=atol)
+
+
+if __name__ == "__main__":
+    size = (8, 64, 128)
+    test = TestVar()
+    test.test_reduction_cases_shapes(size, 2, 'float32', None)
\ No newline at end of file
diff --git a/test/_inductor/test_var_mean.py b/test/_inductor/test_var_mean.py
new file mode 100644
index 0000000000..a36403daab
--- /dev/null
+++ b/test/_inductor/test_var_mean.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+
+import pytest
+from testutils import OperatorType, TestUtils
+
+class TestVarMean(TestUtils):
+    __TIME_LIMIT = 100
+
+    def op_calc(self, input_element, dim):
+        return torch.var_mean(input_element, dim)
+
+    # case：The shape must not be too large
+    #@pytest.mark.skip(reason="npu compiler bug")
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', [(8, 64, 128)])
+    @pytest.mark.parametrize('dim', [0, 1, 2, (0, 2), (0, 1)])
+    @pytest.mark.parametrize('dtype', ['float32'])
+    def test_reduction_cases_shapes(self, shape, dim, dtype, clear_cache):
+        print(f"shape= {shape}")
+        print(f"dim= {dim}")
+        print(f"dtype= {dtype}")
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+
+        input_element = self._generate_tensor(shape, dtype)
+
+        std_var, std_mean = self.op_calc(input_element, dim)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor", dynamic=False)
+        inductor_var, inductor_mean = compiled_op_calc(input_element, dim)
+
+        rtol = 1e-1
+        atol = 1e-1
+        assert torch.allclose(std_var, inductor_var, equal_nan=True, rtol=rtol, atol=atol)
+        assert torch.allclose(std_mean, inductor_mean, equal_nan=True, rtol=rtol, atol=atol)
+
+
+if __name__ == "__main__":
+    size = (8, 64, 1024)
+    test = TestVarMean()
+    test.test_reduction_cases_shapes(size, 2, 'float32', None)
\ No newline at end of file
diff --git a/test/_inductor/test_var_mean_add_mul.py b/test/_inductor/test_var_mean_add_mul.py
new file mode 100644
index 0000000000..a20cdfe547
--- /dev/null
+++ b/test/_inductor/test_var_mean_add_mul.py
@@ -0,0 +1,45 @@
+import torch
+import torch_npu
+import torch_npu._inductor
+import pytest
+
+__TIME_LIMIT = 100
+@pytest.mark.timeout(__TIME_LIMIT)
+def test_reduction_cases_shapes():
+    device = 'npu'
+
+    def forward(add: "f32[1, 2, 2304]", primals_2: "f32[32, 2304]", primals_5: "f32[1, 9600, 2304]"):
+        split = torch.ops.aten.split.Tensor(add, 1, 1);
+        getitem: "f32[1, 1, 2304]" = split[0]
+        getitem_1: "f32[1, 1, 2304]" = split[1];
+
+        var_mean = torch.ops.aten.var_mean.correction(primals_5, [2], correction=0, keepdim=True)
+        getitem_2: "f32[1, 9600, 1]" = var_mean[0]
+        getitem_3: "f32[1, 9600, 1]" = var_mean[1];
+        add_1: "f32[1, 9600, 1]" = torch.ops.aten.add.Tensor(getitem_2, 1e-06);
+        rsqrt: "f32[1, 9600, 1]" = torch.ops.aten.rsqrt.default(add_1);
+        sub: "f32[1, 9600, 2304]" = torch.ops.aten.sub.Tensor(primals_5, getitem_3)
+        mul: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(sub, rsqrt);
+
+        add_2: "f32[1, 1, 2304]" = torch.ops.aten.add.Tensor(getitem_1, 1);
+        mul_1: "f32[1, 9600, 2304]" = torch.ops.aten.mul.Tensor(mul, add_2);
+        add_3: "f32[1, 9600, 2304]" = torch.ops.aten.add.Tensor(mul_1, getitem);
+
+        view: "f32[9600, 2304]" = torch.ops.aten.view.default(add_3, [9600, 2304]);
+        return [None, primals_5, getitem_3, rsqrt, add_2, view, primals_2]
+
+    torch_npu._inductor.config.enable_npu_indexing = True
+    primals_2: "f32[32, 2304]" = torch.randn((32, 2304), device = device, dtype=torch.float32)
+    primals_5: "f32[1, 9600, 2304]" = torch.randn((1, 9600, 2304), device = device, dtype=torch.float32)
+    add: "f32[1, 2, 2304]" = torch.randn((1, 2, 2304), device =device, dtype=torch.float32)
+
+    _, primals_5_ref, getitem_3_ref, rsqrt_ref, add_2_ref, view_ref, primals_2_ref = forward(add, primals_2, primals_5)
+
+    forward = torch.compile(forward, backend="inductor", dynamic=False)
+    _, primals_5, getitem_3, rsqrt, add_2, view, primals_2 = forward(add, primals_2, primals_5)
+
+    assert torch.allclose(primals_5_ref, primals_5, equal_nan=True, rtol=1e-3, atol=1e-3)
+    assert torch.allclose(getitem_3_ref, getitem_3, equal_nan=True, rtol=1e-3, atol=1e-3)
+    assert torch.allclose(rsqrt_ref, rsqrt, equal_nan=True, rtol=1e-3, atol=1e-3)
+    assert torch.allclose(add_2_ref, add_2, equal_nan=True, rtol=1e-3, atol=1e-3)
+    assert torch.allclose(primals_2_ref, primals_2, equal_nan=True, rtol=1e-3, atol=1e-3)
\ No newline at end of file
diff --git a/test/_inductor/test_where.py b/test/_inductor/test_where.py
new file mode 100644
index 0000000000..b10b0aa3d9
--- /dev/null
+++ b/test/_inductor/test_where.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+import torch_npu._inductor
+import pytest
+from testutils import OperatorType, TestUtils
+
+
+class TestWhere(TestUtils):
+    __TIME_LIMIT = 100
+    __OPTYPE = OperatorType.POINTWISE
+
+    # optimized function, auto timeout after __TIME_LIMIT seconds
+
+    # @torch.compile(options={"aggressive_fusion": False})
+
+    def op_calc(self, condition, first_element, second_element):
+        return torch.where(condition, first_element, second_element)
+
+    # 在连续测试场景下,测试结果不稳定,建议单独重测批量测试未通过的 case
+    # 若需测试更多数据类型，将dtype后面的list改成 ProtoTestCase._test_dtypes即可
+    # 对indexing开关情况的测试需要用外部参数--npu_indexing=True/False完成
+
+    @pytest.mark.timeout(__TIME_LIMIT)
+    @pytest.mark.parametrize('shape', TestUtils._pointwise_demo_shapes)
+    @pytest.mark.parametrize('dtype', ['float16', 'float32', 'bfloat16', 'int32'])   
+    def test_pointwise_cases(self, shape, dtype, clear_cache):
+        print(shape)
+        print('npu_indexing= {}'.format(torch_npu._inductor.config.enable_npu_indexing))
+
+        first_element = self._generate_tensor(shape, dtype)
+        second_element = self._generate_tensor(shape, dtype)
+        condition = self._generate_tensor(shape, 'bool')
+
+        std_result = self.op_calc(condition, first_element, second_element)
+
+        compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
+        inductor_result = compiled_op_calc(condition, first_element, second_element)
+
+        torch.testing.assert_close(std_result, inductor_result)
\ No newline at end of file
diff --git a/test/_inductor/testutils.py b/test/_inductor/testutils.py
new file mode 100644
index 0000000000..3559820fc2
--- /dev/null
+++ b/test/_inductor/testutils.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import torch
+import torch_npu
+from enum import Enum, unique
+import os
+
+
+@unique
+class OperatorType(Enum):
+    POINTWISE = 'POINTWISE'
+    REDUCTION = 'REDUCTION'
+
+
+class TestUtils:
+    _pointwise_test_shape2d = [(4096, 256), (1024, 32), (8, 2048), (8, 4096)]  # (8, 4), (8, 8), not supported
+    _pointwise_test_shape3d = [(8, 8, 4), (8, 8, 8), (8, 8, 2048), (8, 8, 4096)]
+    _pointwise_test_shape4d = [(128, 128, 4096, 4), (128, 128, 4096, 8),
+                               (32, 32, 1024, 1024)]  # 128*128*4096*2048 is too big(512G)
+    _pointwise_test_shapes = _pointwise_test_shape2d + _pointwise_test_shape3d + _pointwise_test_shape4d
+
+    _pointwise_demo_shapes = [(1024, 32), (8, 16, 256, 32)]
+    _reduction_extest_shape4d = [(8, 8, 8, 16384), (8, 8, 16384, 8), (8, 16384, 8, 8), (16384, 8, 8, 8)]
+    _reduction_extest_dim4d = [-1, -2, 1, 0]
+    _reduction_extest_SDbinding = list(zip(_reduction_extest_shape4d, _reduction_extest_dim4d))
+
+    _test_dtypes = ['float32', 'int32', 'float16', 'bfloat16', 'int64']
+
+    @staticmethod
+    def _generate_tensor(shape, dtype, floatPOSIFLAG=0):
+        if dtype == 'float32' or dtype == 'float16' or dtype == 'bfloat16':
+            if floatPOSIFLAG:
+                return 1000 * torch.rand(size=shape, dtype=eval('torch.' + dtype), device=torch.device("npu"))
+            else:
+                return torch.randn(size=shape, dtype=eval('torch.' + dtype), device=torch.device("npu")) * 2000
+        elif dtype == 'int32' or dtype == 'int64':
+            return torch.randint(low=0, high=2000, size=shape, dtype=eval('torch.' + dtype), device=torch.device("npu"))
+        elif dtype == 'bool':
+            return torch.randint(low=0, high=2, size=shape, device=torch.device("npu")).bool()
+        else:
+            raise ValueError('Invalid parameter \"dtype\" is found : {}'.format(dtype))
\ No newline at end of file
diff --git a/torch_npu/_inductor/__init__.py b/torch_npu/_inductor/__init__.py
new file mode 100644
index 0000000000..4d18d683d7
--- /dev/null
+++ b/torch_npu/_inductor/__init__.py
@@ -0,0 +1,91 @@
+
+import torch
+from torch._inductor.codegen.common import register_backend_for_device, register_device_op_overrides
+from . import config as npu_config
+from torch_npu.utils._inductor import NPUDeviceOpOverrides
+from torch_npu.utils._dynamo_device import NpuInterface, current_device, set_device
+from torch_npu.npu.utils import device_count
+from torch._dynamo.device_interface import register_interface_for_device, get_interface_for_device
+from torch._inductor import lowering as inductor_lowering
+from torch._inductor.choices import InductorChoices
+from .lowering import _register_npu_inductor_fallbacks, make_reduction
+from .decomposition import _register_npu_inductor_decompositons
+from .utils import get_current_raw_stream
+from .config import log as npulog
+from .config import aggresive_autotune, num_vector_core
+from .npu_choices import should_use_persistent_reduction
+
+npulog.info("perform torch_npu._inductor patch")
+
+def _inductor_register_backend_for_device():
+    from .codegen.schduling import NPUTritonScheduling
+    from .codegen.wrapper import NPUWrapperCodeGen
+    register_backend_for_device('npu', NPUTritonScheduling, NPUWrapperCodeGen)
+
+_inductor_register_backend_for_device()
+
+## Override original inductor device overrides in torch_npu
+
+class NewNPUDeviceOpOverrides(NPUDeviceOpOverrides):
+    def import_get_raw_stream_as(self, name):
+        return f"from torch_npu._inductor import get_current_raw_stream as {name}"
+
+def _inductor_register_device_op_overrides():
+    register_device_op_overrides('npu', NewNPUDeviceOpOverrides())
+
+_inductor_register_device_op_overrides()
+
+## Override original dynamo device interface in torch_npu
+class NewNpuInterface(NpuInterface):
+
+    @staticmethod
+    def is_available() -> bool:
+        return device_count() > 0
+
+    @staticmethod
+    def get_compute_capability(device=None):
+        # npu has no concept of cc. triton-npu compiler depends on subarch instead
+        return torch.npu.get_device_name(device)
+    
+    @staticmethod
+    def exchange_device(device: int) -> int:
+        curr_device = current_device()
+        set_device(device) 
+        return curr_device
+    
+    @staticmethod
+    def maybe_exchange_device(device: int) -> int:
+        return device 
+    
+    # @staticmethod
+    # def get_device_properties(device=None):
+    #     props = NpuInterface.get_device_properties(device)
+    #     setattr(props, "multi_processor_count", num_vector_core )
+    #     return props 
+
+register_interface_for_device("npu", NewNpuInterface)
+register_interface_for_device("npu:0", NewNpuInterface)
+device = get_interface_for_device("npu")
+
+from . import codegen
+
+inductor_lowering.make_reduction = make_reduction
+_register_npu_inductor_fallbacks()
+_register_npu_inductor_decompositons()
+
+#register fx_pass should be put behind of _register_npu_inductor_decompositons
+#from .npu_indexing import fx_pass
+from . import npu_fusion_attention_graph
+from . import dynamo_patch3
+
+def _replace_benchmark_all_configs():
+    from torch._inductor.triton_heuristics import CachingAutotuner
+    from .npu_triton_heuristics import benchmark_all_configs
+    CachingAutotuner.benchmark_all_configs = benchmark_all_configs
+
+if (aggresive_autotune):
+    _replace_benchmark_all_configs()
+    import os
+    os.environ["TRITON_BENCH_METHOD"] = "npu"
+
+InductorChoices.should_use_persistent_reduction = should_use_persistent_reduction
\ No newline at end of file
diff --git a/torch_npu/_inductor/codegen/__init__.py b/torch_npu/_inductor/codegen/__init__.py
new file mode 100644
index 0000000000..e6eed55b00
--- /dev/null
+++ b/torch_npu/_inductor/codegen/__init__.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#  Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
+
+from ..config import log as npulog
+npulog.info("perform npu_indexing patch")
+
+from torch._inductor.ir import Reduction,LoopBody
+from torch._inductor.codegen.triton import TritonScheduling
+from torch._inductor import sizevars
+from torch._inductor.codegen.triton import TritonKernel
+from torch._inductor.codegen.simd import SIMDKernel
+
+from torch_npu._inductor.codegen._sizevars import simplify
+from torch_npu._inductor.codegen.ir import (num_splits,loopbody__call__,transform_dims_in_indexing,
+                                     substituted_dims_in_indexing)
+from torch_npu._inductor.codegen.triton import is_compatible
+from torch_npu._inductor.codegen.triton import group_fn, select_index_dtype
+from torch_npu._inductor.codegen.schduling import create_tiling
+#from ..npu_indexing.graph import run_node
+#graph
+#GraphLowering.run_node = run_node
+#common
+#ir
+Reduction.num_splits = num_splits
+setattr(LoopBody, 'transform_dims_in_indexing', transform_dims_in_indexing)
+setattr(LoopBody, 'substituted_dims_in_indexing', substituted_dims_in_indexing)
+
+LoopBody.__call__=loopbody__call__
+#need to enable this to speedup attn_cp_test
+#ComputedBuffer.simplify_and_reorder = simplify_and_reorder
+#triton scheduling
+TritonScheduling.group_fn = group_fn
+TritonScheduling.select_index_dtype = select_index_dtype
+TritonScheduling.create_tiling = create_tiling
+#triton kernel
+setattr(SIMDKernel, 'is_compatible', is_compatible )
+
+#util
+sizevars.SizeVarAllocator.simplify = simplify
\ No newline at end of file
diff --git a/torch_npu/_inductor/codegen/_sizevars.py b/torch_npu/_inductor/codegen/_sizevars.py
new file mode 100644
index 0000000000..f3c4d4f550
--- /dev/null
+++ b/torch_npu/_inductor/codegen/_sizevars.py
@@ -0,0 +1,9 @@
+import sympy
+from sympy import Expr
+from torch._inductor.utils import sympy_subs
+
+def simplify(self, expr: Expr):
+    if isinstance(expr, (tuple,list)) :
+        return  [sympy.expand(s).xreplace(self.replacements) for s in expr]
+    return sympy.expand(expr).xreplace(self.replacements)
+
diff --git a/torch_npu/_inductor/codegen/ir.py b/torch_npu/_inductor/codegen/ir.py
new file mode 100644
index 0000000000..7edccf17c0
--- /dev/null
+++ b/torch_npu/_inductor/codegen/ir.py
@@ -0,0 +1,194 @@
+from ..config import log
+from typing import List, Tuple, Dict, Any, Optional
+from torch._inductor.virtualized import V
+from torch._inductor.ir import ( ReductionHint, IRNode, ModularIndexing, FloorDiv)
+from torch._inductor.utils import sympy_subs,sympy_index_symbol
+from torch_npu._inductor.codegen.triton import NPUIndexTritonKernel
+import sympy
+import itertools
+
+
+# NPU doesn't need to support ReductionHint.OUTER, and persistent reduction
+def num_splits(
+        device,
+        dst_dtype,
+        src_dtype,
+        inner_fn,
+        ranges,
+        reduction_ranges,
+        reduction_type,
+        reduction_numel,
+        input_node: Optional[IRNode] = None,
+    ):
+    return ReductionHint.DEFAULT, 1
+
+def detect_flattened_dims(kernel, index):
+    new_vars = {}
+    if not isinstance(index, (sympy.core.add.Add, ModularIndexing, FloorDiv)) :
+        return new_vars
+    
+    def detect_flattened_axis(expr):
+        def init_new_vars(var, length):
+            if var not in new_vars:
+                new_vars[var] = {length:[None,None]}
+            if length not in new_vars[var] :
+                new_vars[var][length] = [None,None]
+        if isinstance(expr, ModularIndexing):
+            var, divisor, length = expr.args
+            init_new_vars(var, length)
+            new_vars[var][length][1] = (expr, divisor, length)
+        elif isinstance(expr, FloorDiv):
+            var, divisor = expr.args
+            init_new_vars(var, divisor)
+            # over than 1 node_schedule, var may be deleted in kernel.range_tree_nodes
+            # it shoule be find in range_tree_nodes_removed dict
+            if (var in kernel.range_tree_nodes):
+                numel = kernel.range_tree_nodes[var].length
+            else:
+                numel = kernel.range_tree_nodes_removed[var].length
+
+            length = expr.eval(numel, divisor)
+            new_vars[var][divisor][0] = (expr, divisor, length)
+
+        else:
+            for x in expr.args:
+                detect_flattened_axis(x)
+
+    # add
+    if isinstance(index, sympy.core.add.Add) :
+        for x in index.args:
+            detect_flattened_axis(x)
+    elif isinstance(index, (ModularIndexing, FloorDiv)):
+        detect_flattened_axis(index)
+    else :
+        pass
+
+    # make sure FloorDiv, MouldarIndexing must be in-pair
+    for var, divisors in new_vars.items():
+        if var in kernel.range_tree_nodes:
+            parent_axis = kernel.range_tree_nodes[var]
+        else:
+            parent_axis = kernel.range_tree_nodes_removed[var]
+        for divisor, pair in divisors.items():
+            if not pair[0] and not pair[1] :
+                pass
+            #FloorDiv not inplace
+            elif not pair[0] :
+                _, _, length = pair[1]
+                expr = FloorDiv(var, length)
+                new_vars[var][divisor][0] = (expr, length, parent_axis.length //length )
+            #ModularIndexing not inplace
+            elif not pair[1] :
+                expr = ModularIndexing(var, 1, divisor)
+                new_vars[var][divisor][1] = (expr, 1, divisor)
+            else :
+                pass
+
+    return new_vars
+
+def rebuild_flattened_dims(indexing) :
+    def rebuild_flattened_dim(key, index,  old_node, flatten_dim) :
+        for _, pair in flatten_dim.items():
+            new_var_expr = sympy.Integer(0)
+            origin_axis_length = 0
+            pair_is_valid = True 
+            # don't create duplicated axis, e.g. y1:1024, y1 % 1024 is duplicated
+            expr, divisor, length = pair[1]
+            if not old_node.parent.duplicated_check(divisor, length) :
+                 V.kernel.expr_substituted[expr] = old_node.symbol() 
+                 break   
+       
+            for axis in pair:
+                expr, divisor, length = axis
+                # 3. try to rebuild the axis in kernel
+                new_node = old_node.parent.lookup(divisor, length)
+                # new_node = old_node 
+                
+                # 4. substitute div/mod expression in indexing
+                index = index.subs(expr, new_node.symbol())
+                indexing[key] = index
+                if isinstance(expr, FloorDiv):
+                    new_var_expr = new_var_expr + new_node.symbol() * divisor
+                    origin_axis_length = divisor * length
+                elif isinstance(expr, ModularIndexing):
+                    new_var_expr = new_var_expr + new_node.symbol()
+                V.kernel.expr_substituted[expr] = new_node.symbol()
+  
+            if var not in  V.kernel.range_tree_nodes_substituted :
+                V.kernel.range_tree_nodes_substituted[var] = []
+            V.kernel.range_tree_nodes_substituted[var].append((origin_axis_length,new_var_expr))
+
+    def find_index_in_substitute(index, kernel) :
+        return any([index.find(key) for key in kernel.expr_substituted.keys()])
+
+    kernel = V.kernel
+    for key, index in indexing.items():
+        # 1. try to find out flattened axis from indexing
+        flatten_dims = detect_flattened_dims(kernel, index)
+        #2. try to rebuild these flattened dims
+        for var, flatten_dim in flatten_dims.items():
+            if (var in kernel.range_tree_nodes):
+                old_node = kernel.range_tree_nodes[var]
+            else:
+                old_node = kernel.range_tree_nodes_removed[var]
+
+            rebuild_flattened_dim(key, index, old_node, flatten_dim)
+
+        if find_index_in_substitute(index, kernel):
+            new_index = sympy_subs(index, kernel.expr_substituted)
+            indexing[key] = new_index
+
+def substituted_dims_in_indexing(self, indexing, kernel, range_tree_nodes_substituted) :
+    substituted = False
+    for var, candidates in range_tree_nodes_substituted.items():
+        assert len(candidates) > 0, candidates
+        exprs = sorted(candidates, reverse=True, key=lambda x: x[0])
+        # the best candidate is with the longest numel
+        numel = exprs[0][0]
+        expr = exprs[0][1]
+        node = kernel.range_tree_nodes[var]
+        if node.length != numel:
+            log.debug("sub nodes (expr%s, numel:%d) can not substitute parent node(%s:%d)",
+                      expr, numel, node.symbol(), node.length)
+            continue
+        for key, index in indexing.items():
+            if var in index.free_symbols:
+                index = index.subs(var, expr)
+                indexing[key] = index
+                substituted = True
+
+    return substituted
+
+def generate_body_indexing(body, indices) :
+    index = list(itertools.chain.from_iterable(indices))
+    assert len(index) == len(body.var_ranges), (index, body.var_ranges)
+    assert all(v not in body.var_ranges for v in index)
+    replacements = dict(zip(body.var_ranges.keys(), index))
+    indexing_map = dict(zip( index, body.var_ranges.keys()))
+    setattr(body, 'indexing_map', indexing_map)
+    body.indexing = {
+        name: sympy_subs(expr, replacements)
+        for name, expr in body.indexing_exprs.items()
+    }
+    # body.indexing = {
+    #     name: sympy_subs(expr, V.graph.sizevars.var_to_val)
+    #     for name, expr in body.indexing.items()
+    # }
+
+def transform_dims_in_indexing(self, indices) :
+    if self.indexing is None :
+        generate_body_indexing(self, indices)
+
+    if V.kernel is not None and isinstance(V.kernel, NPUIndexTritonKernel):
+        rebuild_flattened_dims(self.indexing)
+
+# select tiling axis, recover missing dimensions,
+def loopbody__call__(self, *indices):
+    if self.indexing is None:
+        generate_body_indexing(self, indices)
+    result = self.root_block()
+    self.indexing = None
+    return result
+
+
+
diff --git a/torch_npu/_inductor/codegen/npu_kernel_features.py b/torch_npu/_inductor/codegen/npu_kernel_features.py
new file mode 100644
index 0000000000..01020f8966
--- /dev/null
+++ b/torch_npu/_inductor/codegen/npu_kernel_features.py
@@ -0,0 +1,109 @@
+from torch._inductor.codegen.simd_kernel_features import SIMDKernelFeatures,NodeScheduleEntry
+from torch._inductor.utils import cache_on_self
+from torch.utils._ordered_set import OrderedSet
+from typing import Tuple, List
+import sympy
+import functools
+from torch._inductor.virtualized import V
+from torch._inductor.codegen.simd import SIMDScheduling
+import torch
+from typing import Iterable
+
+class NumelList(Tuple):
+
+    def numels(self):
+        numel = functools.reduce(lambda a, b: a * b, self)
+        return numel
+
+    def __eq__(self, other):
+        numel = self.numels() 
+        numel2 = other.numels() if isinstance(other, NumelList) else other
+        return numel == numel2
+    
+    def __le__(self, other):
+        numel = self.numels() 
+        numel2 = other.numels() if isinstance(other, NumelList) else other
+        return numel <= numel2
+    
+    def __lt__(self, other):
+        numel = self.numels() 
+        numel2 = other.numels() if isinstance(other, NumelList) else other
+        return numel < numel2
+    
+    def __ge__(self, other):
+        numel = self.numels() 
+        numel2 = other.numels() if isinstance(other, NumelList) else other
+        return numel >= numel2
+    
+    def __gt__(self, other):
+        numel = self.numels() 
+        numel2 = other.numels() if isinstance(other, NumelList) else other
+        return numel > numel2
+
+
+    def __mod__(self, other):
+        numel = self.numels()
+        numel2 = other.numels() if isinstance(other, NumelList) else other
+        return numel % numel2
+
+    def __truediv__(self, other):
+        numel = self.numels()
+        numel2 = other.numels() if isinstance(other, NumelList) else other
+        return numel / numel2
+    def __floordiv__(self, other):
+        numel = self.numels()
+        numel2 = other.numels() if isinstance(other, NumelList) else other
+        return numel // numel2
+
+    def __mul__(self, other):
+        numel = self.numels()
+        numel2 = other.numels() if isinstance(other, NumelList) else other
+        return numel * numel2
+    def __rmul__(self, other):
+        numel = self.numels()
+        numel2 = other.numels() if isinstance(other, NumelList) else other
+        return numel * numel2
+
+    def __add__(self, other):
+        numel = self.numels()
+        numel2 = other.numels() if isinstance(other, NumelList) else other
+        return numel + numel2
+    def __radd__(self, other):
+        numel = self.numels()
+        numel2 = other.numels() if isinstance(other, NumelList) else other
+        return numel + numel2
+
+    def __hash__(self):
+        return super(NumelList, self).__hash__()
+
+
+class NPUKernelFeatures(SIMDKernelFeatures):
+    def __init__(
+        self,
+        node_schedule: List[NodeScheduleEntry],
+        numel: sympy.Expr,
+        reduction_numel: sympy.Expr = sympy.S.One,
+    ):
+        super().__init__(node_schedule, numel, reduction_numel)
+        self.numel = NumelList(self.numel) if isinstance(self.numel, Iterable) else self.numel
+        self.reduction_numel = NumelList(self.reduction_numel) if isinstance(self.reduction_numel, Iterable) else self.reduction_numel
+
+
+    # @cache_on_self
+    # def select_index_dtype(self) -> torch.dtype:
+    #     # Gather all used buffer names
+    #     buffer_names: OrderedSet[str] = OrderedSet()
+    #     for node in self.scheduler_nodes():
+    #         buffer_names.update(node.get_buffer_names())
+    #         buffer_names.update(node.used_buffer_names())
+    #     buffers = [V.graph.get_buffer(name) for name in buffer_names]
+
+    #     # In theory we can separately check xnumel and rnumel are <= int_max
+    #     # but some indexers do use the full linear index so we need to be
+    #     # conservative here.
+    #     total_numel = self.numel * self.reduction_numel
+        
+
+    #     if SIMDScheduling.can_use_32bit_indexing(total_numel, buffers):
+    #         return torch.int32
+    #     return torch.int64
diff --git a/torch_npu/_inductor/codegen/schduling.py b/torch_npu/_inductor/codegen/schduling.py
new file mode 100644
index 0000000000..d888641530
--- /dev/null
+++ b/torch_npu/_inductor/codegen/schduling.py
@@ -0,0 +1,221 @@
+import pdb
+
+from torch_npu._inductor.codegen.triton import NPUIndexTritonKernel, flatten
+from torch._inductor.codegen.triton import ( TritonScheduling, log, config)
+from torch._inductor.codegen.simd import  DisableReduction, EnableReduction,SIMDKernelFeatures, SIMDKernel
+from torch._inductor.codegen.simd import schedule_log, scheduler 
+from torch._inductor.codegen.multi_kernel import MultiKernel
+from typing import Union, Iterable
+from torch._inductor.virtualized import (
+    V,
+)
+from torch._inductor.codecache import code_hash
+from torch._dynamo.utils import counters
+import itertools, contextlib
+from torch._inductor.utils import sympy_index_symbol,ModularIndexing,FloorDiv
+import sympy
+from .split_tiling import SplitTiling
+from torch.fx.immutable_collections import immutable_dict
+from typing import Dict, Sequence, List, Iterable
+from .npu_kernel_features import NumelList, NPUKernelFeatures
+def flatten_groups(nums):
+    res = []
+    for i in nums:
+        if isinstance(i, Iterable):
+            for x in i :
+                res.append(x)
+        else:
+            res.append(i)
+    return res
+
+@classmethod
+def create_tiling(
+        cls, pw_tiling: Sequence[sympy.Expr], reduction_tiling: Sequence[sympy.Expr]
+    ) -> Dict[str, sympy.Expr]:
+    """
+    Create a tiling dict from pointwise and reduction splits.
+    """
+    
+    pw_tiling = flatten_groups(pw_tiling)
+    pw_prefixes = ["w", "v", "t", "z", "y", "x"][-len(pw_tiling) :]
+    reduction_tiling = flatten_groups(reduction_tiling)
+    reduction_tiling = [NumelList(reduction_tiling).numels()]
+    reduction_prefixes = ["r"][: len(reduction_tiling)]
+    tiling = immutable_dict(
+        list(zip(pw_prefixes, pw_tiling))
+        + list(zip(reduction_prefixes, reduction_tiling)))
+    return tiling
+
+
+
+class NPUTritonScheduling(TritonScheduling):
+    def __init__(self, scheduler):
+        super().__init__(scheduler)
+        self.kernel_type = NPUIndexTritonKernel
+
+    def create_kernel_choices(
+        self, kernel_features: SIMDKernelFeatures, kernel_args, kernel_kwargs
+    ) -> List[SIMDKernel]:
+
+        return [
+            self.kernel_type(
+                *kernel_args,
+                **kernel_kwargs,
+            )
+        ]    
+
+    # transform indexing before call codegen_node_schedule_with_kernel
+    def codegen_node_schedule(self, kernel_features: SIMDKernelFeatures):
+        node_schedule = kernel_features.node_schedule
+        tiling = self.select_tiling(
+            node_schedule, kernel_features.numel, kernel_features.reduction_numel
+        )
+       
+        kernels = self.create_kernel_choices(
+            kernel_features, [tiling], {"features": kernel_features}
+        )
+        kernel = kernels[0]
+        setattr(kernel, "node_schedule", node_schedule )
+        self.decide_codegen_dims_in_kernel(node_schedule, kernel)
+
+        for kernel in kernels:
+            self.codegen_node_schedule_with_kernel(node_schedule, kernel)
+
+        MultiKernel.merge_workspaces_inplace(kernels)
+        for kernel in kernels:
+            with V.set_kernel_handler(kernel):
+                src_code = kernel.codegen_kernel()
+            kernel_name = self.define_kernel(src_code, node_schedule, kernel)
+            log.debug("Generating kernel code with kernel_name: %s", kernel_name)
+            kernel.kernel_name = kernel_name
+            kernel.code_hash = code_hash(src_code)
+        del kernel
+
+        final_kernel: Union[SIMDKernel, MultiKernel]
+        if len(kernels) > 1:
+            final_kernel = MultiKernel(kernels)
+        else:
+            (final_kernel,) = kernels
+
+        with V.set_kernel_handler(final_kernel):
+            for node in kernel_features.scheduler_nodes():
+                node.mark_run()
+
+        self.codegen_comment(node_schedule)
+        final_kernel.call_kernel(final_kernel.kernel_name)
+
+        if config.nan_asserts:
+            final_kernel.codegen_nan_check()
+        if config.warn_mix_layout:
+            final_kernel.warn_mix_layout(kernels[0].kernel_name)
+
+        V.graph.removed_buffers |= final_kernel.removed_buffers
+        V.graph.inplaced_to_remove |= final_kernel.inplaced_to_remove
+
+        if (
+            V.graph.wrapper_code.supports_intermediate_hooks
+            and config.generate_intermediate_hooks
+        ):
+            # Not every node in the schedule will actually be live on output;
+            # we can't check dead buffers.
+            live_outs = kernels[0].args.live_output_buffers()
+            for node in kernel_features.scheduler_nodes():
+                name = node.get_name()
+                if name not in live_outs:
+                    continue
+                assert node.node is not None
+                origin_node = node.node.get_origin_node()
+                if origin_node is not None:
+                    counters["inductor"]["intermediate_hooks"] += 1
+                    V.graph.wrapper_code.writeline(
+                        f"run_intermediate_hooks({origin_node.name!r}, {name})"
+                    )
+
+        self.scheduler.free_buffers()
+        
+    def codegen_node(
+        self, node: Union[scheduler.FusedSchedulerNode, scheduler.SchedulerNode]
+    ):
+        """
+        Given a set of pre-fused nodes, generate a Triton kernel.
+        """
+
+        nodes: List[scheduler.SchedulerNode] = node.get_nodes()  # type: ignore[assignment]
+        _, (numel, rnumel) = max(nodes, key=lambda x: int(x.is_reduction())).group
+
+        node_schedule = self.generate_node_schedule(nodes, numel, rnumel)
+        schedule_log.debug("Schedule:\n %s", node_schedule)
+        
+        return self.codegen_node_schedule(
+            NPUKernelFeatures(node_schedule, numel, rnumel)
+        )
+    
+    def decide_codegen_dims_in_kernel(self, node_schedule, kernel):
+        def current_reduction_nodes(nodes):
+            return itertools.takewhile(lambda n: n is not DisableReduction, nodes)
+
+        with kernel:
+            # 1. transform dims: create new dims to substitute floor_divide and modular expression
+            stack = contextlib.ExitStack()
+            for  i, node in enumerate(node_schedule):
+                if node is DisableReduction:
+                    stack.enter_context(kernel.disable_reduction())
+                elif node is EnableReduction:
+                    stack.close()
+                    # kernel.set_last_usage(current_reduction_nodes(node_schedule[i:]))
+                else:
+                    index_vars = kernel.split_and_set_ranges(node.get_ranges())
+                    node._body.transform_dims_in_indexing(index_vars)
+            # 2. go through range_tree_nodes to findout, to find one axis could be substituted by others
+            self.additional_nodes_to_be_subs(kernel, kernel.range_tree_nodes_substituted)
+            # 3.do the substitution on all indexing
+            for  node in node_schedule:
+                if node  in (EnableReduction, DisableReduction):
+                    continue
+                indexing = node._body.indexing
+                node._body.substituted_dims_in_indexing(indexing, kernel, kernel.range_tree_nodes_substituted)
+      
+            # 4.remove the substituted dims from kernel
+            for var, _ in kernel.range_tree_nodes_substituted.items():
+                if (var in kernel.range_tree_nodes):
+                    root = kernel.range_tree_nodes[var].parent
+                    root.remove_entry(var)
+            # select split and tiling axis
+            split_tiling = SplitTiling(kernel)
+            split_tiling.select_tiling_axis()
+            # debug print index transforms
+            for node in node_schedule:
+              if node in (EnableReduction, DisableReduction):
+                  continue
+              for x,y in zip( node._body.indexing_exprs.values(), node._body.indexing.values()) :
+                  print(f"index transform:{x}->{y}")
+
+    def additional_nodes_to_be_subs(self, kernel, node_to_be_substituted):
+        for node in kernel.range_tree_nodes.values():
+            if node.expr != sympy_index_symbol(f"{node.parent.prefix}index") \
+                    or len(node.parent.var_ranges) == 1 \
+                    or node.symbol() in node_to_be_substituted:
+                continue
+            numel = sympy.Integer(1)
+            new_var_expr = sympy.Integer(0)
+            for k, s in node.parent.var_ranges.items():
+                if k == node.symbol():
+                    continue
+                numel = numel * s
+                sub_node = kernel.range_tree_nodes[k]
+                new_var_expr = new_var_expr + sub_node.symbol() * sub_node.divisor
+                # if isinstance(sub_node.expr, FloorDiv):
+                #     new_var_expr = new_var_expr + sub_node.symbol() * sub_node.divisor
+                # elif isinstance(sub_node.expr, ModularIndexing):
+                #     new_var_expr = new_var_expr + sub_node.symbol()
+
+            if numel == node.length:
+                node_to_be_substituted[node.symbol()] = [(node.length, new_var_expr)]
+            else:
+                log.warning("sub nodes (expr%s, numel:%d) can not make up parent node(%s:%d)",
+                                new_var_expr, numel, node.symbol(), node.length)
+
+
+
+
+
diff --git a/torch_npu/_inductor/codegen/split_tiling.py b/torch_npu/_inductor/codegen/split_tiling.py
new file mode 100644
index 0000000000..c3a77f1a35
--- /dev/null
+++ b/torch_npu/_inductor/codegen/split_tiling.py
@@ -0,0 +1,298 @@
+import pdb
+
+from torch._inductor.codegen.triton import TritonKernel
+from torch._inductor.utils import ModularIndexing,sympy_subs
+import sympy as sympy
+from ..config import num_vector_core, log
+from torch._inductor.virtualized import V
+from torch._inductor.codegen.simd import (  EnableReduction, DisableReduction)
+from torch._inductor.runtime.runtime_utils import  next_power_of_2
+from .triton_utils import get_aligned_numel
+from torch._inductor.loop_body import MemoryUsageType
+
+# split and tiling axis selector
+class SplitTiling :
+    def __init__(self, kernel : TritonKernel) :
+        self.kernel = kernel
+        self.indexing = []
+        def key(x) :
+            # to be higher than x and y
+            if x.name[0] == 'w' or x.name[0] == 'v' or x.name[0] == 'p' or x.name[0] == 't':
+                return "z" + x.name
+            # to be lower than floor_dir
+            elif isinstance(x.expr, ModularIndexing):
+                return x.name[0] + "0" + x.name[1:]
+            else :
+                return x.name
+            
+        kernel.sorted_axis = [x for x in kernel.range_tree_nodes.values()]
+        kernel.sorted_axis.sort(reverse=True, key=key)
+        for i, dim in enumerate(kernel.sorted_axis):
+            dim.sorted_order = i
+        
+        self.find_lowest_dimension()
+        self.should_outer_reduce = False
+
+
+    # Split 原则1 ：先做维度合并，再切分 。通过维度合并降维降低，split和tiling轴选择策略的复杂性 。
+    # Split 原则2: 切分的数量要和AIcore的数量对齐（相同或是倍数）。每个核要分配的split的量一致。每个split形状要一致（包括维度和尺寸）。
+    # Split  原则3: 对于规约类融合算子, 从非规约选择切分轴。对于非规约类融合算子, 从所有轴中选切分轴。
+    # 为了tiling时刻的低维tilesize最大化，切分轴最好不是低维轴且长度大于aicore的数量 。
+    # Split 原则4: 如果高维规约类融合算子，而且高维尺寸非常大（ >= 64KB），低维度尺寸比较小（ <= 32B）, 可以选择对规约轴切分，然后在核间用atomic
+    # 原语做规约。
+    # Split  原则5 ：根据算子逻辑，优先选择一维发射。
+    def select_split_axis(self):
+        def select_longest_dim(can_be_low_dim = True):
+            longest = -1
+            longest_dim = None
+            for x in candidates:
+                if SplitTiling.great_than(x.length,longest) and (can_be_low_dim or not self.is_lowest_dimension(x)):
+                    longest_dim = x
+                    longest = x.length
+            return longest_dim
+        # point-wise : all dims , reduction: outer_reduction dim or non-reduction dims
+        is_reduction = lambda x : x.prefix == 'r'
+        candidates = [x for x in self.kernel.sorted_axis if not is_reduction(x) or self.should_outer_reduce_me(x) ]
+        if self.should_outer_reduce :
+            return self.kernel.split_axis
+
+        # 0307 patch 5lines
+        if len(candidates) > 0:
+            longest_dim = candidates[0]
+            self.kernel.split_axis = longest_dim
+            self.kernel.split_axis.is_split_axis = True
+            return longest_dim
+
+        #longest and not low dims
+        longest_dim = select_longest_dim( can_be_low_dim = False )
+
+        # longest and can be low dims
+        if longest_dim is None or SplitTiling.less_than(longest_dim.length , int(num_vector_core * 0.8)):
+            longest_dim = select_longest_dim( can_be_low_dim = True )
+        if longest_dim is not None :
+            self.kernel.split_axis = longest_dim
+            self.kernel.split_axis.is_split_axis = True
+        elif len(self.kernel.sorted_axis) > 0:
+            longest_dim = self.kernel.sorted_axis[0]
+            self.kernel.split_axis = longest_dim
+            self.kernel.split_axis.is_split_axis = True
+        
+        return longest_dim
+
+    # Tiling 原则1：切分要照顾所有load / store 中索引表达式的中的低维轴 ：所有的低维轴都被切分 从而成为tiling 轴。写代码的时候对所有的tiling
+    # 轴通过make_range产生连续索引，从而保证load / store的连续性。
+    # Tiling 原则2 ：规约的tile必须要二维。 对于低维规约算子，规约轴和至少一个非规约轴要选择为tiling轴。对于高维规约，规约轴和低维轴要选择为tiling轴
+    #    对于是多维规约, 所有的规约轴都要选择为tiling 轴 。
+    # Tiling 原则3: 如果tiling轴是低维，在该轴上的切分的尺寸要与SIMD的BlockSize 对齐（32bytes）
+    # Tiling 原则4: 低维轴的tile size 越大，性能越好。这个其实autotune 的原则，放在这里只是为了更好解释用例中使用的数值 。
+
+    # fixme, two tiling axis might be insufficient when there're 3 or more low-dims in indexing
+    def select_tiling_axis(self ):
+        # True :self.kernel.axis2 is Not None and all reduction axis selected, False : other cases
+        def axis2_selection_done(axis) :
+            if self.kernel.total_numels <= 1  :
+                return True
+            elif self.kernel.axis2 is not None :
+                is_reduction = axis.prefix == "r"
+                if not is_reduction :
+                    return True
+                reduction_axis = self.kernel.numof_reduction_axis()
+                return True if reduction_axis <= 1 else len(self.kernel.axis2_list) == reduction_axis
+            else :
+                return False
+    
+        if self.kernel.axis2 is not None or self.kernel.axis1 is not None:
+            return
+        # two or more reduction axises, need to flatten reduction dims to one to do 1 dim reduction .
+        if self.kernel.numof_reduction_axis() > 1:
+            self.kernel.persistent_reduction = True
+        biggest = -1
+        dims = self.kernel.sorted_axis
+        if self.kernel.split_axis is None :
+            self.select_split_axis()
+        
+        if self.kernel.split_axis is None :
+            return
+        # select tiling_axis2 then tiling_axis1, for reduction, all reduction axis will be selected as tiling_axis2
+        for i in range(len(dims)-1, -1, -1) :
+            axis = dims[i]
+            numel = axis.length
+            if isinstance(numel, (sympy.Symbol, sympy.Expr)) and  not isinstance(numel, sympy.Integer) :
+                  numel = numel.subs(V.graph.sizevars.var_to_val)
+            if axis.is_split_axis :
+                dtype = self.kernel.get_axis_dtype(axis)
+
+                min_aligned_numel = get_aligned_numel(dtype)
+                _, numel = SplitTiling.decide_nblocks_xblock(numel, len(self.kernel.sorted_axis) <=1, min_aligned_numel)
+            
+            # choose reduction axis or low-dim as axis2
+            if not axis2_selection_done(axis):
+                axis.is_tiling_axis2 =True if SplitTiling.great_than(numel,1) else False
+                # axis2 must be the reduction axis in case inside_reduction
+                if axis.prefix == "r" :
+                    axis.is_tiling_axis2 =True
+                if axis.is_tiling_axis2 and  self.kernel.axis2 is None :
+                    self.kernel.axis2 = axis.symbol()
+                if self.kernel.numof_reduction_axis() > 1 :
+                    self.kernel.axis2_list.append(axis.symbol())
+                    self.kernel.axis2 = axis.symbol() if isinstance(axis.expr, ModularIndexing) else self.kernel.axis2
+            else :
+                # for _higher_order_reduction, axis1 must be  the lowest dimension
+                if self.kernel.inside_reduction and self.kernel.is_higher_order_reduction() :
+                    self.kernel.axis1 = axis.symbol()
+                    break
+
+                # low-dim should be selected as another tiling axis
+                if self.is_lowest_dimension(axis) :
+                    self.kernel.axis1 = axis.symbol()
+                    break
+                # select the longest in other cases
+                if numel > biggest :
+                    self.kernel.axis1 = axis.symbol()
+                    biggest = numel
+        if self.kernel.axis1 is not None :
+            axis = self.kernel.range_tree_nodes[self.kernel.axis1 ]
+            axis.is_tiling_axis1 = True
+
+
+        log.debug(f"split_tiling numels:{self.kernel.numels} split_axis: {self.kernel.split_axis.symbol()} "
+                  f"axis1:{self.kernel.axis1} axis2:{self.kernel.axis2} low_dims:{self.kernel.low_dims}, "
+                  f"indexing: {self.indexing}" )
+
+
+
+    # fixme the below logic doesn't work when there're two reduction axis, but only one need outer reduction
+    def should_outer_reduce_me(self, x):
+        should_outer = self.kernel.is_higher_order_reduction(True) and SplitTiling.great_than(x.length, 32768 ) and x.is_loop
+        if should_outer :
+            self.should_outer_reduce = True
+            self.kernel.split_axis = x
+            self.kernel.split_axis.is_split_axis = True
+        return should_outer
+    
+    @staticmethod
+    def decide_nblocks_xblock(numel, no_axis2, min_aligned_numel, xblock = None):
+        #no_axis2 mean there's only on dims
+        min_xblock = min_aligned_numel if no_axis2 else 1
+
+        # need to keep linearity for low_dims
+        if xblock is None :
+            xblock = ( numel + num_vector_core -1 ) // num_vector_core if numel > num_vector_core else min_xblock
+        
+        # fixme, aligning is wasting cores .
+        #if (not no_axis2 and  is_low_dim) or same_axis1 :
+        xblock = next_power_of_2(xblock)
+
+        nblocks = (numel + xblock -1 ) // xblock
+        return nblocks, xblock
+    
+    @staticmethod
+    def get_nblocks_before_launch(numel, xblock):
+        nblocks = (numel + xblock -1 ) // xblock
+        return nblocks, xblock
+
+    @staticmethod
+    def get_nblocks_xblock_list(numel):
+        ret = []
+        XBLOCK = numel
+        NBLOCKS = 1
+        ret.append((NBLOCKS,XBLOCK))
+        while NBLOCKS<=num_vector_core and XBLOCK>1:
+            XBLOCK -= 1
+            NBLOCKS = (numel + XBLOCK - 1) // XBLOCK
+            XBLOCK = (numel + NBLOCKS - 1) // NBLOCKS
+            ret.append((NBLOCKS,XBLOCK))
+
+        return ret
+
+    # return True when x is the low-dim in indexing
+    def is_lowest_dimension(self, x):
+        return x.sorted_order in self.kernel.low_dims
+
+    def find_lowest_dimension(self):
+        def construct_low_dim() :
+            for index in self.indexing:
+                coefficients_dict = index.as_coefficients_dict()
+                for key, value in coefficients_dict.items():
+                    if not key.free_symbols:
+                        continue
+                    key = list(key.free_symbols)[0]
+                    if key not in self.kernel.range_tree_nodes:
+                        continue
+
+                    if value == sympy.Integer(1):
+                        axis = self.kernel.range_tree_nodes[key]
+                        self.kernel.low_dims.add(axis.sorted_order)
+        
+        # all read index should be considered
+        buf_names = [node.node.name for node in self.kernel.node_schedule if
+                     node not in (EnableReduction, DisableReduction)]
+        for node in self.kernel.node_schedule:
+            if node in (EnableReduction, DisableReduction):
+                continue
+            names = []
+
+            for read in node._body.memory_usage[MemoryUsageType.LOAD]:
+                #name = node._body.indexing_exprs_name[read]
+                name = read.index_name
+                arg = read.buffer_name
+                read_is_inptr = False if arg[:3] != 'arg' and arg in buf_names else True
+                if read_is_inptr:
+                    names.append(name)
+
+            for key, index in node._body.indexing.items():
+                if key in names and index not in self.indexing:
+                    self.indexing.append(index)
+
+        if self.kernel.inside_reduction :
+            construct_low_dim()
+            return
+
+        # for non-reduction, write index should be considered
+        for node in self.kernel.node_schedule:
+            if node in (EnableReduction, DisableReduction):
+                continue
+            names = []
+            for write in node._body.memory_usage[MemoryUsageType.STORE]:
+                names.append(write.index_name)
+            for write in node._body.memory_usage[MemoryUsageType.STORE_REDUCTION]:
+                names.append(write.index_name)
+            for key, index in node._body.indexing.items():
+                if key in names and index not in self.indexing:
+                    self.indexing.append(index)
+        
+        construct_low_dim()
+
+    @staticmethod
+    def convert(x, y):
+        xnumel = x
+        ynumel = y
+        if isinstance(xnumel, (sympy.Symbol, sympy.Expr)) and not isinstance(xnumel, sympy.Integer):
+            xnumel = xnumel.subs(V.graph.sizevars.var_to_val)
+
+        if isinstance(ynumel, (sympy.Symbol, sympy.Expr)) and not isinstance(ynumel, sympy.Integer):
+            ynumel = ynumel.subs(V.graph.sizevars.var_to_val)
+
+        if isinstance(xnumel, sympy.Integer) and  isinstance(ynumel, int):
+            ynumel = sympy.Integer(ynumel)
+        
+        if isinstance(ynumel, sympy.Integer) and   isinstance(xnumel, int):
+            xnumel = sympy.Integer(xnumel)
+
+        return (xnumel, ynumel)
+    
+
+    @staticmethod
+    def less_than(x, y):
+        xnumel, ynumel = SplitTiling.convert(x, y)
+        return xnumel < ynumel
+    
+    @staticmethod
+    def great_than(x, y):
+        xnumel, ynumel = SplitTiling.convert(x, y)
+        return xnumel > ynumel
+
+    @staticmethod
+    def ge_than(x, y):
+        xnumel, ynumel = SplitTiling.convert(x, y)
+        return xnumel >= ynumel
diff --git a/torch_npu/_inductor/codegen/tile_generator.py b/torch_npu/_inductor/codegen/tile_generator.py
new file mode 100644
index 0000000000..481cae31b6
--- /dev/null
+++ b/torch_npu/_inductor/codegen/tile_generator.py
@@ -0,0 +1,135 @@
+import copy
+import pdb
+
+import math
+
+from torch._inductor.runtime.triton_heuristics import Config
+from torch._inductor.runtime.runtime_utils import next_power_of_2
+from .triton_utils import get_aligned_numel, byte_per_numel
+# generate tiling configs
+class TileGenerator:
+    
+    @staticmethod
+    def aligned_numel(numel):
+        aligned = next_power_of_2(numel)
+        return aligned
+
+    @staticmethod
+    def get_byte_per_numel(dtype):
+        if dtype is None :
+            return 1
+        return byte_per_numel[dtype]
+
+    @staticmethod
+    def valid_config(config, align_numel, rnumel = 1):
+        
+        bytes = align_numel
+        max_numel = 16384 * 4 // bytes
+
+        rblock = config["RBLOCK"] if "RBLOCK" in config else rnumel
+        xblock_sub = config["XBLOCK_SUB"]
+        if rblock * xblock_sub <= max_numel:
+            return True
+
+        return False
+
+    # when rblock is low dim, need to maximize rblock
+    @staticmethod
+    def descend_xblock(rnumel, xblock, configs, cfg, align_numel, aggresive = True ):
+        
+        bytes = align_numel
+        start_numel = 2048 // bytes if aggresive else 1024 // bytes
+        # include rblock is too big, need to decend rblock first
+        rblock = rnumel if rnumel > 0 else 1
+        while (rblock > start_numel):
+            newcfg = copy.deepcopy(cfg)
+            newcfg["RBLOCK"] = rblock
+            if TileGenerator.valid_config(newcfg, align_numel):
+                configs.append(Config(newcfg, num_warps=1, num_stages=1))
+            rblock = rblock // 2
+        cfg["RBLOCK"] = rblock
+        xblock_sub = TileGenerator.aligned_numel(xblock)
+
+        while True:
+           newcfg = copy.deepcopy(cfg)
+           newcfg["XBLOCK_SUB"] = xblock_sub
+           if TileGenerator.valid_config(newcfg, align_numel, rnumel=rblock):
+               configs.append(Config(newcfg, num_warps=1, num_stages=1))
+           xblock_sub = xblock_sub // 2
+           if xblock_sub * rblock <= start_numel:
+               break
+
+    @staticmethod
+    def descend_rblock(rnumel, xblock, configs, cfg, align_numel,  aggresive = True):
+        bytes = align_numel
+        start_numel = 4096 // bytes if aggresive else 1024 // bytes
+
+        xblock_sub = start_numel if xblock > start_numel else xblock
+        cfg["XBLOCK_SUB"] = xblock_sub
+        rblock = rnumel
+        while True:
+            newcfg = copy.deepcopy(cfg)
+            newcfg["RBLOCK"] = rblock
+            if TileGenerator.valid_config(newcfg, align_numel):
+                 configs.append(Config(newcfg, num_warps=1, num_stages=1))
+            rblock = rblock // 2
+            if xblock_sub * rblock <= start_numel:
+                break
+
+    @staticmethod
+    def descend_xblock_rblock(rnumel, xblock, configs, cfg, align_numel, aggresive = True) :
+        bytes = align_numel
+        start_numel = 4096 // bytes if aggresive else 1024 // bytes
+        
+        # Depending on the number of bytes available to the hardware UB,
+        # 4096 bytes is an appropriate empirical value for an intra-core split.
+        # Rule: xblock_sub * rblock <= start_numel
+        end_numel = math.floor(math.sqrt(start_numel))
+
+        xblock = next_power_of_2(xblock)
+        rnumel = next_power_of_2(rnumel)
+
+        xblock_sub = xblock  if xblock > start_numel else xblock
+        rblock = start_numel if rnumel > start_numel else rnumel
+
+        rblock_is_biggerr = rblock > xblock_sub
+
+        if xblock_sub * rblock <= start_numel :
+            newcfg = copy.deepcopy(cfg)
+            newcfg["XBLOCK_SUB"] = xblock_sub
+            newcfg["RBLOCK"] = rblock
+            if TileGenerator.valid_config(newcfg, align_numel):
+                configs.append(Config(newcfg, num_warps=1, num_stages=1))
+
+        if rblock_is_biggerr:
+            while rblock > xblock_sub and xblock_sub * rblock > start_numel:
+                newcfg = copy.deepcopy(cfg)
+                newcfg["RBLOCK"] = rblock
+                xblock_sub = xblock
+                if TileGenerator.valid_config(newcfg, align_numel):
+                    configs.append(Config(newcfg, num_warps=1, num_stages=1))
+                rblock = rblock // 2
+        else :
+            while rblock < xblock_sub and xblock_sub * rblock > start_numel:
+                newcfg = copy.deepcopy(cfg)
+                newcfg["XBLOCK_SUB"] = xblock_sub
+                if TileGenerator.valid_config(newcfg, align_numel):
+                    configs.append(Config(newcfg, num_warps=1, num_stages=1))
+                xblock_sub = xblock_sub // 2
+
+        while xblock_sub * rblock > start_numel :
+            newcfg = copy.deepcopy(cfg)
+            newcfg["XBLOCK_SUB"] = xblock_sub
+            newcfg["RBLOCK"] = rblock
+            if TileGenerator.valid_config(newcfg, align_numel):
+                configs.append(Config(newcfg, num_warps=1, num_stages=1))
+            if xblock_sub >= end_numel:
+                xblock_sub = xblock_sub // 2
+            if rblock >= end_numel:
+                rblock = rblock // 2
+
+    @staticmethod
+    def nearest_power_of_2(n):
+        big = next_power_of_2(n)
+        small = big // 2
+        return big if (big - n) < (n - small) else small
diff --git a/torch_npu/_inductor/codegen/triton.py b/torch_npu/_inductor/codegen/triton.py
new file mode 100644
index 0000000000..468c12a56a
--- /dev/null
+++ b/torch_npu/_inductor/codegen/triton.py
@@ -0,0 +1,2090 @@
+import pdb
+import os
+import torch
+from torch._inductor.utils import sympy_subs
+from torch._inductor.scheduler import SchedulerNode
+from typing import List,Set,Iterable,Callable,Sequence
+import sympy
+import operator
+import itertools
+from torch._inductor.codegen.simd import CantSplit, DisableReduction, EnableReduction
+from torch._inductor.codegen.common import free_symbol_is_type
+from torch._inductor.codegen.triton import (
+    IndexingOptions,
+    triton_reshape,
+    TritonCSEVariable,
+    OpsHandler,
+)
+from torch._inductor.runtime.hints import ReductionHint
+from torch._inductor.codegen.triton import (
+    TritonKernel,
+    TritonKernelOverrides,
+    IterationRangesRoot,
+    IterationRangesEntry,
+    CSEVariable,
+    gen_common_triton_imports,
+    BlockPtrOptions,
+    triton_acc_type,
+    constant_repr,
+    is_welford_reduction,FixedTritonConfig,
+    prefix_is_reduction, upcast_acc_dtype
+)
+
+from torch.utils._sympy.functions import FloorDiv, Identity, ModularIndexing
+from torch._inductor.utils import sympy_index_symbol,generate_assert
+from torch.utils import _pytree as pytree
+from torch.utils._sympy.value_ranges import ValueRanges
+
+from typing import Dict
+from enum import Enum
+import functools
+
+from torch._inductor import config, ir
+from torch._inductor.virtualized import (
+    V,
+    StoreMode,
+    ReductionType,
+    _ops as ops,
+)
+
+from torch._inductor.utils import (
+    Placeholder,
+)
+from torch._inductor.runtime.runtime_utils import next_power_of_2
+
+
+from torch._inductor.codegen.common import (
+    IndentedBuffer,
+    SizeArg,
+    DeferredLine,
+)
+from torch._inductor.codegen.triton_utils import config_of, signature_of, signature_to_meta
+
+from typing import (
+    Optional,
+    Union,
+    Tuple,
+    Any,
+    cast
+)
+
+import re
+from torch.utils._sympy.symbol import SymT,symbol_is_type
+from torch.utils._sympy.value_ranges import bound_sympy, ValueRangeAnalysis, ValueRanges
+from torch.utils._sympy.numbers import int_oo
+from ..runtime import NPUDeviceProperties
+import textwrap
+from .npu_kernel_features import NumelList
+from torch._inductor.dtype_propagation import DtypePropagationOpsHandler
+
+def flatten(nums):
+    res = []
+    for i in nums:
+        if isinstance(i, list):
+            res.extend(flatten(i))
+        else:
+            res.append(i)
+    return res
+
+class AxisDirection(Enum):
+    Flat = 0,
+    Vertical = 1,
+    Horizontal = 2
+
+def reverse_direction(direction):
+  if direction == AxisDirection.Vertical :
+      return AxisDirection.Horizontal
+  elif direction == AxisDirection.Horizontal :
+      return AxisDirection.Vertical
+  else :
+       return AxisDirection.Flat
+
+
+class NPUTritonKernelOverrides(TritonKernelOverrides):
+    @staticmethod
+    def exp(x):
+        return f"tl_math.exp({x})"
+    @staticmethod
+    def sqrt(x):
+        return f"tl_math.sqrt({x})"
+    @staticmethod
+    def tanh(x):
+        return f"tl_math.tanh({x})"
+    @staticmethod
+    def rsqrt(x):
+        return f"tl.rsqrt({x})"
+    @staticmethod
+    def floor(x):
+        return f"tl_math.floor({x})"
+    @staticmethod
+    def erf(x):
+        return f"tl_math.erf({x})"
+    @staticmethod
+    def ceil(x):
+        return f"tl_math.ceil({x})"
+    
+
+def group_fn(self, sizes):
+    groups = list()
+    for s in sizes :
+       if not s :
+           groups.append(1)
+       elif isinstance(s, list):
+           group = flatten(s)
+        #    for x in group :
+        #        groups.append(x)
+           groups.append(NumelList(tuple(group)) if isinstance(group, list) else group)
+       else :
+           groups.append(s)      
+    return tuple(groups)
+
+@staticmethod
+def select_index_dtype(node_schedule, numel, reduction_numel):
+    return "tl.int32"
+
+
+
+class IterationRangesEntryNPUIndex(IterationRangesEntry) :
+    def __init__(
+            self,
+            *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.is_tiling_axis1 = False
+        self.is_tiling_axis2 = False
+        self.is_split_axis = False
+        self.indexing_code = IndentedBuffer()
+        self.sorted_order = None
+        self.low_dims = set()
+
+
+    def _codegen_mask(self):
+        if self.is_tiling_axis1 or self.is_tiling_axis2 :
+            upper = f"{self.name}_numel"
+            line = f"{self.name}_mask = {self.name} < {upper}"
+            self.writeline(line)
+            line = f"{self.name}_prime_mask = {self.name}_prime < {upper}"
+            self.writeline(line)
+        else:
+            pass
+
+    def _codegen(self):
+        index = None
+        vertical = self.is_tiling_axis1 if V.kernel.numof_reduction_axis() <=1 else not isinstance(self.expr, ModularIndexing)
+        direction = V.kernel.get_axis_direction(vertical)
+        # for multiple reduce dims, don't need this
+        if self.is_tiling_axis1 and  V.kernel.numof_reduction_axis() <= 1:
+            index = f"{self.name} = {self.codegen_index(direction)}"
+            #to be fixed, only permute need to this .
+            self.writeline(f"{self.name}_prime = {self.codegen_index(reverse_direction(direction))}")
+
+        elif self.is_tiling_axis2:
+            index = f"{self.name} = {self.codegen_index(direction)}"
+            #to be fixed, only permute need to this .
+            self.writeline(f"{self.name}_prime = {self.codegen_index(reverse_direction(direction))}")
+            if V.kernel.inside_reduction and V.kernel.current_node  \
+                    and isinstance(V.kernel.current_node, SchedulerNode) \
+                    and V.kernel.current_node.node \
+                    and V.kernel.current_node.node.data \
+                    and isinstance(V.kernel.current_node.node.data, ir.Reduction):
+                reduction_type = V.kernel.current_node.node.data.reduction_type
+                if reduction_type in {"argmax", "argmin"} :
+                    self.writeline(f"{self.parent.prefix}index = "
+                                   f"{self.codegen_index(reverse_direction(AxisDirection.Flat))}")
+        if index:
+            self.writeline(index)
+            self._codegen_mask()
+        return self.name
+    
+    def writeline(self, line):
+        self.indexing_code.writeline(line)
+
+    def codegen_index(self, direction):
+        if self.is_tiling_axis1 and V.kernel.axis2 is None and V.kernel.persistent_reduction:
+            index = f"tl.arange(0, RBLOCK)"
+            return index
+        elif self.is_tiling_axis1:
+            if self.is_split_axis :
+                offset = f"{self.symbol()}_offset"
+                index = f"{offset} + (loop1 * XBLOCK_SUB) + base1"
+            else :
+                index = f"(loop1 * XBLOCK_SUB) + base1"
+            
+            if V.kernel.axis2 is not None and direction != AxisDirection.Flat :
+                index += ("[None, :]" if direction == AxisDirection.Horizontal else "[:, None]")
+            return index
+        elif self.is_tiling_axis2  :
+            if V.kernel.persistent_reduction :
+                index = f"tl.arange(0, RBLOCK_{self.symbol()})" if V.kernel.numof_reduction_axis() > 1 else  "base2"
+            elif self.is_split_axis:
+                offset = f"{self.symbol()}_offset"
+                index = f"{offset} + (loop2 * RBLOCK) + base2"
+            else :
+                index = "loop2 * RBLOCK + base2"
+
+            if direction != AxisDirection.Flat:
+                index += ("[:, None]" if direction == AxisDirection.Vertical else "[None, :]")
+            return index
+
+    def codegen_header(self, code):
+        # generate offset index loop
+        lines = []
+
+        if self.is_split_axis and not (V.kernel.axis2 is None and V.kernel.persistent_reduction):
+            lines.append(f"{self.symbol()}_offset = tl.program_id(0) * XBLOCK")
+
+        if self.is_tiling_axis1 and not (V.kernel.axis2 is None and V.kernel.persistent_reduction):
+            #  don't create loops for multi-reductions
+            if V.kernel.numof_reduction_axis() <= 1 :
+                lines.append("base1 = tl.arange(0, XBLOCK_SUB)")
+                xblock = f"XBLOCK" if self.is_split_axis else f"{self.symbol()}_numel"
+                lines.append(f"loops1 = ({xblock} + XBLOCK_SUB - 1) // XBLOCK_SUB")
+
+        elif self.is_tiling_axis2 and  len(V.kernel.axis2_list) <=1:
+            lines.append("base2 = tl.arange(0, RBLOCK)")
+            if self.is_split_axis:
+                lines.append(f"loops2 = (XBLOCK + RBLOCK - 1) // RBLOCK")
+            else:
+                lines.append(f"loops2 = ({self.name}_numel + RBLOCK - 1) // RBLOCK" )
+        else:
+            pass
+
+        code.writelines(lines)
+
+    def precomputed_args(self):
+        # for dynamic shapes, find parts of indexing expressions that have to be precomputed
+        precomputed_args: List[sympy.Expr] = []
+        if isinstance(self.expr, (sympy.Symbol, sympy.Integer)):
+            return precomputed_args
+
+        assert isinstance(self.expr, (FloorDiv, ModularIndexing)), type(self.expr)
+        for arg in self.expr.args[1:]:
+            if not isinstance(arg, (sympy.Integer, sympy.Symbol)):
+                symbols = arg.free_symbols
+                if len(symbols) > 0 and all(
+                    symbol_is_type(s, SymT.SIZE) for s in symbols
+                ):
+                    precomputed_args.append(arg)
+        return precomputed_args
+
+
+class IterationRangesRootNPUIndex(IterationRangesRoot):
+    def __init__(
+            self,
+            name: str,
+            numel: sympy.Expr,
+            prefix: str,
+            index: int,
+            kernel: TritonKernel,
+            pid_cache=None,
+            *,
+            is_loop: bool,
+            tensor_dim: Optional[int],
+            grid_dim: Optional[int],
+    ):
+        super().__init__(name, numel, prefix, index, kernel, pid_cache, is_loop=is_loop, tensor_dim=tensor_dim,
+                         grid_dim=grid_dim, has_zdim= False )
+
+    def __repr__(self):
+        return f"IterationRangesRootNPUIndex({self.name!r}, {self.numel}, ...)"
+
+    def remove_entry(self, name):
+        if name in self.var_ranges :
+            del self.var_ranges[name]
+        if name in self.var_list:
+            del self.var_list[self.var_list.index(name)]
+        if name in V.kernel.range_tree_nodes :
+            V.kernel.range_tree_nodes_removed[name] = V.kernel.range_tree_nodes[name]
+            del V.kernel.range_tree_nodes[name]
+        if name in self.nodes:
+            del self.nodes[name]
+
+    def duplicated_check(self, divisor, length):
+        """
+        Lookup a given RangeTreeEntry, creating it if needed
+        """
+        if V.graph.sizevars.statically_known_equals(divisor * length, self.numel):
+            expr = FloorDiv(sympy_index_symbol(f"{self.prefix}index"), divisor)
+        else:
+            expr = ModularIndexing(
+                sympy_index_symbol(f"{self.prefix}index"), divisor, length
+            )
+
+        return expr not in self.nodes
+
+
+    def lookup(self, divisor, length):
+        """
+        Lookup a given RangeTreeEntry, creating it if needed
+        """
+        if V.graph.sizevars.statically_known_equals(divisor * length, self.numel):
+            expr = FloorDiv(sympy_index_symbol(f"{self.prefix}index"), divisor)
+        else:
+            expr = ModularIndexing(
+                sympy_index_symbol(f"{self.prefix}index"), divisor, length
+            )
+
+        if expr not in self.nodes:
+            node = IterationRangesEntryNPUIndex(
+                f"{self.prefix}{next(V.kernel.iter_vars_count)}",
+                divisor,
+                length,
+                expr,
+                self,
+            )
+            V.kernel.range_tree_nodes[node.symbol()] = node
+            self.var_list.append(node.symbol())
+            self.var_ranges[node.symbol()] = length
+            self.nodes[expr] = node
+
+
+        return self.nodes[expr]
+
+
+def is_compatible(groups: Iterable[sympy.Expr], lengths: Sequence[Sequence[sympy.Expr]]):
+    try:
+        groups = flatten(groups)
+        NPUIndexTritonKernel._split_iteration_ranges(groups, lengths)
+        return True
+    except CantSplit:
+        return False
+
+
+class NPUIndexTritonKernel(TritonKernel):
+    overrides = NPUTritonKernelOverrides
+
+    def __init__(
+        self,
+        tiling: Dict[str, sympy.Expr],
+        min_elem_per_thread=0,
+        optimize_mask=True,
+        fixed_config: Optional[FixedTritonConfig] = None,
+        **kwargs,) :
+
+        super().__init__(tiling = tiling,
+            min_elem_per_thread=min_elem_per_thread,
+            optimize_mask=optimize_mask,
+            fixed_config=fixed_config,
+            **kwargs)
+        self.first_node = True
+        self.inside_high_order_reduction = False
+        # split axis
+        self.split_axis = None
+        # tiling axis
+        self.axis1 = None
+        self.axis2 = None
+        # incase two reduction axis
+        self.axis2_list = []
+        self.low_dims  = set()
+
+        self.range_tree_nodes_removed: Dict[sympy.Symbol, IterationRangesEntry] = {}
+        self.range_tree_nodes_substituted = {}
+        self.expr_substituted = {}
+        self.sorted_axis = []
+        self.prefix: IndentedBuffer = IndentedBuffer()
+
+    def gen_triton_ext_imports(self):
+        imports = IndentedBuffer()
+        imports.splice(
+            """
+            from torch._inductor.runtime import triton_helpers
+            from torch_npu._inductor import npu_triton_heuristics
+            from torch_npu._inductor import npu_triton_helpers
+            from torch_npu._inductor.runtime import NPUDeviceProperties
+            from torch_npu._inductor.npu_triton_helpers import libdevice, math as tl_math
+            import torch
+            """
+        )
+        return imports.getvalue()
+
+
+    def patch_triton_hash(self):
+        # remove this method once the original invocation is fixed
+        import hashlib
+        from triton.compiler.compiler import triton_key, make_backend
+        from triton.runtime.driver import driver
+        backend = make_backend(driver.active.get_current_target())
+        key = f"{triton_key()}-{backend.hash()}"
+        return hashlib.sha256(key.encode("utf-8")).hexdigest()
+
+    def numof_reduction_axis(self):
+        root = self.range_trees[-1]
+        if root is None :
+            return 0
+
+        return len(root.var_list)
+
+    def numof_tiling_axis(self):
+        return  (1 if self.axis1 is not None else 0) + (1 if self.axis2 is not None else 0 )
+
+    #do nothing in NpuTritonKernel
+    def codegen_range_tree(self):
+        pass
+           
+
+    def initialize_range_tree(self, pid_cache):
+        #self.numels = flatten(self.numels)
+        self.total_numels = 0
+        for k, x in self.numels.items() :
+            if not isinstance(x, sympy.Integer) :
+                 x = x.subs(V.graph.sizevars.var_to_val)
+                 self.numels[k] = x
+            if x > 1 :
+                self.total_numels +=1
+
+        no_r_dim = not self.inside_reduction or self.numels["r"] == 1
+        prefixes = "wvtzyxr"
+        active_prefixes = prefixes[-len(self.numels) :]
+        #prefix can not be 's', 'u', 'ps' , 'i', 'z', 'q'
+        #prefix can not be 'p' from torch 2.6.0
+        grid_dims = "xyztvw"
+        if self.no_x_dim:
+            tensor_dims = "r"
+        elif no_r_dim:
+            tensor_dims = "xyztvw"
+        else:
+            tensor_dims = "xyztvwr"
+        tensor_dims = "".join(p for p in tensor_dims if p in active_prefixes)
+        for i, prefix in enumerate(active_prefixes):
+            is_reduction = prefix_is_reduction(prefix)
+            tensor_dim = tensor_dims.find(prefix) if prefix in tensor_dims else None
+            grid_dim = None if is_reduction else grid_dims.find(prefix)
+            index = i if grid_dim is None else grid_dim
+            self.range_trees.append(
+                IterationRangesRootNPUIndex(
+                    f"{prefix}index",
+                    self.numels[prefix],
+                    prefix,
+                    index,
+                    self,
+                    pid_cache=pid_cache,
+                    is_loop=is_reduction and not self.persistent_reduction,
+                    tensor_dim=tensor_dim,
+                    grid_dim=grid_dim
+                )
+            )
+
+    # numels sent to autotune configs
+    def get_size_hints(self):
+        size_hints = []
+
+        if (len(self.range_tree_nodes.values()) == 0):
+            return size_hints
+        
+        for i, node in enumerate(self.sorted_axis):
+            if isinstance(node.expr, ModularIndexing):
+                numel_expr = node.length
+            else:
+                numel_expr = node.expr.subs({sympy_index_symbol(r.name): r.numel for r in self.range_trees})
+
+            numel_expr = V.graph.sizevars.symbolic_hint(numel_expr)
+
+            size_hints.append(numel_expr)
+        return size_hints
+
+    # torch251 done
+    def add_numel_to_call_args_and_grid(self, name, call_args, arg_types,  grid):
+        for node in self.sorted_axis:
+            if isinstance(node.expr, ModularIndexing) :
+                numel_expr = node.length
+            else :
+                numel_expr = node.expr.subs({sympy_index_symbol(r.name): r.numel for r in self.range_trees})
+
+            if isinstance(numel_expr, (sympy.Integer, sympy.Symbol)):
+                expr = numel_expr
+            else:
+                expr = V.graph.wrapper_code.generate_node_numel_expr(name, node, numel_expr)
+            call_args.append(expr)
+            arg_types.append(type(expr))
+            if node.parent.grid_dim is not None:
+                grid.append(expr)
+
+    def gen_numel_args(self, signature, triton_meta_signature, argdefs ):
+        for node in self.sorted_axis:
+            arg_name = f"{node.name}_numel"
+            if not os.environ.get('INDUCTOR_STATIC_MODE'):
+                sizearg = SizeArg(arg_name, node.length)
+                signature.append(sizearg)
+                triton_meta_signature[arg_name] = signature_of(
+                    sizearg, size_dtype=self.index_dtype
+                )
+                argdefs.append(arg_name)
+            else :
+                argdefs.append(f"{arg_name}: tl.constexpr")
+                self.triton_meta["constants"][arg_name] = node.length
+
+
+    # modify triton_meta, inductor_meta , etc.
+    def codegen_kernel(self, name=None):
+        code = IndentedBuffer()
+        size_hints = self.get_size_hints()
+        heuristics = self._get_heuristic()
+        if name is None:
+            code.splice(gen_common_triton_imports())
+            # Note: add extra imports for extensions
+            code.splice(self.gen_triton_ext_imports())
+
+            if config.benchmark_kernel:
+                code.splice(self.imports_for_benchmark_kernel())
+
+        argdefs, _, signature, _ = self.args.python_argdefs()
+
+        for i, arg in enumerate(signature):
+            if isinstance(arg, SizeArg):
+                symbol = cast(sympy.Symbol, arg.expr)
+                if symbol in V.graph.sizevars.inv_precomputed_replacements:
+                    signature[i] = SizeArg(
+                        arg.name, V.graph.sizevars.inv_precomputed_replacements[symbol]
+                    )
+
+        triton_meta_signature = signature_to_meta( signature, size_dtype=self.index_dtype, argdefs = argdefs )
+
+        triton_meta = {
+            "signature": triton_meta_signature,
+            "device":
+                NPUDeviceProperties.create(
+                V.graph.get_current_device_or_throw()
+            ),
+            "constants": {},
+            # special config for NPU, specify compile target
+            "mix_mode": "aiv",
+        }
+
+        inductor_meta = self.create_inductor_meta()
+        num_gb = None
+        if config.benchmark_kernel or config.profile_bandwidth:
+            num_gb = self.estimate_kernel_num_bytes() / 1e9
+            inductor_meta["kernel_num_gb"] = num_gb
+
+        self.triton_meta = triton_meta
+        self.gen_numel_args(signature, triton_meta_signature, argdefs)
+
+        #add in tiling args
+        self.add_autotune_args(argdefs)
+        #for scalar codegen
+        if len(self.range_tree_nodes) == 0:
+            self.write_scalar()
+        else:
+            self.codegen_body()
+
+        for helper in self.helper_functions:
+            code.writeline("")
+            code.splice(helper)
+
+
+        # Note: override original triton_heuristics
+        if self.inside_reduction:
+            reduction_hint = self.features.get_reduction_hint()
+            heuristics_line = f"""
+                @npu_triton_heuristics.{heuristics}(
+                    size_hints={size_hints},
+                    reduction_hint={reduction_hint},
+                    filename=__file__,
+                    triton_meta={triton_meta!r},
+                    inductor_meta={inductor_meta!r}
+                )
+                @triton.jit
+            """
+        else:
+            tile_hint = ""
+            if len(size_hints) == 2:
+                if len(signature) == 4:  # input, output and 2 args
+                    tile_hint = "tile_hint=TileHint.SQUARE,"
+                else:
+                    tile_hint = "tile_hint=TileHint.DEFAULT,"
+            heuristics_line = f"""
+                @npu_triton_heuristics.{heuristics}(
+                    size_hints={size_hints!r}, {tile_hint}
+                    filename=__file__,
+                    triton_meta={triton_meta!r},
+                    inductor_meta={inductor_meta!r},
+                    min_elem_per_thread={self.min_elem_per_thread}
+                )
+                @triton.jit
+            """
+        code.splice(heuristics_line)
+        code.writeline(
+            f"def {name or str(Placeholder.KERNEL_NAME)}({', '.join(argdefs)}):"
+        )
+        with code.indent():
+            self.codegen_static_numels(code)
+            for old, new in self.args.aliases():
+                code.writeline(f"{old} = {new}")
+            code.splice(self.body)
+
+        if config.benchmark_kernel:
+            code.splice(self.codegen_kernel_benchmark(num_gb))
+
+        return code.getvalue()
+    
+
+    
+    def codegen_static_numels(self, code):
+        no_x_axis =  self.numof_reduction_axis() > 1
+        symbols = []
+        if self.axis2 is not None :
+            symbols = list(self.axis2_list) if no_x_axis else list([self.axis2])
+        elif self.persistent_reduction and self.axis1 is not None:
+            symbols = list([self.axis1])
+
+        nodes = [self.range_tree_nodes[symbol] for symbol in symbols if symbol is not None]
+        for node in nodes:
+            if node.prefix == "r" and self.persistent_reduction:
+                simplified_tree_numel = V.graph.sizevars.simplify(node.length)
+                if isinstance(simplified_tree_numel, (sympy.Integer, int)):
+                    val = int(simplified_tree_numel)
+                else:
+                    continue
+                val = next_power_of_2(val)
+                if no_x_axis :
+                    code.writeline(f"RBLOCK_{node.symbol()}: tl.constexpr = {val}")
+                else :
+                    code.writeline(f"RBLOCK: tl.constexpr = {val}")
+
+    def axis2_variable(self):
+        if self.axis2 is not None :
+            return self.range_tree_nodes[self.axis2]
+        return None
+
+    def is_isolated_symbol(self, input_str, symbol):
+        # 使用正则表达式查找独立的符号, 防止out_ptr0 匹配上r0  r0_prime
+        pattern1 = r'\b' + re.escape(symbol) + r'\b'
+        pattern2 = r'\b' + re.escape(symbol+'_prime') + r'\b'
+
+        return bool(re.search(pattern1, input_str)) or bool(re.search(pattern2, input_str))
+
+    def find_axis2_in_load_store(self):
+        var = self.axis2_variable()
+        if not var :
+            return False
+        for line in self.loads._lines :
+            if line.find('tl.load') >= 0 and self.is_isolated_symbol(line, var.name):
+                return True
+        for line in self.compute._lines :
+            if line.find('tl.load') >= 0 and self.is_isolated_symbol(line, var.name):
+                return True
+        for line in self.post_loop_store._lines :
+            if line.find('tl.store') >= 0 and self.is_isolated_symbol(line, var.name):
+                return True
+        for line in self.stores._lines :
+            if isinstance(line,DeferredLine) :
+                line = line.line
+            if line.find('tl.store') >= 0 and self.is_isolated_symbol(line, var.name):
+                return True
+        return False
+
+    def find_axis2_in_indexing(self):
+        var = self.axis2_variable()
+        if not var :
+            return False
+        if self.current_node is None :
+            return False
+        for index in self.current_node._body.indexing.values() :
+            if var.symbol() in index.free_symbols :
+                return True
+        return False
+
+    def write_scalar(self):
+        self.body.splice(self.indexing_code)
+        self.body.splice(self.loads)
+        self.body.splice(self.compute)
+        self.body.splice(self.stores)
+        self.loads.clear()
+        self.compute.clear()
+        self.stores.clear()
+        self.post_loop_store.clear()
+        self.prefix.clear()
+
+    def is_1d_reduction(self) :
+        return self.numels["r"] > 1 and self.axis2 is None
+    
+    def codegen_body(self):
+        if not (
+                self.loads
+                or self.stores
+                or self.compute
+                or self.post_loop_store
+        ):
+            return
+
+        def write_pointwise() :
+            self.body.splice(self.indexing_code)
+            self.body.splice(self.loads)
+            self.body.splice(self.compute)
+            self.body.splice(self.stores)
+        
+        def codegen_range(index) :
+            def loop_body(index, indexing_code, is_last_axis, do_indent = True ) :
+                if do_indent:
+                    self.body.do_indent()
+                if indexing_code :
+                    self.body.splice(indexing_code)
+
+                if is_last_axis:
+                    write_pointwise()
+                else:
+                    codegen_range(index + 1)
+
+                if do_indent :
+                    self.body.do_unindent()
+
+            if index < 0 or index >= len(self.range_tree_nodes):
+                return
+            nodes = self.sorted_axis
+            range = nodes[index]
+            is_tilling_asix1 = getattr(range, "is_tiling_axis1")
+            is_tilling_asix2 = getattr(range, "is_tiling_axis2")
+            is_last_axis = index == len(nodes) -1
+            indexing_code = getattr(range, "indexing_code")
+            numof_axis2 = self.numof_reduction_axis()
+            if is_tilling_asix1:
+                do_indent = True
+                reduction_1d = self.is_1d_reduction()
+                if reduction_1d : 
+                    self.body.splice(self.prefix)
+                    self.prefix.clear()
+                
+                # multi-dim reduction, i.e. var_mean[1,2]
+                if numof_axis2 > 1:
+                    if range.is_split_axis :
+                        offset = f"{range.name}_offset"
+                        self.body.writeline(f"for {range.name} in range({offset}, "
+                                             f"min({offset} + XBLOCK, {range.name}_numel)):")
+                    else :
+                        self.body.writeline(f"for {range.name} in  range({range.name}_numel):")
+                # 1D persistent_reduction or 1d reduction non-first-node
+                elif self.axis2 is None and (self.persistent_reduction or len(self.loads._lines) == 0):
+                    do_indent = False
+                    if len(self.loads._lines) == 0:
+                        indexing_code = None
+                else :
+                    self.body.writeline(f"for loop1 in range(loops1):")
+
+                
+                if not reduction_1d and self.persistent_reduction :
+                    self.body.do_indent()
+                    self.body.splice(self.prefix)
+                    self.prefix.clear()    
+                    self.body.do_unindent()
+                
+                loop_body(index, indexing_code, is_last_axis, do_indent = do_indent)
+                
+                # for 1D reduction, need to add in suffix for persist_reduction or second node of 1d reduction
+                if self.is_1d_reduction() or self.persistent_reduction:
+                    self.body.splice(self.post_loop_store)
+                    self.post_loop_store.clear()
+
+
+            elif is_tilling_asix2:
+                do_indent = False
+                need_axis2_loop = self.find_axis2_in_load_store()
+                if not need_axis2_loop :
+                    indexing_code = None
+                if (not self.inside_reduction or not self.persistent_reduction) \
+                        and need_axis2_loop:
+                    self.body.splice(self.prefix)
+                    self.body.writeline(f"for loop2 in range(loops2):")
+                    do_indent = True
+                loop_body(index, indexing_code, is_last_axis, do_indent)
+                self.body.splice(self.post_loop_store)
+                self.post_loop_store.clear()
+
+            elif is_last_axis and range.numel == 1: #pointwise , last axis =1
+                write_pointwise()
+            else:
+                if range.is_split_axis :
+                    offset = f"{range.symbol()}_offset"
+                    self.body.writeline(f"for {range.symbol()} in range({offset}, min({offset} + XBLOCK, {range.name}_numel)):")
+                else :
+                    self.body.writeline(f"for {range.symbol()} in range({range.name}_numel):")
+                loop_body(index, indexing_code, is_last_axis)
+
+        if self.first_node:
+            for node in self.sorted_axis:
+                node.codegen_header(self.body)
+
+
+        if self.first_node:
+            codegen_range(0)
+        else :
+            if self.axis2 is None :
+                codegen_range(0)
+            else :
+                axis2_order = self.range_tree_nodes[self.axis2].sorted_order
+                if self.persistent_reduction and self.numof_reduction_axis() > 1 :
+                    axis2_order = axis2_order - self.numof_reduction_axis() +1
+                for _ in range(axis2_order) :
+                    self.body.do_indent()
+                codegen_range(axis2_order)
+                for _ in range(axis2_order) :
+                    self.body.do_unindent()
+
+        self.cse.invalidate(self.outside_loop_vars)
+        self.loads.clear()
+        self.compute.clear()
+        self.stores.clear()
+        self.post_loop_store.clear()
+        self.prefix.clear()
+        #for root in self.range_trees:
+        #    root.cache_clear()
+        self.first_node = False
+
+    # for creat constant tensor, if have two axis, constant=tl.full([1,1]) else  tl.full([1])
+    def triton_tensor_ndim(self):
+        if self.numof_reduction_axis() > 1 :
+            return 1
+        if self.axis1 is not None and self.axis2 is not None:
+            ndim = 2
+        else:
+            ndim = 1
+        return ndim
+
+    # fixme, indexing.mask_str is None , see varmean_test.py
+    def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable):
+        assert self.inside_reduction
+        self.inside_reduction = False
+        indexing = self.indexing(index, block_ptr=True)
+        self.inside_reduction = True
+        var = self.args.output(name)
+        if isinstance(indexing, BlockPtrOptions):
+            self.post_loop_store.writeline(
+                DeferredLine(
+                    name,
+                    self.codegen_block_ptr_store_line(
+                        name,
+                        indexing,
+                        indexing.format(var),
+                        value,
+                        f", boundary_check={indexing.boundary_check()!r}",
+                    ),
+                )
+            )
+        else:
+            assert isinstance(indexing, IndexingOptions)
+            line = f"tl.store({var} + ({indexing.index_str} ), {value}, {indexing.mask_str})"
+            if self.numof_reduction_axis() > 1 :
+                line = f"tl.store({var} + ({indexing.index_str} + tl.arange(0,1) ), {value}, {indexing.mask_str})"
+            self.post_loop_store.writeline(
+                DeferredLine( name, line )
+            )
+
+    def apply_var_prime(self, index, line, mask):
+        # axis should only be replaced once
+        axis_list = []
+        for key in index.as_coefficients_dict().keys():
+            if not key.free_symbols :
+                continue
+            symbol = list(key.free_symbols)[0]
+            if symbol not in self.range_tree_nodes :
+                continue
+            range = self.range_tree_nodes[symbol]
+            if (range.is_tiling_axis1 or range.is_tiling_axis2) and (symbol not in axis_list):
+                line = line.replace(f"{range.name}", f"{range.name}_prime")
+                mask = mask.replace(f"{range.name}", f"{range.name}_prime")
+                axis_list.append(symbol)
+        return line, mask
+
+    # apply xxx_prime var in case dim are permuted
+    def store(
+        self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
+    ) -> None:
+        
+        var = self.args.output(name)
+        original_index = index
+        indexing = self.indexing(index, dense_indexing=True, block_ptr=mode is None)
+        index_str = indexing.index_str
+        value_str = f"{value}"
+
+        # need to reshape when value's dimensions > 2, e.g. (XBLOCK,1,RBLOCK)
+        is_permuted = self.need_permuted(index)
+
+        mask_str = indexing.mask_str
+        if is_permuted:
+            index_str, mask_str = self.apply_var_prime(index, index_str, indexing.mask_str)
+            value_str = value_str.replace(f"{value}", f"{value}.permute(1,0)")
+
+        advance_block_ptr = None
+        if isinstance(indexing, BlockPtrOptions):
+            block_ptr, advance_block_ptr, other = self.codegen_block_ptr(
+                name, var, indexing
+            )
+            # block_ptr stores don't do implicit casting
+            line = self.codegen_block_ptr_store_line(
+                name, indexing, block_ptr, value, other
+            )
+        elif mode is None:
+            line = f"tl.store({var} + ({index_str}), {value_str}, {mask_str})"
+            if len(self.axis2_list) > 1 :
+                line = f"tl.store({var} + ({index_str} + tl.arange(0,1) ), {value_str}, {indexing.mask_str})"
+
+        elif mode == "atomic_add":
+            line = f"tl.atomic_add({var} + ({index_str}), {value_str}, {indexing.mask_str})"
+        else:
+            raise NotImplementedError(f"store mode={mode}")
+
+        self.stores.writeline(DeferredLine(name, line))
+        if advance_block_ptr:
+            self.stores.writeline(advance_block_ptr)
+
+        if not self.inside_reduction:
+            self.outside_loop_vars.add(value)
+
+
+    @staticmethod
+    def _get_next_scheduler_node(node_schedule, current_node):
+        found_current = False if current_node else True
+        for node in node_schedule :
+            if isinstance(node, SchedulerNode) :
+                if not found_current and node.get_name() == current_node.get_name() :
+                    found_current = True
+                    continue
+                if found_current  :
+                    return node
+        return None
+
+    #fixme, this seems not reliable, need to refactor .
+    def get_next_scheduler_node(self, node):
+        return self._get_next_scheduler_node(self.node_schedule, node)
+
+    def get_prev_scheduler_node(self, node):
+        return self._get_next_scheduler_node(reversed(self.node_schedule), node)
+
+    def check_all_index_is_1d_for_dual_reduction(self):
+        if self.numof_reduction_axis() <= 1:
+            return False 
+        
+        all_index_is_1d  = True 
+        for _,index in self.current_node._body.indexing.items() :
+            count = 0 
+            for symbol in index.free_symbols :
+                if symbol in self.axis2_list :
+                    count = count + 1
+            if count > 1 :
+                all_index_is_1d = False 
+
+            if not all_index_is_1d :
+                break 
+        return all_index_is_1d
+                    
+            
+    # to generate the shape of the accumulator of RBLOCK loop
+    def dense_size_list(self, is_permute) -> List[str]:
+
+        sizes = []
+        if self.numof_reduction_axis() > 1:
+            sizes = [] if self.check_all_index_is_1d_for_dual_reduction() else [f"RBLOCK_{axis}" for axis in self.axis2_list] 
+            return sizes
+        if self.persistent_reduction and self.axis2 is None :
+            sizes = ["RBLOCK" ]
+            return sizes
+        # current computedbuffer is reduction
+        cb_is_reduction = self.inside_reduction if not self.current_node else isinstance(self.current_node.node.data, ir.Reduction)
+
+        for tree in self.sorted_axis:
+            if tree.is_tiling_axis1 :
+                sizes.append("XBLOCK_SUB")
+            elif tree.is_tiling_axis2:
+                sizes.append("RBLOCK")
+
+        if cb_is_reduction and self.inside_reduction and self.is_higher_order_reduction() or is_permute:
+            sizes = sizes[::-1]
+
+        return sizes
+
+    def dense_size_str(self, is_permute = False):
+        sizes = self.dense_size_list(is_permute)
+        if self.numof_reduction_axis() > 1:
+            return f"[{'* '.join(sizes)}]"
+        return f"[{', '.join(sizes)}]"
+
+    def filter_masks(self, mask_vars):
+        for node in self.sorted_axis:
+            if not(node.is_tiling_axis1 or node.is_tiling_axis2):
+                mask_vars.discard(f"{node.name}_mask")
+            if len(self.axis2_list) > 1 and  not node.is_tiling_axis2:
+                mask_vars.discard(f"{node.name}_mask")
+
+
+    # and add to shape to value
+    def reduction_resize(self, value):
+        ndims = self.triton_tensor_ndim()
+        if ndims == 1:
+           return f"triton_helpers.promote_to_tensor({value})"
+        is_higher_order_reduction = self.is_higher_order_reduction()
+
+        expand_str = "1," if is_higher_order_reduction else ",1"
+        if is_higher_order_reduction:
+            return f"{value}.reshape({expand_str}XBLOCK_SUB)"
+        else:
+            return f"{value}.reshape(XBLOCK_SUB{expand_str})"
+
+    def get_axis_direction(self, is_axis1, reversed = False ):
+      
+        if self.check_all_index_is_1d_for_dual_reduction():
+            result = AxisDirection.Flat
+        elif not self.inside_reduction :
+            if self.numof_tiling_axis() > 1 :
+                result = AxisDirection.Vertical if is_axis1 else AxisDirection.Horizontal
+            else :
+                result = AxisDirection.Flat
+        else  :
+            if is_axis1 :
+                result = AxisDirection.Horizontal if V.kernel.is_higher_order_reduction() else AxisDirection.Vertical
+            else :
+                result = AxisDirection.Vertical if V.kernel.is_higher_order_reduction() else AxisDirection.Horizontal
+
+        result =  reverse_direction(result) if reversed else result
+        return result
+
+    def is_higher_order_reduction(self, check_prev_node = False ):
+        if self.numof_reduction_axis() > 1 :
+            return False
+        assert self.inside_reduction
+        if self.inside_high_order_reduction :
+            return self.inside_high_order_reduction
+
+        node = self.current_node if self.current_node is not None else self.get_prev_scheduler_node(None)
+        if node is None or not isinstance(node, SchedulerNode) :
+            return False
+
+        reduction = node.node.data
+        while check_prev_node and reduction is not None and  not isinstance(reduction, ir.Reduction) :
+            node = self.get_prev_scheduler_node(node)
+            if node is None :
+                reduction = None
+            else :
+                reduction = node.node.data
+
+
+        if reduction is None or not isinstance(reduction, ir.Reduction) :
+            return False
+        if not hasattr(reduction, "reduced_idx") :
+            return False
+
+        reduced_order = reduction.reduced_idx[0]
+        is_last_axis = all(_ < reduced_order for _ in reduction.kept_idx)
+        self.inside_high_order_reduction = not is_last_axis
+        return self.inside_high_order_reduction
+    def get_axis_dtype(self, axis):
+        dtype = None
+        if axis is None :
+            return None
+        for node in self.node_schedule :
+            if node in (EnableReduction, DisableReduction) :
+                continue
+            if axis.symbol() in node._body.indexing_map :
+                dtype = V.graph.get_dtype(node.node.name)
+                break
+        if dtype is None :
+            should_break_all = False
+            for node in self.node_schedule:
+                if should_break_all:
+                    break
+                if node in (EnableReduction, DisableReduction):
+                    continue
+                for key, value in node._body.indexing_map.items():
+                    if key  in self.range_tree_nodes :
+                        dim = self.range_tree_nodes[key]
+                    else :
+                        dim = self.range_tree_nodes_removed[key]
+
+                    if dim.parent == axis.parent :
+                        dtype = V.graph.get_dtype(node.node.name)
+                        should_break_all = True
+                        break
+        return dtype
+    def create_inductor_meta(self):
+        mutated_args = set()
+        for mutation in self.mutations:
+            if mutation in self.args.input_buffers:
+                mutated_args.add(self.args.input_buffers[mutation])
+            if (
+                    mutation in self.args.inplace_buffers
+                    and mutation not in V.graph.removed_buffers
+                    and mutation not in self.removed_buffers
+            ):
+                mutated_args.add(self.args.inplace_buffers[mutation].inner_name)
+            if mutation in self.args.output_buffers:
+                mutated_args.add(self.args.output_buffers[mutation])
+        mutated_args = sorted(mutated_args)
+        axis1_order = self.range_tree_nodes[self.axis1].sorted_order if self.axis1 is not None else None
+        axis2_order = self.range_tree_nodes[self.axis2].sorted_order if self.axis2 is not None else None
+        split_axis_dtype = self.get_axis_dtype(self.split_axis)
+        inductor_meta = {
+            "autotune_hints": set(self.autotune_hints),
+            "kernel_name": str(Placeholder.DESCRIPTIVE_NAME),
+            "mutated_arg_names": mutated_args,
+            "no_x_dim": self.no_x_dim,
+            # Due to breaking change of triton 3.0, the original invocation is broken
+            "backend_hash": self.patch_triton_hash(),  # torch.utils._triton.triton_hash_with_backend(),
+            #"high_order_reduction" : self.inside_reduction and self.is_higher_order_reduction(True) ,
+            "split_axis_order" : self.split_axis.sorted_order if self.split_axis is not None else None,
+            "axis1_order" : axis1_order,
+            "axis2_order": axis2_order,
+            "low_dims" : self.low_dims,
+            "numof_reduction_axis": self.numof_reduction_axis(),
+            "split_axis_dtype":split_axis_dtype
+        }
+        return inductor_meta
+    def reduction_dim(self):
+        assert self.inside_reduction
+        if self.numof_reduction_axis() > 1:
+            return 0
+        return 0 if self.is_higher_order_reduction() or len(self.sorted_axis) ==1 else 1
+    def reduction_var(self):
+        var = self.axis2
+        return var
+
+
+    def reduction(
+        self,
+        dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        reduction_type: ReductionType,
+        value: Union[CSEVariable, Tuple[CSEVariable, ...]],
+    ) -> Union[CSEVariable, Tuple[CSEVariable, ...]]:
+        assert self.inside_reduction
+        masks = {f"{node.symbol()}_mask" for node in self.sorted_axis}
+        self.filter_masks(masks)
+        masks = sorted(masks)
+        if self._load_mask:
+            masks.append(self._load_mask)
+        reduction_range_prefix = self.range_trees[-1].prefix
+
+        dense_size_str = self.dense_size_str(False)
+
+        if len(dense_size_str) > 2 :
+            value = self._map_tuple_or_scalar(
+                lambda v: self.cse.generate(
+                    self.compute, f"tl.reshape({v}, {dense_size_str})", dtype=v.dtype,
+                ),
+                value,
+                
+            )
+
+        dim: int
+        root_op: str
+
+        def final_reduction(value):
+            #use_helper = reduction_type in {"any", "max", "min", "prod"}
+            module = "tl" # use tl
+            if reduction_type in {"max", "min"}:
+                return self.reduction_resize(f"{module}.{reduction_type}({value}, {dim})" #use tl.max
+                )
+            return self.reduction_resize(f"{module}.{reduction_type}({value}, {dim})")
+
+        def final_argreduce(buffer, result_var, value, index):
+            buffer.splice(
+                f"""\
+                _, {result_var}_tmp = triton_helpers.{root_op}_with_index({value}, {index}, {dim})
+                {result_var} = {self.reduction_resize(f'{result_var}_tmp')}
+                """
+            )
+        def get_reduction_axis() :
+            return list(self.range_tree_nodes.values())[-1]
+
+        cache_key = (src_dtype, reduction_type, value)
+        if cache_key in self.cse.reduction_cache:
+            return self.cse.reduction_cache[cache_key]
+
+        dim = self.reduction_dim()
+        acc_type = triton_acc_type(src_dtype)
+        torch_acc_type = upcast_acc_dtype(src_dtype)
+        result_var: Any = self.cse.newvar(dtype=torch_acc_type)
+        result_var.mask_vars = {var for var in masks if var[0] != "r"}
+        cond = " & ".join(masks)
+
+
+        def where_cond(tval, fval):
+            if not cond:
+                return tval
+            return TritonKernelOverrides.where(cond, tval, fval)
+
+        if self.persistent_reduction:
+            default = ir.Reduction.default_value(reduction_type, src_dtype)
+            default = self._map_tuple_or_scalar(constant_repr, default)
+
+            def _mask_value(value, default):
+                return self.cse.generate(self.compute, where_cond(value, default) , dtype=value.dtype)
+            # fixme masked_value doesn't work dual reduction
+            if self.numof_reduction_axis() == 1 :
+                if isinstance(value, tuple):
+                    masked_value = [_mask_value(v, d) for v, d in zip(value, default)]
+                else:
+                    masked_value = _mask_value(value, default)
+            else :
+                masked_value = value
+
+            if reduction_type in {"argmax", "argmin", "max", "min"}:
+                reduce_axis = get_reduction_axis()
+                broadcast_string: str
+                if self.is_1d_reduction():
+                    broadcast_string = f"tl.broadcast_to({reduce_axis.symbol()}.reshape({reduction_range_prefix.upper()}BLOCK), {masked_value}.shape)"
+                elif self.is_higher_order_reduction():
+                    broadcast_string = f"tl.broadcast_to({reduce_axis.symbol()}.reshape({reduction_range_prefix.upper()}BLOCK,1), {masked_value}.shape)"
+                else:
+                    broadcast_string = f"tl.broadcast_to({reduce_axis.symbol()}.reshape(1,{reduction_range_prefix.upper()}BLOCK), {masked_value}.shape)"
+                accumulator_index = str(
+                    self.cse.generate(
+                        self.compute,
+                        broadcast_string,
+                        dtype=torch.int64
+                    )
+                )
+                if reduction_type == "argmax" or reduction_type == "argmin":
+                    root_op = {"argmax": "max", "argmin": "min"}[reduction_type]
+                    final_argreduce(
+                        self.compute, result_var, masked_value, accumulator_index
+                    )
+                elif reduction_type == "max" or reduction_type == "min":
+                    result_var = self.cse.generate(
+                        self.compute, final_reduction(masked_value), dtype=masked_value.dtype,
+                )
+                    
+            elif reduction_type == "welford_reduce":
+                assert False, "welford_reduction is not supported now.."
+            elif reduction_type == "welford_combine":
+                assert False, "welford_combine is not supported now.."
+            else:
+                result_var = self.cse.generate(
+                    self.compute, final_reduction(masked_value), dtype=masked_value.dtype,
+                )
+        else:
+            accumulator = self.cse.namedvar(f"_{result_var}", dtype=torch_acc_type)
+            default = ir.Reduction.default_accumulator(reduction_type, src_dtype)
+            default = self._map_tuple_or_scalar(constant_repr, default)
+            if not isinstance(default, tuple):
+                self.prefix.writeline(
+                    f"{accumulator} = tl.full({self.dense_size_str()}, {default}, {acc_type})"
+                )
+
+            if reduction_type in {"argmax", "argmin"}:
+                accumulator_index = f"_{result_var}_index"
+                long_max = torch.iinfo(torch.int64).max
+                self.prefix.writeline(
+                    f"{accumulator_index} = tl.full({self.dense_size_str()}, {long_max}, tl.int64)"
+                )
+                root_op = {"argmax": "max", "argmin": "min"}[reduction_type]
+
+                self.compute.splice(
+                    f"""\
+                {accumulator}_next, {accumulator_index}_next = triton_helpers.{root_op}imum_with_index(
+                    {accumulator}, {accumulator_index}, {value}, {reduction_range_prefix}index
+                )
+                {accumulator} = {where_cond(f'{accumulator}_next', accumulator)}
+                {accumulator_index} = {where_cond(f'{accumulator_index}_next', accumulator_index)}
+                """
+                )
+                final_argreduce(self.post_loop_store, result_var, accumulator, accumulator_index)
+            elif is_welford_reduction(reduction_type):
+                assert False, "welford_reduction is not supported now.."
+            else:
+                combine_fn = ir.get_reduction_combine_fn(reduction_type, src_dtype)
+                updated = combine_fn(accumulator, value)
+                self.compute.writeline(
+                    f"{accumulator} = {where_cond(updated, accumulator)}"
+                )
+
+                if src_dtype == torch.bool:
+                    accumulator = f"{accumulator}.to(tl.int8)"
+                    result_type = triton_compute_type(dtype)
+                    self.post_loop_store.writeline(
+                        f"{result_var} = {final_reduction(accumulator)}.to({result_type})"
+                    )
+                else:
+                    self.post_loop_store.writeline(
+                        f"{result_var} = {final_reduction(accumulator)}"
+                    )
+
+        self.cse.reduction_cache[cache_key] = result_var
+
+        if isinstance(result_var, tuple):
+            self.outside_loop_vars |= set(result_var)
+        else:
+            self.outside_loop_vars.add(result_var)
+
+        return result_var
+    #XBLICK:split size, XBLOCK_SUB : tile1 size, RBLOCK:tile2 size
+    def add_autotune_args(self, argdefs):
+        # no tiling in this case
+        if self.persistent_reduction and self.axis2 is None:
+            return
+        argdefs.append(f"XBLOCK: tl.constexpr")
+        if self.numof_reduction_axis() <= 1 :
+            argdefs.append(f"XBLOCK_SUB: tl.constexpr")
+        if self.axis2 is not None and not self.persistent_reduction:
+            argdefs.append(f"RBLOCK: tl.constexpr")
+
+    def _get_heuristic(self):
+        if self.persistent_reduction:
+            assert self.inside_reduction
+            return "persistent_reduction_npu_index"
+        elif self.inside_reduction:
+            return "reduction_npu_index"
+        return "pointwise_npu_index"
+
+    def need_broadcast(self, index: sympy.Expr):
+        tiling_axis = [False, False]
+        for axis in index.free_symbols:
+            if axis not in self.range_tree_nodes :
+                continue
+            if self.range_tree_nodes[axis].is_tiling_axis1:
+                tiling_axis[0] = True
+            elif self.range_tree_nodes[axis].is_tiling_axis2:
+                tiling_axis[1] = True
+        #implict broadcast
+        result = (self.numof_tiling_axis() > 1 and not self.persistent_reduction) and (tiling_axis[1] ^  tiling_axis[0])
+        result = result and self.find_axis2_in_indexing()
+        return  result,  tiling_axis
+
+    def current_node_has_permute(self):
+        if not self.current_node :
+            return False
+        for index in self.current_node._body.indexing.values():
+             if self.need_permuted(index) :
+                return True
+        return False
+    def need_permuted(self, index: sympy.Expr):
+        if self.numof_tiling_axis() <= 1 :
+            return False
+
+        need_permute = False
+        tmp_list = []
+        coefficients_dict = index.as_coefficients_dict()
+        need_permute_axis1 = False
+        need_permute_axis2 = False
+        for key,value in coefficients_dict.items():
+            if not key.free_symbols :
+                continue
+            key = list(key.free_symbols)[0]
+            if key not in self.range_tree_nodes :
+                continue
+            axis = self.range_tree_nodes[key]
+            # normally, axis2 is lowest dimension, except for higher_order_reduction
+            if (self.inside_reduction and self.is_higher_order_reduction(True)) :
+                if axis.is_tiling_axis1 and value > sympy.Integer(1):
+                    need_permute_axis1 = True
+            elif axis.is_tiling_axis2 and value > sympy.Integer(1) :
+                need_permute_axis2 = True if self.numof_reduction_axis() <= 1 else isinstance(axis.expr, ModularIndexing)
+            tmp_list.append(True if value > sympy.Integer(1) else False)
+
+        # If all axes have coefficients greater than 1,
+        # then the stride is not 1, and in this case, return false,
+        # indicating that the transpose is not required.
+        if all(tmp_list):
+            return False
+        return need_permute_axis1 or need_permute_axis2
+
+    def get_reshape_dense_str(self, tiling_axis):
+        # there must be one tiling asis missing
+        assert tiling_axis[1] or tiling_axis[0]
+        sizes = ["XBLOCK_SUB", "1"]
+        if not tiling_axis[0] :
+            sizes = ["1", "RBLOCK"]
+
+        if self.inside_reduction and self.is_higher_order_reduction():
+            sizes = reversed(sizes)
+        return f"[{', '.join(sizes)}]"
+
+    def get_reshape_str(self, tiling_axis, check_prev_node = True):
+        # there must be one tiling asis missing
+        assert tiling_axis[1] or tiling_axis[0]
+        sizes = ["XBLOCK_SUB", "RBLOCK"]
+        if not tiling_axis[0] :
+            sizes[0] = "1"
+        elif not tiling_axis[1] :
+            sizes[1] = "1"
+        if self.inside_reduction and self.is_higher_order_reduction(check_prev_node):
+            sizes = reversed(sizes)
+
+        return f"[{', '.join(sizes)}]"
+
+    def get_broadcast_dense_str(self, tiling_axis, check_prev_node = True):
+        # there must be one tiling asis missing
+        assert tiling_axis[1] or tiling_axis[0]
+        sizes = ["XBLOCK_SUB", "RBLOCK"]
+        if self.inside_reduction and self.is_higher_order_reduction(check_prev_node):
+            sizes = reversed(sizes)
+        #elif not tiling_axis[0] :
+        #    sizes = reversed(sizes)
+        return f"[{', '.join(sizes)}]"
+
+
+    #broadcast, permute handling
+    def load(self, name: str, index: sympy.Expr):
+        var = self.args.input(name)
+        original_index = index
+        is_permuted = self.need_permuted(index)
+        store_cache = self.cse.store_cache
+        if name in store_cache:
+            broadcasted, tiling_axis = self.need_broadcast(original_index)
+            result_var = store_cache[name]
+            if broadcasted:
+                line = f"{result_var}.broadcast_to({self.get_broadcast_dense_str(tiling_axis, True)})"
+                buffer = self.compute if self.persistent_reduction else self.loads
+                result_var = self.cse.generate(buffer, line, dtype=result_var.dtype)
+            elif is_permuted:
+                line = f"{result_var}.permute(1,0)"
+                buffer = self.compute if self.persistent_reduction else self.loads
+                result_var = self.cse.generate(self.loads, line, dtype=result_var.dtype)
+            return result_var
+
+        need_broadcast, tiling_axis = self.need_broadcast(index)
+        indirect_indexing = self.is_indirect_indexing(index)
+        indexing = self.indexing(index, block_ptr=True)
+        has_rindex = indexing.has_rindex()
+        has_tmpmask = indexing.has_tmpmask()
+        is_coalesced = any(
+            i == 1 for i in self.get_strides_of_load(original_index).values()
+        )
+        ep = ""
+        if (
+                (has_tmpmask or has_rindex)
+                and V.graph.get_dtype(name) != torch.bool
+                and indexing.has_mask()
+        ):
+            other = ", other=0.0"
+        else:
+            other = ""
+
+        advance_block_ptr = None
+        append_broadcast = None
+        dtype = V.graph.get_dtype(name)
+
+        if V.graph.is_unspec_arg(name):
+            line = var
+        else:
+            if isinstance(indexing, BlockPtrOptions):
+                block_ptr, advance_block_ptr, other = self.codegen_block_ptr(
+                    name, var, indexing, other
+                )
+                line = f"tl.load({block_ptr}{other}{ep})"
+                # add needed size=1 dimensions
+                line = triton_reshape(
+                    line, indexing.block_shape, indexing.reshape_suffix
+                )
+            elif isinstance(original_index, sympy.Integer):
+                line = f"tl.load({var} + ({original_index}))"
+                num_size = len(self.dense_size_list(is_permuted))
+                append_broadcast = "[1, 1]" if (num_size > 1) else "[1]"
+            else:
+                index_str = indexing.index_str
+                mask_str = indexing.mask_str
+                if is_permuted:
+                    index_str, mask_str = self.apply_var_prime(index, index_str, mask_str)
+                line = f"tl.load({var} + ({index_str}), {mask_str}{ep}{other})"
+
+            dtype = V.graph.get_dtype(name)
+            if dtype in  (torch.bfloat16, ):
+                line += ".to(tl.float32)"
+            if dtype == torch.bool and torch.version.hip is None:
+                line += ".to(tl.int1)"
+        if has_tmpmask:
+            # Masked loads must come after the mask is computed
+            load_buffer = self.compute
+        elif (
+            self.inside_reduction
+            and self.range_trees[-1].is_loop
+            and not indirect_indexing
+            and not has_rindex
+        ):
+            # can lift a common load outside of reduction loop
+            # One exception is when this is an indirect_load.
+            load_buffer = self.prefix
+
+        else:
+            load_buffer = self.loads
+
+        result_var = self.cse.generate(load_buffer, line, dtype = dtype)
+        assert isinstance(result_var, TritonCSEVariable)
+        result_var.mask_vars = indexing.mask_vars  # type: ignore[assignment]
+
+        if append_broadcast and append_broadcast != '[]':
+            line = f"tl.broadcast_to({result_var}, {append_broadcast})"
+            result_var = self.cse.generate(load_buffer, line, dtype = dtype)
+        elif need_broadcast and not indirect_indexing:
+            #reshape_str = self.get_reshape_str(tiling_axis)
+            #.reshape({reshape_str})
+            line = f"{result_var}.broadcast_to({self.get_broadcast_dense_str(tiling_axis)})"
+            result_var = self.cse.generate(load_buffer, line, dtype = dtype)
+        elif is_permuted:
+            line = f"{result_var}.permute(1,0)"
+            result_var = self.cse.generate(self.loads, line, dtype = dtype)
+
+        if advance_block_ptr:
+            load_buffer.writeline(advance_block_ptr)
+
+        if not self.inside_reduction or (not indexing.has_rmask() and not has_rindex):
+            self.outside_loop_vars.add(result_var)
+
+        return result_var
+
+    # don't call symlify_indexing
+    def prepare_indexing(
+        self,
+        index: sympy.Expr,
+    ):
+        #index = self.simplify_indexing(index)
+        index = sympy_subs(index, V.graph.sizevars.precomputed_replacements)
+        # if simple replacements didn't get rid of floor/ceil, try full subs
+        if len(index.atoms(sympy.floor)) or len(index.atoms(sympy.ceiling)):
+            index = index.subs(V.graph.sizevars.precomputed_replacements)
+
+        if len(index.atoms(sympy.ceiling)):
+            for a in index.atoms(sympy.ceiling):
+                # for nested exprs, atoms yields top level first (?)
+                # so if everything goes fine, lower level replacements will come up empty
+                symbols = a.free_symbols
+                if len(symbols) > 0 and all(
+                    symbol_is_type(s, (SymT.SIZE, SymT.PRECOMPUTED_SIZE))
+                    for s in symbols
+                ):
+                    replacements = {a: V.graph.sizevars.lookup_precomputed_size(a)}
+                    index = sympy_subs(index, replacements)
+
+        #simp_index = self.simplify_indexing(index)
+        simp_index = index
+
+        # Now that we are done simplifying we can unwrap Identity so that downstream handling
+        # for its contained expression will work. previously, tl.full wrapping of sympy.Integer
+        # would not occur
+        simp_index = (
+            simp_index if not isinstance(simp_index, Identity) else simp_index.args[0]
+        )
+
+        return self.codegen_indexing(simp_index)
+    
+    #1. only remove the line which asserts index var should be in "xyr"
+    #2. don't do simplify_indexing, which combine continuous dims
+    #3. removed block_ptr, removed dense mask/broadcast support
+    # fixme, dense_mask_vars should be generated from sorted_axis
+    # upgraded to torch251
+    def indexing(
+            self,
+            index: sympy.Expr,
+            *,
+            copy_shape=None,
+            dense_indexing=False,
+            override_mask=None,
+            block_ptr=False,
+    ) -> Union[IndexingOptions, BlockPtrOptions]:
+        """
+        Compute the index and mask to pass to tl.load() or tl.store()
+        """
+        index = self.prepare_indexing(index)
+        index_vars = index.free_symbols
+        has_rindex = False
+        #index = self.simplify_indexing(index)
+        index = sympy_subs(index, V.graph.sizevars.precomputed_replacements)
+        # if simple replacements didn't get rid of floor/ceil, try full subs
+        if len(index.atoms(sympy.floor)) or len(index.atoms(sympy.ceiling)):
+            index = index.subs(V.graph.sizevars.precomputed_replacements)
+        if len(index.atoms(sympy.ceiling)):
+            for a in index.atoms(sympy.ceiling):
+                # for nested exprs, atoms yields top level first (?)
+                # so if everything goes fine, lower level replacements will come up empty
+                symbols = a.free_symbols
+                if len(symbols) > 0 and all(
+                        s.name.startswith("s") or s.name.startswith("ps") for s in symbols
+                ):
+                    replacements = {a: V.graph.sizevars.lookup_precomputed_size(a)}
+                    index = sympy_subs(index, replacements)
+
+        #index = self.simplify_indexing(index)
+        index_vars = index.free_symbols
+        has_rindex = False
+
+        mask_vars: Set[str] = set()
+        for var in index_vars:
+            assert isinstance(var, sympy.Symbol)
+            has_rindex = has_rindex or var.name.startswith("r")
+            if override_mask:
+                pass
+            elif var.name.startswith("tmp"):
+                # indirect indexing
+                cse_var = self.cse.varname_map[var.name]
+                mask_vars.update(cse_var.mask_vars)
+            elif var.name.startswith(("s", "ps", "i")):
+                pass
+            else:
+                # var is one of xN, yN or rN
+                # assert var.name[0] in "xyr", var.name
+                mask_vars.add(f"{var.name}_mask")
+
+        expand_str = None
+        index_str = self.index_to_str(index)
+        is_permute = self.need_permuted(index)
+        if isinstance(index, sympy.Integer):
+            expand_str = f"{copy_shape}.shape" if copy_shape else self.dense_size_str(is_permute)
+            if (index != 0):
+                index_str = f"tl.full({expand_str}, {index_str}, tl.int32)"
+            else:
+                index_str = f"tl.arange(0,1)"
+            return IndexingOptions(index_str, set(), "None", expand_str, has_rindex, index)
+
+        if override_mask:
+            mask_vars = {override_mask}
+        if self._load_mask:
+            mask_vars.add(self._load_mask)
+        self.filter_masks(mask_vars)
+        mask_str = " & ".join(sorted(map(str, mask_vars))) if mask_vars else "None"
+        return IndexingOptions(index_str, mask_vars, mask_str, expand_str, has_rindex, index)  # type: ignore[arg-type]
+
+    
+    
+    def codegen_indexing(self, expr: sympy.Expr):
+        expr = V.graph.sizevars.simplify_with_ranges(expr, self.var_ranges())
+        for sym in sorted(expr.free_symbols, key=str):
+            if sym in self.range_tree_nodes:
+                # if indexing expression is complicated, we precompute it on the host side
+                # and send the result as a kernel argument
+                replacements = {}
+                for ps in self.range_tree_nodes[sym].precomputed_args():  # type: ignore[index]
+                    replacements[ps] = V.graph.sizevars.lookup_precomputed_size(ps)
+                if len(replacements) > 0:
+                    self.range_tree_nodes[sym].expr = sympy_subs(  # type: ignore[index]
+                        self.range_tree_nodes[sym].expr, replacements  # type: ignore[index]
+                    )
+                self.range_tree_nodes[sym].codegen()  # type: ignore[index]
+        return expr
+    
+    def split_and_set_ranges(self, lengths: Sequence[Sequence[sympy.Expr]]):
+        groups = [rt.numel for rt in self.range_trees]
+        if not self.inside_reduction:
+            groups[-1] = sympy.S.One
+
+        return self.map_kernel_groups_to_node_sizes(groups, lengths, self.set_ranges)
+
+    #support split multiple ranges (instead of double) from one flatten range, triple-ranges are needed in mamba model
+    @staticmethod
+    def _split_iteration_ranges(
+        groups: Iterable[sympy.Expr], lengths: Sequence[Sequence[sympy.Expr]]
+    ):
+        sv = V.graph.sizevars
+        new_ranges: List[List[sympy.Expr]] = [[] for _ in groups]
+        remaining = [sv.simplify(g) for g in groups]
+        for i, group in enumerate(remaining) :
+            if isinstance(group, (list, tuple)):
+                remaining[i] = NumelList(group).numels()
+
+        var_count = itertools.count()
+
+        def add_range(i, expr):
+            expr = sv.simplify(expr)
+            if not sv.statically_known_multiple_of(remaining[i], expr):
+                raise CantSplit()
+            # guard on the last item out
+            remaining[i] = FloorDiv(remaining[i], expr)
+            new_ranges[i].append(expr)
+            return next(var_count)
+
+        def make_combined(strides, index_list):
+            def getter(flat_vars):
+                expr = sympy.Integer(0)
+                for stride, index in zip(strides, index_list) :
+                    expr = stride * flat_vars[index] + expr
+                return expr
+
+            return getter
+
+        def size_hints(group):
+            if isinstance(group, (list,tuple)) :
+                return sv.size_hint(NumelList(group).numels())
+            return sv.size_hint(group)
+        def add_multiple_range(size, return_getters):
+            # need to break size in multiple
+            index_list = []
+            stride_list = []
+            group = current_group
+            remained_size = size
+            # Two checks:
+            # 1. remaining sizes to be merged
+            # 2. remained_size is already divided to 1
+            while (group < len(remaining) and remaining[group] > 1) and (remained_size > 1):
+                group_size = remaining[group]
+                # size should be divisible by group_size
+                if not sv.statically_known_multiple_of( remained_size, group_size ):
+                    raise CantSplit()
+                index_list.append(add_range(group, group_size))
+                remained_size = FloorDiv(remained_size, group_size)
+                stride_list.append(remained_size)
+                group = group + 1
+            if remained_size != 1 :
+                raise CantSplit()
+            return_getters.append(make_combined(stride_list, index_list))
+
+        return_getters_groups = []
+        current_group = 0
+
+        for length_group in lengths:
+            return_getters = []
+            for size in length_group:
+                if sv.statically_known_equals(size, 1):  # type: ignore[arg-type]
+                    return_getters.append(lambda _: sympy.Integer(0))
+                    continue
+
+                while (
+                    current_group < len(remaining)
+                    and size_hints(remaining[current_group]) == 1
+                ):
+                    # scroll to next group with remaining elements
+                    current_group += 1
+                size_hint = sv.size_hint(size)
+                if current_group >= len(remaining) :
+                    pdb.set_trace()
+                if size_hint > size_hints(remaining[current_group]):
+                    #add multiple ranges (two or more) to the list, as well as the getter funcs
+                    add_multiple_range(size_hint, return_getters)
+                else:
+                    return_getters.append(
+                        operator.itemgetter(add_range(current_group, size_hint))
+                    )
+            return_getters_groups.append(return_getters)
+
+        assert all(
+            V.graph.sizevars.size_hint(s) == 1 for s in remaining
+        ), f"failed to set ranges {remaining} {lengths}"
+
+        return new_ranges, return_getters_groups
+    
+    
+    
+    # torch260 done
+    # just to override load method of CSEProxy, however, CSEProxy is an inner which can not be monkey patched,
+    # we need to override the whole inner class
+    def __enter__(self):
+        # TODO: hoist this to top level
+        class CSEProxy:
+            self.name = "CSEProxy"
+            vr_analysis = ValueRangeAnalysis()
+
+            @staticmethod
+            def __getattr__(name: str) -> Callable[..., CSEVariable]:  # type: ignore[misc]
+                def inner(*args, **kwargs):
+                    bounds = CSEProxy._bound_variable(name, *args, **kwargs)
+
+                    value = getattr(parent_handler, name)(*args, **kwargs)  # type: ignore[has-type]
+                    dtype_handler = DtypePropagationOpsHandler()
+
+                    output_idx = 0
+
+                    def do_cse(v):
+                        # cpp backend doesnt set current device - TODO: fix
+                        if V.graph.current_device is not None:
+                            device_str = V.graph.get_current_device_or_throw().type
+                            triton_backend = (
+                                config.cpu_backend == "triton"
+                                if device_str == "cpu"
+                                else config.cuda_backend == "triton"
+                            )
+                        else:
+                            triton_backend = False
+
+                        # only triton backend tracks dtype currently
+                        if triton_backend:
+                            if name == "masked":
+                                output_dtype = value.dtype
+                            else:
+                                output_dtype = getattr(
+                                    dtype_handler,
+                                    name,
+                                )(*args, **kwargs)
+                        else:
+                            # cpp backend doesnt track dtype yet
+                            output_dtype = None
+
+                        csevar = V.kernel.cse.generate(
+                            V.kernel.compute,
+                            v,
+                            bounds=bounds,
+                            dtype=output_dtype,
+                        )
+
+                        nonlocal output_idx
+                        if (
+                            config.test_configs.runtime_triton_dtype_assert
+                            and triton_backend
+                        ):
+                            from torch._inductor.codegen.triton import triton_type
+
+                            # we tree_map over the output, so we need to fetch corresponding dtype
+                            if isinstance(output_dtype, (list, tuple)):
+                                output_dtype = output_dtype[output_idx]
+
+                            V.kernel.compute.writeline(
+                                f"tl.static_assert({csevar}.dtype == {triton_type(output_dtype)})"
+                            )
+                        output_idx += 1
+
+                        csevar.update_on_args(name, args, kwargs)
+
+                        return csevar
+
+                    return pytree.tree_map(do_cse, value)
+
+                return inner
+
+            @staticmethod
+            def _bound_variable(name, *args, **kwargs):
+                """
+                If the variable comes from an FX node, we forward the bound we have already computed
+                Else, if the variable when codegen'ing another op, we try to compute its bounds
+                """
+                from torch._inductor.select_algorithm import TritonTemplateKernel
+
+                if isinstance(V.kernel, TritonTemplateKernel):
+                    return ValueRanges.unknown()
+
+                fx_node = V.interpreter.current_node
+                if fx_node.target == name and self.node_to_bounds is not None:
+                    assert isinstance(self.node_to_bounds, dict)
+                    return self.node_to_bounds.get(fx_node, ValueRanges.unknown())
+                elif config.compute_all_bounds and hasattr(ValueRangeAnalysis, name):
+                    # These create lots of inner strings. We would need to compute the bounds at the ops
+                    # We will also likely not get much from computing VRs on these nodes
+                    if any(
+                        s in fx_node.target
+                        for s in ("set_indirect", "reduction", "scan")
+                    ):
+                        return ValueRanges.unknown()
+
+                    # We assume that the inputs come from `ops.` and are not strings. If you want to generate
+                    # intermediary strings, wrap them in CSE variables with properly initialised bounds.
+
+                    # If there is no FX bound but we know how to compute one we do so
+                    assert not kwargs
+
+                    def arg_to_bound(x):
+                        if isinstance(x, CSEVariable):
+                            return x.bounds
+                        elif isinstance(x, sympy.Expr):
+                            return bound_sympy(x)
+                        else:
+                            return x
+
+                    arg_bounds = list(map(arg_to_bound, args))
+                    return getattr(CSEProxy.vr_analysis, name)(*arg_bounds)
+                return ValueRanges.unknown()
+
+            @staticmethod
+            def indirect_indexing(
+                var: CSEVariable,
+                size: Union[sympy.Expr, int],
+                check: bool = True,
+                wrap_neg=True,
+            ):
+                if isinstance(size, int):
+                    size = sympy.Integer(size)
+                assert isinstance(size, sympy.Expr), size
+                # Skip CSE since this doesn't return an expression
+
+                if var.bounds.lower < 0:  # type: ignore[operator]
+                    if wrap_neg:
+                        stm = ops.add(var, ops.index_expr(size, torch.long))
+                        # Mixed negative and non-negative
+                        if var.bounds.upper >= 0:  # type: ignore[operator]
+                            lt = ops.lt(var, 0)
+                            stm = ops.where(lt, stm, var)
+                    else:
+                        stm = var
+
+                    # Propagate bounds as we know how to compute them properly
+                    new_bounds = ValueRanges.unknown()
+                    if var.bounds != ValueRanges.unknown() and isinstance(
+                        size, sympy.Number
+                    ):
+                        # Take the negative part of the bound and add size to it
+                        # Then take union of that and the positive part
+                        # This is a tighter bound than that of a generic ops.where, as we have info on the cond
+                        neg_bounds = var.bounds & ValueRanges(-int_oo, -1)
+                        new_bounds = ValueRanges(
+                            neg_bounds.lower + size, neg_bounds.upper + size
+                        )
+                        # We don't have a good way of representing the empty range
+                        if var.bounds.upper >= 0:  # type: ignore[operator]
+                            pos = var.bounds & ValueRanges(0, int_oo)
+                            new_bounds = new_bounds | pos
+
+                    var = self.cse.generate(self.compute, stm, bounds=new_bounds)
+
+                sympy_var = parent_handler.indirect_indexing(var, size, check)
+                if generate_assert(check):
+                    assert_lower = not (var.bounds.lower >= 0)
+                    # value ranges cannot x < s when x and s are symbols
+                    assert_upper = not isinstance(size, sympy.Number) or not (
+                        var.bounds.upper < size
+                    )
+                    self.check_bounds(sympy_var, size, assert_lower, assert_upper)
+                return sympy_var
+
+            @staticmethod
+            def check_bounds(
+                expr: sympy.Expr, size: sympy.Expr, lower: bool, upper: bool
+            ):
+                return self.check_bounds(expr, size, lower, upper)
+
+            @staticmethod
+            def load(name: str, index: sympy.Expr) -> CSEVariable:
+                if name in self.cse.invalidated_stores:
+                    # A load from an invalidated store requires us to
+                    # keep the actual buffer around
+                    V.kernel.must_keep_buffers.add(name)
+                if free_symbol_is_type(index, SymT.TMP):
+                    return self.indirect_load(name, index)
+                store_cache = self.cse.store_cache
+                if name in store_cache:
+                    return self.load(name, index)
+                    #return store_cache[name]
+                out = self.load(name, index)
+                # count load that is not in the store_cache, and also not in the
+                # cse cache.
+                if out.use_count == 1:
+                    self.num_load += 1
+                return out
+
+            @staticmethod
+            def _update_store_cache(name: str, value: CSEVariable):
+                self.cse.store_cache[name] = value
+                if self.current_node and name in V.graph.name_to_buffer:
+                    buf = self.current_node.get_output(name)
+                    for other_name in buf.get_mutations():
+                        self.cse.store_cache[other_name] = value
+
+            @staticmethod
+            def store(
+                name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
+            ) -> None:
+                self.store_buffer_names.add(name)
+                if mode is None:
+                    CSEProxy._update_store_cache(name, value)
+                if name not in V.graph.removed_buffers:
+                    return self.store(name, index, value, mode=mode)
+                return None  # type: ignore[return-value]
+
+            @staticmethod
+            def store_reduction(name: str, index: sympy.Expr, value: CSEVariable):
+                self.store_buffer_names.add(name)
+                CSEProxy._update_store_cache(name, value)
+
+                if name not in V.graph.removed_buffers:
+                    return self.store_reduction(name, index, value)
+
+            @staticmethod
+            def reduction(
+                dtype: torch.dtype,
+                src_dtype: torch.dtype,
+                reduction_type: ReductionType,
+                value: Union[CSEVariable, Tuple[CSEVariable, ...]],
+            ) -> Union[CSEVariable, Tuple[CSEVariable, ...]]:
+                self.num_reduction += 1
+                return self.reduction(dtype, src_dtype, reduction_type, value)
+
+            @staticmethod
+            def scan(
+                dtypes: Tuple[torch.dtype, ...],
+                combine_fn: Callable[
+                    [Tuple[CSEVariable, ...], Tuple[CSEVariable, ...]],
+                    Tuple[CSEVariable, ...],
+                ],
+                values: Tuple[CSEVariable, ...],
+            ) -> Tuple[CSEVariable, ...]:
+                return self.scan(dtypes, combine_fn, values)
+
+            @staticmethod
+            def sort(
+                dtypes: Tuple[torch.dtype, ...],
+                values: Tuple[CSEVariable, ...],
+                stable: bool,
+                descending: bool,
+            ) -> Tuple[CSEVariable, ...]:
+                return self.sort(dtypes, values, stable, descending)
+
+            @staticmethod
+            def bucketize(
+                values: CSEVariable,
+                boundaries: Tuple[str, sympy.Expr, sympy.Expr, sympy.Expr],
+                boundary_indices: CSEVariable,
+                indexing_dtype: torch.dtype,
+                right: bool,
+                sorter: Optional[Tuple[str, sympy.Expr]] = None,
+                sorter_indices: Optional[CSEVariable] = None,
+            ) -> CSEVariable:
+                """
+                [Note: Inductor bucketize op]
+
+                Inputs:
+                -------
+                values: the values to be bucketized.
+                boundaries: a tuple containing
+                  (a) the name of the boundaries tensor (which must be sorted, unless
+                  the sorting tensor is present),
+                  (b) the length of the tensor in the last dimension (i.e. the length of
+                  one set of boundaries),
+                  (c) the number of elements in the underlying storage (i.e. the length
+                  of the flattened tensor, ignoring striding), and
+                  (d) the stride of the tensor in the last dimension.
+                boundary_indices: indices into a flattened version of the boundaries
+                tensor, of the same size and shape as "values".  Each index points to
+                the first element in the set of boundaries to be used for the
+                corresponding value.
+                indexing_dtype: the dtype to use when indexing into the boundaries
+                tensor.  This must be int64 or int32.  This additionally specifies the
+                dtype of the return value.
+                right: see "Details" below.
+                sorter: an optional tuple containing
+                  (a) the name of an optional sorting tensor, used to access unsorted
+                  boundaries without reordering the boundaries tensor, and
+                  (b) the stride of the tensor in the last dimension.
+                The values in the sorting tensor are used as indices into the *last*
+                dimension of the boundaries tensor, with all other indices matching.
+                The size of the sorting and boundaries tensors must be equivalent.
+                sorter_indices: must be present if the sorting array is present; see
+                "boundary_indices" for the equivalent definition for the boundaries
+                tensor.
+
+                Output:
+                -------
+                The buckets each value belongs in, within a given set of boundaries.  0
+                indicates a position before the first boundary, and len(boundaries_set)
+                represents a position after the last boundary.
+
+                Details:
+                --------
+                Given a value and a set of boundaries, calculate the bucket that each
+                value belongs to.  This works differently in 1-D and N-D cases.
+
+                for values [[-1, 0, 1, 2], [3, 4, 5, 9]], boundaries [0, 4, 4, 8], right=True
+                return =   [[ 0, 1, 1, 1], [1, 3, 3, 4]].
+
+                for values [[-1, 0, 1, 2], [3, 4, 5, 9]], boundaries [[0, 4], [4, 8]], right=True
+                return =   [[ 0, 1, 1, 1], [0, 1, 1, 2]]
+
+                Note that in the N-D boundaries case, the shape of "values" and
+                "boundaries" must match in every dimension _except_ the last.
+
+                When right == False, bucket i refers to range (boundaries[i], boundaries[i+1]].
+                When right == True,  bucket i refers to range [boundaries[i], boundaries[i+1]).
+
+                Boundaries must be non-decreasing, or a sorter must be provided which
+                would re-index offsets in a non-decreasing order (e.g. the second output
+                of torch.sort(offsets)).  Otherwise, the result is undefined.
+                """
+                return self.bucketize(
+                    values,
+                    boundaries,
+                    boundary_indices,
+                    indexing_dtype,
+                    right,
+                    sorter,
+                    sorter_indices,
+                )
+
+        # Use mypy to check protocol implemented correctly
+        def _typecheck_CSEProxy(h: CSEProxy) -> OpsHandler[CSEVariable]:
+            return h
+
+        super().__enter__()
+        assert self.overrides
+        parent_handler = self.overrides(V.get_ops_handler())
+        self.exit_stack.enter_context(V.set_ops_handler(CSEProxy()))
+        self.exit_stack.enter_context(V.set_kernel_handler(self))
+        return self
diff --git a/torch_npu/_inductor/codegen/triton_utils.py b/torch_npu/_inductor/codegen/triton_utils.py
new file mode 100644
index 0000000000..7e07c80dba
--- /dev/null
+++ b/torch_npu/_inductor/codegen/triton_utils.py
@@ -0,0 +1,29 @@
+
+import torch
+
+# wrapper npu 32 bytes align, get and pass unalign info to triton meta
+# then autotune choose tiling param and send them to bishengIR
+byte_per_numel = {
+    torch.float32: 4,  # torch.float32 or torch.float
+    torch.float64: 8,  # torch.float64 or torch.double
+    torch.float16: 2,  # torch.float16 or torch.half
+    torch.bfloat16: 2,  # torch.bfloat16
+    torch.int32: 4,  # torch.int32 or torch.int
+    torch.int64: 8,  # torch.int64 or torch.long
+    torch.int16: 2,  # torch.int16 or torch.short
+    torch.int8: 1,  # torch.int8
+    torch.uint8: 1,  # torch.uint8
+    torch.bool: 1,  # torch.bool
+    torch.complex32: 4,  # torch.complex32 (not yet available in PyTorch as of the latest stable release)
+    torch.complex64: 8,  # torch.complex64
+    torch.complex128: 16  # torch.complex128
+}
+
+
+def get_aligned_numel( dtype):
+    if dtype in byte_per_numel:
+        return 32 // byte_per_numel[dtype]
+    else:
+        return 1
+
+
diff --git a/torch_npu/_inductor/codegen/wrapper.py b/torch_npu/_inductor/codegen/wrapper.py
new file mode 100644
index 0000000000..67a1dbdab4
--- /dev/null
+++ b/torch_npu/_inductor/codegen/wrapper.py
@@ -0,0 +1,86 @@
+from torch._inductor.codegen.wrapper import PythonWrapperCodegen, SymbolicCallArg,SubgraphPythonWrapperCodegen
+from torch._inductor.virtualized import V
+from torch._inductor.utils import (
+    cache_on_self,
+)
+from torch._inductor.runtime import triton_heuristics
+from torch._inductor import config
+
+class NPUWrapperCodeGen(PythonWrapperCodegen):
+    def __init__(self):
+        super().__init__()
+    
+    @staticmethod
+    def create(
+        is_subgraph: bool, subgraph_name: str, parent_wrapper: PythonWrapperCodegen
+    ):
+        if is_subgraph:
+            return SubgraphPythonWrapperCodegen(subgraph_name, parent_wrapper)
+        return NPUWrapperCodeGen()
+        
+    @cache_on_self
+    def write_triton_header_once(self) -> None:
+        import_str = f"""
+            import triton
+            import triton.language as tl
+            from {triton_heuristics.__name__} import (                
+                split_scan_grid,
+                grid_combo_kernels,
+                start_graph,
+                end_graph,
+                cooperative_reduction_grid,
+            )
+            from torch_npu._inductor.npu_triton_heuristics import grid
+            """
+        if config.triton.autotune_at_compile_time:
+            self.kernel_autotune_calls.splice(import_str)
+            self.kernel_autotune_calls.writeline(
+                V.graph.device_ops.import_get_raw_stream_as("get_raw_stream")
+            )
+        if not V.graph.cpp_wrapper:
+            self.imports.splice(import_str, strip=True)
+            self.imports.writeline(
+                V.graph.device_ops.import_get_raw_stream_as("get_raw_stream")
+            )
+
+    # @cache_on_self
+    # def write_triton_header_once(self) -> None:
+    #     import_str = f"""
+    #             import triton
+    #             import triton.language as tl
+    #             from {triton_heuristics.__name__} import grid, split_scan_grid, grid_combo_kernels, start_graph, end_graph
+    #             from torch_npu._inductor.npu_triton_heuristics import grid
+    #             """
+    #     self.imports.splice(import_str, strip=True)
+    #     if config.triton.autotune_at_compile_time:
+    #         self.kernel_autotune_calls.splice(import_str)
+    #     self.write_get_raw_stream_header_once()
+
+    #generate numel expr for range_tree_node
+    def generate_node_numel_expr(self, kernel_name: str, node, numel_expr):
+        expr = f"{kernel_name}_{node.name}_numel"
+        if (expr, V.graph) not in self.kernel_numel_expr:
+            # declare expr once in each graph (scope)
+            self.kernel_numel_expr.add((expr, V.graph))
+            self.writeline(
+                f"{self.declare}{expr} = {self.expr_printer(numel_expr)}{self.ending}"
+            )
+        else:
+            self.writeline(f"{expr} = {self.expr_printer(numel_expr)}{self.ending}")
+        # We can get symbolic expressions here, like s0*64
+        # It is fine to have them here, but we need to handle them correctly as their own type
+        # This is tricky to do, so we wrap in a custom type, distinct from scalars, but also from sympy*
+        # scalars as well.
+        # This is handled in `generate_args_decl` which has a correct comment of: TODO: only works for
+        # constant now, need type info. I agree, this needs type info, and while this is not true type info
+        # it suffices as a type hint for the purposes of producing the correct code for this type.
+        return SymbolicCallArg(expr, numel_expr)
+
+    # don't free anything
+    def make_buffer_free(self, buffer):
+        #return f"del {buffer.get_name()}"
+        return ""
+
+    # don't assert
+    def codegen_input_size_asserts(self) -> None:
+        pass
diff --git a/torch_npu/_inductor/config.py b/torch_npu/_inductor/config.py
new file mode 100644
index 0000000000..fe356796b9
--- /dev/null
+++ b/torch_npu/_inductor/config.py
@@ -0,0 +1,44 @@
+import os  # noqa: C101
+import sys
+import logging
+from typing import Any, Callable, Dict, Optional, TYPE_CHECKING
+from triton.runtime.driver import driver
+from torch._inductor import config
+enable_npu_indexing = True
+
+config.triton.unique_kernel_names = True
+# avoid test_opensora_cases_model_16_forward  reinterpre_tensor issue
+config.allow_buffer_reuse = False
+#inductor debug switch
+config.trace.enabled = True
+
+# npu hardware params from trion
+target = driver.active.get_current_target()
+device = driver.active.get_current_device()
+prop = driver.active.utils.get_device_properties(device)
+
+num_cube_core = prop["num_aicore"]
+num_vector_core = prop["num_aicore"]
+
+# unit byte
+npu_block = 32
+
+if ("Ascend910B" in target.arch):
+    num_vector_core = num_cube_core * 2
+
+log_level_env = os.getenv('INDUCTOR_ASCEND_LOG_LEVEL', 'INFO').upper()
+log_level_mapping = {
+    'DEBUG': logging.DEBUG,
+    'INFO': logging.INFO,
+    'WARNING': logging.WARNING,
+    'ERROR': logging.ERROR,
+    'CRITICAL': logging.CRITICAL
+}
+log_level = log_level_mapping.get(log_level_env.upper(), logging.INFO)
+logging.basicConfig(
+    level=log_level,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+log = logging.getLogger(__name__)
+
+aggresive_autotune = os.getenv("INDUCTOR_ASCEND_AGGRESSIVE_AUTOTUNE", '0').lower() in ('1', 'true')
\ No newline at end of file
diff --git a/torch_npu/_inductor/decomposition.py b/torch_npu/_inductor/decomposition.py
new file mode 100644
index 0000000000..397bd5ef27
--- /dev/null
+++ b/torch_npu/_inductor/decomposition.py
@@ -0,0 +1,47 @@
+from torch._inductor.decomposition import decompositions, pw_cast_for_opmath
+from torch._inductor.decomposition import register_decomposition
+import torch._ops
+from .lowering import _init_set
+
+aten = torch.ops.aten
+
+DECOMPOSITION_OVERLOAD_OP = [
+    aten._log_softmax, 
+    aten.nll_loss_forward, 
+    # aten.gelu_backward,
+    # aten.gelu,
+    aten.nll_loss_backward,
+    aten._log_softmax_backward_data,
+    aten.embedding_dense_backward
+]
+
+def _register_npu_inductor_decompositons():
+
+    overload_op_set = set()
+    _init_set(DECOMPOSITION_OVERLOAD_OP, overload_op_set)
+
+    for op in overload_op_set:
+        if (op in decompositions):
+            del decompositions[op]
+            
+    @register_decomposition([aten.scatter.src])
+    @pw_cast_for_opmath
+    def scatter_src(self, input_tensor, dim, index_tensor, source_tensor):
+        assert self.device.type == "npu" and dim == 1
+        (XNUMEL, YS) = input_tensor.shape
+        index_rblock = torch.arange(YS).npu().reshape((1, YS)).repeat((XNUMEL, 1))
+
+        index_tensor_brd = index_tensor.to(torch.int32).broadcast_to(XNUMEL, YS)
+        source_tensor_brd = source_tensor.broadcast_to(XNUMEL, YS).to(torch.float32)
+        scatter1 = torch.where(index_rblock == index_tensor_brd, 1.0, 0.0) * source_tensor_brd
+        return scatter1
+
+    @register_decomposition([aten.expm1])
+    def expm1(x):
+        tensor = torch.exp(x) - torch.ones_like(x)
+        return tensor
+
+    @register_decomposition([aten.erfc])
+    def erfc(x):
+        tensor = torch.ones_like(x) - torch.exp(x) 
+        return tensor
\ No newline at end of file
diff --git a/torch_npu/_inductor/dynamo_patch3.py b/torch_npu/_inductor/dynamo_patch3.py
new file mode 100644
index 0000000000..15a52f61e5
--- /dev/null
+++ b/torch_npu/_inductor/dynamo_patch3.py
@@ -0,0 +1,11 @@
+# Issue: Error
+# x.permute(0, 2, 1, 3).contiguous().view(xxx)
+# Message: 
+import torch
+from torch.library import Library, impl
+python_dispatcher_lib = Library("aten", "IMPL", "PythonDispatcher")
+@impl(python_dispatcher_lib, "embedding_backward")
+def embedding_backward(grad, indices, num_weights, padding_idx, scale_grad_by_freq, sparse):
+    if sparse:
+        raise RuntimeError("the current NPU does not yet support sparse tensor, when sparse is set to True")
+    return torch.ops.aten.embedding_dense_backward(grad, indices,  num_weights, padding_idx, scale_grad_by_freq)
\ No newline at end of file
diff --git a/torch_npu/_inductor/lowering.py b/torch_npu/_inductor/lowering.py
new file mode 100644
index 0000000000..79d37f8f6c
--- /dev/null
+++ b/torch_npu/_inductor/lowering.py
@@ -0,0 +1,333 @@
+import sympy
+from torch._inductor.ir import Reduction
+from torch._inductor.utils import sympy_product
+from torch._inductor import ir
+from torch._inductor.ir import ExpandView, TensorBox, ops_wrapper
+from torch._inductor.lowering import sum_
+from torch._inductor import lowering
+from torch._prims_common import (
+    is_boolean_dtype,
+    is_integer_dtype,
+    get_computation_dtype,
+)
+from torch._inductor.decomposition import decompositions, pw_cast_for_opmath
+import torch._ops
+
+
+def make_reduction(reduction_type: str, override_return_dtype=None):
+    def inner(x, axis=None, keepdims=False, *, dtype=None):
+        kwargs = _make_reduction_inner(
+            x,
+            axis=axis,
+            keepdims=keepdims,
+            dtype=dtype,
+            override_return_dtype=override_return_dtype,
+        )
+        result = Reduction.create(reduction_type=reduction_type, input_node=x, **kwargs)
+        if isinstance(
+                result.data.data, Reduction
+        ):  #Only realize if reduction isn't unrolled
+            size = x.get_size()
+            axis = set(_validate_reduction_axis(x, axis))
+            kept_idx = []
+            reduced_idx = []
+            for i in range(len(size)):
+                if i in axis:
+                    reduced_idx.append(i)
+                else:
+                    kept_idx.append(i)
+    
+            object.__setattr__(result.data.data, "kept_idx", kept_idx)
+            object.__setattr__(result.data.data, "reduced_idx", reduced_idx)
+
+            result.realize()
+        return result
+
+    return inner
+
+
+lowering.make_reduction = make_reduction
+
+from torch._inductor.lowering import (
+    lowerings,
+    make_fallback,
+    register_lowering,
+    to_dtype,
+    # make_reduction,
+    # reduce_amax,
+    # reduce_amin,
+    fallback_cumsum,
+    _validate_reduction_axis,
+    div,
+    squeeze,
+    square,
+    sub,
+    fallback_handler,
+    is_boolean_type,
+    logical_and,
+    make_pointwise,
+    _make_reduction_inner,
+    _validate_reduction_axis,
+)
+
+aten = torch.ops.aten
+tr_c10d = torch.ops.tr_c10d
+prims = torch.ops.prims
+
+import torch_npu
+
+from torch_npu import  npu_dtype_cast
+
+
+def _init_set(input_list, output_set):
+    for fn in input_list:
+        output_set.add(fn)
+        if isinstance(fn, torch._ops.OpOverloadPacket):
+            for overload in fn.overloads():
+                other_fn = getattr(fn, overload)
+                output_set.add(other_fn)
+
+
+GENERATE_LIST = [
+    aten.mul,
+    aten.add,
+    aten.sub,
+    aten.div,
+    aten.exp,
+    aten.maximum,
+    aten.sum,
+    aten.select,
+    aten.unsqueeze,
+    aten.repeat,
+    #aten.clone,
+    aten.reshape,
+    aten.where,
+    aten.lt,
+    aten.minimum,
+    aten.gt,
+    aten.le,
+    aten.ceil,
+    aten.floor,
+    aten.rsqrt,
+    aten.abs,
+    aten.log,
+    aten.bitwise_xor,
+    aten.amax,
+    # backward
+    prims.convert_element_type,
+    aten.min,
+    aten.max,
+    aten.erf,
+    aten.argmax,
+    aten.argmin,
+    aten.clamp_min,
+    aten.slice,
+    aten.neg,
+    aten.cat,
+    aten.arange,
+    aten.expand,
+    aten.eq,
+    aten.where,
+    aten.scalar_tensor,
+    aten.ge,
+    aten.permute,
+    aten.sqrt,
+    aten.relu,
+    aten.clamp,
+    aten.clamp_max,
+    aten.mean,
+    # npu.npu_dtype_cast
+    npu_dtype_cast,
+    aten.select_scatter,
+    aten.slice_scatter,
+    prims.broadcast_in_dim,
+    prims.maximum,
+    aten.ne,
+    aten.sigmoid,
+    aten.sign,
+    aten.logical_and,
+    aten.logical_or,
+    aten.logical_not,
+    aten.pow,
+    aten.gelu,
+    aten.tanh,
+    aten.isnan,
+    aten.bitwise_and,
+    aten.squeeze,
+    aten.copy,
+    aten.reciprocal
+]
+
+GENERATE_LIST2 = [
+    "foreach"
+]
+
+FALLBACK_LIST = []
+
+# 先删除从lowering已经注册的op，再更新，不然会lowering的时候找到在torch注册的op
+LOWERING_OVERLOAD_OP = [
+    aten.cumsum,
+    aten.mean,
+    # aten.max,
+    # aten.min,
+    # aten.mul,
+    aten.var_mean,
+    aten.var,
+
+    # todo: work round for electraModel
+    aten.embedding,
+    aten.split,
+    aten.split_with_sizes,
+    aten.nll_loss_forward,
+    aten.gather,
+    aten.cat,
+    aten.clone
+]
+
+
+def _register_npu_inductor_fallbacks():
+    gen_set = set()
+    _init_set(GENERATE_LIST, gen_set)
+    overload_op_set = set()
+    _init_set(LOWERING_OVERLOAD_OP, overload_op_set)
+
+    # 把不在白名单的op fallback
+    for op in lowerings:
+        if op not in decompositions and op not in gen_set:
+            if isinstance(op, torch._ops.OpOverloadPacket) or \
+                    isinstance(op, (torch._ops.OpOverload, torch._ops.HigherOrderOperator)):
+                flag = False
+                for gens in GENERATE_LIST2:
+                    if str(op).find(gens) != -1:
+                        flag = True
+                if flag:
+                    continue
+                else:
+                    make_fallback(op)
+                    FALLBACK_LIST.append(op)
+    # 把需要overload的op在lowering里删除
+    for op in overload_op_set:
+        if op in lowerings:
+            del lowerings[op]
+
+    @register_lowering(aten.mean)
+    def mean(x, axis=None, keepdim=False, *, dtype=None):
+        if dtype is not None:
+            x = to_dtype(x, dtype)
+        size = x.get_size()
+        axis = _validate_reduction_axis(x, axis)
+        # compute in higher-precision until end of mean lowering
+        output_dtype = x.get_dtype()
+        if output_dtype in (torch.float16, torch.bfloat16):
+            x = to_dtype(x, torch.float)
+        sum_result = sum_(x, axis, keepdim)
+        denom = sympy_product(size[i] for i in axis)
+        denom = ir.IndexingConstant(index=denom, dtype=x.get_dtype(), device=x.get_device())
+        denom = ExpandView.create(denom, list(sum_result.get_size()))
+        return to_dtype(div(sum_result, denom), output_dtype)
+    
+    # @register_lowering(aten.mean)
+    # def mean(x, axis=None, keepdim=False, *, dtype=None):
+    #     size = x.get_size()
+    #     if dtype is not None:
+    #         x = to_dtype(x, dtype)
+    #         size = x.get_size()
+    #     axis = _validate_reduction_axis(x, axis)
+
+    #     # compute in higher-precision until end of mean lowering
+    #     output_dtype = x.get_dtype()
+    #     if output_dtype in (torch.bfloat16,):
+    #         x = to_dtype(x, torch.float)
+    #     sum_result = sum_(x, axis, keepdim)
+    #     denom = sympy_product(size[i] for i in axis)
+    #     denom = ir.IndexingConstant(denom, x.get_dtype(), x.get_device())
+    #     denom = ExpandView.create(denom, list(sum_result.get_size()))
+    #     return to_dtype(div(sum_result, denom), output_dtype)
+
+    @register_lowering(aten.cumsum)
+    def cumsum(x, axis=None, dtype=None):
+        if (
+                is_integer_dtype(x.get_dtype()) or is_boolean_dtype(x.get_dtype())
+        ) and dtype is None:
+            dtype = torch.int32 # torch.int64->torch.int32
+        if len(x.get_size()) == 0:
+            assert axis in [0, -1]
+            dtype = dtype or x.get_dtype()
+            return to_dtype(x, dtype, copy=True)
+        return fallback_cumsum(x, dim=axis, dtype=dtype)
+
+    @register_lowering(npu_dtype_cast, type_promotion_kind=None)
+    def _convert_npu_type(x: TensorBox, dtype: torch.dtype):
+        return to_dtype(x, dtype, copy=True)
+
+
+    def var_mean_sum_(x, axis, correction, keepdim, return_mean):
+        if correction is None:
+            correction = 1
+    
+        size = x.get_size()
+        axis = _validate_reduction_axis(x, axis)
+        x_mean = mean(x, axis, keepdim=True)
+        if return_mean:
+            x_mean.realize()
+    
+        diffs = square(sub(x, x_mean))
+        sum_result = sum_(diffs, axis, keepdim)
+        denom = sympy_product(size[i] for i in axis)
+        if correction:
+            denom = sympy.Max(denom - correction, 0)
+        denom = ir.IndexingConstant(index=denom, dtype=x.get_dtype(), device=x.get_device())
+        denom = ExpandView.create(denom, list(sum_result.get_size()))
+        x_var = div(sum_result, denom)
+        if not return_mean:
+            return (x_var,)
+    
+        x_mean = x_mean if keepdim else squeeze(x_mean, axis)
+        return x_var, x_mean
+
+
+    def var_mean_helper_(x, *, axis, correction, keepdim, return_mean):
+        out_dtype = x.get_dtype()
+        compute_dtype = get_computation_dtype(out_dtype)
+        x = to_dtype(x, compute_dtype, copy=False)
+        kwargs = dict(
+            x=x,
+            axis=axis,
+            correction=correction,
+            keepdim=keepdim,
+            return_mean=return_mean,
+        )
+        output = (
+            var_mean_sum_(**kwargs)
+            #todo: The welford reduction branch is annotated
+            # if use_two_step_variance(x,axis=axis,keepdim=keepdim)
+            # else var_mean_welford_(**kwargs)
+        )
+        output = tuple(to_dtype(x, out_dtype, copy=False) for x in output)
+        return output[0] if not return_mean else output
+
+    @register_lowering(aten.var_mean)
+    def var_mean(x, axis=None, *, correction=None, keepdim=False):
+        return var_mean_helper_(
+            x, axis=axis, correction=correction, keepdim=keepdim, return_mean=True
+        )
+
+    @register_lowering([aten.var, prims.var])
+    def var_(x, axis=None, *, correction=None, keepdim=False):
+        return var_mean_helper_(
+            x, axis=axis, correction=correction, keepdim=keepdim, return_mean=False
+        )
+
+    @register_lowering(aten.embedding, type_promotion_kind=None)
+    def embedding(weight, indices, padding_idx=-1, scale_grad_by_freq=False, sparse=False):
+        return fallback_handler(aten.embedding.default)(weight, indices, padding_idx=-1, scale_grad_by_freq=False,
+                                                        sparse=False)
+
+    @register_lowering(aten.cat)
+    def cat(inputs, dim=0):
+        # todo:work round for electraModel backward
+        return fallback_handler(aten.cat.default)(inputs, dim)
+
+    make_fallback(aten._log_softmax)
+    make_fallback(aten.gather)
+    make_fallback(aten.nll_loss_forward)
diff --git a/torch_npu/_inductor/npu_choices.py b/torch_npu/_inductor/npu_choices.py
new file mode 100644
index 0000000000..a3f7a2bf47
--- /dev/null
+++ b/torch_npu/_inductor/npu_choices.py
@@ -0,0 +1,40 @@
+import typing
+from typing import Any, Dict, List, Type, TYPE_CHECKING
+
+import sympy
+
+from torch._inductor import config
+
+from torch._inductor.runtime.hints import  ReductionHint
+
+from torch._inductor.virtualized import V
+from torch._inductor.codegen.simd_kernel_features import SIMDKernelFeatures
+from torch._inductor.codegen.triton import TritonKernel
+
+
+    
+
+
+@staticmethod
+def should_use_persistent_reduction(
+    features: SIMDKernelFeatures, cooperative_reduction: bool
+) -> bool:
+    """
+    Heuristic to decide if a persistent reduction should be used.
+    """
+    if not config.triton.persistent_reductions:
+        return False
+    threshold = {
+        ReductionHint.INNER: 1024,
+        ReductionHint.DEFAULT : 1024
+    }.get(features.get_reduction_hint(), 64)
+    if cooperative_reduction:
+        # The RSPLIT of cooperative reductions means each thread block is operating on fewer elements
+        try:
+            threshold *= 32 // min(V.graph.sizevars.size_hint(features.numel), 32)
+        except ValueError:
+            pass  # unbacked symint
+
+    if config.triton.multi_kernel:
+        threshold *= 16
+    return V.graph.sizevars.statically_known_leq(features.reduction_numel, threshold)  # type: ignore[arg-types]
\ No newline at end of file
diff --git a/torch_npu/_inductor/npu_fusion_attention_graph.py b/torch_npu/_inductor/npu_fusion_attention_graph.py
new file mode 100644
index 0000000000..443f20f966
--- /dev/null
+++ b/torch_npu/_inductor/npu_fusion_attention_graph.py
@@ -0,0 +1,231 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import sympy
+import torch
+import torch_npu
+import functools
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.library import Library, impl
+
+npu_def = Library("npu_graph", "DEF")
+npu_lib = Library("npu_graph", "IMPL", "PrivateUse1")
+meta_lib = Library("npu_graph", "IMPL", "Meta")
+
+npu_def.define("npu_fa(Tensor query, Tensor key, Tensor value, int head_num, str input_layout, Tensor? pse=None, Tensor? padding_mask=None, Tensor? atten_mask=None, float scale=1., float keep_prob=1., int pre_tockens=2147483647, int next_tockens=2147483647, int inner_precise=0, int[]? prefix=None, int[]? actual_seq_qlen=None, int[]? actual_seq_kvlen=None, int sparse_mode=0, bool gen_mask_parallel=True, bool sync=False) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)")
+npu_def.define("npu_fa_backward(Tensor query, Tensor key, Tensor value, Tensor dy, int head_num, str input_layout, *, Tensor? pse=None, Tensor? padding_mask=None, Tensor? atten_mask=None, Tensor? softmax_max=None, Tensor? softmax_sum=None, Tensor? softmax_in=None, Tensor? attention_in=None, float scale_value=1., float keep_prob=1., int pre_tockens=2147483647, int next_tockens=2147483647, int inner_precise=0, Tensor? seed=None, Tensor? offset=None, Tensor? numels=None, int[]? prefix=None, int[]? actual_seq_qlen=None, int[]? actual_seq_kvlen=None, int sparse_mode=0, bool gen_mask_parallel=True, bool sync=False) -> (Tensor, Tensor, Tensor, Tensor)")
+
+@impl(npu_lib, "npu_fa")
+def npu_fa(*args, **kwargs):
+    if len(args) > 8:
+        args = list(args)
+        # for scale
+        try: 
+            args[8] = 1.0 / args[8]
+        except IndexError:
+            args[8] = 1.0 / (args[8] + 1e-6)
+            print("args[8]: zero can not be divided")
+    # kwargs['scale'] = 1 / kwargs['scale']
+    r1, r2, r3, r4, seed, offset, numel = torch_npu.npu_fusion_attention(*args, **kwargs)
+    r2.requires_grad = False
+    r3.requires_grad = False
+    r4.requires_grad = False
+    return r1, r2, r3, r4, torch.tensor([seed], requires_grad=False), torch.tensor([offset], requires_grad=False), torch.tensor([numel], requires_grad=False)
+
+
+@impl(npu_lib, "npu_fa_backward")
+def npu_fa_backward(*args, **kwargs):
+    if 'scale_value' in kwargs:
+        kwargs['scale_value'] = 1.0 / kwargs['scale_value']
+    return torch_npu.npu_fusion_attention_grad(*args, **kwargs)
+
+
+@impl(meta_lib, "npu_fa")
+def npu_fa(query, key, value, head_num, input_layout, pse=None, padding_mask=None,
+                                atten_mask=None, scale=1.0, keep_prob=1.0, pre_tockens=2147483647, next_tockens=2147483647,
+                                inner_precise=0, prefix=None, actual_seq_qlen=None, actual_seq_kvlen=None, sparse_mode=0, gen_mask_parallel=True, sync=False):
+    B = query.size(0)
+    N = head_num
+    S1 = query.size(2)
+    S2 = key.size(2)
+
+    if input_layout == "BSH":
+        B = query.size(0)
+        S1 = query.size(1)
+        S2 = key.size(1)
+
+    if input_layout == "SBH":
+        B = query.size(1)
+        S1 = query.size(0)
+        S2 = key.size(0)
+
+    attention_score = torch.empty_like(query, dtype=query.dtype, device='meta').contiguous()
+    softmax_max = torch.empty([B, head_num, S1, 8], dtype=torch.float32, device='meta')
+    softmax_sum = torch.empty([B, head_num, S1, 8], dtype=torch.float32, device='meta')
+    softmax_out = torch.empty([0], dtype=query.dtype, device='meta')
+    return (torch.empty_like(attention_score),
+            torch.empty_like(softmax_max),
+            torch.empty_like(softmax_sum),
+            torch.empty_like(softmax_out),
+            torch.tensor([0],device='meta',requires_grad=False),
+            torch.tensor([0],device='meta',requires_grad=False),
+            torch.tensor([0],device='meta',requires_grad=False))
+
+@impl(meta_lib, "npu_fa_backward")
+def npu_fa_backward(query, key, value, dy, head_num, input_layout, *, pse=None, padding_mask=None, atten_mask=None,
+                                softmax_max=None, softmax_sum=None, softmax_in=None, attention_in=None, scale_value=1.0,
+                                keep_prob=1.0, pre_tockens=2147483647, next_tockens=2147483647, inner_precise=0, seed=0, offset=0,
+                                numels=0, prefix=None, actual_seq_qlen=None, actual_seq_kvlen=None, sparse_mode=0, gen_mask_parallel=True, sync=False):
+    
+    dq = torch.empty_like(query, dtype=query.dtype, device='meta').contiguous()
+    dk = torch.empty_like(key, dtype=query.dtype, device='meta').contiguous()
+    dv = torch.empty_like(value, dtype=query.dtype, device='meta').contiguous()
+    dpse = torch.empty([0], dtype=query.dtype, device='meta').contiguous()
+    return (torch.empty_like(dq), torch.empty_like(dk), torch.empty_like(dv), torch.empty_like(dpse) if pse else None)
+
+class NpuGraphAttentionFunction(Function):
+    @staticmethod
+    def forward(ctx, query, key, value, head_num, input_layout, pse=None, padding_mask=None, atten_mask=None, scale=1.0, keep_prob=1.0, pre_tockens=2147483647, next_tockens=2147483647, inner_precise=0, prefix=None, actual_seq_qlen=None, actual_seq_kvlen=None, sparse_mode=0, gen_mask_parallel=True, sync=False):
+        # 前向传播逻辑
+        # 这里假设有一个实现前向传播的函数 `npu_fusion_attention_forward`
+        result0, result1, result2, result3, result4, result5, result6 = torch.ops.npu_graph.npu_fa(
+            query, key, value, head_num, input_layout, pse=pse, padding_mask=padding_mask, atten_mask=atten_mask, scale=scale, keep_prob=keep_prob, pre_tockens=pre_tockens, next_tockens=next_tockens, inner_precise=inner_precise, prefix=prefix, actual_seq_qlen=actual_seq_qlen, actual_seq_kvlen=actual_seq_kvlen, sparse_mode=sparse_mode, gen_mask_parallel=gen_mask_parallel, sync=sync
+        )        
+        # 保存中间结果，以便在反向传播中使用
+        ctx.save_for_backward(query, key, value, pse, padding_mask, atten_mask, result1, result2, result3, result0, result4, result5, result6)
+        ctx.head_num = head_num
+        ctx.input_layout = input_layout
+        ctx.scale = scale
+        ctx.keep_prob = keep_prob
+        ctx.pre_tockens = pre_tockens
+        ctx.next_tockens = next_tockens
+        ctx.inner_precise = inner_precise
+        ctx.prefix = prefix
+        ctx.actual_seq_qlen = actual_seq_qlen
+        ctx.actual_seq_kvlen = actual_seq_kvlen
+        ctx.sparse_mode = sparse_mode
+        ctx.gen_mask_parallel = gen_mask_parallel
+        ctx.sync = sync
+        
+        return result0, result1, result2, result3, result4, result5, result6
+    
+    @staticmethod
+    def backward(ctx, grad_result0, grad_result1, grad_result2, grad_result3, grad_result4, grad_result5, grad_result6):
+        # 获取保存的中间结果
+        query, key, value, pse, padding_mask, atten_mask, result1, result2, result3, result0, result4, result5, result6 = ctx.saved_tensors
+        # 反向传播逻辑
+        # 这里假设有一个实现反向传播的函数 `npu_fusion_attention_backward`
+        grad_query, grad_key, grad_value, grad_pse = torch.ops.npu_graph.npu_fa_backward(
+            query, key, value, grad_result0, ctx.head_num, ctx.input_layout, pse=pse, padding_mask=padding_mask, atten_mask=atten_mask, softmax_max=result1, softmax_sum=result2, softmax_in=result3, attention_in=result0, scale_value=ctx.scale, keep_prob=ctx.keep_prob, pre_tockens=ctx.pre_tockens, next_tockens=ctx.next_tockens, inner_precise=ctx.inner_precise, seed=result4, offset=result5, numels=result6, prefix=ctx.prefix, actual_seq_qlen=ctx.actual_seq_qlen, actual_seq_kvlen=ctx.actual_seq_kvlen, sparse_mode=ctx.sparse_mode, gen_mask_parallel=ctx.gen_mask_parallel, sync=ctx.sync
+        )
+        return (grad_query, grad_key, grad_value, None, None, grad_pse, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
+
+def npu_fusion_attention_graph(query, key, value, head_num, input_layout, pse=None, padding_mask=None,
+                            atten_mask=None, scale=1.0, keep_prob=1.0, pre_tockens=2147483647, next_tockens=2147483647,
+                            inner_precise=0, prefix=None, actual_seq_qlen=None, actual_seq_kvlen=None, sparse_mode=0, gen_mask_parallel=True, sync=False):
+    #import pdb;pdb.set_trace()
+    return NpuGraphAttentionFunction.apply(query, key, value, head_num, input_layout, pse, padding_mask,
+                            atten_mask, scale, keep_prob, pre_tockens, next_tockens,
+                            inner_precise, prefix, actual_seq_qlen, actual_seq_kvlen, sparse_mode, gen_mask_parallel, sync)
+torch_npu.npu_fusion_attention_graph = npu_fusion_attention_graph
+
+def register_fx_pass():
+    TOKEN_MAX = 2147483647
+    from torch._inductor.pattern_matcher import register_replacement, fwd_only, joint_fwd_bwd
+    from torch._inductor.fx_passes.joint_graph import patterns
+    from torch._dynamo.utils import counters
+    from torch._inductor.fx_passes.fuse_attention import partialize_and_update_signature
+    def _npu_fusion_attention_graph_pattern_1(query, key, value, inv_scale_factor, dropout_p):
+        q = query.permute(0, 2, 1, 3)
+        k = key.permute(0, 2, 1, 3)
+        v = value.permute(0, 2, 1, 3)
+        return torch.nn.functional.dropout(
+            torch.matmul(q, k.transpose(-2, -1)).div(inv_scale_factor).softmax(dim=-1),
+            p=dropout_p,
+        ).matmul(v)
+
+
+    def _npu_fusion_attention_graph_replacement_1(query, key, value, inv_scale_factor, dropout_p):
+        counters["inductor"]["fuse_attention"] += 1
+        head_num = query.size(2)
+        input_layout = "BNSD"
+        return torch_npu.npu_fusion_attention_graph(
+            query.transpose(1, 2),
+            key.transpose(1, 2),
+            value.transpose(1, 2),
+            head_num,
+            input_layout,
+            None,
+            atten_mask = None,
+            scale = inv_scale_factor,
+            keep_prob = 1.0 - dropout_p,
+        )[0]
+
+    def _get_sfdp_patterns():
+        device = 'npu'
+        g_inp = functools.partial(
+            torch.empty, (2, 4, 8, 16), device=device, requires_grad=True
+        )
+        c_inp = functools.partial(torch.tensor, 2.0, device=device)
+        d = {"dropout_p": 0.113377}
+        candidates = []
+        for dtype in [torch.float]:
+            g = functools.partial(g_inp, dtype=dtype)
+            c = functools.partial(c_inp, dtype=dtype)
+            candidates.append((
+                _npu_fusion_attention_graph_pattern_1,
+                _npu_fusion_attention_graph_replacement_1,
+                [g(), g(), g(), c()],
+                d,
+            ))
+
+            for pattern, replacement, args, workaround in candidates:
+                # XXX: when adding a new pattern, re-run `gen_attention_patterns` so the pattern
+                # gets serialized to a python file and does not require tracing at runtime.
+                assert isinstance(workaround, dict)
+                name = pattern.__name__
+
+                if dtype != torch.float:
+                    name += "_half"
+
+                if args[0].size(0) == 1:
+                    name += "_bs1"
+
+                training_name = name + "_training"
+                yield training_name, {
+                    "search_fn": pattern,
+                    "replace_fn": replacement,
+                    "example_inputs": args,
+                    "trace_fn": joint_fwd_bwd,
+                    "pass_dicts": patterns,
+                    "scalar_workaround": workaround,
+                }
+
+                if workaround:
+                    assert len(workaround) == 1 and "dropout_p" in workaround
+                    # functools.partial insufficient because we look at signature downstream
+                    pattern = partialize_and_update_signature(pattern, dropout_p=0.0)
+                    replacement = partialize_and_update_signature(
+                        replacement, dropout_p=0.0
+                    )
+                    workaround = {}
+
+                inference_name = name + "_inference"
+                yield inference_name, {
+                    "search_fn": pattern,
+                    "replace_fn": replacement,
+                    "example_inputs": args,
+                    "trace_fn": fwd_only,
+                    "pass_dicts": patterns,
+                    "scalar_workaround": workaround,
+                }
+
+    for key, register_replacement_kwargs in _get_sfdp_patterns():
+        register_replacement(
+            **register_replacement_kwargs,
+        )
+
+register_fx_pass()
+
+
+
diff --git a/torch_npu/_inductor/npu_triton_helpers.py b/torch_npu/_inductor/npu_triton_helpers.py
new file mode 100644
index 0000000000..f4e22275da
--- /dev/null
+++ b/torch_npu/_inductor/npu_triton_helpers.py
@@ -0,0 +1,18 @@
+import triton
+import triton.language as tl
+
+import triton.language.extra.ascend.libdevice as libdevice
+libdevice = tl.extra.ascend.libdevice
+math = tl.math
+
+from torch._inductor.runtime import triton_helpers
+@triton.jit
+def maximum(a, b):
+    return tl.maximum(a, b)
+
+@triton.jit
+def minimum(a, b):
+    return tl.minimum(a, b)
+
+triton_helpers.maximum = maximum
+triton_helpers.minimum = minimum
diff --git a/torch_npu/_inductor/npu_triton_heuristics.py b/torch_npu/_inductor/npu_triton_heuristics.py
new file mode 100644
index 0000000000..e4cf47c7ef
--- /dev/null
+++ b/torch_npu/_inductor/npu_triton_heuristics.py
@@ -0,0 +1,969 @@
+# This file is based on triton_heuristics with heuristics designed for NPU
+import os
+import functools
+import copy
+import pdb
+from typing import Any, Callable, List, Optional
+from .config import log
+import logging
+from .config import aggresive_autotune
+
+import torch
+import re
+
+from torch._inductor import config
+import hashlib
+from .codegen.tile_generator import TileGenerator
+from .codegen.triton_utils import get_aligned_numel
+
+from torch._inductor.runtime.triton_heuristics import (
+    CachingAutotuner,
+    HeuristicType,
+    unique_configs,
+    hash_configs,
+    Config,
+    ASTSource,
+    _find_names,
+    get_first_attr,
+    collected_calls,
+)
+import json
+from torch._inductor.runtime.benchmarking import benchmarker
+from torch._inductor.runtime.autotune_cache import AutotuneCache
+
+
+from torch._inductor.runtime.runtime_utils import (
+    create_bandwidth_info_str,
+    get_num_bytes,
+
+)
+from .codegen.split_tiling import SplitTiling
+import triton
+from triton.compiler import CompiledKernel
+
+try:
+    from triton.backends.compiler import GPUTarget
+    from triton.runtime.autotuner import OutOfResources
+    import torch.autograd.profiler as autograd_profiler
+except ImportError:
+    GPUTarget = None
+    OutOfResources = None
+    autograd_profiler = None
+
+from .utils import get_current_raw_stream
+
+
+# torch-261
+class NPUCachingAutotuner(CachingAutotuner):
+    def __init__(
+        self,
+        fn,
+        triton_meta,  # passed directly to triton
+        configs,
+        save_cache_hook,
+        mutated_arg_names: List[str],  # see [Note: clone mutated buffers]
+        optimize_mem,
+        heuristic_type,
+        size_hints=None,
+        inductor_meta=None,  # metadata not relevant to triton
+        custom_kernel=False,  # whether the kernel is inductor-generated or custom
+        filename: Optional[str] = None,
+        reset_to_zero_arg_names: Optional[List[str]] = None,
+    ):
+        super().__init__(fn, triton_meta, configs, save_cache_hook, mutated_arg_names, optimize_mem, heuristic_type,
+                         size_hints, inductor_meta, custom_kernel, filename, reset_to_zero_arg_names)
+
+        self.exceptions = []
+
+    def precompile(self, warm_cache_only=False):
+        # FIXME
+        # xpu_graph changed TORCHINDUCTOR_CACHE_DIR.
+        # When TORCHINDUCTOR_COMPILE_THREADS > 1, multiprocessing's fork method
+        # does not propagate TORCHINDUCTOR_CACHE_DIR into the child threads.
+        # However, after all the child threads finished, the main thread reaches
+        # here and inherits xpu_graph's TORCHINDUCTOR_CACHE_DIR. Then the main
+        # thread finds the cache dir does not have any compiled kernel. It will
+        # compile all kernels one by one.
+        # So we directly replace TORCHINDUCTOR_CACHE_DIR with the standard cache dir.
+        if ("xpu_graph" in os.getenv("TORCHINDUCTOR_CACHE_DIR", "")):
+            import re
+            import getpass
+            import tempfile
+            sanitized_username = re.sub(r'[\\/:*?"<>|]', "_", getpass.getuser())
+            cache_dir = os.path.join(
+                tempfile.gettempdir(),
+                "torchinductor_" + sanitized_username,
+            )
+            os.environ["TORCHINDUCTOR_CACHE_DIR"] = cache_dir
+            os.environ["TRITON_CACHE_DIR"] = os.path.join(cache_dir, "triton", "0")
+        with self.lock:
+            if self.launchers:
+                return
+            self.launchers = []
+            compiled_binaries = []
+            if not self.configs:
+                raise RuntimeError("No triton configs are available")
+            for c in self.configs:
+                try:
+                    compiled_binary, launcher = self._precompile_config(
+                        c, warm_cache_only
+                    )
+                except Exception as e:
+                    log.debug(f"[thread {os.getpid()}][InductorNPU.precompile] Exception = {e}, kernel = {self.fn.__name__} config = {c}")
+                    # Skip the config if the compilation fails
+                    continue
+                if launcher is not None :  
+                    self.launchers.append(launcher)
+                    compiled_binaries.append(compiled_binary)
+
+            if len(self.launchers) == 0:
+                raise RuntimeError(
+                    "No valid triton configs. Report a fatal compilation error"
+                )
+
+            self.configs = None
+
+
+ 
+    def _precompile_config(self, cfg: Config, warm_cache_only: bool):
+        """Ahead of time compile a given autotuner config."""
+        compile_meta = copy.deepcopy(self.triton_meta)
+
+        for k, v in cfg.kwargs.items():
+            if k not in self.fn.arg_names :
+                continue
+            compile_meta["constants"][k] = v
+
+        compile_meta["num_warps"] = cfg.num_warps
+        compile_meta["num_stages"] = cfg.num_stages
+ 
+        compile_meta["debug"] = (
+            os.getenv("INDUCTOR_ASCEND_DEBUG", 'false').lower() in ('true', '1') and
+            config.assert_indirect_indexing and torch.version.hip is None
+        )
+
+        # device type will be "hip" rather than "cuda" here
+        compile_meta["device_type"] = self.device_props.type
+        compile_meta["cc"] = self.device_props.cc
+
+        if ASTSource:
+            compile_args = (
+                ASTSource(
+                    self.fn,
+                    compile_meta["signature"],
+                    compile_meta["constants"],
+                    #compile_meta["configs"][0],
+                ),
+            )
+
+            cc_str = str(compile_meta["cc"])
+            if "gfx10" in cc_str or "gfx11" in cc_str:
+                rocm_warp_size = 32
+            else:
+                rocm_warp_size = 64
+
+            if GPUTarget:
+                target = GPUTarget(
+                    compile_meta["device_type"],
+                    compile_meta["cc"],
+                    rocm_warp_size if torch.version.hip else 32,
+                )
+            else:
+                target = (
+                    (compile_meta["device_type"], compile_meta["cc"])
+                    if not torch.version.hip
+                    else [
+                        compile_meta["device_type"],
+                        compile_meta["cc"],
+                        rocm_warp_size,
+                    ]
+                )
+
+            options = {
+                "num_warps": compile_meta["num_warps"],
+                "num_stages": compile_meta["num_stages"],
+                "debug": compile_meta["debug"],
+            }
+            if self.device_props.type == "hip":
+                if "waves_per_eu" in compile_meta:
+                    options["waves_per_eu"] = compile_meta["waves_per_eu"]
+                if "matrix_instr_nonkdim" in compile_meta:
+                    options["matrix_instr_nonkdim"] = compile_meta[
+                        "matrix_instr_nonkdim"
+                    ]
+            compile_kwargs = {
+                "target": target,
+                "options": options,
+            }
+        else:
+            compile_args = (self.fn,)
+            compile_kwargs = compile_meta
+        if warm_cache_only:
+            return (
+                triton.compile(*compile_args, **compile_kwargs),
+                None,
+            )
+
+        # importing from torch is safe now that precompile has returned
+        from torch._dynamo.device_interface import DeviceGuard
+
+        device_interface = self.get_device_interface()
+
+        # load binary to the correct device
+        with DeviceGuard(device_interface, compile_meta["device"]):  # type: ignore[attr-defined]
+            # need to initialize context
+            device_interface.synchronize(device_interface.current_device())
+
+            try:
+                
+                binary = triton.compile(*compile_args, **compile_kwargs)
+                binary._init_handles()
+
+            except Exception:
+                log.exception(
+                    "Triton compilation failed: %s\n%s\nmetadata: %s",
+                    self.inductor_meta.get("kernel_name", "triton_"),
+                    self.fn.src,
+                    compile_meta,
+                )
+                return None, None
+                #raise
+
+            # except Exception as e:
+            #     self.exceptions.append(e)
+            #     # compile failed don't need raise error for npu
+            #     return None, None
+
+
+            
+
+        call_args = [
+            arg
+            for i, arg in enumerate(self.fn.arg_names)
+            if i not in self.fn.constexprs
+        ]
+        def_args = [name for name in self.fn.arg_names if name not in cfg.kwargs]
+
+        binary_shared = (
+            binary.shared if hasattr(binary, "shared") else binary.metadata.shared
+        )
+
+        scope = {
+            "grid_meta": cfg.kwargs,
+            "bin": binary,
+            "launch_enter_hook": CompiledKernel.launch_enter_hook,
+            "launch_exit_hook": CompiledKernel.launch_exit_hook,
+            "metadata": binary.packed_metadata
+            if hasattr(binary, "packed_metadata")
+            else binary.metadata,
+            "shared": binary_shared,
+        }
+
+        scope["num_warps"] = (
+            binary.num_warps
+            if hasattr(binary, "num_warps")
+            else binary.metadata.num_warps
+        )
+
+        scope["cta_args"] = (
+            (binary.num_ctas, *get_first_attr(binary, "cluster_dims", "clusterDims"))
+            if hasattr(binary, "num_ctas")
+            else (
+                (binary.metadata.num_ctas, *binary.metadata.cluster_dims)
+                if hasattr(binary, "metadata")
+                else ()
+            )
+        )
+
+        scope["function"] = get_first_attr(binary, "function", "cu_function")
+        def get_launch_args_without_kernel_launch_metadata(
+            grid,
+            grid_0,
+            grid_1,
+            grid_2,
+            stream,
+            function,
+            metadata,
+            bin,
+            launch_enter_hook,
+            launch_exit_hook,
+            num_warps,
+            shared,
+            cta_args,
+            args,
+        ):
+            """
+            Construct launch args before CompiledKernel.launch_metadata is added.
+            """
+            return (
+                grid_0,
+                grid_1,
+                grid_2,
+                num_warps,
+                *cta_args,
+                shared,
+                stream,
+                function,
+                launch_enter_hook,
+                launch_exit_hook,
+                metadata,
+            )
+
+        # Getting the kernel launch args is extremely perf-sensitive.  Evaluating
+        # `bin.launch_metadata` is relatively expensive, and returns None unless a
+        # `launch_enter_hook` is installed.  So if we don't have that hook installed,
+        # we want to burn None in to the launch args with zero overhead.
+        # See https://github.com/pytorch/pytorch/issues/123597
+        if binary.launch_enter_hook:
+
+            def get_launch_args_with_kernel_launch_metadata(
+                grid,
+                grid_0,
+                grid_1,
+                grid_2,
+                stream,
+                function,
+                metadata,
+                bin,
+                launch_enter_hook,
+                launch_exit_hook,
+                num_warps,
+                shared,
+                cta_args,
+                args,
+            ):
+                """
+                Construct launch args after CompiledKernel.launch_metadata is added
+                by https://github.com/openai/triton/pull/3492 .
+                """
+                return (
+                    grid_0,
+                    grid_1,
+                    grid_2,
+                    stream,
+                    function,
+                    metadata,
+                    bin.launch_metadata(grid, stream, *args),
+                    launch_enter_hook,
+                    launch_exit_hook,
+                )
+
+        else:
+
+            def get_launch_args_with_kernel_launch_metadata(
+                grid,
+                grid_0,
+                grid_1,
+                grid_2,
+                stream,
+                function,
+                metadata,
+                bin,
+                launch_enter_hook,
+                launch_exit_hook,
+                num_warps,
+                shared,
+                cta_args,
+                args,
+            ):
+                """
+                Construct launch args after CompiledKernel.launch_metadata is added
+                by https://github.com/openai/triton/pull/3492 .
+                """
+                return (
+                    grid_0,
+                    grid_1,
+                    grid_2,
+                    stream,
+                    function,
+                    metadata,
+                    None,
+                    launch_enter_hook,
+                    launch_exit_hook,
+                )
+
+        scope["get_launch_args"] = (
+            get_launch_args_with_kernel_launch_metadata
+            if hasattr(binary, "launch_metadata")
+            else get_launch_args_without_kernel_launch_metadata
+        )
+
+        scope["runner"] = get_first_attr(binary, "run", "c_wrapper")
+
+        exec(
+            f"""
+            def launcher({', '.join(def_args)}, grid, stream):
+                if callable(grid):
+                    grid_0, grid_1, grid_2 = grid(grid_meta)
+                else:
+                    grid_0, grid_1, grid_2 = grid
+
+                args = {', '.join(call_args)},
+                launch_args = get_launch_args(
+                    grid, grid_0, grid_1, grid_2, stream, function,
+                    metadata, bin, launch_enter_hook, launch_exit_hook,
+                    num_warps, shared, cta_args, args
+                )
+                runner(*launch_args, *args)
+                return bin
+            """.lstrip(),
+            scope,
+        )
+
+        launcher = scope["launcher"]
+        launcher.config = cfg
+        launcher.n_regs = getattr(binary, "n_regs", None)
+        launcher.n_spills = getattr(binary, "n_spills", None)
+        launcher.shared = binary_shared
+        # launcher.store_cubin = self.inductor_meta.get("store_cubin", False)
+        launcher.store_cubin = True
+        # store this global variable to avoid the high overhead of reading it when calling run
+        if launcher.store_cubin:
+            launcher.fn = self.fn
+            launcher.bin = binary
+
+        return binary, launcher
+
+    def save_gpu_kernel(self, grid, stream, launcher):
+        self.save_npu_kernel(grid, stream, launcher)
+    
+    def save_npu_kernel(self, grid, stream, launcher):
+        if callable(grid):
+            grid_x, grid_y, grid_z = grid(launcher.config.kwargs)
+        else:
+            grid_x, grid_y, grid_z = grid
+
+        key = self.inductor_meta.get("kernel_name", None)  # unique kernel name
+        assert key is not None, "kernel_name can not be None"
+        params = {
+            "mangled_name": (
+                launcher.bin.metadata.name
+                if hasattr(launcher.bin.metadata, "name")
+                else launcher.bin.metadata["name"]
+            ),
+            "grid_x": grid_x,
+            "grid_y": grid_y,
+            "grid_z": grid_z,
+            # "x_block": launcher.config.kwargs.get("XBLOCK", 1),
+            # "y_block": launcher.config.kwargs.get("YBLOCK", None),
+            # "z_block": launcher.config.kwargs.get("ZBLOCK", None),
+            # "r_block": launcher.config.kwargs.get("RBLOCK", None),
+            "num_warps": (
+                launcher.bin.num_warps
+                if hasattr(launcher.bin, "num_warps")
+                else launcher.bin.metadata.num_warps
+            ),
+            "shared_mem": (
+                launcher.bin.shared
+                if hasattr(launcher.bin, "shared")
+                else launcher.bin.metadata.shared
+            ),
+            "stream": stream,
+            # User defined triton kernels will have arbitrary kwarg names
+            "meta": launcher.config.kwargs,
+        }
+        from torch._inductor.codecache import CudaKernelParamCache
+
+        bin_type = "npubin"
+        binary = launcher.bin.asm[bin_type] # npubin type = npubin
+        CudaKernelParamCache.set(key, params, binary, bin_type='cubin') # CudaKernelParam
+
+        self.cuda_kernel_saved = True
+
+    def bench(self, launcher, *args, grid, with_profiler=False, **kwargs):
+        """Measure the performance of a given launcher"""        
+
+        if not self.custom_kernel and launcher.n_spills > self.inductor_meta.get(
+            "spill_threshold", 16
+        ):
+            log.debug(
+                "Skip config %s because of register spilling: %d",
+                launcher.config,
+                launcher.n_spills,
+            )
+            return float("inf")
+
+        device_interface = self.get_device_interface()
+        stream = device_interface.get_raw_stream(device_interface.current_device())
+
+        def kernel_call():
+            cloned_args, cloned_kwargs = self.clone_args(*args, **kwargs)
+            launcher(
+                *cloned_args,
+                **cloned_kwargs,
+                grid=grid,
+                stream=stream,
+            )
+
+        if with_profiler:
+            from torch._inductor.utils import do_bench_using_profiling
+
+            return do_bench_using_profiling(kernel_call, warmup=10, rep=40)
+        # remove fast_flush=True for high version triton
+        return benchmarker.benchmark_gpu(kernel_call, rep=40)
+
+
+    
+class NPUDebugAutotuner(NPUCachingAutotuner):
+    def __init__(self, *args, regex_filter="", **kwargs):
+        self.regex_filter = regex_filter
+        super().__init__(*args, **kwargs)
+        self.cached = None
+
+    def run(self, *args, grid, stream):
+        possible_names = _find_names(self)
+        kernel_name = f"{max(possible_names, key=len)}"
+        if not re.match(self.regex_filter, kernel_name):
+            return
+        super().run(*args, grid=grid, stream=stream)
+        (launcher,) = self.launchers
+
+        if self.cached is None:
+            ms = self.bench(launcher, *args, grid=grid)
+            num_in_out_ptrs = len(
+                [
+                    arg_name
+                    for arg_name in self.fn.arg_names
+                    if arg_name.startswith("in_out_ptr")
+                ]
+            )
+            num_gb = get_num_bytes(*args, num_in_out_args=num_in_out_ptrs) / 1e9
+            gb_per_s = num_gb / (ms / 1e3)
+            self.cached = (ms, num_gb, gb_per_s, kernel_name)
+        else:
+            ms, num_gb, gb_per_s, kernel_name = self.cached
+        collected_calls.append((ms, num_gb, gb_per_s, kernel_name))
+        print(
+            create_bandwidth_info_str(ms, num_gb, gb_per_s, suffix=f" \t {kernel_name}")
+        )
+
+#torch-260
+def cached_autotune(
+    size_hints: Optional[List[int]],
+    configs: List[Config],
+    triton_meta,
+    heuristic_type,
+    filename=None,
+    inductor_meta=None,
+    custom_kernel=False,
+):
+    """
+    A copy of triton.autotune that calls our subclass.  Our subclass
+    has additional debugging, error handling, and on-disk caching.
+    """
+    configs = unique_configs(configs)
+    assert len(configs) == 1 or filename
+    inductor_meta = {} if inductor_meta is None else inductor_meta
+
+    disabled = inductor_meta.get("force_disable_caches", False)
+
+    # on disk caching logic and/or remote caching
+    autotune_cache = None
+    if (
+        not disabled
+        and filename is not None
+        and (len(configs) > 1 or inductor_meta.get("coordinate_descent_tuning"))
+        and not os.environ.get("TRITON_INTERPRET", "0") == "1"
+    ):
+        configs_hash = hash_configs(configs)
+
+        autotune_cache = AutotuneCache.create(inductor_meta, filename, configs_hash)
+        if autotune_cache:
+            if best_config := autotune_cache.read_best(inductor_meta, configs):
+                configs = [best_config]
+
+    else:
+        if disabled:
+            log.debug("autotune caching is disabled by config.force_disable_caches")
+
+    mutated_arg_names = inductor_meta.pop("mutated_arg_names", ())
+    optimize_mem = inductor_meta.pop("optimize_mem", True)
+
+    if "restore_value" in triton_meta:
+        mutated_arg_names += triton_meta.pop("restore_value")
+
+    reset_to_zero_arg_names: List[str] = []
+    if "reset_to_zero" in triton_meta:
+        reset_to_zero_arg_names.extend(triton_meta.pop("reset_to_zero"))
+
+    def decorator(fn):
+        # Remove XBLOCK from config if it's not a function argument.
+        # This way, coordinate descent tuning will not try to tune it.
+        #
+        # Context: When TritonKernel.no_x_dim is True, we hardcode XBLOCK to 1.
+        import inspect
+
+        if "XBLOCK" not in inspect.signature(fn.fn).parameters:
+            for tconfig in configs:
+                if "XBLOCK" in tconfig.kwargs:
+                    assert tconfig.kwargs["XBLOCK"] == 1
+                    tconfig.kwargs.pop("XBLOCK")
+
+        if inductor_meta.get("profile_bandwidth"):
+            return NPUDebugAutotuner(
+                fn,
+                triton_meta=triton_meta,
+                inductor_meta=inductor_meta,
+                regex_filter=inductor_meta["profile_bandwidth_regex"],
+                with_profiler=inductor_meta[
+                    "profile_bandwidth_with_do_bench_using_profiling"
+                ],
+                configs=configs,
+                save_cache_hook=autotune_cache and autotune_cache.save,
+                mutated_arg_names=mutated_arg_names,
+                reset_to_zero_arg_names=reset_to_zero_arg_names,
+                optimize_mem=optimize_mem,
+                heuristic_type=heuristic_type,
+                size_hints=size_hints,
+                custom_kernel=custom_kernel,
+                filename=filename,
+                with_bandwidth_info=True,
+            )
+        return NPUCachingAutotuner(
+            fn,
+            triton_meta=triton_meta,
+            inductor_meta=inductor_meta,
+            configs=configs,
+            save_cache_hook=autotune_cache and autotune_cache.save,
+            mutated_arg_names=mutated_arg_names,
+            reset_to_zero_arg_names=reset_to_zero_arg_names,
+            optimize_mem=optimize_mem,
+            heuristic_type=heuristic_type,
+            size_hints=size_hints,
+            custom_kernel=custom_kernel,
+            filename=filename,
+        )
+
+    return decorator
+
+
+######################################################
+## Main entry points for triton kernel invocation   ##
+## adapts original heuristics for NPU arch, and     ##
+## redirect to NPUCaching autotuner                 ##
+######################################################
+
+def grid(*numels):
+    def grid_fn(meta):
+        split_axis_order = meta["split_axis_order"]
+
+        if split_axis_order is not None and  split_axis_order < len(numels) :
+            numel = numels[split_axis_order] if split_axis_order is not None else 1
+            xblock = meta["XBLOCK"]
+            NBLOCKS, _ = SplitTiling.get_nblocks_before_launch(numel, xblock)
+        else:
+            NBLOCKS = 1
+
+        log.debug("launch grid(%s), NBLOCKS:%d, meta:%s", numels, NBLOCKS, meta)
+        return (
+            NBLOCKS,
+            1,
+            1,
+        )
+
+    return grid_fn
+
+# split:sizeof split, xblock:axis1 length, rblock:axis2 length
+def triton_config_npu_index(
+    size_hints,
+    inductor_meta,
+    triton_meta=None,
+    reduction = False,
+    persistent_reduction = False,
+
+) -> List[Config]:
+    num_warps = 1
+    num_stages = 1
+    configs = []
+    log.info("[InductorNPU] processing kernel %s", inductor_meta['kernel_name'])
+    split_axis_order = inductor_meta["split_axis_order"]
+    axis1_order = inductor_meta["axis1_order"]
+    axis2_order = inductor_meta["axis2_order"]
+    low_dims = inductor_meta["low_dims"]
+    split_axis_dtype = inductor_meta["split_axis_dtype"]
+    split_numel = size_hints[split_axis_order] if split_axis_order is not None else 1
+    is_low_dim = True if split_axis_order is not None and split_axis_order in low_dims else False
+
+    min_aligned_numel = get_aligned_numel(split_axis_dtype)
+
+    grid_list = []
+    if (aggresive_autotune):
+        grid_list = SplitTiling.get_nblocks_xblock_list(split_numel)
+    else:
+        nblocks, split = SplitTiling.decide_nblocks_xblock(split_numel, axis2_order is None, min_aligned_numel)
+        grid_list.append((nblocks, split))
+
+    for nblocks, split in grid_list:
+        log.debug("generating tiling : size_hints:%s split_axis_order:%s, axis1_order:%s, axis2_order:%s, "
+                    "low_dims:%s  nblocks %s, split:%s persistent_reduction:%s split_axis_dtype:%s", size_hints,
+                    split_axis_order, axis1_order, axis2_order, low_dims, nblocks, split,
+                    persistent_reduction, split_axis_dtype)
+        # xblock is a range, don't auto_tune
+        xnumel = split if split_axis_order == axis1_order else size_hints[axis1_order]
+        rblock = 1
+        if axis2_order is not None :
+            rblock =  split if split_axis_order == axis2_order else size_hints[axis2_order]
+
+        xblock_sub = xnumel
+        cfg = {"NBLOCKS": nblocks, "XBLOCK": split, "XBLOCK_SUB": xblock_sub}
+        # forward to grid()
+        cfg["split_axis_order"] = split_axis_order
+        cfg["axis2_order"] = axis2_order if not(axis2_order is None) else -1 
+        cfg["is_low_dim"] = is_low_dim
+        cfg["min_aligned_numel"] = min_aligned_numel
+        is_1d_reduction = reduction and axis2_order is None
+        if persistent_reduction :
+            numof_reduction_axis = inductor_meta["numof_reduction_axis"]
+            if numof_reduction_axis > 1 :
+                del cfg["XBLOCK_SUB"]
+                configs.append(Config(cfg, num_warps=1, num_stages=1))
+            elif axis2_order is None :
+                del cfg["XBLOCK"]
+                del cfg["XBLOCK_SUB"]
+                cfg["NBLOCKS"] = 1
+                configs.append(Config(cfg, num_warps=1, num_stages=1))
+            else :
+                TileGenerator.descend_xblock(rnumel = rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
+        elif is_1d_reduction:
+            cfg["NBLOCKS"] = 1
+            cfg["XBLOCK"] = split_numel
+            cfg["XBLOCK_SUB"] = split_numel
+            TileGenerator.descend_xblock(rnumel = rblock, xblock=split_numel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
+        # both of the two axis are low dims
+        elif axis1_order in low_dims and axis2_order in low_dims :
+            cfg["RBLOCK"] = rblock
+            TileGenerator.descend_xblock_rblock(rnumel = rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
+        elif axis2_order is None and axis1_order is not None:
+            TileGenerator.descend_xblock(rnumel=0, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
+        # need to maximize xblock_sub
+        elif axis1_order in low_dims:
+            cfg["RBLOCK"] = rblock
+            TileGenerator.descend_rblock(rnumel = rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
+        elif axis2_order in low_dims:
+            cfg["RBLOCK"] = rblock
+            TileGenerator.descend_xblock(rnumel=rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
+        elif len(low_dims) == 0:
+            cfg["RBLOCK"] = rblock
+            if (axis1_order is not None) and (axis2_order is not None):
+                TileGenerator.descend_xblock_rblock(rnumel=rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel, aggresive = False )
+            elif axis1_order is not None:
+                TileGenerator.descend_xblock(rnumel=0, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel,aggresive = False )
+            else:
+                TileGenerator.descend_rblock(rnumel=rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel,aggresive = False )
+        else:
+            cfg["RBLOCK"] = rblock
+            tmp = Config(cfg, num_warps=num_warps, num_stages=num_stages)
+            configs.append(tmp)
+
+    for cfg in configs :
+        log.debug("generated tiling configs %s", cfg.kwargs)
+
+    return configs
+
+def pointwise_npu_index(
+    size_hints,
+    triton_meta,
+    tile_hint=None,
+    filename=None,
+    min_elem_per_thread=0,
+    inductor_meta=None,
+):
+
+    inductor_meta = {} if inductor_meta is None else inductor_meta
+    triton_config_with_settings = functools.partial(
+        triton_config_npu_index
+    )
+    return cached_autotune(
+        size_hints,
+        triton_config_with_settings(size_hints, inductor_meta = inductor_meta),
+        triton_meta=triton_meta,
+        inductor_meta=inductor_meta,
+        heuristic_type=HeuristicType.POINTWISE,
+        filename=filename,
+    )
+
+def reduction_npu_index(
+    size_hints,
+    reduction_hint=False,
+    triton_meta=None,
+    filename=None,
+    inductor_meta=None,
+):
+
+    """args to @triton.heuristics()"""
+    inductor_meta = {} if inductor_meta is None else inductor_meta
+    inductor_meta["reduction_hint"] = reduction_hint
+    assert triton_meta is not None
+    contiguous_config = triton_config_npu_index(size_hints, inductor_meta = inductor_meta, reduction = True)
+    return cached_autotune(
+        size_hints,
+        [
+            *contiguous_config,
+        ],
+        triton_meta=triton_meta,
+        inductor_meta=inductor_meta,
+        filename=filename,
+        heuristic_type=HeuristicType.REDUCTION,
+    )
+
+def persistent_reduction_npu_index(
+    size_hints,
+    reduction_hint=False,
+    triton_meta=None,
+    filename=None,
+    inductor_meta=None,
+):
+    inductor_meta = {} if inductor_meta is None else inductor_meta
+    inductor_meta["reduction_hint"] = reduction_hint
+    configs = triton_config_npu_index(size_hints, inductor_meta = inductor_meta, reduction=True,
+                                      persistent_reduction = True )
+
+
+    return cached_autotune(
+        size_hints,
+        configs,
+        triton_meta=triton_meta,
+        inductor_meta=inductor_meta,
+        filename=filename,
+        heuristic_type=HeuristicType.PERSISTENT_REDUCTION,
+    )
+
+# fixme , need to add npu_indexing tiling
+def foreach(triton_meta, num_warps, filename=None, inductor_meta=None):
+    """
+    Compile a triton foreach kernel
+    """
+    return cached_autotune(
+        None,
+        [triton.Config({}, num_stages=1, num_warps=num_warps)],
+        triton_meta=triton_meta,
+        inductor_meta=inductor_meta,
+        heuristic_type=HeuristicType.TEMPLATE,
+        filename=filename,
+    )
+
+from torch._dynamo.utils import dynamo_timed
+@dynamo_timed
+def benchmark_all_configs(self, *args, grid, **kwargs):
+    print(f"candidate launcher count = {len(self.launchers)}")
+    
+    tilling_kernel_list = []
+    
+    def kernel_call(launcher):
+        def call_kernel():
+            if launcher.config.pre_hook is not None:
+                launcher.config.pre_hook(
+                    {**dict(zip(self.arg_names, args)), **launcher.config.kwargs}
+                )
+            cloned_args, cloned_kwargs = self.clone_args(*args, **kwargs)
+            launcher(
+                *cloned_args,
+                **cloned_kwargs,
+                grid=grid,
+                stream=stream,
+            )
+        return call_kernel
+    
+    for launcher in self.launchers:
+        if not self.custom_kernel and launcher.n_spills > config.triton.spill_threshold:
+            log.debug(
+                "Skip config %s because of register spilling: %d",
+                launcher.config,
+                launcher.n_spills,
+            )
+            return float("inf")
+
+        stream = self.gpu_device.get_raw_stream(  # type: ignore[call-arg]
+            self.gpu_device.current_device()
+        )
+        tilling_kernel_list.append(kernel_call(launcher))
+    
+    def do_batch_benchmark(tilling_kernel_list):
+        
+        def delete_file(base_path):
+            import shutil
+            import os
+            if os.path.exists(base_path):
+                shutil.rmtree(base_path)
+        
+        import torch
+        import torch_npu
+        import hashlib
+        from datetime import datetime
+        
+        stream = torch.npu.current_stream()
+        experimental_config = torch_npu.profiler._ExperimentalConfig(
+            aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization,
+            profiler_level=torch_npu.profiler.ProfilerLevel.Level1,
+            l2_cache=False,
+            data_simplification=False
+        )
+        
+        md5_hash = hashlib.md5()
+        md5_hash = hashlib.md5(datetime.now().strftime('%Y-%m-%d').encode('utf-8')).hexdigest()
+        
+        torch_path="./profile_result/"+md5_hash
+        rep=1
+        with torch_npu.profiler.profile(
+            activities=[
+                torch_npu.profiler.ProfilerActivity.NPU
+                ],
+            schedule=torch_npu.profiler.schedule(wait=0, warmup=1, active=rep, repeat=1, skip_first=1),
+            on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(torch_path),
+            record_shapes=False,
+            profile_memory=False,
+            with_stack=False,
+            with_flops=False,
+            with_modules=False,
+            experimental_config=experimental_config) as prof:
+            stream.synchronize()
+            for i in range(rep+3):
+                for fn in tilling_kernel_list:
+                    fn()
+                prof.step()
+            stream.synchronize()
+
+        import pandas as pd
+        for root, dirs, files in os.walk(torch_path):
+            for file in files:
+                if file != 'kernel_details.csv':
+                    continue
+                target_file = os.path.join(root, file)
+                df = pd.read_csv(target_file)
+                triton_rows = df[df['Name'].str.startswith('triton', na=False)]
+                ret = triton_rows['Duration(us)'].astype(float).tolist()
+                delete_file(torch_path)
+                return ret
+
+        delete_file(torch_path)  
+        return []
+
+    try:
+        timinglist = do_batch_benchmark(tilling_kernel_list)
+        assert len(timinglist) == len(self.launchers)
+        timings = {launcher: timing for launcher, timing in zip(self.launchers, timinglist)}
+    except Exception as e:
+        print("some cases in batch benchmark has error! Logging Exception as:")
+        print(e)
+        print("switched to single bench...")
+        timings = {
+            launcher: self.bench(launcher, *args, **kwargs)
+            for launcher in self.launchers
+        }
+    
+    for k, v in timings.items():
+        self.coordesc_tuner.cache_benchmark_result(k.config, v)
+
+    if log.isEnabledFor(logging.DEBUG):
+        log.debug("Benchmark all input configs for %s, get:", self.fn.__name__)
+        for k, v in timings.items():
+            log.debug(
+                "%s: %f, nreg %d, nspill %d, #shared-mem %s",
+                k.config,
+                v,
+                k.n_regs,
+                k.n_spills,
+                k.shared,
+            )
+    print(f"final valid tillings count = {len(timings)}")
+    return timings
\ No newline at end of file
diff --git a/torch_npu/_inductor/runtime.py b/torch_npu/_inductor/runtime.py
new file mode 100644
index 0000000000..ff0ff13242
--- /dev/null
+++ b/torch_npu/_inductor/runtime.py
@@ -0,0 +1,56 @@
+from torch._inductor.runtime.hints import DeviceProperties
+from typing import Optional
+import functools
+from .config import num_vector_core
+
+class NPUDeviceProperties(DeviceProperties):
+
+    # @classmethod
+    # @functools.lru_cache(None)
+    # def create(cls, device) -> DeviceProperties:
+    #     import torch
+    #     from torch._dynamo.device_interface import get_interface_for_device
+
+    #     device_type = device.type if torch.version.hip is None else "hip"
+    #     device_interface = get_interface_for_device(device)
+
+    #     return cls(
+    #         type=device_type,
+    #         index=device.index,
+    #         cc=device_interface.get_compute_capability(device),
+    #     )
+
+
+    @classmethod
+    @functools.lru_cache(None)
+    def create(cls, device) -> DeviceProperties:
+        import torch
+        from torch._dynamo.device_interface import get_interface_for_device
+
+        device_type = device.type
+
+        if torch.version.hip and device_type == "cuda":
+            device_type = "hip"
+
+        device_interface = get_interface_for_device(device)
+        props = device_interface.get_device_properties(device)
+
+        try:
+            multi_processor_count = num_vector_core #props.multi_processor_count
+        except AttributeError:
+            if device_type == "xpu":
+                multi_processor_count = props.gpu_subslice_count
+            else:
+                raise
+        return cls(
+            type=device_type,
+            index=device.index,
+            multi_processor_count=multi_processor_count,
+            cc=device_interface.get_compute_capability(device),
+            major=getattr(props, "major", None),
+            regs_per_multiprocessor=getattr(props, "regs_per_multiprocessor", None),
+            max_threads_per_multi_processor=getattr(
+                props, "max_threads_per_multi_processor", None
+            ),
+            warp_size=getattr(props, "warp_size", 32 if device_type != "cpu" else None),
+        )
diff --git a/torch_npu/_inductor/utils.py b/torch_npu/_inductor/utils.py
new file mode 100644
index 0000000000..cec57c9267
--- /dev/null
+++ b/torch_npu/_inductor/utils.py
@@ -0,0 +1,6 @@
+import torch
+import torch_npu
+
+# Not good implementation, but no other way
+def get_current_raw_stream(device):
+    return torch.npu.current_stream(device).npu_stream
\ No newline at end of file
-- 
Gitee


From f028e830f0cb794ddbfbbe8e2b50ad9c902c472c Mon Sep 17 00:00:00 2001
From: wl1259 <wanglei752@huawei.com>
Date: Fri, 18 Apr 2025 15:26:01 +0800
Subject: [PATCH 327/358] =?UTF-8?q?=E6=8F=90=E4=BA=A4=EF=BC=8C=E7=AC=AC?=
 =?UTF-8?q?=E4=B8=80=E6=AC=A1clean=20code?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/_inductor/__init__.py               | 12 ++++++++----
 torch_npu/_inductor/decomposition.py          |  2 ++
 torch_npu/_inductor/dynamo_patch3.py          |  7 +++----
 torch_npu/_inductor/npu_choices.py            |  4 +---
 .../_inductor/npu_fusion_attention_graph.py   | 13 ++++++++++---
 torch_npu/_inductor/npu_triton_helpers.py     |  4 +++-
 torch_npu/_inductor/npu_triton_heuristics.py  | 18 ++++++++----------
 torch_npu/_inductor/runtime.py                | 19 +++----------------
 torch_npu/_inductor/utils.py                  |  1 +
 9 files changed, 39 insertions(+), 41 deletions(-)

diff --git a/torch_npu/_inductor/__init__.py b/torch_npu/_inductor/__init__.py
index 4d18d683d7..71f637cdfd 100644
--- a/torch_npu/_inductor/__init__.py
+++ b/torch_npu/_inductor/__init__.py
@@ -1,22 +1,26 @@
 
 import torch
 from torch._inductor.codegen.common import register_backend_for_device, register_device_op_overrides
-from . import config as npu_config
-from torch_npu.utils._inductor import NPUDeviceOpOverrides
-from torch_npu.utils._dynamo_device import NpuInterface, current_device, set_device
-from torch_npu.npu.utils import device_count
 from torch._dynamo.device_interface import register_interface_for_device, get_interface_for_device
 from torch._inductor import lowering as inductor_lowering
 from torch._inductor.choices import InductorChoices
+
+from torch_npu.utils._inductor import NPUDeviceOpOverrides
+from torch_npu.utils._dynamo_device import NpuInterface, current_device, set_device
+from torch_npu.npu.utils import device_count
+
 from .lowering import _register_npu_inductor_fallbacks, make_reduction
 from .decomposition import _register_npu_inductor_decompositons
 from .utils import get_current_raw_stream
 from .config import log as npulog
 from .config import aggresive_autotune, num_vector_core
 from .npu_choices import should_use_persistent_reduction
+from . import config as npu_config
+
 
 npulog.info("perform torch_npu._inductor patch")
 
+
 def _inductor_register_backend_for_device():
     from .codegen.schduling import NPUTritonScheduling
     from .codegen.wrapper import NPUWrapperCodeGen
diff --git a/torch_npu/_inductor/decomposition.py b/torch_npu/_inductor/decomposition.py
index 397bd5ef27..b13da90092 100644
--- a/torch_npu/_inductor/decomposition.py
+++ b/torch_npu/_inductor/decomposition.py
@@ -3,6 +3,7 @@ from torch._inductor.decomposition import register_decomposition
 import torch._ops
 from .lowering import _init_set
 
+
 aten = torch.ops.aten
 
 DECOMPOSITION_OVERLOAD_OP = [
@@ -15,6 +16,7 @@ DECOMPOSITION_OVERLOAD_OP = [
     aten.embedding_dense_backward
 ]
 
+
 def _register_npu_inductor_decompositons():
 
     overload_op_set = set()
diff --git a/torch_npu/_inductor/dynamo_patch3.py b/torch_npu/_inductor/dynamo_patch3.py
index 15a52f61e5..6584c99e7a 100644
--- a/torch_npu/_inductor/dynamo_patch3.py
+++ b/torch_npu/_inductor/dynamo_patch3.py
@@ -1,11 +1,10 @@
-# Issue: Error
-# x.permute(0, 2, 1, 3).contiguous().view(xxx)
-# Message: 
 import torch
 from torch.library import Library, impl
 python_dispatcher_lib = Library("aten", "IMPL", "PythonDispatcher")
+
+
 @impl(python_dispatcher_lib, "embedding_backward")
 def embedding_backward(grad, indices, num_weights, padding_idx, scale_grad_by_freq, sparse):
     if sparse:
         raise RuntimeError("the current NPU does not yet support sparse tensor, when sparse is set to True")
-    return torch.ops.aten.embedding_dense_backward(grad, indices,  num_weights, padding_idx, scale_grad_by_freq)
\ No newline at end of file
+    return torch.ops.aten.embedding_dense_backward(grad, indices, num_weights, padding_idx, scale_grad_by_freq)
\ No newline at end of file
diff --git a/torch_npu/_inductor/npu_choices.py b/torch_npu/_inductor/npu_choices.py
index a3f7a2bf47..02a90b0ac3 100644
--- a/torch_npu/_inductor/npu_choices.py
+++ b/torch_npu/_inductor/npu_choices.py
@@ -4,9 +4,7 @@ from typing import Any, Dict, List, Type, TYPE_CHECKING
 import sympy
 
 from torch._inductor import config
-
-from torch._inductor.runtime.hints import  ReductionHint
-
+from torch._inductor.runtime.hints import ReductionHint
 from torch._inductor.virtualized import V
 from torch._inductor.codegen.simd_kernel_features import SIMDKernelFeatures
 from torch._inductor.codegen.triton import TritonKernel
diff --git a/torch_npu/_inductor/npu_fusion_attention_graph.py b/torch_npu/_inductor/npu_fusion_attention_graph.py
index 443f20f966..c8949487af 100644
--- a/torch_npu/_inductor/npu_fusion_attention_graph.py
+++ b/torch_npu/_inductor/npu_fusion_attention_graph.py
@@ -1,12 +1,14 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
+import functools
 import sympy
 import torch
-import torch_npu
-import functools
-import torch.nn.functional as F
 from torch.autograd import Function
 from torch.library import Library, impl
+import torch.nn.functional as F
+import torch_npu
+
+
 
 npu_def = Library("npu_graph", "DEF")
 npu_lib = Library("npu_graph", "IMPL", "PrivateUse1")
@@ -15,6 +17,7 @@ meta_lib = Library("npu_graph", "IMPL", "Meta")
 npu_def.define("npu_fa(Tensor query, Tensor key, Tensor value, int head_num, str input_layout, Tensor? pse=None, Tensor? padding_mask=None, Tensor? atten_mask=None, float scale=1., float keep_prob=1., int pre_tockens=2147483647, int next_tockens=2147483647, int inner_precise=0, int[]? prefix=None, int[]? actual_seq_qlen=None, int[]? actual_seq_kvlen=None, int sparse_mode=0, bool gen_mask_parallel=True, bool sync=False) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)")
 npu_def.define("npu_fa_backward(Tensor query, Tensor key, Tensor value, Tensor dy, int head_num, str input_layout, *, Tensor? pse=None, Tensor? padding_mask=None, Tensor? atten_mask=None, Tensor? softmax_max=None, Tensor? softmax_sum=None, Tensor? softmax_in=None, Tensor? attention_in=None, float scale_value=1., float keep_prob=1., int pre_tockens=2147483647, int next_tockens=2147483647, int inner_precise=0, Tensor? seed=None, Tensor? offset=None, Tensor? numels=None, int[]? prefix=None, int[]? actual_seq_qlen=None, int[]? actual_seq_kvlen=None, int sparse_mode=0, bool gen_mask_parallel=True, bool sync=False) -> (Tensor, Tensor, Tensor, Tensor)")
 
+
 @impl(npu_lib, "npu_fa")
 def npu_fa(*args, **kwargs):
     if len(args) > 8:
@@ -71,6 +74,7 @@ def npu_fa(query, key, value, head_num, input_layout, pse=None, padding_mask=Non
             torch.tensor([0],device='meta',requires_grad=False),
             torch.tensor([0],device='meta',requires_grad=False))
 
+
 @impl(meta_lib, "npu_fa_backward")
 def npu_fa_backward(query, key, value, dy, head_num, input_layout, *, pse=None, padding_mask=None, atten_mask=None,
                                 softmax_max=None, softmax_sum=None, softmax_in=None, attention_in=None, scale_value=1.0,
@@ -83,6 +87,7 @@ def npu_fa_backward(query, key, value, dy, head_num, input_layout, *, pse=None,
     dpse = torch.empty([0], dtype=query.dtype, device='meta').contiguous()
     return (torch.empty_like(dq), torch.empty_like(dk), torch.empty_like(dv), torch.empty_like(dpse) if pse else None)
 
+
 class NpuGraphAttentionFunction(Function):
     @staticmethod
     def forward(ctx, query, key, value, head_num, input_layout, pse=None, padding_mask=None, atten_mask=None, scale=1.0, keep_prob=1.0, pre_tockens=2147483647, next_tockens=2147483647, inner_precise=0, prefix=None, actual_seq_qlen=None, actual_seq_kvlen=None, sparse_mode=0, gen_mask_parallel=True, sync=False):
@@ -120,6 +125,7 @@ class NpuGraphAttentionFunction(Function):
         )
         return (grad_query, grad_key, grad_value, None, None, grad_pse, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
 
+
 def npu_fusion_attention_graph(query, key, value, head_num, input_layout, pse=None, padding_mask=None,
                             atten_mask=None, scale=1.0, keep_prob=1.0, pre_tockens=2147483647, next_tockens=2147483647,
                             inner_precise=0, prefix=None, actual_seq_qlen=None, actual_seq_kvlen=None, sparse_mode=0, gen_mask_parallel=True, sync=False):
@@ -129,6 +135,7 @@ def npu_fusion_attention_graph(query, key, value, head_num, input_layout, pse=No
                             inner_precise, prefix, actual_seq_qlen, actual_seq_kvlen, sparse_mode, gen_mask_parallel, sync)
 torch_npu.npu_fusion_attention_graph = npu_fusion_attention_graph
 
+
 def register_fx_pass():
     TOKEN_MAX = 2147483647
     from torch._inductor.pattern_matcher import register_replacement, fwd_only, joint_fwd_bwd
diff --git a/torch_npu/_inductor/npu_triton_helpers.py b/torch_npu/_inductor/npu_triton_helpers.py
index f4e22275da..5f60f5dd49 100644
--- a/torch_npu/_inductor/npu_triton_helpers.py
+++ b/torch_npu/_inductor/npu_triton_helpers.py
@@ -2,14 +2,16 @@ import triton
 import triton.language as tl
 
 import triton.language.extra.ascend.libdevice as libdevice
+from torch._inductor.runtime import triton_helpers
 libdevice = tl.extra.ascend.libdevice
 math = tl.math
 
-from torch._inductor.runtime import triton_helpers
+
 @triton.jit
 def maximum(a, b):
     return tl.maximum(a, b)
 
+
 @triton.jit
 def minimum(a, b):
     return tl.minimum(a, b)
diff --git a/torch_npu/_inductor/npu_triton_heuristics.py b/torch_npu/_inductor/npu_triton_heuristics.py
index e4cf47c7ef..98566c0eea 100644
--- a/torch_npu/_inductor/npu_triton_heuristics.py
+++ b/torch_npu/_inductor/npu_triton_heuristics.py
@@ -2,19 +2,13 @@
 import os
 import functools
 import copy
-import pdb
 from typing import Any, Callable, List, Optional
-from .config import log
 import logging
-from .config import aggresive_autotune
-
-import torch
 import re
+import hashlib
 
+import torch
 from torch._inductor import config
-import hashlib
-from .codegen.tile_generator import TileGenerator
-from .codegen.triton_utils import get_aligned_numel
 
 from torch._inductor.runtime.triton_heuristics import (
     CachingAutotuner,
@@ -37,7 +31,7 @@ from torch._inductor.runtime.runtime_utils import (
     get_num_bytes,
 
 )
-from .codegen.split_tiling import SplitTiling
+
 import triton
 from triton.compiler import CompiledKernel
 
@@ -50,8 +44,12 @@ except ImportError:
     OutOfResources = None
     autograd_profiler = None
 
+from .codegen.split_tiling import SplitTiling
 from .utils import get_current_raw_stream
-
+from .codegen.tile_generator import TileGenerator
+from .codegen.triton_utils import get_aligned_numel
+from .config import aggresive_autotune
+from .config import log
 
 # torch-261
 class NPUCachingAutotuner(CachingAutotuner):
diff --git a/torch_npu/_inductor/runtime.py b/torch_npu/_inductor/runtime.py
index ff0ff13242..dbcd70db66 100644
--- a/torch_npu/_inductor/runtime.py
+++ b/torch_npu/_inductor/runtime.py
@@ -1,24 +1,11 @@
-from torch._inductor.runtime.hints import DeviceProperties
 from typing import Optional
 import functools
-from .config import num_vector_core
-
-class NPUDeviceProperties(DeviceProperties):
 
-    # @classmethod
-    # @functools.lru_cache(None)
-    # def create(cls, device) -> DeviceProperties:
-    #     import torch
-    #     from torch._dynamo.device_interface import get_interface_for_device
+from torch._inductor.runtime.hints import DeviceProperties
+from .config import num_vector_core
 
-    #     device_type = device.type if torch.version.hip is None else "hip"
-    #     device_interface = get_interface_for_device(device)
 
-    #     return cls(
-    #         type=device_type,
-    #         index=device.index,
-    #         cc=device_interface.get_compute_capability(device),
-    #     )
+class NPUDeviceProperties(DeviceProperties):
 
 
     @classmethod
diff --git a/torch_npu/_inductor/utils.py b/torch_npu/_inductor/utils.py
index cec57c9267..697059f7a8 100644
--- a/torch_npu/_inductor/utils.py
+++ b/torch_npu/_inductor/utils.py
@@ -1,6 +1,7 @@
 import torch
 import torch_npu
 
+
 # Not good implementation, but no other way
 def get_current_raw_stream(device):
     return torch.npu.current_stream(device).npu_stream
\ No newline at end of file
-- 
Gitee


From 36add631441279382fa27db2abfc76dfbee235e3 Mon Sep 17 00:00:00 2001
From: wl1259 <wanglei752@huawei.com>
Date: Fri, 18 Apr 2025 16:59:12 +0800
Subject: [PATCH 328/358] second clean code

---
 torch_npu/_inductor/__init__.py               | 37 ++++-----
 torch_npu/_inductor/decomposition.py          |  1 -
 torch_npu/_inductor/lowering.py               | 82 +++++++------------
 torch_npu/_inductor/npu_choices.py            |  2 +-
 .../_inductor/npu_fusion_attention_graph.py   | 24 +++---
 torch_npu/_inductor/npu_triton_heuristics.py  | 38 ++++-----
 torch_npu/_inductor/runtime.py                |  2 +-
 7 files changed, 76 insertions(+), 110 deletions(-)

diff --git a/torch_npu/_inductor/__init__.py b/torch_npu/_inductor/__init__.py
index 71f637cdfd..d15589aa09 100644
--- a/torch_npu/_inductor/__init__.py
+++ b/torch_npu/_inductor/__init__.py
@@ -16,7 +16,10 @@ from .config import log as npulog
 from .config import aggresive_autotune, num_vector_core
 from .npu_choices import should_use_persistent_reduction
 from . import config as npu_config
-
+#register fx_pass should be put behind of _register_npu_inductor_decompositons
+from . import codegen
+from . import npu_fusion_attention_graph
+from . import dynamo_patch3
 
 npulog.info("perform torch_npu._inductor patch")
 
@@ -28,17 +31,20 @@ def _inductor_register_backend_for_device():
 
 _inductor_register_backend_for_device()
 
-## Override original inductor device overrides in torch_npu
 
+## Override original inductor device overrides in torch_npu
 class NewNPUDeviceOpOverrides(NPUDeviceOpOverrides):
     def import_get_raw_stream_as(self, name):
         return f"from torch_npu._inductor import get_current_raw_stream as {name}"
 
+
+
 def _inductor_register_device_op_overrides():
     register_device_op_overrides('npu', NewNPUDeviceOpOverrides())
 
 _inductor_register_device_op_overrides()
 
+
 ## Override original dynamo device interface in torch_npu
 class NewNpuInterface(NpuInterface):
 
@@ -47,46 +53,37 @@ class NewNpuInterface(NpuInterface):
         return device_count() > 0
 
     @staticmethod
-    def get_compute_capability(device=None):
+    def get_compute_capability(mydevice=None):
         # npu has no concept of cc. triton-npu compiler depends on subarch instead
-        return torch.npu.get_device_name(device)
+        return torch.npu.get_device_name(mydevice)
     
     @staticmethod
-    def exchange_device(device: int) -> int:
+    def exchange_device(device_id: int) -> int:
         curr_device = current_device()
-        set_device(device) 
+        set_device(device_id)
         return curr_device
     
     @staticmethod
-    def maybe_exchange_device(device: int) -> int:
-        return device 
-    
-    # @staticmethod
-    # def get_device_properties(device=None):
-    #     props = NpuInterface.get_device_properties(device)
-    #     setattr(props, "multi_processor_count", num_vector_core )
-    #     return props 
+    def maybe_exchange_device(device_id: int) -> int:
+        return device_id
+
 
 register_interface_for_device("npu", NewNpuInterface)
 register_interface_for_device("npu:0", NewNpuInterface)
 device = get_interface_for_device("npu")
 
-from . import codegen
+
 
 inductor_lowering.make_reduction = make_reduction
 _register_npu_inductor_fallbacks()
 _register_npu_inductor_decompositons()
 
-#register fx_pass should be put behind of _register_npu_inductor_decompositons
-#from .npu_indexing import fx_pass
-from . import npu_fusion_attention_graph
-from . import dynamo_patch3
-
 def _replace_benchmark_all_configs():
     from torch._inductor.triton_heuristics import CachingAutotuner
     from .npu_triton_heuristics import benchmark_all_configs
     CachingAutotuner.benchmark_all_configs = benchmark_all_configs
 
+
 if (aggresive_autotune):
     _replace_benchmark_all_configs()
     import os
diff --git a/torch_npu/_inductor/decomposition.py b/torch_npu/_inductor/decomposition.py
index b13da90092..af5ecbf311 100644
--- a/torch_npu/_inductor/decomposition.py
+++ b/torch_npu/_inductor/decomposition.py
@@ -29,7 +29,6 @@ def _register_npu_inductor_decompositons():
     @register_decomposition([aten.scatter.src])
     @pw_cast_for_opmath
     def scatter_src(self, input_tensor, dim, index_tensor, source_tensor):
-        assert self.device.type == "npu" and dim == 1
         (XNUMEL, YS) = input_tensor.shape
         index_rblock = torch.arange(YS).npu().reshape((1, YS)).repeat((XNUMEL, 1))
 
diff --git a/torch_npu/_inductor/lowering.py b/torch_npu/_inductor/lowering.py
index 79d37f8f6c..15ac45b9b1 100644
--- a/torch_npu/_inductor/lowering.py
+++ b/torch_npu/_inductor/lowering.py
@@ -13,6 +13,30 @@ from torch._prims_common import (
 from torch._inductor.decomposition import decompositions, pw_cast_for_opmath
 import torch._ops
 
+from torch._inductor.lowering import (
+    lowerings,
+    make_fallback,
+    register_lowering,
+    to_dtype,
+    # make_reduction,
+    # reduce_amax,
+    # reduce_amin,
+    fallback_cumsum,
+    _validate_reduction_axis,
+    div,
+    squeeze,
+    square,
+    sub,
+    fallback_handler,
+    is_boolean_type,
+    logical_and,
+    make_pointwise,
+    _make_reduction_inner,
+    _validate_reduction_axis,
+)
+import torch_npu
+from torch_npu import  npu_dtype_cast
+
 
 def make_reduction(reduction_type: str, override_return_dtype=None):
     def inner(x, axis=None, keepdims=False, *, dtype=None):
@@ -26,7 +50,8 @@ def make_reduction(reduction_type: str, override_return_dtype=None):
         result = Reduction.create(reduction_type=reduction_type, input_node=x, **kwargs)
         if isinstance(
                 result.data.data, Reduction
-        ):  #Only realize if reduction isn't unrolled
+        ):
+            #Only realize if reduction isn't unrolled
             size = x.get_size()
             axis = set(_validate_reduction_axis(x, axis))
             kept_idx = []
@@ -48,36 +73,11 @@ def make_reduction(reduction_type: str, override_return_dtype=None):
 
 lowering.make_reduction = make_reduction
 
-from torch._inductor.lowering import (
-    lowerings,
-    make_fallback,
-    register_lowering,
-    to_dtype,
-    # make_reduction,
-    # reduce_amax,
-    # reduce_amin,
-    fallback_cumsum,
-    _validate_reduction_axis,
-    div,
-    squeeze,
-    square,
-    sub,
-    fallback_handler,
-    is_boolean_type,
-    logical_and,
-    make_pointwise,
-    _make_reduction_inner,
-    _validate_reduction_axis,
-)
 
 aten = torch.ops.aten
 tr_c10d = torch.ops.tr_c10d
 prims = torch.ops.prims
 
-import torch_npu
-
-from torch_npu import  npu_dtype_cast
-
 
 def _init_set(input_list, output_set):
     for fn in input_list:
@@ -174,7 +174,6 @@ LOWERING_OVERLOAD_OP = [
     aten.var_mean,
     aten.var,
 
-    # todo: work round for electraModel
     aten.embedding,
     aten.split,
     aten.split_with_sizes,
@@ -225,33 +224,18 @@ def _register_npu_inductor_fallbacks():
         denom = ir.IndexingConstant(index=denom, dtype=x.get_dtype(), device=x.get_device())
         denom = ExpandView.create(denom, list(sum_result.get_size()))
         return to_dtype(div(sum_result, denom), output_dtype)
-    
-    # @register_lowering(aten.mean)
-    # def mean(x, axis=None, keepdim=False, *, dtype=None):
-    #     size = x.get_size()
-    #     if dtype is not None:
-    #         x = to_dtype(x, dtype)
-    #         size = x.get_size()
-    #     axis = _validate_reduction_axis(x, axis)
-
-    #     # compute in higher-precision until end of mean lowering
-    #     output_dtype = x.get_dtype()
-    #     if output_dtype in (torch.bfloat16,):
-    #         x = to_dtype(x, torch.float)
-    #     sum_result = sum_(x, axis, keepdim)
-    #     denom = sympy_product(size[i] for i in axis)
-    #     denom = ir.IndexingConstant(denom, x.get_dtype(), x.get_device())
-    #     denom = ExpandView.create(denom, list(sum_result.get_size()))
-    #     return to_dtype(div(sum_result, denom), output_dtype)
+
 
     @register_lowering(aten.cumsum)
     def cumsum(x, axis=None, dtype=None):
         if (
                 is_integer_dtype(x.get_dtype()) or is_boolean_dtype(x.get_dtype())
         ) and dtype is None:
-            dtype = torch.int32 # torch.int64->torch.int32
+            # torch.int64->torch.int32
+            dtype = torch.int32
         if len(x.get_size()) == 0:
-            assert axis in [0, -1]
+            if axis not in [0, -1]:
+                raise ValueError("axis must be 0 or -1")
             dtype = dtype or x.get_dtype()
             return to_dtype(x, dtype, copy=True)
         return fallback_cumsum(x, dim=axis, dtype=dtype)
@@ -299,9 +283,6 @@ def _register_npu_inductor_fallbacks():
         )
         output = (
             var_mean_sum_(**kwargs)
-            #todo: The welford reduction branch is annotated
-            # if use_two_step_variance(x,axis=axis,keepdim=keepdim)
-            # else var_mean_welford_(**kwargs)
         )
         output = tuple(to_dtype(x, out_dtype, copy=False) for x in output)
         return output[0] if not return_mean else output
@@ -325,7 +306,6 @@ def _register_npu_inductor_fallbacks():
 
     @register_lowering(aten.cat)
     def cat(inputs, dim=0):
-        # todo:work round for electraModel backward
         return fallback_handler(aten.cat.default)(inputs, dim)
 
     make_fallback(aten._log_softmax)
diff --git a/torch_npu/_inductor/npu_choices.py b/torch_npu/_inductor/npu_choices.py
index 02a90b0ac3..ff0c11b4ff 100644
--- a/torch_npu/_inductor/npu_choices.py
+++ b/torch_npu/_inductor/npu_choices.py
@@ -24,7 +24,7 @@ def should_use_persistent_reduction(
         return False
     threshold = {
         ReductionHint.INNER: 1024,
-        ReductionHint.DEFAULT : 1024
+        ReductionHint.DEFAULT: 1024
     }.get(features.get_reduction_hint(), 64)
     if cooperative_reduction:
         # The RSPLIT of cooperative reductions means each thread block is operating on fewer elements
diff --git a/torch_npu/_inductor/npu_fusion_attention_graph.py b/torch_npu/_inductor/npu_fusion_attention_graph.py
index c8949487af..9e796252c2 100644
--- a/torch_npu/_inductor/npu_fusion_attention_graph.py
+++ b/torch_npu/_inductor/npu_fusion_attention_graph.py
@@ -28,7 +28,6 @@ def npu_fa(*args, **kwargs):
         except IndexError:
             args[8] = 1.0 / (args[8] + 1e-6)
             print("args[8]: zero can not be divided")
-    # kwargs['scale'] = 1 / kwargs['scale']
     r1, r2, r3, r4, seed, offset, numel = torch_npu.npu_fusion_attention(*args, **kwargs)
     r2.requires_grad = False
     r3.requires_grad = False
@@ -70,9 +69,9 @@ def npu_fa(query, key, value, head_num, input_layout, pse=None, padding_mask=Non
             torch.empty_like(softmax_max),
             torch.empty_like(softmax_sum),
             torch.empty_like(softmax_out),
-            torch.tensor([0],device='meta',requires_grad=False),
-            torch.tensor([0],device='meta',requires_grad=False),
-            torch.tensor([0],device='meta',requires_grad=False))
+            torch.tensor([0], device='meta',requires_grad=False),
+            torch.tensor([0], device='meta',requires_grad=False),
+            torch.tensor([0], device='meta',requires_grad=False))
 
 
 @impl(meta_lib, "npu_fa_backward")
@@ -129,7 +128,6 @@ class NpuGraphAttentionFunction(Function):
 def npu_fusion_attention_graph(query, key, value, head_num, input_layout, pse=None, padding_mask=None,
                             atten_mask=None, scale=1.0, keep_prob=1.0, pre_tockens=2147483647, next_tockens=2147483647,
                             inner_precise=0, prefix=None, actual_seq_qlen=None, actual_seq_kvlen=None, sparse_mode=0, gen_mask_parallel=True, sync=False):
-    #import pdb;pdb.set_trace()
     return NpuGraphAttentionFunction.apply(query, key, value, head_num, input_layout, pse, padding_mask,
                             atten_mask, scale, keep_prob, pre_tockens, next_tockens,
                             inner_precise, prefix, actual_seq_qlen, actual_seq_kvlen, sparse_mode, gen_mask_parallel, sync)
@@ -142,6 +140,7 @@ def register_fx_pass():
     from torch._inductor.fx_passes.joint_graph import patterns
     from torch._dynamo.utils import counters
     from torch._inductor.fx_passes.fuse_attention import partialize_and_update_signature
+
     def _npu_fusion_attention_graph_pattern_1(query, key, value, inv_scale_factor, dropout_p):
         q = query.permute(0, 2, 1, 3)
         k = key.permute(0, 2, 1, 3)
@@ -163,9 +162,9 @@ def register_fx_pass():
             head_num,
             input_layout,
             None,
-            atten_mask = None,
-            scale = inv_scale_factor,
-            keep_prob = 1.0 - dropout_p,
+            atten_mask=None,
+            scale=inv_scale_factor,
+            keep_prob=1.0 - dropout_p,
         )[0]
 
     def _get_sfdp_patterns():
@@ -187,9 +186,9 @@ def register_fx_pass():
             ))
 
             for pattern, replacement, args, workaround in candidates:
-                # XXX: when adding a new pattern, re-run `gen_attention_patterns` so the pattern
                 # gets serialized to a python file and does not require tracing at runtime.
-                assert isinstance(workaround, dict)
+                if not isinstance(workaround, dict):
+                    raise ValueError("workaround not dict")
                 name = pattern.__name__
 
                 if dtype != torch.float:
@@ -209,7 +208,8 @@ def register_fx_pass():
                 }
 
                 if workaround:
-                    assert len(workaround) == 1 and "dropout_p" in workaround
+                    if not (len(workaround) == 1 and "dropout_p" in workaround):
+                        raise ValueError("not (len(workaround) == 1 and dropout_p in workaround)")
                     # functools.partial insufficient because we look at signature downstream
                     pattern = partialize_and_update_signature(pattern, dropout_p=0.0)
                     replacement = partialize_and_update_signature(
@@ -227,7 +227,7 @@ def register_fx_pass():
                     "scalar_workaround": workaround,
                 }
 
-    for key, register_replacement_kwargs in _get_sfdp_patterns():
+    for _, register_replacement_kwargs in _get_sfdp_patterns():
         register_replacement(
             **register_replacement_kwargs,
         )
diff --git a/torch_npu/_inductor/npu_triton_heuristics.py b/torch_npu/_inductor/npu_triton_heuristics.py
index 98566c0eea..10c95a836d 100644
--- a/torch_npu/_inductor/npu_triton_heuristics.py
+++ b/torch_npu/_inductor/npu_triton_heuristics.py
@@ -6,6 +6,7 @@ from typing import Any, Callable, List, Optional
 import logging
 import re
 import hashlib
+import json
 
 import torch
 from torch._inductor import config
@@ -21,7 +22,6 @@ from torch._inductor.runtime.triton_heuristics import (
     get_first_attr,
     collected_calls,
 )
-import json
 from torch._inductor.runtime.benchmarking import benchmarker
 from torch._inductor.runtime.autotune_cache import AutotuneCache
 
@@ -51,6 +51,7 @@ from .codegen.triton_utils import get_aligned_numel
 from .config import aggresive_autotune
 from .config import log
 
+
 # torch-261
 class NPUCachingAutotuner(CachingAutotuner):
     def __init__(
@@ -74,7 +75,6 @@ class NPUCachingAutotuner(CachingAutotuner):
         self.exceptions = []
 
     def precompile(self, warm_cache_only=False):
-        # FIXME
         # xpu_graph changed TORCHINDUCTOR_CACHE_DIR.
         # When TORCHINDUCTOR_COMPILE_THREADS > 1, multiprocessing's fork method
         # does not propagate TORCHINDUCTOR_CACHE_DIR into the child threads.
@@ -84,7 +84,6 @@ class NPUCachingAutotuner(CachingAutotuner):
         # compile all kernels one by one.
         # So we directly replace TORCHINDUCTOR_CACHE_DIR with the standard cache dir.
         if ("xpu_graph" in os.getenv("TORCHINDUCTOR_CACHE_DIR", "")):
-            import re
             import getpass
             import tempfile
             sanitized_username = re.sub(r'[\\/:*?"<>|]', "_", getpass.getuser())
@@ -110,7 +109,7 @@ class NPUCachingAutotuner(CachingAutotuner):
                     log.debug(f"[thread {os.getpid()}][InductorNPU.precompile] Exception = {e}, kernel = {self.fn.__name__} config = {c}")
                     # Skip the config if the compilation fails
                     continue
-                if launcher is not None :  
+                if launcher is not None:
                     self.launchers.append(launcher)
                     compiled_binaries.append(compiled_binary)
 
@@ -128,7 +127,7 @@ class NPUCachingAutotuner(CachingAutotuner):
         compile_meta = copy.deepcopy(self.triton_meta)
 
         for k, v in cfg.kwargs.items():
-            if k not in self.fn.arg_names :
+            if k not in self.fn.arg_names:
                 continue
             compile_meta["constants"][k] = v
 
@@ -150,7 +149,6 @@ class NPUCachingAutotuner(CachingAutotuner):
                     self.fn,
                     compile_meta["signature"],
                     compile_meta["constants"],
-                    #compile_meta["configs"][0],
                 ),
             )
 
@@ -225,15 +223,6 @@ class NPUCachingAutotuner(CachingAutotuner):
                     compile_meta,
                 )
                 return None, None
-                #raise
-
-            # except Exception as e:
-            #     self.exceptions.append(e)
-            #     # compile failed don't need raise error for npu
-            #     return None, None
-
-
-            
 
         call_args = [
             arg
@@ -274,6 +263,7 @@ class NPUCachingAutotuner(CachingAutotuner):
         )
 
         scope["function"] = get_first_attr(binary, "function", "cu_function")
+
         def get_launch_args_without_kernel_launch_metadata(
             grid,
             grid_0,
@@ -594,7 +584,8 @@ def cached_autotune(
         if "XBLOCK" not in inspect.signature(fn.fn).parameters:
             for tconfig in configs:
                 if "XBLOCK" in tconfig.kwargs:
-                    assert tconfig.kwargs["XBLOCK"] == 1
+                    if tconfig.kwargs["XBLOCK"] != 1:
+                        raise ValueError("tconfig.kwargs[XBLOCK] != 1")
                     tconfig.kwargs.pop("XBLOCK")
 
         if inductor_meta.get("profile_bandwidth"):
@@ -645,7 +636,7 @@ def grid(*numels):
     def grid_fn(meta):
         split_axis_order = meta["split_axis_order"]
 
-        if split_axis_order is not None and  split_axis_order < len(numels) :
+        if split_axis_order is not None and split_axis_order < len(numels):
             numel = numels[split_axis_order] if split_axis_order is not None else 1
             xblock = meta["XBLOCK"]
             NBLOCKS, _ = SplitTiling.get_nblocks_before_launch(numel, xblock)
@@ -661,6 +652,7 @@ def grid(*numels):
 
     return grid_fn
 
+
 # split:sizeof split, xblock:axis1 length, rblock:axis2 length
 def triton_config_npu_index(
     size_hints,
@@ -743,11 +735,11 @@ def triton_config_npu_index(
         elif len(low_dims) == 0:
             cfg["RBLOCK"] = rblock
             if (axis1_order is not None) and (axis2_order is not None):
-                TileGenerator.descend_xblock_rblock(rnumel=rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel, aggresive = False )
+                TileGenerator.descend_xblock_rblock(rnumel=rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel, aggresive=False)
             elif axis1_order is not None:
-                TileGenerator.descend_xblock(rnumel=0, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel,aggresive = False )
+                TileGenerator.descend_xblock(rnumel=0, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel, aggresive=False)
             else:
-                TileGenerator.descend_rblock(rnumel=rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel,aggresive = False )
+                TileGenerator.descend_rblock(rnumel=rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel, aggresive=False)
         else:
             cfg["RBLOCK"] = rblock
             tmp = Config(cfg, num_warps=num_warps, num_stages=num_stages)
@@ -880,13 +872,10 @@ def benchmark_all_configs(self, *args, grid, **kwargs):
         
         def delete_file(base_path):
             import shutil
-            import os
             if os.path.exists(base_path):
                 shutil.rmtree(base_path)
         
-        import torch
         import torch_npu
-        import hashlib
         from datetime import datetime
         
         stream = torch.npu.current_stream()
@@ -938,7 +927,8 @@ def benchmark_all_configs(self, *args, grid, **kwargs):
 
     try:
         timinglist = do_batch_benchmark(tilling_kernel_list)
-        assert len(timinglist) == len(self.launchers)
+        if not len(timinglist) == len(self.launchers):
+            raise RuntimeError("not len(timinglist) == len(self.launchers)")
         timings = {launcher: timing for launcher, timing in zip(self.launchers, timinglist)}
     except Exception as e:
         print("some cases in batch benchmark has error! Logging Exception as:")
diff --git a/torch_npu/_inductor/runtime.py b/torch_npu/_inductor/runtime.py
index dbcd70db66..ae00e99043 100644
--- a/torch_npu/_inductor/runtime.py
+++ b/torch_npu/_inductor/runtime.py
@@ -23,7 +23,7 @@ class NPUDeviceProperties(DeviceProperties):
         props = device_interface.get_device_properties(device)
 
         try:
-            multi_processor_count = num_vector_core #props.multi_processor_count
+            multi_processor_count = num_vector_core
         except AttributeError:
             if device_type == "xpu":
                 multi_processor_count = props.gpu_subslice_count
-- 
Gitee


From c1af668c753a478f25ddb4d4f68db0661318569c Mon Sep 17 00:00:00 2001
From: wl1259 <wanglei752@huawei.com>
Date: Sat, 19 Apr 2025 09:28:33 +0800
Subject: [PATCH 329/358] 3 clean code commit

---
 torch_npu/_inductor/codegen/__init__.py       | 17 ++++---
 torch_npu/_inductor/codegen/_sizevars.py      |  5 +-
 torch_npu/_inductor/codegen/ir.py             | 44 ++++++++++-------
 .../_inductor/codegen/npu_kernel_features.py  | 38 +++++----------
 torch_npu/_inductor/codegen/schduling.py      | 35 +++++++-------
 torch_npu/_inductor/codegen/split_tiling.py   | 48 ++++++++++---------
 torch_npu/_inductor/codegen/tile_generator.py | 46 +++++++++---------
 torch_npu/_inductor/codegen/triton_utils.py   |  2 +-
 torch_npu/_inductor/codegen/wrapper.py        | 17 +------
 9 files changed, 118 insertions(+), 134 deletions(-)

diff --git a/torch_npu/_inductor/codegen/__init__.py b/torch_npu/_inductor/codegen/__init__.py
index e6eed55b00..3027bc9d28 100644
--- a/torch_npu/_inductor/codegen/__init__.py
+++ b/torch_npu/_inductor/codegen/__init__.py
@@ -2,31 +2,34 @@
 # -*- coding: utf-8 -*-
 #  Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
 
-from ..config import log as npulog
-npulog.info("perform npu_indexing patch")
 
-from torch._inductor.ir import Reduction,LoopBody
+
+from torch._inductor.ir import Reduction, LoopBody
 from torch._inductor.codegen.triton import TritonScheduling
 from torch._inductor import sizevars
 from torch._inductor.codegen.triton import TritonKernel
 from torch._inductor.codegen.simd import SIMDKernel
 
 from torch_npu._inductor.codegen._sizevars import simplify
-from torch_npu._inductor.codegen.ir import (num_splits,loopbody__call__,transform_dims_in_indexing,
-                                     substituted_dims_in_indexing)
+from torch_npu._inductor.codegen.ir import (num_splits,loopbody__call__,transform_dims_in_indexing, substituted_dims_in_indexing)
 from torch_npu._inductor.codegen.triton import is_compatible
 from torch_npu._inductor.codegen.triton import group_fn, select_index_dtype
 from torch_npu._inductor.codegen.schduling import create_tiling
+
+from ..config import log as npulog
+npulog.info("perform npu_indexing patch")
 #from ..npu_indexing.graph import run_node
 #graph
 #GraphLowering.run_node = run_node
 #common
 #ir
+
+
 Reduction.num_splits = num_splits
 setattr(LoopBody, 'transform_dims_in_indexing', transform_dims_in_indexing)
 setattr(LoopBody, 'substituted_dims_in_indexing', substituted_dims_in_indexing)
 
-LoopBody.__call__=loopbody__call__
+LoopBody.__call__ = loopbody__call__
 #need to enable this to speedup attn_cp_test
 #ComputedBuffer.simplify_and_reorder = simplify_and_reorder
 #triton scheduling
@@ -34,7 +37,7 @@ TritonScheduling.group_fn = group_fn
 TritonScheduling.select_index_dtype = select_index_dtype
 TritonScheduling.create_tiling = create_tiling
 #triton kernel
-setattr(SIMDKernel, 'is_compatible', is_compatible )
+setattr(SIMDKernel, 'is_compatible', is_compatible)
 
 #util
 sizevars.SizeVarAllocator.simplify = simplify
\ No newline at end of file
diff --git a/torch_npu/_inductor/codegen/_sizevars.py b/torch_npu/_inductor/codegen/_sizevars.py
index f3c4d4f550..37196fcad4 100644
--- a/torch_npu/_inductor/codegen/_sizevars.py
+++ b/torch_npu/_inductor/codegen/_sizevars.py
@@ -2,8 +2,9 @@ import sympy
 from sympy import Expr
 from torch._inductor.utils import sympy_subs
 
+
 def simplify(self, expr: Expr):
-    if isinstance(expr, (tuple,list)) :
-        return  [sympy.expand(s).xreplace(self.replacements) for s in expr]
+    if isinstance(expr, (tuple,list)):
+        return [sympy.expand(s).xreplace(self.replacements) for s in expr]
     return sympy.expand(expr).xreplace(self.replacements)
 
diff --git a/torch_npu/_inductor/codegen/ir.py b/torch_npu/_inductor/codegen/ir.py
index 7edccf17c0..6a9baf5b8f 100644
--- a/torch_npu/_inductor/codegen/ir.py
+++ b/torch_npu/_inductor/codegen/ir.py
@@ -1,12 +1,14 @@
-from ..config import log
+
 from typing import List, Tuple, Dict, Any, Optional
-from torch._inductor.virtualized import V
-from torch._inductor.ir import ( ReductionHint, IRNode, ModularIndexing, FloorDiv)
-from torch._inductor.utils import sympy_subs,sympy_index_symbol
-from torch_npu._inductor.codegen.triton import NPUIndexTritonKernel
 import sympy
 import itertools
 
+from torch._inductor.virtualized import V
+from torch._inductor.ir import (ReductionHint, IRNode, ModularIndexing, FloorDiv)
+from torch._inductor.utils import sympy_subs, sympy_index_symbol
+from torch_npu._inductor.codegen.triton import NPUIndexTritonKernel
+
+from ..config import log
 
 # NPU doesn't need to support ReductionHint.OUTER, and persistent reduction
 def num_splits(
@@ -22,17 +24,18 @@ def num_splits(
     ):
     return ReductionHint.DEFAULT, 1
 
+
 def detect_flattened_dims(kernel, index):
     new_vars = {}
-    if not isinstance(index, (sympy.core.add.Add, ModularIndexing, FloorDiv)) :
+    if not isinstance(index, (sympy.core.add.Add, ModularIndexing, FloorDiv)):
         return new_vars
     
     def detect_flattened_axis(expr):
         def init_new_vars(var, length):
             if var not in new_vars:
-                new_vars[var] = {length:[None,None]}
-            if length not in new_vars[var] :
-                new_vars[var][length] = [None,None]
+                new_vars[var] = {length: [None, None]}
+            if length not in new_vars[var]:
+                new_vars[var][length] = [None, None]
         if isinstance(expr, ModularIndexing):
             var, divisor, length = expr.args
             init_new_vars(var, length)
@@ -55,12 +58,12 @@ def detect_flattened_dims(kernel, index):
                 detect_flattened_axis(x)
 
     # add
-    if isinstance(index, sympy.core.add.Add) :
+    if isinstance(index, sympy.core.add.Add):
         for x in index.args:
             detect_flattened_axis(x)
     elif isinstance(index, (ModularIndexing, FloorDiv)):
         detect_flattened_axis(index)
-    else :
+    else:
         pass
 
     # make sure FloorDiv, MouldarIndexing must be in-pair
@@ -70,13 +73,13 @@ def detect_flattened_dims(kernel, index):
         else:
             parent_axis = kernel.range_tree_nodes_removed[var]
         for divisor, pair in divisors.items():
-            if not pair[0] and not pair[1] :
+            if not pair[0] and not pair[1]:
                 pass
             #FloorDiv not inplace
-            elif not pair[0] :
+            elif not pair[0]:
                 _, _, length = pair[1]
                 expr = FloorDiv(var, length)
-                new_vars[var][divisor][0] = (expr, length, parent_axis.length //length )
+                new_vars[var][divisor][0] = (expr, length, parent_axis.length // length)
             #ModularIndexing not inplace
             elif not pair[1] :
                 expr = ModularIndexing(var, 1, divisor)
@@ -138,10 +141,12 @@ def rebuild_flattened_dims(indexing) :
             new_index = sympy_subs(index, kernel.expr_substituted)
             indexing[key] = new_index
 
-def substituted_dims_in_indexing(self, indexing, kernel, range_tree_nodes_substituted) :
+
+def substituted_dims_in_indexing(self, indexing, kernel, range_tree_nodes_substituted):
     substituted = False
     for var, candidates in range_tree_nodes_substituted.items():
-        assert len(candidates) > 0, candidates
+        if not (len(candidates) > 0):
+            raise RuntimeError("assert len(candidates) > 0, candidates")
         exprs = sorted(candidates, reverse=True, key=lambda x: x[0])
         # the best candidate is with the longest numel
         numel = exprs[0][0]
@@ -159,7 +164,8 @@ def substituted_dims_in_indexing(self, indexing, kernel, range_tree_nodes_substi
 
     return substituted
 
-def generate_body_indexing(body, indices) :
+
+def generate_body_indexing(body, indices):
     index = list(itertools.chain.from_iterable(indices))
     assert len(index) == len(body.var_ranges), (index, body.var_ranges)
     assert all(v not in body.var_ranges for v in index)
@@ -175,13 +181,15 @@ def generate_body_indexing(body, indices) :
     #     for name, expr in body.indexing.items()
     # }
 
+
 def transform_dims_in_indexing(self, indices) :
-    if self.indexing is None :
+    if self.indexing is None:
         generate_body_indexing(self, indices)
 
     if V.kernel is not None and isinstance(V.kernel, NPUIndexTritonKernel):
         rebuild_flattened_dims(self.indexing)
 
+
 # select tiling axis, recover missing dimensions,
 def loopbody__call__(self, *indices):
     if self.indexing is None:
diff --git a/torch_npu/_inductor/codegen/npu_kernel_features.py b/torch_npu/_inductor/codegen/npu_kernel_features.py
index 01020f8966..0f86e3d81a 100644
--- a/torch_npu/_inductor/codegen/npu_kernel_features.py
+++ b/torch_npu/_inductor/codegen/npu_kernel_features.py
@@ -1,13 +1,16 @@
-from torch._inductor.codegen.simd_kernel_features import SIMDKernelFeatures,NodeScheduleEntry
-from torch._inductor.utils import cache_on_self
-from torch.utils._ordered_set import OrderedSet
-from typing import Tuple, List
 import sympy
 import functools
+
+from typing import Tuple, List
+from typing import Iterable
+
+import torch
+from torch._inductor.codegen.simd_kernel_features import SIMDKernelFeatures, NodeScheduleEntry
+from torch._inductor.utils import cache_on_self
+from torch.utils._ordered_set import OrderedSet
 from torch._inductor.virtualized import V
 from torch._inductor.codegen.simd import SIMDScheduling
-import torch
-from typing import Iterable
+
 
 class NumelList(Tuple):
 
@@ -50,6 +53,7 @@ class NumelList(Tuple):
         numel = self.numels()
         numel2 = other.numels() if isinstance(other, NumelList) else other
         return numel / numel2
+
     def __floordiv__(self, other):
         numel = self.numels()
         numel2 = other.numels() if isinstance(other, NumelList) else other
@@ -59,6 +63,7 @@ class NumelList(Tuple):
         numel = self.numels()
         numel2 = other.numels() if isinstance(other, NumelList) else other
         return numel * numel2
+
     def __rmul__(self, other):
         numel = self.numels()
         numel2 = other.numels() if isinstance(other, NumelList) else other
@@ -68,6 +73,7 @@ class NumelList(Tuple):
         numel = self.numels()
         numel2 = other.numels() if isinstance(other, NumelList) else other
         return numel + numel2
+
     def __radd__(self, other):
         numel = self.numels()
         numel2 = other.numels() if isinstance(other, NumelList) else other
@@ -87,23 +93,3 @@ class NPUKernelFeatures(SIMDKernelFeatures):
         super().__init__(node_schedule, numel, reduction_numel)
         self.numel = NumelList(self.numel) if isinstance(self.numel, Iterable) else self.numel
         self.reduction_numel = NumelList(self.reduction_numel) if isinstance(self.reduction_numel, Iterable) else self.reduction_numel
-
-
-    # @cache_on_self
-    # def select_index_dtype(self) -> torch.dtype:
-    #     # Gather all used buffer names
-    #     buffer_names: OrderedSet[str] = OrderedSet()
-    #     for node in self.scheduler_nodes():
-    #         buffer_names.update(node.get_buffer_names())
-    #         buffer_names.update(node.used_buffer_names())
-    #     buffers = [V.graph.get_buffer(name) for name in buffer_names]
-
-    #     # In theory we can separately check xnumel and rnumel are <= int_max
-    #     # but some indexers do use the full linear index so we need to be
-    #     # conservative here.
-    #     total_numel = self.numel * self.reduction_numel
-        
-
-    #     if SIMDScheduling.can_use_32bit_indexing(total_numel, buffers):
-    #         return torch.int32
-    #     return torch.int64
diff --git a/torch_npu/_inductor/codegen/schduling.py b/torch_npu/_inductor/codegen/schduling.py
index d888641530..a4d92f197f 100644
--- a/torch_npu/_inductor/codegen/schduling.py
+++ b/torch_npu/_inductor/codegen/schduling.py
@@ -1,23 +1,24 @@
-import pdb
+import itertools
+import contextlib
+import sympy
+from typing import Union, Iterable
+from typing import Dict, Sequence, List, Iterable
 
-from torch_npu._inductor.codegen.triton import NPUIndexTritonKernel, flatten
-from torch._inductor.codegen.triton import ( TritonScheduling, log, config)
+from torch.fx.immutable_collections import immutable_dict
+from torch._inductor.codegen.triton import (TritonScheduling, log, config)
 from torch._inductor.codegen.simd import  DisableReduction, EnableReduction,SIMDKernelFeatures, SIMDKernel
 from torch._inductor.codegen.simd import schedule_log, scheduler 
 from torch._inductor.codegen.multi_kernel import MultiKernel
-from typing import Union, Iterable
-from torch._inductor.virtualized import (
-    V,
-)
+from torch._inductor.virtualized import (V,)
 from torch._inductor.codecache import code_hash
 from torch._dynamo.utils import counters
-import itertools, contextlib
 from torch._inductor.utils import sympy_index_symbol,ModularIndexing,FloorDiv
-import sympy
+
+from torch_npu._inductor.codegen.triton import NPUIndexTritonKernel, flatten
 from .split_tiling import SplitTiling
-from torch.fx.immutable_collections import immutable_dict
-from typing import Dict, Sequence, List, Iterable
 from .npu_kernel_features import NumelList, NPUKernelFeatures
+
+
 def flatten_groups(nums):
     res = []
     for i in nums:
@@ -185,10 +186,10 @@ class NPUTritonScheduling(TritonScheduling):
             split_tiling.select_tiling_axis()
             # debug print index transforms
             for node in node_schedule:
-              if node in (EnableReduction, DisableReduction):
-                  continue
-              for x,y in zip( node._body.indexing_exprs.values(), node._body.indexing.values()) :
-                  print(f"index transform:{x}->{y}")
+                if node in (EnableReduction, DisableReduction):
+                    continue
+                for x,y in zip( node._body.indexing_exprs.values(), node._body.indexing.values()) :
+                    print(f"index transform:{x}->{y}")
 
     def additional_nodes_to_be_subs(self, kernel, node_to_be_substituted):
         for node in kernel.range_tree_nodes.values():
@@ -204,10 +205,6 @@ class NPUTritonScheduling(TritonScheduling):
                 numel = numel * s
                 sub_node = kernel.range_tree_nodes[k]
                 new_var_expr = new_var_expr + sub_node.symbol() * sub_node.divisor
-                # if isinstance(sub_node.expr, FloorDiv):
-                #     new_var_expr = new_var_expr + sub_node.symbol() * sub_node.divisor
-                # elif isinstance(sub_node.expr, ModularIndexing):
-                #     new_var_expr = new_var_expr + sub_node.symbol()
 
             if numel == node.length:
                 node_to_be_substituted[node.symbol()] = [(node.length, new_var_expr)]
diff --git a/torch_npu/_inductor/codegen/split_tiling.py b/torch_npu/_inductor/codegen/split_tiling.py
index c3a77f1a35..52300c22a8 100644
--- a/torch_npu/_inductor/codegen/split_tiling.py
+++ b/torch_npu/_inductor/codegen/split_tiling.py
@@ -1,21 +1,23 @@
-import pdb
+import sympy as sympy
 
 from torch._inductor.codegen.triton import TritonKernel
-from torch._inductor.utils import ModularIndexing,sympy_subs
-import sympy as sympy
-from ..config import num_vector_core, log
+from torch._inductor.utils import ModularIndexing, sympy_subs
 from torch._inductor.virtualized import V
-from torch._inductor.codegen.simd import (  EnableReduction, DisableReduction)
-from torch._inductor.runtime.runtime_utils import  next_power_of_2
-from .triton_utils import get_aligned_numel
+from torch._inductor.codegen.simd import (EnableReduction, DisableReduction)
+from torch._inductor.runtime.runtime_utils import next_power_of_2
 from torch._inductor.loop_body import MemoryUsageType
 
+from .triton_utils import get_aligned_numel
+from ..config import num_vector_core, log
+
+
 # split and tiling axis selector
-class SplitTiling :
-    def __init__(self, kernel : TritonKernel) :
+class SplitTiling:
+    def __init__(self, kernel: TritonKernel) :
         self.kernel = kernel
         self.indexing = []
-        def key(x) :
+
+        def key(x):
             # to be higher than x and y
             if x.name[0] == 'w' or x.name[0] == 'v' or x.name[0] == 'p' or x.name[0] == 't':
                 return "z" + x.name
@@ -41,19 +43,21 @@ class SplitTiling :
     # Split 原则4: 如果高维规约类融合算子，而且高维尺寸非常大（ >= 64KB），低维度尺寸比较小（ <= 32B）, 可以选择对规约轴切分，然后在核间用atomic
     # 原语做规约。
     # Split  原则5 ：根据算子逻辑，优先选择一维发射。
+
     def select_split_axis(self):
-        def select_longest_dim(can_be_low_dim = True):
+
+        def select_longest_dim(can_be_low_dim=True):
             longest = -1
             longest_dim = None
             for x in candidates:
-                if SplitTiling.great_than(x.length,longest) and (can_be_low_dim or not self.is_lowest_dimension(x)):
+                if SplitTiling.great_than(x.length, longest) and (can_be_low_dim or not self.is_lowest_dimension(x)):
                     longest_dim = x
                     longest = x.length
             return longest_dim
         # point-wise : all dims , reduction: outer_reduction dim or non-reduction dims
-        is_reduction = lambda x : x.prefix == 'r'
-        candidates = [x for x in self.kernel.sorted_axis if not is_reduction(x) or self.should_outer_reduce_me(x) ]
-        if self.should_outer_reduce :
+        is_reduction = lambda x: x.prefix == 'r'
+        candidates = [x for x in self.kernel.sorted_axis if not is_reduction(x) or self.should_outer_reduce_me(x)]
+        if self.should_outer_reduce:
             return self.kernel.split_axis
 
         # 0307 patch 5lines
@@ -98,7 +102,7 @@ class SplitTiling :
                     return True
                 reduction_axis = self.kernel.numof_reduction_axis()
                 return True if reduction_axis <= 1 else len(self.kernel.axis2_list) == reduction_axis
-            else :
+            else:
                 return False
     
         if self.kernel.axis2 is not None or self.kernel.axis1 is not None:
@@ -114,10 +118,10 @@ class SplitTiling :
         if self.kernel.split_axis is None :
             return
         # select tiling_axis2 then tiling_axis1, for reduction, all reduction axis will be selected as tiling_axis2
-        for i in range(len(dims)-1, -1, -1) :
+        for i in range(len(dims)-1, -1, -1):
             axis = dims[i]
             numel = axis.length
-            if isinstance(numel, (sympy.Symbol, sympy.Expr)) and  not isinstance(numel, sympy.Integer) :
+            if isinstance(numel, (sympy.Symbol, sympy.Expr)) and not isinstance(numel, sympy.Integer):
                   numel = numel.subs(V.graph.sizevars.var_to_val)
             if axis.is_split_axis :
                 dtype = self.kernel.get_axis_dtype(axis)
@@ -161,7 +165,7 @@ class SplitTiling :
 
 
-    # fixme the below logic doesn't work when there're two reduction axis, but only one need outer reduction
+
     def should_outer_reduce_me(self, x):
         should_outer = self.kernel.is_higher_order_reduction(True) and SplitTiling.great_than(x.length, 32768 ) and x.is_loop
         if should_outer :
@@ -179,8 +183,6 @@ class SplitTiling :
         if xblock is None :
             xblock = ( numel + num_vector_core -1 ) // num_vector_core if numel > num_vector_core else min_xblock
         
-        # fixme, aligning is wasting cores .
-        #if (not no_axis2 and  is_low_dim) or same_axis1 :
         xblock = next_power_of_2(xblock)
 
         nblocks = (numel + xblock -1 ) // xblock
@@ -273,10 +275,10 @@ class SplitTiling :
         if isinstance(ynumel, (sympy.Symbol, sympy.Expr)) and not isinstance(ynumel, sympy.Integer):
             ynumel = ynumel.subs(V.graph.sizevars.var_to_val)
 
-        if isinstance(xnumel, sympy.Integer) and  isinstance(ynumel, int):
+        if isinstance(xnumel, sympy.Integer) and isinstance(ynumel, int):
             ynumel = sympy.Integer(ynumel)
         
-        if isinstance(ynumel, sympy.Integer) and   isinstance(xnumel, int):
+        if isinstance(ynumel, sympy.Integer) and  isinstance(xnumel, int):
             xnumel = sympy.Integer(xnumel)
 
         return (xnumel, ynumel)
diff --git a/torch_npu/_inductor/codegen/tile_generator.py b/torch_npu/_inductor/codegen/tile_generator.py
index 481cae31b6..d6a6d5c17a 100644
--- a/torch_npu/_inductor/codegen/tile_generator.py
+++ b/torch_npu/_inductor/codegen/tile_generator.py
@@ -1,11 +1,11 @@
 import copy
-import pdb
-
 import math
 
 from torch._inductor.runtime.triton_heuristics import Config
 from torch._inductor.runtime.runtime_utils import next_power_of_2
 from .triton_utils import get_aligned_numel, byte_per_numel
+
+
 # generate tiling configs
 class TileGenerator:
     
@@ -16,15 +16,15 @@ class TileGenerator:
 
     @staticmethod
     def get_byte_per_numel(dtype):
-        if dtype is None :
+        if dtype is None:
             return 1
         return byte_per_numel[dtype]
 
     @staticmethod
-    def valid_config(config, align_numel, rnumel = 1):
+    def valid_config(config, align_numel, rnumel=1):
         
-        bytes = align_numel
-        max_numel = 16384 * 4 // bytes
+        count_bytes = align_numel
+        max_numel = 16384 * 4 // count_bytes
 
         rblock = config["RBLOCK"] if "RBLOCK" in config else rnumel
         xblock_sub = config["XBLOCK_SUB"]
@@ -35,10 +35,10 @@ class TileGenerator:
 
     # when rblock is low dim, need to maximize rblock
     @staticmethod
-    def descend_xblock(rnumel, xblock, configs, cfg, align_numel, aggresive = True ):
+    def descend_xblock(rnumel, xblock, configs, cfg, align_numel, aggresive=True):
         
-        bytes = align_numel
-        start_numel = 2048 // bytes if aggresive else 1024 // bytes
+        count_bytes = align_numel
+        start_numel = 2048 // bytes if aggresive else 1024 // count_bytes
         # include rblock is too big, need to decend rblock first
         rblock = rnumel if rnumel > 0 else 1
         while (rblock > start_numel):
@@ -51,18 +51,18 @@ class TileGenerator:
         xblock_sub = TileGenerator.aligned_numel(xblock)
 
         while True:
-           newcfg = copy.deepcopy(cfg)
-           newcfg["XBLOCK_SUB"] = xblock_sub
-           if TileGenerator.valid_config(newcfg, align_numel, rnumel=rblock):
-               configs.append(Config(newcfg, num_warps=1, num_stages=1))
-           xblock_sub = xblock_sub // 2
-           if xblock_sub * rblock <= start_numel:
-               break
+            newcfg = copy.deepcopy(cfg)
+            newcfg["XBLOCK_SUB"] = xblock_sub
+            if TileGenerator.valid_config(newcfg, align_numel, rnumel=rblock):
+                configs.append(Config(newcfg, num_warps=1, num_stages=1))
+            xblock_sub = xblock_sub // 2
+            if xblock_sub * rblock <= start_numel:
+                break
 
     @staticmethod
-    def descend_rblock(rnumel, xblock, configs, cfg, align_numel,  aggresive = True):
-        bytes = align_numel
-        start_numel = 4096 // bytes if aggresive else 1024 // bytes
+    def descend_rblock(rnumel, xblock, configs, cfg, align_numel, aggresive=True):
+        count_bytes = align_numel
+        start_numel = 4096 // bytes if aggresive else 1024 // count_bytes
 
         xblock_sub = start_numel if xblock > start_numel else xblock
         cfg["XBLOCK_SUB"] = xblock_sub
@@ -71,15 +71,15 @@ class TileGenerator:
             newcfg = copy.deepcopy(cfg)
             newcfg["RBLOCK"] = rblock
             if TileGenerator.valid_config(newcfg, align_numel):
-                 configs.append(Config(newcfg, num_warps=1, num_stages=1))
+                configs.append(Config(newcfg, num_warps=1, num_stages=1))
             rblock = rblock // 2
             if xblock_sub * rblock <= start_numel:
                 break
 
     @staticmethod
     def descend_xblock_rblock(rnumel, xblock, configs, cfg, align_numel, aggresive = True) :
-        bytes = align_numel
-        start_numel = 4096 // bytes if aggresive else 1024 // bytes
+        count_bytes = align_numel
+        start_numel = 4096 // bytes if aggresive else 1024 // count_bytes
         
         # Depending on the number of bytes available to the hardware UB,
         # 4096 bytes is an appropriate empirical value for an intra-core split.
@@ -109,7 +109,7 @@ class TileGenerator:
                 if TileGenerator.valid_config(newcfg, align_numel):
                     configs.append(Config(newcfg, num_warps=1, num_stages=1))
                 rblock = rblock // 2
-        else :
+        else:
             while rblock < xblock_sub and xblock_sub * rblock > start_numel:
                 newcfg = copy.deepcopy(cfg)
                 newcfg["XBLOCK_SUB"] = xblock_sub
diff --git a/torch_npu/_inductor/codegen/triton_utils.py b/torch_npu/_inductor/codegen/triton_utils.py
index 7e07c80dba..5acd971ba0 100644
--- a/torch_npu/_inductor/codegen/triton_utils.py
+++ b/torch_npu/_inductor/codegen/triton_utils.py
@@ -20,7 +20,7 @@ byte_per_numel = {
 }
 
 
-def get_aligned_numel( dtype):
+def get_aligned_numel(dtype):
     if dtype in byte_per_numel:
         return 32 // byte_per_numel[dtype]
     else:
diff --git a/torch_npu/_inductor/codegen/wrapper.py b/torch_npu/_inductor/codegen/wrapper.py
index 67a1dbdab4..8daeafbf63 100644
--- a/torch_npu/_inductor/codegen/wrapper.py
+++ b/torch_npu/_inductor/codegen/wrapper.py
@@ -1,4 +1,4 @@
-from torch._inductor.codegen.wrapper import PythonWrapperCodegen, SymbolicCallArg,SubgraphPythonWrapperCodegen
+from torch._inductor.codegen.wrapper import PythonWrapperCodegen, SymbolicCallArg, SubgraphPythonWrapperCodegen
 from torch._inductor.virtualized import V
 from torch._inductor.utils import (
     cache_on_self,
@@ -6,6 +6,7 @@ from torch._inductor.utils import (
 from torch._inductor.runtime import triton_heuristics
 from torch._inductor import config
 
+
 class NPUWrapperCodeGen(PythonWrapperCodegen):
     def __init__(self):
         super().__init__()
@@ -43,19 +44,6 @@ class NPUWrapperCodeGen(PythonWrapperCodegen):
                 V.graph.device_ops.import_get_raw_stream_as("get_raw_stream")
             )
 
-    # @cache_on_self
-    # def write_triton_header_once(self) -> None:
-    #     import_str = f"""
-    #             import triton
-    #             import triton.language as tl
-    #             from {triton_heuristics.__name__} import grid, split_scan_grid, grid_combo_kernels, start_graph, end_graph
-    #             from torch_npu._inductor.npu_triton_heuristics import grid
-    #             """
-    #     self.imports.splice(import_str, strip=True)
-    #     if config.triton.autotune_at_compile_time:
-    #         self.kernel_autotune_calls.splice(import_str)
-    #     self.write_get_raw_stream_header_once()
-
     #generate numel expr for range_tree_node
     def generate_node_numel_expr(self, kernel_name: str, node, numel_expr):
         expr = f"{kernel_name}_{node.name}_numel"
@@ -78,7 +66,6 @@ class NPUWrapperCodeGen(PythonWrapperCodegen):
 
     # don't free anything
     def make_buffer_free(self, buffer):
-        #return f"del {buffer.get_name()}"
         return ""
 
     # don't assert
-- 
Gitee


From 3f549302b26cafbb18f848f40bdbca22b29575a9 Mon Sep 17 00:00:00 2001
From: wl1259 <wanglei752@huawei.com>
Date: Sat, 19 Apr 2025 10:50:03 +0800
Subject: [PATCH 330/358] 4 clean code commit

---
 torch_npu/_inductor/codegen/tile_generator.py |   6 +-
 torch_npu/_inductor/codegen/triton.py         | 455 +++++++++---------
 2 files changed, 240 insertions(+), 221 deletions(-)

diff --git a/torch_npu/_inductor/codegen/tile_generator.py b/torch_npu/_inductor/codegen/tile_generator.py
index d6a6d5c17a..2acb301087 100644
--- a/torch_npu/_inductor/codegen/tile_generator.py
+++ b/torch_npu/_inductor/codegen/tile_generator.py
@@ -38,7 +38,7 @@ class TileGenerator:
     def descend_xblock(rnumel, xblock, configs, cfg, align_numel, aggresive=True):
         
         count_bytes = align_numel
-        start_numel = 2048 // bytes if aggresive else 1024 // count_bytes
+        start_numel = 2048 // count_bytes if aggresive else 1024 // count_bytes
         # include rblock is too big, need to decend rblock first
         rblock = rnumel if rnumel > 0 else 1
         while (rblock > start_numel):
@@ -62,7 +62,7 @@ class TileGenerator:
     @staticmethod
     def descend_rblock(rnumel, xblock, configs, cfg, align_numel, aggresive=True):
         count_bytes = align_numel
-        start_numel = 4096 // bytes if aggresive else 1024 // count_bytes
+        start_numel = 4096 // count_bytes if aggresive else 1024 // count_bytes
 
         xblock_sub = start_numel if xblock > start_numel else xblock
         cfg["XBLOCK_SUB"] = xblock_sub
@@ -79,7 +79,7 @@ class TileGenerator:
     @staticmethod
     def descend_xblock_rblock(rnumel, xblock, configs, cfg, align_numel, aggresive = True) :
         count_bytes = align_numel
-        start_numel = 4096 // bytes if aggresive else 1024 // count_bytes
+        start_numel = 4096 // count_bytes if aggresive else 1024 // count_bytes
         
         # Depending on the number of bytes available to the hardware UB,
         # 4096 bytes is an appropriate empirical value for an intra-core split.
diff --git a/torch_npu/_inductor/codegen/triton.py b/torch_npu/_inductor/codegen/triton.py
index 468c12a56a..ffc4ebe96e 100644
--- a/torch_npu/_inductor/codegen/triton.py
+++ b/torch_npu/_inductor/codegen/triton.py
@@ -1,12 +1,25 @@
-import pdb
 import os
-import torch
-from torch._inductor.utils import sympy_subs
-from torch._inductor.scheduler import SchedulerNode
-from typing import List,Set,Iterable,Callable,Sequence
+from typing import List, Set, Iterable, Callable,Sequence
+from typing import Dict
 import sympy
 import operator
 import itertools
+from enum import Enum
+import functools
+from typing import (
+    Optional,
+    Union,
+    Tuple,
+    Any,
+    cast
+)
+import re
+import textwrap
+
+import torch
+from torch._inductor.utils import sympy_subs
+from torch._inductor.scheduler import SchedulerNode
+
 from torch._inductor.codegen.simd import CantSplit, DisableReduction, EnableReduction
 from torch._inductor.codegen.common import free_symbol_is_type
 from torch._inductor.codegen.triton import (
@@ -26,19 +39,14 @@ from torch._inductor.codegen.triton import (
     BlockPtrOptions,
     triton_acc_type,
     constant_repr,
-    is_welford_reduction,FixedTritonConfig,
+    is_welford_reduction, FixedTritonConfig,
     prefix_is_reduction, upcast_acc_dtype
 )
 
 from torch.utils._sympy.functions import FloorDiv, Identity, ModularIndexing
-from torch._inductor.utils import sympy_index_symbol,generate_assert
+from torch._inductor.utils import sympy_index_symbol, generate_assert
 from torch.utils import _pytree as pytree
 from torch.utils._sympy.value_ranges import ValueRanges
-
-from typing import Dict
-from enum import Enum
-import functools
-
 from torch._inductor import config, ir
 from torch._inductor.virtualized import (
     V,
@@ -51,31 +59,20 @@ from torch._inductor.utils import (
     Placeholder,
 )
 from torch._inductor.runtime.runtime_utils import next_power_of_2
-
-
 from torch._inductor.codegen.common import (
     IndentedBuffer,
     SizeArg,
     DeferredLine,
 )
 from torch._inductor.codegen.triton_utils import config_of, signature_of, signature_to_meta
-
-from typing import (
-    Optional,
-    Union,
-    Tuple,
-    Any,
-    cast
-)
-
-import re
-from torch.utils._sympy.symbol import SymT,symbol_is_type
+from torch.utils._sympy.symbol import SymT, symbol_is_type
 from torch.utils._sympy.value_ranges import bound_sympy, ValueRangeAnalysis, ValueRanges
 from torch.utils._sympy.numbers import int_oo
+from torch._inductor.dtype_propagation import DtypePropagationOpsHandler
+
 from ..runtime import NPUDeviceProperties
-import textwrap
 from .npu_kernel_features import NumelList
-from torch._inductor.dtype_propagation import DtypePropagationOpsHandler
+
 
 def flatten(nums):
     res = []
@@ -86,39 +83,48 @@ def flatten(nums):
             res.append(i)
     return res
 
+
 class AxisDirection(Enum):
     Flat = 0,
     Vertical = 1,
     Horizontal = 2
 
+
 def reverse_direction(direction):
-  if direction == AxisDirection.Vertical :
-      return AxisDirection.Horizontal
-  elif direction == AxisDirection.Horizontal :
-      return AxisDirection.Vertical
-  else :
-       return AxisDirection.Flat
+    if direction == AxisDirection.Vertical:
+        return AxisDirection.Horizontal
+    elif direction == AxisDirection.Horizontal:
+        return AxisDirection.Vertical
+    else:
+        return AxisDirection.Flat
 
 
 class NPUTritonKernelOverrides(TritonKernelOverrides):
+
     @staticmethod
     def exp(x):
         return f"tl_math.exp({x})"
+
     @staticmethod
     def sqrt(x):
         return f"tl_math.sqrt({x})"
+
     @staticmethod
     def tanh(x):
         return f"tl_math.tanh({x})"
+
     @staticmethod
     def rsqrt(x):
         return f"tl.rsqrt({x})"
+
     @staticmethod
     def floor(x):
         return f"tl_math.floor({x})"
+
     @staticmethod
     def erf(x):
         return f"tl_math.erf({x})"
+
     @staticmethod
     def ceil(x):
         return f"tl_math.ceil({x})"
@@ -126,25 +132,24 @@ class NPUTritonKernelOverrides(TritonKernelOverrides):
 
 def group_fn(self, sizes):
     groups = list()
-    for s in sizes :
-       if not s :
-           groups.append(1)
-       elif isinstance(s, list):
-           group = flatten(s)
-        #    for x in group :
-        #        groups.append(x)
-           groups.append(NumelList(tuple(group)) if isinstance(group, list) else group)
-       else :
-           groups.append(s)      
+    for s in sizes:
+        if not s:
+            groups.append(1)
+        elif isinstance(s, list):
+            group = flatten(s)
+            groups.append(NumelList(tuple(group)) if isinstance(group, list) else group)
+        else:
+            groups.append(s)
     return tuple(groups)
 
+
 @staticmethod
 def select_index_dtype(node_schedule, numel, reduction_numel):
     return "tl.int32"
 
 
-class IterationRangesEntryNPUIndex(IterationRangesEntry) :
+class IterationRangesEntryNPUIndex(IterationRangesEntry):
     def __init__(
             self,
             *args, **kwargs):
@@ -158,7 +163,7 @@ class IterationRangesEntryNPUIndex(IterationRangesEntry) :
 
 
     def _codegen_mask(self):
-        if self.is_tiling_axis1 or self.is_tiling_axis2 :
+        if self.is_tiling_axis1 or self.is_tiling_axis2:
             upper = f"{self.name}_numel"
             line = f"{self.name}_mask = {self.name} < {upper}"
             self.writeline(line)
@@ -169,10 +174,10 @@ class IterationRangesEntryNPUIndex(IterationRangesEntry) :
 
     def _codegen(self):
         index = None
-        vertical = self.is_tiling_axis1 if V.kernel.numof_reduction_axis() <=1 else not isinstance(self.expr, ModularIndexing)
+        vertical = self.is_tiling_axis1 if V.kernel.numof_reduction_axis() <= 1 else not isinstance(self.expr, ModularIndexing)
         direction = V.kernel.get_axis_direction(vertical)
         # for multiple reduce dims, don't need this
-        if self.is_tiling_axis1 and  V.kernel.numof_reduction_axis() <= 1:
+        if self.is_tiling_axis1 and V.kernel.numof_reduction_axis() <= 1:
             index = f"{self.name} = {self.codegen_index(direction)}"
             #to be fixed, only permute need to this .
             self.writeline(f"{self.name}_prime = {self.codegen_index(reverse_direction(direction))}")
@@ -187,7 +192,7 @@ class IterationRangesEntryNPUIndex(IterationRangesEntry) :
                     and V.kernel.current_node.node.data \
                     and isinstance(V.kernel.current_node.node.data, ir.Reduction):
                 reduction_type = V.kernel.current_node.node.data.reduction_type
-                if reduction_type in {"argmax", "argmin"} :
+                if reduction_type in {"argmax", "argmin"}:
                     self.writeline(f"{self.parent.prefix}index = "
                                    f"{self.codegen_index(reverse_direction(AxisDirection.Flat))}")
         if index:
@@ -203,22 +208,22 @@ class IterationRangesEntryNPUIndex(IterationRangesEntry) :
             index = f"tl.arange(0, RBLOCK)"
             return index
         elif self.is_tiling_axis1:
-            if self.is_split_axis :
+            if self.is_split_axis:
                 offset = f"{self.symbol()}_offset"
                 index = f"{offset} + (loop1 * XBLOCK_SUB) + base1"
             else :
                 index = f"(loop1 * XBLOCK_SUB) + base1"
             
-            if V.kernel.axis2 is not None and direction != AxisDirection.Flat :
+            if V.kernel.axis2 is not None and direction != AxisDirection.Flat:
                 index += ("[None, :]" if direction == AxisDirection.Horizontal else "[:, None]")
             return index
-        elif self.is_tiling_axis2  :
-            if V.kernel.persistent_reduction :
+        elif self.is_tiling_axis2:
+            if V.kernel.persistent_reduction:
                 index = f"tl.arange(0, RBLOCK_{self.symbol()})" if V.kernel.numof_reduction_axis() > 1 else  "base2"
             elif self.is_split_axis:
                 offset = f"{self.symbol()}_offset"
                 index = f"{offset} + (loop2 * RBLOCK) + base2"
-            else :
+            else:
                 index = "loop2 * RBLOCK + base2"
 
             if direction != AxisDirection.Flat:
@@ -234,17 +239,17 @@ class IterationRangesEntryNPUIndex(IterationRangesEntry) :
 
         if self.is_tiling_axis1 and not (V.kernel.axis2 is None and V.kernel.persistent_reduction):
             #  don't create loops for multi-reductions
-            if V.kernel.numof_reduction_axis() <= 1 :
+            if V.kernel.numof_reduction_axis() <= 1:
                 lines.append("base1 = tl.arange(0, XBLOCK_SUB)")
                 xblock = f"XBLOCK" if self.is_split_axis else f"{self.symbol()}_numel"
                 lines.append(f"loops1 = ({xblock} + XBLOCK_SUB - 1) // XBLOCK_SUB")
 
-        elif self.is_tiling_axis2 and  len(V.kernel.axis2_list) <=1:
+        elif self.is_tiling_axis2 and  len(V.kernel.axis2_list) <= 1:
             lines.append("base2 = tl.arange(0, RBLOCK)")
             if self.is_split_axis:
                 lines.append(f"loops2 = (XBLOCK + RBLOCK - 1) // RBLOCK")
             else:
-                lines.append(f"loops2 = ({self.name}_numel + RBLOCK - 1) // RBLOCK" )
+                lines.append(f"loops2 = ({self.name}_numel + RBLOCK - 1) // RBLOCK")
         else:
             pass
 
@@ -256,7 +261,8 @@ class IterationRangesEntryNPUIndex(IterationRangesEntry) :
         if isinstance(self.expr, (sympy.Symbol, sympy.Integer)):
             return precomputed_args
 
-        assert isinstance(self.expr, (FloorDiv, ModularIndexing)), type(self.expr)
+        if not isinstance(self.expr, (FloorDiv, ModularIndexing)):
+            raise RuntimeError("assert isinstance(self.expr, (FloorDiv, ModularIndexing)), type(self.expr)")
         for arg in self.expr.args[1:]:
             if not isinstance(arg, (sympy.Integer, sympy.Symbol)):
                 symbols = arg.free_symbols
@@ -282,17 +288,17 @@ class IterationRangesRootNPUIndex(IterationRangesRoot):
             grid_dim: Optional[int],
     ):
         super().__init__(name, numel, prefix, index, kernel, pid_cache, is_loop=is_loop, tensor_dim=tensor_dim,
-                         grid_dim=grid_dim, has_zdim= False )
+                         grid_dim=grid_dim, has_zdim=False)
 
     def __repr__(self):
         return f"IterationRangesRootNPUIndex({self.name!r}, {self.numel}, ...)"
 
     def remove_entry(self, name):
-        if name in self.var_ranges :
+        if name in self.var_ranges:
             del self.var_ranges[name]
         if name in self.var_list:
             del self.var_list[self.var_list.index(name)]
-        if name in V.kernel.range_tree_nodes :
+        if name in V.kernel.range_tree_nodes:
             V.kernel.range_tree_nodes_removed[name] = V.kernel.range_tree_nodes[name]
             del V.kernel.range_tree_nodes[name]
         if name in self.nodes:
@@ -358,9 +364,9 @@ class NPUIndexTritonKernel(TritonKernel):
         min_elem_per_thread=0,
         optimize_mask=True,
         fixed_config: Optional[FixedTritonConfig] = None,
-        **kwargs,) :
+        **kwargs,):
 
-        super().__init__(tiling = tiling,
+        super().__init__(tiling=tiling,
             min_elem_per_thread=min_elem_per_thread,
             optimize_mask=optimize_mask,
             fixed_config=fixed_config,
@@ -374,7 +380,7 @@ class NPUIndexTritonKernel(TritonKernel):
         self.axis2 = None
         # incase two reduction axis
         self.axis2_list = []
-        self.low_dims  = set()
+        self.low_dims = set()
 
         self.range_tree_nodes_removed: Dict[sympy.Symbol, IterationRangesEntry] = {}
         self.range_tree_nodes_substituted = {}
@@ -408,13 +414,13 @@ class NPUIndexTritonKernel(TritonKernel):
 
     def numof_reduction_axis(self):
         root = self.range_trees[-1]
-        if root is None :
+        if root is None:
             return 0
 
         return len(root.var_list)
 
     def numof_tiling_axis(self):
-        return  (1 if self.axis1 is not None else 0) + (1 if self.axis2 is not None else 0 )
+        return (1 if self.axis1 is not None else 0) + (1 if self.axis2 is not None else 0)
 
     #do nothing in NpuTritonKernel
     def codegen_range_tree(self):
@@ -424,16 +430,16 @@ class NPUIndexTritonKernel(TritonKernel):
     def initialize_range_tree(self, pid_cache):
         #self.numels = flatten(self.numels)
         self.total_numels = 0
-        for k, x in self.numels.items() :
-            if not isinstance(x, sympy.Integer) :
-                 x = x.subs(V.graph.sizevars.var_to_val)
-                 self.numels[k] = x
-            if x > 1 :
-                self.total_numels +=1
+        for k, x in self.numels.items():
+            if not isinstance(x, sympy.Integer):
+                x = x.subs(V.graph.sizevars.var_to_val)
+                self.numels[k] = x
+            if x > 1:
+                self.total_numels += 1
 
         no_r_dim = not self.inside_reduction or self.numels["r"] == 1
         prefixes = "wvtzyxr"
-        active_prefixes = prefixes[-len(self.numels) :]
+        active_prefixes = prefixes[-len(self.numels): ]
         #prefix can not be 's', 'u', 'ps' , 'i', 'z', 'q'
         #prefix can not be 'p' from torch 2.6.0
         grid_dims = "xyztvw"
@@ -470,7 +476,7 @@ class NPUIndexTritonKernel(TritonKernel):
         if (len(self.range_tree_nodes.values()) == 0):
             return size_hints
         
-        for i, node in enumerate(self.sorted_axis):
+        for _, node in enumerate(self.sorted_axis):
             if isinstance(node.expr, ModularIndexing):
                 numel_expr = node.length
             else:
@@ -482,11 +488,11 @@ class NPUIndexTritonKernel(TritonKernel):
         return size_hints
 
     # torch251 done
-    def add_numel_to_call_args_and_grid(self, name, call_args, arg_types,  grid):
+    def add_numel_to_call_args_and_grid(self, name, call_args, arg_types, grid):
         for node in self.sorted_axis:
-            if isinstance(node.expr, ModularIndexing) :
+            if isinstance(node.expr, ModularIndexing):
                 numel_expr = node.length
-            else :
+            else:
                 numel_expr = node.expr.subs({sympy_index_symbol(r.name): r.numel for r in self.range_trees})
 
             if isinstance(numel_expr, (sympy.Integer, sympy.Symbol)):
@@ -498,7 +504,7 @@ class NPUIndexTritonKernel(TritonKernel):
             if node.parent.grid_dim is not None:
                 grid.append(expr)
 
-    def gen_numel_args(self, signature, triton_meta_signature, argdefs ):
+    def gen_numel_args(self, signature, triton_meta_signature, argdefs):
         for node in self.sorted_axis:
             arg_name = f"{node.name}_numel"
             if not os.environ.get('INDUCTOR_STATIC_MODE'):
@@ -508,12 +514,11 @@ class NPUIndexTritonKernel(TritonKernel):
                     sizearg, size_dtype=self.index_dtype
                 )
                 argdefs.append(arg_name)
-            else :
+            else:
                 argdefs.append(f"{arg_name}: tl.constexpr")
                 self.triton_meta["constants"][arg_name] = node.length
 
 
-    # modify triton_meta, inductor_meta , etc.
     def codegen_kernel(self, name=None):
         code = IndentedBuffer()
         size_hints = self.get_size_hints()
@@ -536,7 +541,7 @@ class NPUIndexTritonKernel(TritonKernel):
                         arg.name, V.graph.sizevars.inv_precomputed_replacements[symbol]
                     )
 
-        triton_meta_signature = signature_to_meta( signature, size_dtype=self.index_dtype, argdefs = argdefs )
+        triton_meta_signature = signature_to_meta( signature, size_dtype=self.index_dtype, argdefs=argdefs)
 
         triton_meta = {
             "signature": triton_meta_signature,
@@ -619,9 +624,9 @@ class NPUIndexTritonKernel(TritonKernel):
 
     
     def codegen_static_numels(self, code):
-        no_x_axis =  self.numof_reduction_axis() > 1
+        no_x_axis = self.numof_reduction_axis() > 1
         symbols = []
-        if self.axis2 is not None :
+        if self.axis2 is not None:
             symbols = list(self.axis2_list) if no_x_axis else list([self.axis2])
         elif self.persistent_reduction and self.axis1 is not None:
             symbols = list([self.axis1])
@@ -635,38 +640,38 @@ class NPUIndexTritonKernel(TritonKernel):
                 else:
                     continue
                 val = next_power_of_2(val)
-                if no_x_axis :
+                if no_x_axis:
                     code.writeline(f"RBLOCK_{node.symbol()}: tl.constexpr = {val}")
-                else :
+                else:
                     code.writeline(f"RBLOCK: tl.constexpr = {val}")
 
     def axis2_variable(self):
-        if self.axis2 is not None :
+        if self.axis2 is not None:
             return self.range_tree_nodes[self.axis2]
         return None
 
     def is_isolated_symbol(self, input_str, symbol):
         # 使用正则表达式查找独立的符号, 防止out_ptr0 匹配上r0  r0_prime
         pattern1 = r'\b' + re.escape(symbol) + r'\b'
-        pattern2 = r'\b' + re.escape(symbol+'_prime') + r'\b'
+        pattern2 = r'\b' + re.escape(symbol + '_prime') + r'\b'
 
         return bool(re.search(pattern1, input_str)) or bool(re.search(pattern2, input_str))
 
     def find_axis2_in_load_store(self):
         var = self.axis2_variable()
-        if not var :
+        if not var:
             return False
-        for line in self.loads._lines :
+        for line in self.loads._lines:
             if line.find('tl.load') >= 0 and self.is_isolated_symbol(line, var.name):
                 return True
-        for line in self.compute._lines :
+        for line in self.compute._lines:
             if line.find('tl.load') >= 0 and self.is_isolated_symbol(line, var.name):
                 return True
-        for line in self.post_loop_store._lines :
+        for line in self.post_loop_store._lines:
             if line.find('tl.store') >= 0 and self.is_isolated_symbol(line, var.name):
                 return True
-        for line in self.stores._lines :
-            if isinstance(line,DeferredLine) :
+        for line in self.stores._lines:
+            if isinstance(line,DeferredLine):
                 line = line.line
             if line.find('tl.store') >= 0 and self.is_isolated_symbol(line, var.name):
                 return True
@@ -674,12 +679,12 @@ class NPUIndexTritonKernel(TritonKernel):
 
     def find_axis2_in_indexing(self):
         var = self.axis2_variable()
-        if not var :
+        if not var:
             return False
-        if self.current_node is None :
+        if self.current_node is None:
             return False
-        for index in self.current_node._body.indexing.values() :
-            if var.symbol() in index.free_symbols :
+        for index in self.current_node._body.indexing.values():
+            if var.symbol() in index.free_symbols:
                 return True
         return False
 
@@ -694,7 +699,7 @@ class NPUIndexTritonKernel(TritonKernel):
         self.post_loop_store.clear()
         self.prefix.clear()
 
-    def is_1d_reduction(self) :
+    def is_1d_reduction(self):
         return self.numels["r"] > 1 and self.axis2 is None
     
     def codegen_body(self):
@@ -706,14 +711,15 @@ class NPUIndexTritonKernel(TritonKernel):
         ):
             return
 
-        def write_pointwise() :
+        def write_pointwise():
             self.body.splice(self.indexing_code)
             self.body.splice(self.loads)
             self.body.splice(self.compute)
             self.body.splice(self.stores)
         
-        def codegen_range(index) :
-            def loop_body(index, indexing_code, is_last_axis, do_indent = True ) :
+        def codegen_range(index):
+
+            def loop_body(index, indexing_code, is_last_axis, do_indent=True) :
                 if do_indent:
                     self.body.do_indent()
                 if indexing_code :
@@ -730,11 +736,11 @@ class NPUIndexTritonKernel(TritonKernel):
             if index < 0 or index >= len(self.range_tree_nodes):
                 return
             nodes = self.sorted_axis
-            range = nodes[index]
-            is_tilling_asix1 = getattr(range, "is_tiling_axis1")
-            is_tilling_asix2 = getattr(range, "is_tiling_axis2")
-            is_last_axis = index == len(nodes) -1
-            indexing_code = getattr(range, "indexing_code")
+            range_node = nodes[index]
+            is_tilling_asix1 = getattr(range_node, "is_tiling_axis1")
+            is_tilling_asix2 = getattr(range_node, "is_tiling_axis2")
+            is_last_axis = index == len(nodes) - 1
+            indexing_code = getattr(range_node, "indexing_code")
             numof_axis2 = self.numof_reduction_axis()
             if is_tilling_asix1:
                 do_indent = True
@@ -745,12 +751,12 @@ class NPUIndexTritonKernel(TritonKernel):
                 
                 # multi-dim reduction, i.e. var_mean[1,2]
                 if numof_axis2 > 1:
-                    if range.is_split_axis :
-                        offset = f"{range.name}_offset"
-                        self.body.writeline(f"for {range.name} in range({offset}, "
-                                             f"min({offset} + XBLOCK, {range.name}_numel)):")
+                    if range_node.is_split_axis:
+                        offset = f"{range_node.name}_offset"
+                        self.body.writeline(f"for {range_node.name} in range({offset}, "
+                                             f"min({offset} + XBLOCK, {range_node.name}_numel)):")
                     else :
-                        self.body.writeline(f"for {range.name} in  range({range.name}_numel):")
+                        self.body.writeline(f"for {range_node.name} in  range({range_node.name}_numel):")
                 # 1D persistent_reduction or 1d reduction non-first-node
                 elif self.axis2 is None and (self.persistent_reduction or len(self.loads._lines) == 0):
                     do_indent = False
@@ -760,7 +766,7 @@ class NPUIndexTritonKernel(TritonKernel):
                     self.body.writeline(f"for loop1 in range(loops1):")
 
                 
-                if not reduction_1d and self.persistent_reduction :
+                if not reduction_1d and self.persistent_reduction:
                     self.body.do_indent()
                     self.body.splice(self.prefix)
                     self.prefix.clear()    
@@ -777,7 +783,7 @@ class NPUIndexTritonKernel(TritonKernel):
             elif is_tilling_asix2:
                 do_indent = False
                 need_axis2_loop = self.find_axis2_in_load_store()
-                if not need_axis2_loop :
+                if not need_axis2_loop:
                     indexing_code = None
                 if (not self.inside_reduction or not self.persistent_reduction) \
                         and need_axis2_loop:
@@ -788,14 +794,15 @@ class NPUIndexTritonKernel(TritonKernel):
                 self.body.splice(self.post_loop_store)
                 self.post_loop_store.clear()
 
-            elif is_last_axis and range.numel == 1: #pointwise , last axis =1
+            elif is_last_axis and range_node.numel == 1:
+                #pointwise , last axis =1
                 write_pointwise()
             else:
-                if range.is_split_axis :
-                    offset = f"{range.symbol()}_offset"
-                    self.body.writeline(f"for {range.symbol()} in range({offset}, min({offset} + XBLOCK, {range.name}_numel)):")
+                if range_node.is_split_axis:
+                    offset = f"{range_node.symbol()}_offset"
+                    self.body.writeline(f"for {range_node.symbol()} in range({offset}, min({offset} + XBLOCK, {range_node.name}_numel)):")
                 else :
-                    self.body.writeline(f"for {range.symbol()} in range({range.name}_numel):")
+                    self.body.writeline(f"for {range_node.symbol()} in range({range_node.name}_numel):")
                 loop_body(index, indexing_code, is_last_axis)
 
         if self.first_node:
@@ -806,16 +813,16 @@ class NPUIndexTritonKernel(TritonKernel):
         if self.first_node:
             codegen_range(0)
         else :
-            if self.axis2 is None :
+            if self.axis2 is None:
                 codegen_range(0)
             else :
                 axis2_order = self.range_tree_nodes[self.axis2].sorted_order
-                if self.persistent_reduction and self.numof_reduction_axis() > 1 :
+                if self.persistent_reduction and self.numof_reduction_axis() > 1:
                     axis2_order = axis2_order - self.numof_reduction_axis() +1
-                for _ in range(axis2_order) :
+                for _ in range(axis2_order):
                     self.body.do_indent()
                 codegen_range(axis2_order)
-                for _ in range(axis2_order) :
+                for _ in range(axis2_order):
                     self.body.do_unindent()
 
         self.cse.invalidate(self.outside_loop_vars)
@@ -824,13 +831,11 @@ class NPUIndexTritonKernel(TritonKernel):
         self.stores.clear()
         self.post_loop_store.clear()
         self.prefix.clear()
-        #for root in self.range_trees:
-        #    root.cache_clear()
         self.first_node = False
 
     # for creat constant tensor, if have two axis, constant=tl.full([1,1]) else  tl.full([1])
     def triton_tensor_ndim(self):
-        if self.numof_reduction_axis() > 1 :
+        if self.numof_reduction_axis() > 1:
             return 1
         if self.axis1 is not None and self.axis2 is not None:
             ndim = 2
@@ -838,9 +843,10 @@ class NPUIndexTritonKernel(TritonKernel):
             ndim = 1
         return ndim
 
-    # fixme, indexing.mask_str is None , see varmean_test.py
     def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable):
-        assert self.inside_reduction
+        if not self.inside_reduction:
+            raise RuntimeError("assert self.inside_reduction")
+
         self.inside_reduction = False
         indexing = self.indexing(index, block_ptr=True)
         self.inside_reduction = True
@@ -859,33 +865,34 @@ class NPUIndexTritonKernel(TritonKernel):
                 )
             )
         else:
-            assert isinstance(indexing, IndexingOptions)
+            if not isinstance(indexing, IndexingOptions):
+                raise RuntimeError("assert isinstance(indexing, IndexingOptions)")
             line = f"tl.store({var} + ({indexing.index_str} ), {value}, {indexing.mask_str})"
-            if self.numof_reduction_axis() > 1 :
+            if self.numof_reduction_axis() > 1:
                 line = f"tl.store({var} + ({indexing.index_str} + tl.arange(0,1) ), {value}, {indexing.mask_str})"
             self.post_loop_store.writeline(
-                DeferredLine( name, line )
+                DeferredLine(name, line)
             )
 
     def apply_var_prime(self, index, line, mask):
         # axis should only be replaced once
         axis_list = []
         for key in index.as_coefficients_dict().keys():
-            if not key.free_symbols :
+            if not key.free_symbols:
                 continue
             symbol = list(key.free_symbols)[0]
-            if symbol not in self.range_tree_nodes :
+            if symbol not in self.range_tree_nodes:
                 continue
-            range = self.range_tree_nodes[symbol]
-            if (range.is_tiling_axis1 or range.is_tiling_axis2) and (symbol not in axis_list):
-                line = line.replace(f"{range.name}", f"{range.name}_prime")
-                mask = mask.replace(f"{range.name}", f"{range.name}_prime")
+            range_node = self.range_tree_nodes[symbol]
+            if (range_node.is_tiling_axis1 or range_node.is_tiling_axis2) and (symbol not in axis_list):
+                line = line.replace(f"{range_node.name}", f"{range_node.name}_prime")
+                mask = mask.replace(f"{range_node.name}", f"{range_node.name}_prime")
                 axis_list.append(symbol)
         return line, mask
 
     # apply xxx_prime var in case dim are permuted
     def store(
-        self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
+        self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode=None
     ) -> None:
         
         var = self.args.output(name)
@@ -913,7 +920,7 @@ class NPUIndexTritonKernel(TritonKernel):
             )
         elif mode is None:
             line = f"tl.store({var} + ({index_str}), {value_str}, {mask_str})"
-            if len(self.axis2_list) > 1 :
+            if len(self.axis2_list) > 1:
                 line = f"tl.store({var} + ({index_str} + tl.arange(0,1) ), {value_str}, {indexing.mask_str})"
 
         elif mode == "atomic_add":
@@ -932,12 +939,12 @@ class NPUIndexTritonKernel(TritonKernel):
     @staticmethod
     def _get_next_scheduler_node(node_schedule, current_node):
         found_current = False if current_node else True
-        for node in node_schedule :
-            if isinstance(node, SchedulerNode) :
-                if not found_current and node.get_name() == current_node.get_name() :
+        for node in node_schedule:
+            if isinstance(node, SchedulerNode):
+                if not found_current and node.get_name() == current_node.get_name():
                     found_current = True
                     continue
-                if found_current  :
+                if found_current:
                     return node
         return None
 
@@ -952,35 +959,36 @@ class NPUIndexTritonKernel(TritonKernel):
         if self.numof_reduction_axis() <= 1:
             return False 
         
-        all_index_is_1d  = True 
-        for _,index in self.current_node._body.indexing.items() :
+        all_index_is_1d = True
+        for _, index in self.current_node._body.indexing.items():
             count = 0 
-            for symbol in index.free_symbols :
-                if symbol in self.axis2_list :
+            for symbol in index.free_symbols:
+                if symbol in self.axis2_list:
                     count = count + 1
-            if count > 1 :
+            if count > 1:
                 all_index_is_1d = False 
 
-            if not all_index_is_1d :
+            if not all_index_is_1d:
                 break 
         return all_index_is_1d
                     
             
     # to generate the shape of the accumulator of RBLOCK loop
+
     def dense_size_list(self, is_permute) -> List[str]:
 
         sizes = []
         if self.numof_reduction_axis() > 1:
             sizes = [] if self.check_all_index_is_1d_for_dual_reduction() else [f"RBLOCK_{axis}" for axis in self.axis2_list] 
             return sizes
-        if self.persistent_reduction and self.axis2 is None :
+        if self.persistent_reduction and self.axis2 is None:
             sizes = ["RBLOCK" ]
             return sizes
         # current computedbuffer is reduction
         cb_is_reduction = self.inside_reduction if not self.current_node else isinstance(self.current_node.node.data, ir.Reduction)
 
         for tree in self.sorted_axis:
-            if tree.is_tiling_axis1 :
+            if tree.is_tiling_axis1:
                 sizes.append("XBLOCK_SUB")
             elif tree.is_tiling_axis2:
                 sizes.append("RBLOCK")
@@ -1035,63 +1043,65 @@ class NPUIndexTritonKernel(TritonKernel):
         result =  reverse_direction(result) if reversed else result
         return result
 
-    def is_higher_order_reduction(self, check_prev_node = False ):
-        if self.numof_reduction_axis() > 1 :
+    def is_higher_order_reduction(self, check_prev_node=False):
+        if self.numof_reduction_axis() > 1:
             return False
         assert self.inside_reduction
-        if self.inside_high_order_reduction :
+        if self.inside_high_order_reduction:
             return self.inside_high_order_reduction
 
         node = self.current_node if self.current_node is not None else self.get_prev_scheduler_node(None)
-        if node is None or not isinstance(node, SchedulerNode) :
+        if node is None or not isinstance(node, SchedulerNode):
             return False
 
         reduction = node.node.data
-        while check_prev_node and reduction is not None and  not isinstance(reduction, ir.Reduction) :
+        while check_prev_node and reduction is not None and  not isinstance(reduction, ir.Reduction):
             node = self.get_prev_scheduler_node(node)
-            if node is None :
+            if node is None:
                 reduction = None
-            else :
+            else:
                 reduction = node.node.data
 
 
-        if reduction is None or not isinstance(reduction, ir.Reduction) :
+        if reduction is None or not isinstance(reduction, ir.Reduction):
             return False
-        if not hasattr(reduction, "reduced_idx") :
+        if not hasattr(reduction, "reduced_idx"):
             return False
 
         reduced_order = reduction.reduced_idx[0]
         is_last_axis = all(_ < reduced_order for _ in reduction.kept_idx)
         self.inside_high_order_reduction = not is_last_axis
         return self.inside_high_order_reduction
+
     def get_axis_dtype(self, axis):
         dtype = None
-        if axis is None :
+        if axis is None:
             return None
-        for node in self.node_schedule :
-            if node in (EnableReduction, DisableReduction) :
+        for node in self.node_schedule:
+            if node in (EnableReduction, DisableReduction):
                 continue
-            if axis.symbol() in node._body.indexing_map :
+            if axis.symbol() in node._body.indexing_map:
                 dtype = V.graph.get_dtype(node.node.name)
                 break
-        if dtype is None :
+        if dtype is None:
             should_break_all = False
             for node in self.node_schedule:
                 if should_break_all:
                     break
                 if node in (EnableReduction, DisableReduction):
                     continue
-                for key, value in node._body.indexing_map.items():
-                    if key  in self.range_tree_nodes :
+                for key, _ in node._body.indexing_map.items():
+                    if key  in self.range_tree_nodes:
                         dim = self.range_tree_nodes[key]
-                    else :
+                    else:
                         dim = self.range_tree_nodes_removed[key]
 
-                    if dim.parent == axis.parent :
+                    if dim.parent == axis.parent:
                         dtype = V.graph.get_dtype(node.node.name)
                         should_break_all = True
                         break
         return dtype
+
     def create_inductor_meta(self):
         mutated_args = set()
         for mutation in self.mutations:
@@ -1116,25 +1126,26 @@ class NPUIndexTritonKernel(TritonKernel):
             "no_x_dim": self.no_x_dim,
             # Due to breaking change of triton 3.0, the original invocation is broken
             "backend_hash": self.patch_triton_hash(),  # torch.utils._triton.triton_hash_with_backend(),
-            #"high_order_reduction" : self.inside_reduction and self.is_higher_order_reduction(True) ,
-            "split_axis_order" : self.split_axis.sorted_order if self.split_axis is not None else None,
-            "axis1_order" : axis1_order,
+            "split_axis_order": self.split_axis.sorted_order if self.split_axis is not None else None,
+            "axis1_order": axis1_order,
             "axis2_order": axis2_order,
-            "low_dims" : self.low_dims,
+            "low_dims": self.low_dims,
             "numof_reduction_axis": self.numof_reduction_axis(),
-            "split_axis_dtype":split_axis_dtype
+            "split_axis_dtype": split_axis_dtype
         }
         return inductor_meta
+
     def reduction_dim(self):
-        assert self.inside_reduction
+        if not self.inside_reduction:
+            raise RuntimeError("assert self.inside_reduction")
         if self.numof_reduction_axis() > 1:
             return 0
-        return 0 if self.is_higher_order_reduction() or len(self.sorted_axis) ==1 else 1
+        return 0 if self.is_higher_order_reduction() or len(self.sorted_axis) == 1 else 1
+
     def reduction_var(self):
         var = self.axis2
         return var
 
-
     def reduction(
         self,
         dtype: torch.dtype,
@@ -1142,7 +1153,8 @@ class NPUIndexTritonKernel(TritonKernel):
         reduction_type: ReductionType,
         value: Union[CSEVariable, Tuple[CSEVariable, ...]],
     ) -> Union[CSEVariable, Tuple[CSEVariable, ...]]:
-        assert self.inside_reduction
+        if not self.inside_reduction:
+            raise RuntimeError("assert self.inside_reduction")
         masks = {f"{node.symbol()}_mask" for node in self.sorted_axis}
         self.filter_masks(masks)
         masks = sorted(masks)
@@ -1152,7 +1164,7 @@ class NPUIndexTritonKernel(TritonKernel):
 
         dense_size_str = self.dense_size_str(False)
 
-        if len(dense_size_str) > 2 :
+        if len(dense_size_str) > 2:
             value = self._map_tuple_or_scalar(
                 lambda v: self.cse.generate(
                     self.compute, f"tl.reshape({v}, {dense_size_str})", dtype=v.dtype,
@@ -1165,10 +1177,11 @@ class NPUIndexTritonKernel(TritonKernel):
         root_op: str
 
         def final_reduction(value):
-            #use_helper = reduction_type in {"any", "max", "min", "prod"}
-            module = "tl" # use tl
+            module = "tl"
+            # use tl
+            # use tl.max
             if reduction_type in {"max", "min"}:
-                return self.reduction_resize(f"{module}.{reduction_type}({value}, {dim})" #use tl.max
+                    return self.reduction_resize(f"{module}.{reduction_type}({value}, {dim})"
                 )
             return self.reduction_resize(f"{module}.{reduction_type}({value}, {dim})")
 
@@ -1179,7 +1192,8 @@ class NPUIndexTritonKernel(TritonKernel):
                 {result_var} = {self.reduction_resize(f'{result_var}_tmp')}
                 """
             )
-        def get_reduction_axis() :
+
+        def get_reduction_axis():
             return list(self.range_tree_nodes.values())[-1]
 
         cache_key = (src_dtype, reduction_type, value)
@@ -1193,7 +1207,6 @@ class NPUIndexTritonKernel(TritonKernel):
         result_var.mask_vars = {var for var in masks if var[0] != "r"}
         cond = " & ".join(masks)
 
-
         def where_cond(tval, fval):
             if not cond:
                 return tval
@@ -1204,9 +1217,9 @@ class NPUIndexTritonKernel(TritonKernel):
             default = self._map_tuple_or_scalar(constant_repr, default)
 
             def _mask_value(value, default):
-                return self.cse.generate(self.compute, where_cond(value, default) , dtype=value.dtype)
-            # fixme masked_value doesn't work dual reduction
-            if self.numof_reduction_axis() == 1 :
+                return self.cse.generate(self.compute, where_cond(value, default), dtype=value.dtype)
+
+            if self.numof_reduction_axis() == 1:
                 if isinstance(value, tuple):
                     masked_value = [_mask_value(v, d) for v, d in zip(value, default)]
                 else:
@@ -1309,7 +1322,7 @@ class NPUIndexTritonKernel(TritonKernel):
         if self.persistent_reduction and self.axis2 is None:
             return
         argdefs.append(f"XBLOCK: tl.constexpr")
-        if self.numof_reduction_axis() <= 1 :
+        if self.numof_reduction_axis() <= 1:
             argdefs.append(f"XBLOCK_SUB: tl.constexpr")
         if self.axis2 is not None and not self.persistent_reduction:
             argdefs.append(f"RBLOCK: tl.constexpr")
@@ -1325,26 +1338,26 @@ class NPUIndexTritonKernel(TritonKernel):
     def need_broadcast(self, index: sympy.Expr):
         tiling_axis = [False, False]
         for axis in index.free_symbols:
-            if axis not in self.range_tree_nodes :
+            if axis not in self.range_tree_nodes:
                 continue
             if self.range_tree_nodes[axis].is_tiling_axis1:
                 tiling_axis[0] = True
             elif self.range_tree_nodes[axis].is_tiling_axis2:
                 tiling_axis[1] = True
         #implict broadcast
-        result = (self.numof_tiling_axis() > 1 and not self.persistent_reduction) and (tiling_axis[1] ^  tiling_axis[0])
+        result = (self.numof_tiling_axis() > 1 and not self.persistent_reduction) and (tiling_axis[1] ^ tiling_axis[0])
         result = result and self.find_axis2_in_indexing()
         return  result,  tiling_axis
 
     def current_node_has_permute(self):
-        if not self.current_node :
+        if not self.current_node:
             return False
         for index in self.current_node._body.indexing.values():
-             if self.need_permuted(index) :
+             if self.need_permuted(index):
                 return True
         return False
     def need_permuted(self, index: sympy.Expr):
-        if self.numof_tiling_axis() <= 1 :
+        if self.numof_tiling_axis() <= 1:
             return False
 
         need_permute = False
@@ -1352,18 +1365,18 @@ class NPUIndexTritonKernel(TritonKernel):
         coefficients_dict = index.as_coefficients_dict()
         need_permute_axis1 = False
         need_permute_axis2 = False
-        for key,value in coefficients_dict.items():
-            if not key.free_symbols :
+        for key, value in coefficients_dict.items():
+            if not key.free_symbols:
                 continue
             key = list(key.free_symbols)[0]
-            if key not in self.range_tree_nodes :
+            if key not in self.range_tree_nodes:
                 continue
             axis = self.range_tree_nodes[key]
             # normally, axis2 is lowest dimension, except for higher_order_reduction
-            if (self.inside_reduction and self.is_higher_order_reduction(True)) :
+            if (self.inside_reduction and self.is_higher_order_reduction(True)):
                 if axis.is_tiling_axis1 and value > sympy.Integer(1):
                     need_permute_axis1 = True
-            elif axis.is_tiling_axis2 and value > sympy.Integer(1) :
+            elif axis.is_tiling_axis2 and value > sympy.Integer(1):
                 need_permute_axis2 = True if self.numof_reduction_axis() <= 1 else isinstance(axis.expr, ModularIndexing)
             tmp_list.append(True if value > sympy.Integer(1) else False)
 
@@ -1376,40 +1389,45 @@ class NPUIndexTritonKernel(TritonKernel):
 
     def get_reshape_dense_str(self, tiling_axis):
         # there must be one tiling asis missing
-        assert tiling_axis[1] or tiling_axis[0]
+        if not (tiling_axis[1] or tiling_axis[0]):
+            raise RuntimeError("assert tiling_axis[1] or tiling_axis[0]")
+
         sizes = ["XBLOCK_SUB", "1"]
-        if not tiling_axis[0] :
+        if not tiling_axis[0]:
             sizes = ["1", "RBLOCK"]
 
         if self.inside_reduction and self.is_higher_order_reduction():
             sizes = reversed(sizes)
         return f"[{', '.join(sizes)}]"
 
-    def get_reshape_str(self, tiling_axis, check_prev_node = True):
+    def get_reshape_str(self, tiling_axis, check_prev_node=True):
         # there must be one tiling asis missing
-        assert tiling_axis[1] or tiling_axis[0]
+        if not (tiling_axis[1] or tiling_axis[0]):
+            raise RuntimeError("assert tiling_axis[1] or tiling_axis[0]")
+
         sizes = ["XBLOCK_SUB", "RBLOCK"]
-        if not tiling_axis[0] :
+        if not tiling_axis[0]:
             sizes[0] = "1"
-        elif not tiling_axis[1] :
+        elif not tiling_axis[1]:
             sizes[1] = "1"
         if self.inside_reduction and self.is_higher_order_reduction(check_prev_node):
             sizes = reversed(sizes)
 
         return f"[{', '.join(sizes)}]"
 
-    def get_broadcast_dense_str(self, tiling_axis, check_prev_node = True):
+    def get_broadcast_dense_str(self, tiling_axis, check_prev_node=True):
         # there must be one tiling asis missing
-        assert tiling_axis[1] or tiling_axis[0]
+        if not (tiling_axis[1] or tiling_axis[0]):
+            raise RuntimeError("assert tiling_axis[1] or tiling_axis[0]")
+
         sizes = ["XBLOCK_SUB", "RBLOCK"]
         if self.inside_reduction and self.is_higher_order_reduction(check_prev_node):
             sizes = reversed(sizes)
-        #elif not tiling_axis[0] :
-        #    sizes = reversed(sizes)
         return f"[{', '.join(sizes)}]"
 
 
     #broadcast, permute handling
+
     def load(self, name: str, index: sympy.Expr):
         var = self.args.input(name)
         original_index = index
@@ -1474,7 +1492,7 @@ class NPUIndexTritonKernel(TritonKernel):
                 line = f"tl.load({var} + ({index_str}), {mask_str}{ep}{other})"
 
             dtype = V.graph.get_dtype(name)
-            if dtype in  (torch.bfloat16, ):
+            if dtype in (torch.bfloat16, ):
                 line += ".to(tl.float32)"
             if dtype == torch.bool and torch.version.hip is None:
                 line += ".to(tl.int1)"
@@ -1494,21 +1512,20 @@ class NPUIndexTritonKernel(TritonKernel):
         else:
             load_buffer = self.loads
 
-        result_var = self.cse.generate(load_buffer, line, dtype = dtype)
-        assert isinstance(result_var, TritonCSEVariable)
+        result_var = self.cse.generate(load_buffer, line, dtype=dtype)
+        if not (isinstance(result_var, TritonCSEVariable)):
+            raise RuntimeError("assert isinstance(result_var, TritonCSEVariable)")
         result_var.mask_vars = indexing.mask_vars  # type: ignore[assignment]
 
         if append_broadcast and append_broadcast != '[]':
             line = f"tl.broadcast_to({result_var}, {append_broadcast})"
-            result_var = self.cse.generate(load_buffer, line, dtype = dtype)
+            result_var = self.cse.generate(load_buffer, line, dtype=dtype)
         elif need_broadcast and not indirect_indexing:
-            #reshape_str = self.get_reshape_str(tiling_axis)
-            #.reshape({reshape_str})
             line = f"{result_var}.broadcast_to({self.get_broadcast_dense_str(tiling_axis)})"
-            result_var = self.cse.generate(load_buffer, line, dtype = dtype)
+            result_var = self.cse.generate(load_buffer, line, dtype=dtype)
         elif is_permuted:
             line = f"{result_var}.permute(1,0)"
-            result_var = self.cse.generate(self.loads, line, dtype = dtype)
+            result_var = self.cse.generate(self.loads, line, dtype=dtype)
 
         if advance_block_ptr:
             load_buffer.writeline(advance_block_ptr)
@@ -1851,7 +1868,8 @@ class NPUIndexTritonKernel(TritonKernel):
                     # intermediary strings, wrap them in CSE variables with properly initialised bounds.
 
                     # If there is no FX bound but we know how to compute one we do so
-                    assert not kwargs
+                    if (kwargs):
+                        raise RuntimeError("assert not kwargs")
 
                     def arg_to_bound(x):
                         if isinstance(x, CSEVariable):
@@ -1874,7 +1892,8 @@ class NPUIndexTritonKernel(TritonKernel):
             ):
                 if isinstance(size, int):
                     size = sympy.Integer(size)
-                assert isinstance(size, sympy.Expr), size
+                if not (isinstance(size, sympy.Expr)):
+                    raise RuntimeError("assert isinstance(size, sympy.Expr), size")
                 # Skip CSE since this doesn't return an expression
 
                 if var.bounds.lower < 0:  # type: ignore[operator]
@@ -1933,7 +1952,6 @@ class NPUIndexTritonKernel(TritonKernel):
                 store_cache = self.cse.store_cache
                 if name in store_cache:
                     return self.load(name, index)
-                    #return store_cache[name]
                 out = self.load(name, index)
                 # count load that is not in the store_cache, and also not in the
                 # cse cache.
@@ -2083,7 +2101,8 @@ class NPUIndexTritonKernel(TritonKernel):
             return h
 
         super().__enter__()
-        assert self.overrides
+        if not (self.overrides):
+            raise RuntimeError("assert self.overrides")
         parent_handler = self.overrides(V.get_ops_handler())
         self.exit_stack.enter_context(V.set_ops_handler(CSEProxy()))
         self.exit_stack.enter_context(V.set_kernel_handler(self))
-- 
Gitee


From fd7b035287465171021ac72d509d5318eb6c2f95 Mon Sep 17 00:00:00 2001
From: wl1259 <wanglei752@huawei.com>
Date: Sat, 19 Apr 2025 11:32:20 +0800
Subject: [PATCH 331/358] 5 clean code commit

---
 torch_npu/_inductor/codegen/ir.py            | 41 +++++-----
 torch_npu/_inductor/codegen/schduling.py     | 31 +++----
 torch_npu/_inductor/codegen/split_tiling.py  | 75 +++++++++--------
 torch_npu/_inductor/npu_triton_heuristics.py | 85 ++++++++++----------
 4 files changed, 119 insertions(+), 113 deletions(-)

diff --git a/torch_npu/_inductor/codegen/ir.py b/torch_npu/_inductor/codegen/ir.py
index 6a9baf5b8f..c16fce2549 100644
--- a/torch_npu/_inductor/codegen/ir.py
+++ b/torch_npu/_inductor/codegen/ir.py
@@ -1,7 +1,8 @@
 
 from typing import List, Tuple, Dict, Any, Optional
-import sympy
 import itertools
+import sympy
+
 
 from torch._inductor.virtualized import V
 from torch._inductor.ir import (ReductionHint, IRNode, ModularIndexing, FloorDiv)
@@ -10,6 +11,7 @@ from torch_npu._inductor.codegen.triton import NPUIndexTritonKernel
 
 from ..config import log
 
+
 # NPU doesn't need to support ReductionHint.OUTER, and persistent reduction
 def num_splits(
         device,
@@ -81,32 +83,32 @@ def detect_flattened_dims(kernel, index):
                 expr = FloorDiv(var, length)
                 new_vars[var][divisor][0] = (expr, length, parent_axis.length // length)
             #ModularIndexing not inplace
-            elif not pair[1] :
+            elif not pair[1]:
                 expr = ModularIndexing(var, 1, divisor)
                 new_vars[var][divisor][1] = (expr, 1, divisor)
-            else :
+            else:
                 pass
 
     return new_vars
 
-def rebuild_flattened_dims(indexing) :
-    def rebuild_flattened_dim(key, index,  old_node, flatten_dim) :
+
+def rebuild_flattened_dims(indexing):
+    def rebuild_flattened_dim(key, index, old_node, flatten_dim):
         for _, pair in flatten_dim.items():
             new_var_expr = sympy.Integer(0)
             origin_axis_length = 0
             pair_is_valid = True 
             # don't create duplicated axis, e.g. y1:1024, y1 % 1024 is duplicated
             expr, divisor, length = pair[1]
-            if not old_node.parent.duplicated_check(divisor, length) :
-                 V.kernel.expr_substituted[expr] = old_node.symbol() 
-                 break   
+            if not old_node.parent.duplicated_check(divisor, length):
+                V.kernel.expr_substituted[expr] = old_node.symbol()
+                break
        
             for axis in pair:
                 expr, divisor, length = axis
                 # 3. try to rebuild the axis in kernel
                 new_node = old_node.parent.lookup(divisor, length)
-                # new_node = old_node 
-                
+
                 # 4. substitute div/mod expression in indexing
                 index = index.subs(expr, new_node.symbol())
                 indexing[key] = index
@@ -117,11 +119,11 @@ def rebuild_flattened_dims(indexing) :
                     new_var_expr = new_var_expr + new_node.symbol()
                 V.kernel.expr_substituted[expr] = new_node.symbol()
   
-            if var not in  V.kernel.range_tree_nodes_substituted :
+            if var not in V.kernel.range_tree_nodes_substituted:
                 V.kernel.range_tree_nodes_substituted[var] = []
-            V.kernel.range_tree_nodes_substituted[var].append((origin_axis_length,new_var_expr))
+            V.kernel.range_tree_nodes_substituted[var].append((origin_axis_length, new_var_expr))
 
-    def find_index_in_substitute(index, kernel) :
+    def find_index_in_substitute(index, kernel):
         return any([index.find(key) for key in kernel.expr_substituted.keys()])
 
     kernel = V.kernel
@@ -167,19 +169,18 @@ def substituted_dims_in_indexing(self, indexing, kernel, range_tree_nodes_substi
 
 def generate_body_indexing(body, indices):
     index = list(itertools.chain.from_iterable(indices))
-    assert len(index) == len(body.var_ranges), (index, body.var_ranges)
-    assert all(v not in body.var_ranges for v in index)
+    if not (len(index) == len(body.var_ranges)):
+        raise RuntimeError("assert len(index) == len(body.var_ranges), (index, body.var_ranges)")
+    if not (all(v not in body.var_ranges for v in index)):
+        raise RuntimeError("assert all(v not in body.var_ranges for v in index)")
+
     replacements = dict(zip(body.var_ranges.keys(), index))
-    indexing_map = dict(zip( index, body.var_ranges.keys()))
+    indexing_map = dict(zip(index, body.var_ranges.keys()))
     setattr(body, 'indexing_map', indexing_map)
     body.indexing = {
         name: sympy_subs(expr, replacements)
         for name, expr in body.indexing_exprs.items()
     }
-    # body.indexing = {
-    #     name: sympy_subs(expr, V.graph.sizevars.var_to_val)
-    #     for name, expr in body.indexing.items()
-    # }
 
 
 def transform_dims_in_indexing(self, indices) :
diff --git a/torch_npu/_inductor/codegen/schduling.py b/torch_npu/_inductor/codegen/schduling.py
index a4d92f197f..cdd0743cdd 100644
--- a/torch_npu/_inductor/codegen/schduling.py
+++ b/torch_npu/_inductor/codegen/schduling.py
@@ -1,18 +1,19 @@
 import itertools
 import contextlib
-import sympy
 from typing import Union, Iterable
 from typing import Dict, Sequence, List, Iterable
+import sympy
+
 
 from torch.fx.immutable_collections import immutable_dict
 from torch._inductor.codegen.triton import (TritonScheduling, log, config)
-from torch._inductor.codegen.simd import  DisableReduction, EnableReduction,SIMDKernelFeatures, SIMDKernel
+from torch._inductor.codegen.simd import DisableReduction, EnableReduction, SIMDKernelFeatures, SIMDKernel
 from torch._inductor.codegen.simd import schedule_log, scheduler 
 from torch._inductor.codegen.multi_kernel import MultiKernel
 from torch._inductor.virtualized import (V,)
 from torch._inductor.codecache import code_hash
 from torch._dynamo.utils import counters
-from torch._inductor.utils import sympy_index_symbol,ModularIndexing,FloorDiv
+from torch._inductor.utils import sympy_index_symbol, ModularIndexing, FloorDiv
 
 from torch_npu._inductor.codegen.triton import NPUIndexTritonKernel, flatten
 from .split_tiling import SplitTiling
@@ -23,12 +24,13 @@ def flatten_groups(nums):
     res = []
     for i in nums:
         if isinstance(i, Iterable):
-            for x in i :
+            for x in i:
                 res.append(x)
         else:
             res.append(i)
     return res
 
+
 @classmethod
 def create_tiling(
         cls, pw_tiling: Sequence[sympy.Expr], reduction_tiling: Sequence[sympy.Expr]
@@ -38,7 +40,7 @@ def create_tiling(
     """
     
     pw_tiling = flatten_groups(pw_tiling)
-    pw_prefixes = ["w", "v", "t", "z", "y", "x"][-len(pw_tiling) :]
+    pw_prefixes = ["w", "v", "t", "z", "y", "x"][-len(pw_tiling): ]
     reduction_tiling = flatten_groups(reduction_tiling)
     reduction_tiling = [NumelList(reduction_tiling).numels()]
     reduction_prefixes = ["r"][: len(reduction_tiling)]
@@ -50,8 +52,8 @@ def create_tiling(
 
 
 class NPUTritonScheduling(TritonScheduling):
-    def __init__(self, scheduler):
-        super().__init__(scheduler)
+    def __init__(self, input_scheduler):
+        super().__init__(input_scheduler)
         self.kernel_type = NPUIndexTritonKernel
 
     def create_kernel_choices(
@@ -76,7 +78,7 @@ class NPUTritonScheduling(TritonScheduling):
             kernel_features, [tiling], {"features": kernel_features}
         )
         kernel = kernels[0]
-        setattr(kernel, "node_schedule", node_schedule )
+        setattr(kernel, "node_schedule", node_schedule)
         self.decide_codegen_dims_in_kernel(node_schedule, kernel)
 
         for kernel in kernels:
@@ -124,7 +126,9 @@ class NPUTritonScheduling(TritonScheduling):
                 name = node.get_name()
                 if name not in live_outs:
                     continue
-                assert node.node is not None
+                if node.node is None:
+                    raise RuntimeError("assert node.node is not None")
+
                 origin_node = node.node.get_origin_node()
                 if origin_node is not None:
                     counters["inductor"]["intermediate_hooks"] += 1
@@ -158,20 +162,19 @@ class NPUTritonScheduling(TritonScheduling):
         with kernel:
             # 1. transform dims: create new dims to substitute floor_divide and modular expression
             stack = contextlib.ExitStack()
-            for  i, node in enumerate(node_schedule):
+            for _, node in enumerate(node_schedule):
                 if node is DisableReduction:
                     stack.enter_context(kernel.disable_reduction())
                 elif node is EnableReduction:
                     stack.close()
-                    # kernel.set_last_usage(current_reduction_nodes(node_schedule[i:]))
                 else:
                     index_vars = kernel.split_and_set_ranges(node.get_ranges())
                     node._body.transform_dims_in_indexing(index_vars)
             # 2. go through range_tree_nodes to findout, to find one axis could be substituted by others
             self.additional_nodes_to_be_subs(kernel, kernel.range_tree_nodes_substituted)
             # 3.do the substitution on all indexing
-            for  node in node_schedule:
-                if node  in (EnableReduction, DisableReduction):
+            for node in node_schedule:
+                if node in (EnableReduction, DisableReduction):
                     continue
                 indexing = node._body.indexing
                 node._body.substituted_dims_in_indexing(indexing, kernel, kernel.range_tree_nodes_substituted)
@@ -188,7 +191,7 @@ class NPUTritonScheduling(TritonScheduling):
             for node in node_schedule:
                 if node in (EnableReduction, DisableReduction):
                     continue
-                for x,y in zip( node._body.indexing_exprs.values(), node._body.indexing.values()) :
+                for x, y in zip(node._body.indexing_exprs.values(), node._body.indexing.values()):
                     print(f"index transform:{x}->{y}")
 
     def additional_nodes_to_be_subs(self, kernel, node_to_be_substituted):
diff --git a/torch_npu/_inductor/codegen/split_tiling.py b/torch_npu/_inductor/codegen/split_tiling.py
index 52300c22a8..13220cb13c 100644
--- a/torch_npu/_inductor/codegen/split_tiling.py
+++ b/torch_npu/_inductor/codegen/split_tiling.py
@@ -13,7 +13,7 @@ from ..config import num_vector_core, log
 
 # split and tiling axis selector
 class SplitTiling:
-    def __init__(self, kernel: TritonKernel) :
+    def __init__(self, kernel: TritonKernel):
         self.kernel = kernel
         self.indexing = []
 
@@ -24,7 +24,7 @@ class SplitTiling:
             # to be lower than floor_dir
             elif isinstance(x.expr, ModularIndexing):
                 return x.name[0] + "0" + x.name[1:]
-            else :
+            else:
                 return x.name
             
         kernel.sorted_axis = [x for x in kernel.range_tree_nodes.values()]
@@ -43,7 +43,6 @@ class SplitTiling:
     # Split 原则4: 如果高维规约类融合算子，而且高维尺寸非常大（ >= 64KB），低维度尺寸比较小（ <= 32B）, 可以选择对规约轴切分，然后在核间用atomic
     # 原语做规约。
     # Split  原则5 ：根据算子逻辑，优先选择一维发射。
-
     def select_split_axis(self):
 
         def select_longest_dim(can_be_low_dim=True):
@@ -68,12 +67,12 @@ class SplitTiling:
             return longest_dim
 
         #longest and not low dims
-        longest_dim = select_longest_dim( can_be_low_dim = False )
+        longest_dim = select_longest_dim( can_be_low_dim=False)
 
         # longest and can be low dims
-        if longest_dim is None or SplitTiling.less_than(longest_dim.length , int(num_vector_core * 0.8)):
-            longest_dim = select_longest_dim( can_be_low_dim = True )
-        if longest_dim is not None :
+        if longest_dim is None or SplitTiling.less_than(longest_dim.length, int(num_vector_core * 0.8)):
+            longest_dim = select_longest_dim(can_be_low_dim=True)
+        if longest_dim is not None:
             self.kernel.split_axis = longest_dim
             self.kernel.split_axis.is_split_axis = True
         elif len(self.kernel.sorted_axis) > 0:
@@ -93,12 +92,12 @@ class SplitTiling:
     # fixme, two tiling axis might be insufficient when there're 3 or more low-dims in indexing
     def select_tiling_axis(self ):
         # True :self.kernel.axis2 is Not None and all reduction axis selected, False : other cases
-        def axis2_selection_done(axis) :
-            if self.kernel.total_numels <= 1  :
+        def axis2_selection_done(axis):
+            if self.kernel.total_numels <= 1:
                 return True
-            elif self.kernel.axis2 is not None :
+            elif self.kernel.axis2 is not None:
                 is_reduction = axis.prefix == "r"
-                if not is_reduction :
+                if not is_reduction:
                     return True
                 reduction_axis = self.kernel.numof_reduction_axis()
                 return True if reduction_axis <= 1 else len(self.kernel.axis2_list) == reduction_axis
@@ -112,80 +111,80 @@ class SplitTiling:
             self.kernel.persistent_reduction = True
         biggest = -1
         dims = self.kernel.sorted_axis
-        if self.kernel.split_axis is None :
+        if self.kernel.split_axis is None:
             self.select_split_axis()
         
-        if self.kernel.split_axis is None :
+        if self.kernel.split_axis is None:
             return
         # select tiling_axis2 then tiling_axis1, for reduction, all reduction axis will be selected as tiling_axis2
-        for i in range(len(dims)-1, -1, -1):
+        for i in range(len(dims) - 1, -1, -1):
             axis = dims[i]
             numel = axis.length
             if isinstance(numel, (sympy.Symbol, sympy.Expr)) and not isinstance(numel, sympy.Integer):
-                  numel = numel.subs(V.graph.sizevars.var_to_val)
-            if axis.is_split_axis :
+                numel = numel.subs(V.graph.sizevars.var_to_val)
+            if axis.is_split_axis:
                 dtype = self.kernel.get_axis_dtype(axis)
 
                 min_aligned_numel = get_aligned_numel(dtype)
-                _, numel = SplitTiling.decide_nblocks_xblock(numel, len(self.kernel.sorted_axis) <=1, min_aligned_numel)
+                _, numel = SplitTiling.decide_nblocks_xblock(numel, len(self.kernel.sorted_axis) <= 1, min_aligned_numel)
             
             # choose reduction axis or low-dim as axis2
             if not axis2_selection_done(axis):
-                axis.is_tiling_axis2 =True if SplitTiling.great_than(numel,1) else False
+                axis.is_tiling_axis2 = True if SplitTiling.great_than(numel,1) else False
                 # axis2 must be the reduction axis in case inside_reduction
-                if axis.prefix == "r" :
-                    axis.is_tiling_axis2 =True
-                if axis.is_tiling_axis2 and  self.kernel.axis2 is None :
+                if axis.prefix == "r":
+                    axis.is_tiling_axis2 = True
+                if axis.is_tiling_axis2 and self.kernel.axis2 is None:
                     self.kernel.axis2 = axis.symbol()
-                if self.kernel.numof_reduction_axis() > 1 :
+                if self.kernel.numof_reduction_axis() > 1:
                     self.kernel.axis2_list.append(axis.symbol())
                     self.kernel.axis2 = axis.symbol() if isinstance(axis.expr, ModularIndexing) else self.kernel.axis2
-            else :
+            else:
                 # for _higher_order_reduction, axis1 must be  the lowest dimension
-                if self.kernel.inside_reduction and self.kernel.is_higher_order_reduction() :
+                if self.kernel.inside_reduction and self.kernel.is_higher_order_reduction():
                     self.kernel.axis1 = axis.symbol()
                     break
 
                 # low-dim should be selected as another tiling axis
-                if self.is_lowest_dimension(axis) :
+                if self.is_lowest_dimension(axis):
                     self.kernel.axis1 = axis.symbol()
                     break
                 # select the longest in other cases
-                if numel > biggest :
+                if numel > biggest:
                     self.kernel.axis1 = axis.symbol()
                     biggest = numel
-        if self.kernel.axis1 is not None :
-            axis = self.kernel.range_tree_nodes[self.kernel.axis1 ]
+        if self.kernel.axis1 is not None:
+            axis = self.kernel.range_tree_nodes[self.kernel.axis1]
             axis.is_tiling_axis1 = True
 
 
         log.debug(f"split_tiling numels:{self.kernel.numels} split_axis: {self.kernel.split_axis.symbol()} "
                   f"axis1:{self.kernel.axis1} axis2:{self.kernel.axis2} low_dims:{self.kernel.low_dims}, "
-                  f"indexing: {self.indexing}" )
+                  f"indexing: {self.indexing}")
 
 
     def should_outer_reduce_me(self, x):
-        should_outer = self.kernel.is_higher_order_reduction(True) and SplitTiling.great_than(x.length, 32768 ) and x.is_loop
-        if should_outer :
+        should_outer = self.kernel.is_higher_order_reduction(True) and SplitTiling.great_than(x.length, 32768) and x.is_loop
+        if should_outer:
             self.should_outer_reduce = True
             self.kernel.split_axis = x
             self.kernel.split_axis.is_split_axis = True
         return should_outer
     
     @staticmethod
-    def decide_nblocks_xblock(numel, no_axis2, min_aligned_numel, xblock = None):
+    def decide_nblocks_xblock(numel, no_axis2, min_aligned_numel, xblock=None):
         #no_axis2 mean there's only on dims
         min_xblock = min_aligned_numel if no_axis2 else 1
 
         # need to keep linearity for low_dims
-        if xblock is None :
-            xblock = ( numel + num_vector_core -1 ) // num_vector_core if numel > num_vector_core else min_xblock
+        if xblock is None:
+            xblock = (numel + num_vector_core -1 ) // num_vector_core if numel > num_vector_core else min_xblock
         
         xblock = next_power_of_2(xblock)
 
-        nblocks = (numel + xblock -1 ) // xblock
+        nblocks = (numel + xblock -1) // xblock
         return nblocks, xblock
     
     @staticmethod
@@ -199,7 +198,7 @@ class SplitTiling:
         XBLOCK = numel
         NBLOCKS = 1
         ret.append((NBLOCKS,XBLOCK))
-        while NBLOCKS<=num_vector_core and XBLOCK>1:
+        while NBLOCKS <= num_vector_core and XBLOCK > 1:
             XBLOCK -= 1
             NBLOCKS = (numel + XBLOCK - 1) // XBLOCK
             XBLOCK = (numel + NBLOCKS - 1) // NBLOCKS
@@ -212,7 +211,7 @@ class SplitTiling:
         return x.sorted_order in self.kernel.low_dims
 
     def find_lowest_dimension(self):
-        def construct_low_dim() :
+        def construct_low_dim():
             for index in self.indexing:
                 coefficients_dict = index.as_coefficients_dict()
                 for key, value in coefficients_dict.items():
@@ -246,7 +245,7 @@ class SplitTiling:
                 if key in names and index not in self.indexing:
                     self.indexing.append(index)
 
-        if self.kernel.inside_reduction :
+        if self.kernel.inside_reduction:
             construct_low_dim()
             return
 
diff --git a/torch_npu/_inductor/npu_triton_heuristics.py b/torch_npu/_inductor/npu_triton_heuristics.py
index 10c95a836d..527700d224 100644
--- a/torch_npu/_inductor/npu_triton_heuristics.py
+++ b/torch_npu/_inductor/npu_triton_heuristics.py
@@ -10,7 +10,7 @@ import json
 
 import torch
 from torch._inductor import config
-
+from torch._dynamo.utils import dynamo_timed
 from torch._inductor.runtime.triton_heuristics import (
     CachingAutotuner,
     HeuristicType,
@@ -403,7 +403,6 @@ class NPUCachingAutotuner(CachingAutotuner):
         launcher.n_regs = getattr(binary, "n_regs", None)
         launcher.n_spills = getattr(binary, "n_spills", None)
         launcher.shared = binary_shared
-        # launcher.store_cubin = self.inductor_meta.get("store_cubin", False)
         launcher.store_cubin = True
         # store this global variable to avoid the high overhead of reading it when calling run
         if launcher.store_cubin:
@@ -412,48 +411,46 @@ class NPUCachingAutotuner(CachingAutotuner):
 
         return binary, launcher
 
-    def save_gpu_kernel(self, grid, stream, launcher):
-        self.save_npu_kernel(grid, stream, launcher)
+    def save_gpu_kernel(self, input_grid, input_stream, input_launcher):
+        self.save_npu_kernel(input_grid, input_stream, input_launcher)
     
-    def save_npu_kernel(self, grid, stream, launcher):
-        if callable(grid):
-            grid_x, grid_y, grid_z = grid(launcher.config.kwargs)
+    def save_npu_kernel(self, input_grid, input_stream, input_launcher):
+        if callable(input_grid):
+            grid_x, grid_y, grid_z = input_grid(input_launcher.config.kwargs)
         else:
-            grid_x, grid_y, grid_z = grid
+            grid_x, grid_y, grid_z = input_grid
 
         key = self.inductor_meta.get("kernel_name", None)  # unique kernel name
-        assert key is not None, "kernel_name can not be None"
+
+        if key is None:
+            raise RuntimeError("assert key is not None, kernel_name can not be None")
         params = {
             "mangled_name": (
-                launcher.bin.metadata.name
-                if hasattr(launcher.bin.metadata, "name")
-                else launcher.bin.metadata["name"]
+                input_launcher.bin.metadata.name
+                if hasattr(input_launcher.bin.metadata, "name")
+                else input_launcher.bin.metadata["name"]
             ),
             "grid_x": grid_x,
             "grid_y": grid_y,
             "grid_z": grid_z,
-            # "x_block": launcher.config.kwargs.get("XBLOCK", 1),
-            # "y_block": launcher.config.kwargs.get("YBLOCK", None),
-            # "z_block": launcher.config.kwargs.get("ZBLOCK", None),
-            # "r_block": launcher.config.kwargs.get("RBLOCK", None),
             "num_warps": (
-                launcher.bin.num_warps
-                if hasattr(launcher.bin, "num_warps")
-                else launcher.bin.metadata.num_warps
+                input_launcher.bin.num_warps
+                if hasattr(input_launcher.bin, "num_warps")
+                else input_launcher.bin.metadata.num_warps
             ),
             "shared_mem": (
-                launcher.bin.shared
-                if hasattr(launcher.bin, "shared")
-                else launcher.bin.metadata.shared
+                input_launcher.bin.shared
+                if hasattr(input_launcher.bin, "shared")
+                else input_launcher.bin.metadata.shared
             ),
-            "stream": stream,
+            "stream": input_stream,
             # User defined triton kernels will have arbitrary kwarg names
-            "meta": launcher.config.kwargs,
+            "meta": input_launcher.config.kwargs,
         }
         from torch._inductor.codecache import CudaKernelParamCache
 
         bin_type = "npubin"
-        binary = launcher.bin.asm[bin_type] # npubin type = npubin
+        binary = input_launcher.bin.asm[bin_type] # npubin type = npubin
         CudaKernelParamCache.set(key, params, binary, bin_type='cubin') # CudaKernelParam
 
         self.cuda_kernel_saved = True
@@ -745,11 +742,12 @@ def triton_config_npu_index(
             tmp = Config(cfg, num_warps=num_warps, num_stages=num_stages)
             configs.append(tmp)
 
-    for cfg in configs :
+    for cfg in configs:
         log.debug("generated tiling configs %s", cfg.kwargs)
 
     return configs
 
+
 def pointwise_npu_index(
     size_hints,
     triton_meta,
@@ -765,13 +763,14 @@ def pointwise_npu_index(
     )
     return cached_autotune(
         size_hints,
-        triton_config_with_settings(size_hints, inductor_meta = inductor_meta),
+        triton_config_with_settings(size_hints, inductor_meta=inductor_meta),
         triton_meta=triton_meta,
         inductor_meta=inductor_meta,
         heuristic_type=HeuristicType.POINTWISE,
         filename=filename,
     )
 
+
 def reduction_npu_index(
     size_hints,
     reduction_hint=False,
@@ -783,8 +782,10 @@ def reduction_npu_index(
     """args to @triton.heuristics()"""
     inductor_meta = {} if inductor_meta is None else inductor_meta
     inductor_meta["reduction_hint"] = reduction_hint
-    assert triton_meta is not None
-    contiguous_config = triton_config_npu_index(size_hints, inductor_meta = inductor_meta, reduction = True)
+    if triton_meta is None:
+        raise RuntimeError("assert triton_meta is not None")
+
+    contiguous_config = triton_config_npu_index(size_hints, inductor_meta=inductor_meta, reduction=True)
     return cached_autotune(
         size_hints,
         [
@@ -796,6 +797,7 @@ def reduction_npu_index(
         heuristic_type=HeuristicType.REDUCTION,
     )
 
+
 def persistent_reduction_npu_index(
     size_hints,
     reduction_hint=False,
@@ -805,8 +807,8 @@ def persistent_reduction_npu_index(
 ):
     inductor_meta = {} if inductor_meta is None else inductor_meta
     inductor_meta["reduction_hint"] = reduction_hint
-    configs = triton_config_npu_index(size_hints, inductor_meta = inductor_meta, reduction=True,
-                                      persistent_reduction = True )
+    configs = triton_config_npu_index(size_hints, inductor_meta=inductor_meta, reduction=True,
+                                      persistent_reduction=True)
 
 
     return cached_autotune(
@@ -818,7 +820,7 @@ def persistent_reduction_npu_index(
         heuristic_type=HeuristicType.PERSISTENT_REDUCTION,
     )
 
-# fixme , need to add npu_indexing tiling
+
 def foreach(triton_meta, num_warps, filename=None, inductor_meta=None):
     """
     Compile a triton foreach kernel
@@ -832,9 +834,9 @@ def foreach(triton_meta, num_warps, filename=None, inductor_meta=None):
         filename=filename,
     )
 
-from torch._dynamo.utils import dynamo_timed
+
 @dynamo_timed
-def benchmark_all_configs(self, *args, grid, **kwargs):
+def benchmark_all_configs(self, *args, input_grid, **kwargs):
     print(f"candidate launcher count = {len(self.launchers)}")
     
     tilling_kernel_list = []
@@ -849,7 +851,7 @@ def benchmark_all_configs(self, *args, grid, **kwargs):
             launcher(
                 *cloned_args,
                 **cloned_kwargs,
-                grid=grid,
+                grid=input_grid,
                 stream=stream,
             )
         return call_kernel
@@ -886,11 +888,12 @@ def benchmark_all_configs(self, *args, grid, **kwargs):
             data_simplification=False
         )
         
-        md5_hash = hashlib.md5()
+        # md5_hash = hashlib.md5()
         md5_hash = hashlib.md5(datetime.now().strftime('%Y-%m-%d').encode('utf-8')).hexdigest()
-        
-        torch_path="./profile_result/"+md5_hash
-        rep=1
+
+
+        torch_path= "./profile_result/" + md5_hash
+        rep = 1
         with torch_npu.profiler.profile(
             activities=[
                 torch_npu.profiler.ProfilerActivity.NPU
@@ -904,14 +907,14 @@ def benchmark_all_configs(self, *args, grid, **kwargs):
             with_modules=False,
             experimental_config=experimental_config) as prof:
             stream.synchronize()
-            for i in range(rep+3):
+            for _ in range(rep + 3):
                 for fn in tilling_kernel_list:
                     fn()
                 prof.step()
             stream.synchronize()
 
         import pandas as pd
-        for root, dirs, files in os.walk(torch_path):
+        for root, _, files in os.walk(torch_path):
             for file in files:
                 if file != 'kernel_details.csv':
                     continue
-- 
Gitee


From f346259d56baefaaa2ca3381b623672158d39766 Mon Sep 17 00:00:00 2001
From: wl1259 <wanglei752@huawei.com>
Date: Sat, 19 Apr 2025 15:03:49 +0800
Subject: [PATCH 332/358] 5 clean code commit

---
 torch_npu/_inductor/codegen/__init__.py       |   2 -
 torch_npu/_inductor/codegen/_sizevars.py      |   2 +-
 .../_inductor/codegen/npu_kernel_features.py  |   3 +-
 torch_npu/_inductor/codegen/split_tiling.py   |  21 ++--
 torch_npu/_inductor/codegen/tile_generator.py |   8 +-
 torch_npu/_inductor/codegen/triton.py         | 107 +++++++++---------
 torch_npu/_inductor/lowering.py               |   2 +-
 .../_inductor/npu_fusion_attention_graph.py   |   6 +-
 torch_npu/_inductor/npu_triton_heuristics.py  |  22 ++--
 9 files changed, 87 insertions(+), 86 deletions(-)

diff --git a/torch_npu/_inductor/codegen/__init__.py b/torch_npu/_inductor/codegen/__init__.py
index 3027bc9d28..ee073a8f83 100644
--- a/torch_npu/_inductor/codegen/__init__.py
+++ b/torch_npu/_inductor/codegen/__init__.py
@@ -18,9 +18,7 @@ from torch_npu._inductor.codegen.schduling import create_tiling
 
 from ..config import log as npulog
 npulog.info("perform npu_indexing patch")
-#from ..npu_indexing.graph import run_node
 #graph
-#GraphLowering.run_node = run_node
 #common
 #ir
 
diff --git a/torch_npu/_inductor/codegen/_sizevars.py b/torch_npu/_inductor/codegen/_sizevars.py
index 37196fcad4..8420655404 100644
--- a/torch_npu/_inductor/codegen/_sizevars.py
+++ b/torch_npu/_inductor/codegen/_sizevars.py
@@ -4,7 +4,7 @@ from torch._inductor.utils import sympy_subs
 
 
 def simplify(self, expr: Expr):
-    if isinstance(expr, (tuple,list)):
+    if isinstance(expr, (tuple, list)):
         return [sympy.expand(s).xreplace(self.replacements) for s in expr]
     return sympy.expand(expr).xreplace(self.replacements)
 
diff --git a/torch_npu/_inductor/codegen/npu_kernel_features.py b/torch_npu/_inductor/codegen/npu_kernel_features.py
index 0f86e3d81a..57c1211e35 100644
--- a/torch_npu/_inductor/codegen/npu_kernel_features.py
+++ b/torch_npu/_inductor/codegen/npu_kernel_features.py
@@ -1,8 +1,7 @@
-import sympy
 import functools
-
 from typing import Tuple, List
 from typing import Iterable
+import sympy
 
 import torch
 from torch._inductor.codegen.simd_kernel_features import SIMDKernelFeatures, NodeScheduleEntry
diff --git a/torch_npu/_inductor/codegen/split_tiling.py b/torch_npu/_inductor/codegen/split_tiling.py
index 13220cb13c..7606b71316 100644
--- a/torch_npu/_inductor/codegen/split_tiling.py
+++ b/torch_npu/_inductor/codegen/split_tiling.py
@@ -13,6 +13,7 @@ from ..config import num_vector_core, log
 
 # split and tiling axis selector
 class SplitTiling:
+
     def __init__(self, kernel: TritonKernel):
         self.kernel = kernel
         self.indexing = []
@@ -67,7 +68,7 @@ class SplitTiling:
             return longest_dim
 
         #longest and not low dims
-        longest_dim = select_longest_dim( can_be_low_dim=False)
+        longest_dim = select_longest_dim(can_be_low_dim=False)
 
         # longest and can be low dims
         if longest_dim is None or SplitTiling.less_than(longest_dim.length, int(num_vector_core * 0.8)):
@@ -89,8 +90,8 @@ class SplitTiling:
     # Tiling 原则3: 如果tiling轴是低维，在该轴上的切分的尺寸要与SIMD的BlockSize 对齐（32bytes）
     # Tiling 原则4: 低维轴的tile size 越大，性能越好。这个其实autotune 的原则，放在这里只是为了更好解释用例中使用的数值 。
 
-    # fixme, two tiling axis might be insufficient when there're 3 or more low-dims in indexing
-    def select_tiling_axis(self ):
+    def select_tiling_axis(self):
+
         # True :self.kernel.axis2 is Not None and all reduction axis selected, False : other cases
         def axis2_selection_done(axis):
             if self.kernel.total_numels <= 1:
@@ -180,16 +181,16 @@ class SplitTiling:
 
         # need to keep linearity for low_dims
         if xblock is None:
-            xblock = (numel + num_vector_core -1 ) // num_vector_core if numel > num_vector_core else min_xblock
+            xblock = (numel + num_vector_core - 1) // num_vector_core if numel > num_vector_core else min_xblock
         
         xblock = next_power_of_2(xblock)
 
-        nblocks = (numel + xblock -1) // xblock
+        nblocks = (numel + xblock - 1) // xblock
         return nblocks, xblock
     
     @staticmethod
     def get_nblocks_before_launch(numel, xblock):
-        nblocks = (numel + xblock -1 ) // xblock
+        nblocks = (numel + xblock -1) // xblock
         return nblocks, xblock
 
     @staticmethod
@@ -197,12 +198,12 @@ class SplitTiling:
         ret = []
         XBLOCK = numel
         NBLOCKS = 1
-        ret.append((NBLOCKS,XBLOCK))
+        ret.append((NBLOCKS, XBLOCK))
         while NBLOCKS <= num_vector_core and XBLOCK > 1:
             XBLOCK -= 1
             NBLOCKS = (numel + XBLOCK - 1) // XBLOCK
             XBLOCK = (numel + NBLOCKS - 1) // NBLOCKS
-            ret.append((NBLOCKS,XBLOCK))
+            ret.append((NBLOCKS, XBLOCK))
 
         return ret
 
@@ -226,15 +227,13 @@ class SplitTiling:
                         self.kernel.low_dims.add(axis.sorted_order)
         
         # all read index should be considered
-        buf_names = [node.node.name for node in self.kernel.node_schedule if
-                     node not in (EnableReduction, DisableReduction)]
+        buf_names = [node.node.name for node in self.kernel.node_schedule if node not in (EnableReduction, DisableReduction)]
         for node in self.kernel.node_schedule:
             if node in (EnableReduction, DisableReduction):
                 continue
             names = []
 
             for read in node._body.memory_usage[MemoryUsageType.LOAD]:
-                #name = node._body.indexing_exprs_name[read]
                 name = read.index_name
                 arg = read.buffer_name
                 read_is_inptr = False if arg[:3] != 'arg' and arg in buf_names else True
diff --git a/torch_npu/_inductor/codegen/tile_generator.py b/torch_npu/_inductor/codegen/tile_generator.py
index 2acb301087..fc174631e7 100644
--- a/torch_npu/_inductor/codegen/tile_generator.py
+++ b/torch_npu/_inductor/codegen/tile_generator.py
@@ -77,7 +77,7 @@ class TileGenerator:
                 break
 
     @staticmethod
-    def descend_xblock_rblock(rnumel, xblock, configs, cfg, align_numel, aggresive = True) :
+    def descend_xblock_rblock(rnumel, xblock, configs, cfg, align_numel, aggresive=True) :
         count_bytes = align_numel
         start_numel = 4096 // count_bytes if aggresive else 1024 // count_bytes
         
@@ -89,12 +89,12 @@ class TileGenerator:
         xblock = next_power_of_2(xblock)
         rnumel = next_power_of_2(rnumel)
 
-        xblock_sub = xblock  if xblock > start_numel else xblock
+        xblock_sub = xblock if xblock > start_numel else xblock
         rblock = start_numel if rnumel > start_numel else rnumel
 
         rblock_is_biggerr = rblock > xblock_sub
 
-        if xblock_sub * rblock <= start_numel :
+        if xblock_sub * rblock <= start_numel:
             newcfg = copy.deepcopy(cfg)
             newcfg["XBLOCK_SUB"] = xblock_sub
             newcfg["RBLOCK"] = rblock
@@ -117,7 +117,7 @@ class TileGenerator:
                     configs.append(Config(newcfg, num_warps=1, num_stages=1))
                 xblock_sub = xblock_sub // 2
 
-        while xblock_sub * rblock > start_numel :
+        while xblock_sub * rblock > start_numel:
             newcfg = copy.deepcopy(cfg)
             newcfg["XBLOCK_SUB"] = xblock_sub
             newcfg["RBLOCK"] = rblock
diff --git a/torch_npu/_inductor/codegen/triton.py b/torch_npu/_inductor/codegen/triton.py
index ffc4ebe96e..52e6252fa1 100644
--- a/torch_npu/_inductor/codegen/triton.py
+++ b/torch_npu/_inductor/codegen/triton.py
@@ -1,11 +1,11 @@
 import os
-from typing import List, Set, Iterable, Callable,Sequence
+from typing import List, Set, Iterable, Callable, Sequence
 from typing import Dict
-import sympy
 import operator
 import itertools
 from enum import Enum
 import functools
+
 from typing import (
     Optional,
     Union,
@@ -13,8 +13,10 @@ from typing import (
     Any,
     cast
 )
+
 import re
 import textwrap
+import sympy
 
 import torch
 from torch._inductor.utils import sympy_subs
@@ -211,7 +213,7 @@ class IterationRangesEntryNPUIndex(IterationRangesEntry):
             if self.is_split_axis:
                 offset = f"{self.symbol()}_offset"
                 index = f"{offset} + (loop1 * XBLOCK_SUB) + base1"
-            else :
+            else:
                 index = f"(loop1 * XBLOCK_SUB) + base1"
             
             if V.kernel.axis2 is not None and direction != AxisDirection.Flat:
@@ -219,7 +221,7 @@ class IterationRangesEntryNPUIndex(IterationRangesEntry):
             return index
         elif self.is_tiling_axis2:
             if V.kernel.persistent_reduction:
-                index = f"tl.arange(0, RBLOCK_{self.symbol()})" if V.kernel.numof_reduction_axis() > 1 else  "base2"
+                index = f"tl.arange(0, RBLOCK_{self.symbol()})" if V.kernel.numof_reduction_axis() > 1 else "base2"
             elif self.is_split_axis:
                 offset = f"{self.symbol()}_offset"
                 index = f"{offset} + (loop2 * RBLOCK) + base2"
@@ -244,7 +246,7 @@ class IterationRangesEntryNPUIndex(IterationRangesEntry):
                 xblock = f"XBLOCK" if self.is_split_axis else f"{self.symbol()}_numel"
                 lines.append(f"loops1 = ({xblock} + XBLOCK_SUB - 1) // XBLOCK_SUB")
 
-        elif self.is_tiling_axis2 and  len(V.kernel.axis2_list) <= 1:
+        elif self.is_tiling_axis2 and len(V.kernel.axis2_list) <= 1:
             lines.append("base2 = tl.arange(0, RBLOCK)")
             if self.is_split_axis:
                 lines.append(f"loops2 = (XBLOCK + RBLOCK - 1) // RBLOCK")
@@ -428,7 +430,6 @@ class NPUIndexTritonKernel(TritonKernel):
            
 
     def initialize_range_tree(self, pid_cache):
-        #self.numels = flatten(self.numels)
         self.total_numels = 0
         for k, x in self.numels.items():
             if not isinstance(x, sympy.Integer):
@@ -439,7 +440,7 @@ class NPUIndexTritonKernel(TritonKernel):
 
         no_r_dim = not self.inside_reduction or self.numels["r"] == 1
         prefixes = "wvtzyxr"
-        active_prefixes = prefixes[-len(self.numels): ]
+        active_prefixes = prefixes[-len(self.numels):]
         #prefix can not be 's', 'u', 'ps' , 'i', 'z', 'q'
         #prefix can not be 'p' from torch 2.6.0
         grid_dims = "xyztvw"
@@ -541,7 +542,7 @@ class NPUIndexTritonKernel(TritonKernel):
                         arg.name, V.graph.sizevars.inv_precomputed_replacements[symbol]
                     )
 
-        triton_meta_signature = signature_to_meta( signature, size_dtype=self.index_dtype, argdefs=argdefs)
+        triton_meta_signature = signature_to_meta(signature, size_dtype=self.index_dtype, argdefs=argdefs)
 
         triton_meta = {
             "signature": triton_meta_signature,
@@ -671,7 +672,7 @@ class NPUIndexTritonKernel(TritonKernel):
             if line.find('tl.store') >= 0 and self.is_isolated_symbol(line, var.name):
                 return True
         for line in self.stores._lines:
-            if isinstance(line,DeferredLine):
+            if isinstance(line, DeferredLine):
                 line = line.line
             if line.find('tl.store') >= 0 and self.is_isolated_symbol(line, var.name):
                 return True
@@ -719,10 +720,10 @@ class NPUIndexTritonKernel(TritonKernel):
         
         def codegen_range(index):
 
-            def loop_body(index, indexing_code, is_last_axis, do_indent=True) :
+            def loop_body(index, indexing_code, is_last_axis, do_indent=True):
                 if do_indent:
                     self.body.do_indent()
-                if indexing_code :
+                if indexing_code:
                     self.body.splice(indexing_code)
 
                 if is_last_axis:
@@ -730,7 +731,7 @@ class NPUIndexTritonKernel(TritonKernel):
                 else:
                     codegen_range(index + 1)
 
-                if do_indent :
+                if do_indent:
                     self.body.do_unindent()
 
             if index < 0 or index >= len(self.range_tree_nodes):
@@ -745,7 +746,7 @@ class NPUIndexTritonKernel(TritonKernel):
             if is_tilling_asix1:
                 do_indent = True
                 reduction_1d = self.is_1d_reduction()
-                if reduction_1d : 
+                if reduction_1d:
                     self.body.splice(self.prefix)
                     self.prefix.clear()
                 
@@ -755,14 +756,14 @@ class NPUIndexTritonKernel(TritonKernel):
                         offset = f"{range_node.name}_offset"
                         self.body.writeline(f"for {range_node.name} in range({offset}, "
                                              f"min({offset} + XBLOCK, {range_node.name}_numel)):")
-                    else :
+                    else:
                         self.body.writeline(f"for {range_node.name} in  range({range_node.name}_numel):")
                 # 1D persistent_reduction or 1d reduction non-first-node
                 elif self.axis2 is None and (self.persistent_reduction or len(self.loads._lines) == 0):
                     do_indent = False
                     if len(self.loads._lines) == 0:
                         indexing_code = None
-                else :
+                else:
                     self.body.writeline(f"for loop1 in range(loops1):")
 
                 
@@ -772,7 +773,7 @@ class NPUIndexTritonKernel(TritonKernel):
                     self.prefix.clear()    
                     self.body.do_unindent()
                 
-                loop_body(index, indexing_code, is_last_axis, do_indent = do_indent)
+                loop_body(index, indexing_code, is_last_axis, do_indent=do_indent)
                 
                 # for 1D reduction, need to add in suffix for persist_reduction or second node of 1d reduction
                 if self.is_1d_reduction() or self.persistent_reduction:
@@ -801,7 +802,7 @@ class NPUIndexTritonKernel(TritonKernel):
                 if range_node.is_split_axis:
                     offset = f"{range_node.symbol()}_offset"
                     self.body.writeline(f"for {range_node.symbol()} in range({offset}, min({offset} + XBLOCK, {range_node.name}_numel)):")
-                else :
+                else:
                     self.body.writeline(f"for {range_node.symbol()} in range({range_node.name}_numel):")
                 loop_body(index, indexing_code, is_last_axis)
 
@@ -812,13 +813,13 @@ class NPUIndexTritonKernel(TritonKernel):
 
         if self.first_node:
             codegen_range(0)
-        else :
+        else:
             if self.axis2 is None:
                 codegen_range(0)
-            else :
+            else:
                 axis2_order = self.range_tree_nodes[self.axis2].sorted_order
                 if self.persistent_reduction and self.numof_reduction_axis() > 1:
-                    axis2_order = axis2_order - self.numof_reduction_axis() +1
+                    axis2_order = axis2_order - self.numof_reduction_axis() + 1
                 for _ in range(axis2_order):
                     self.body.do_indent()
                 codegen_range(axis2_order)
@@ -892,7 +893,7 @@ class NPUIndexTritonKernel(TritonKernel):
 
     # apply xxx_prime var in case dim are permuted
     def store(
-        self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode=None
+        self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
     ) -> None:
         
         var = self.args.output(name)
@@ -948,7 +949,6 @@ class NPUIndexTritonKernel(TritonKernel):
                     return node
         return None
 
-    #fixme, this seems not reliable, need to refactor .
     def get_next_scheduler_node(self, node):
         return self._get_next_scheduler_node(self.node_schedule, node)
 
@@ -974,7 +974,6 @@ class NPUIndexTritonKernel(TritonKernel):
                     
             
     # to generate the shape of the accumulator of RBLOCK loop
-
     def dense_size_list(self, is_permute) -> List[str]:
 
         sizes = []
@@ -982,7 +981,7 @@ class NPUIndexTritonKernel(TritonKernel):
             sizes = [] if self.check_all_index_is_1d_for_dual_reduction() else [f"RBLOCK_{axis}" for axis in self.axis2_list] 
             return sizes
         if self.persistent_reduction and self.axis2 is None:
-            sizes = ["RBLOCK" ]
+            sizes = ["RBLOCK"]
             return sizes
         # current computedbuffer is reduction
         cb_is_reduction = self.inside_reduction if not self.current_node else isinstance(self.current_node.node.data, ir.Reduction)
@@ -998,7 +997,7 @@ class NPUIndexTritonKernel(TritonKernel):
 
         return sizes
 
-    def dense_size_str(self, is_permute = False):
+    def dense_size_str(self, is_permute=False):
         sizes = self.dense_size_list(is_permute)
         if self.numof_reduction_axis() > 1:
             return f"[{'* '.join(sizes)}]"
@@ -1008,7 +1007,7 @@ class NPUIndexTritonKernel(TritonKernel):
         for node in self.sorted_axis:
             if not(node.is_tiling_axis1 or node.is_tiling_axis2):
                 mask_vars.discard(f"{node.name}_mask")
-            if len(self.axis2_list) > 1 and  not node.is_tiling_axis2:
+            if len(self.axis2_list) > 1 and not node.is_tiling_axis2:
                 mask_vars.discard(f"{node.name}_mask")
 
 
@@ -1016,7 +1015,7 @@ class NPUIndexTritonKernel(TritonKernel):
     def reduction_resize(self, value):
         ndims = self.triton_tensor_ndim()
         if ndims == 1:
-           return f"triton_helpers.promote_to_tensor({value})"
+            return f"triton_helpers.promote_to_tensor({value})"
         is_higher_order_reduction = self.is_higher_order_reduction()
 
         expand_str = "1," if is_higher_order_reduction else ",1"
@@ -1025,28 +1024,30 @@ class NPUIndexTritonKernel(TritonKernel):
         else:
             return f"{value}.reshape(XBLOCK_SUB{expand_str})"
 
-    def get_axis_direction(self, is_axis1, reversed = False ):
+    def get_axis_direction(self, is_axis1, is_reversed=False):
       
         if self.check_all_index_is_1d_for_dual_reduction():
             result = AxisDirection.Flat
-        elif not self.inside_reduction :
+        elif not self.inside_reduction:
             if self.numof_tiling_axis() > 1 :
                 result = AxisDirection.Vertical if is_axis1 else AxisDirection.Horizontal
-            else :
+            else:
                 result = AxisDirection.Flat
-        else  :
-            if is_axis1 :
+        else:
+            if is_axis1:
                 result = AxisDirection.Horizontal if V.kernel.is_higher_order_reduction() else AxisDirection.Vertical
-            else :
+            else:
                 result = AxisDirection.Vertical if V.kernel.is_higher_order_reduction() else AxisDirection.Horizontal
 
-        result =  reverse_direction(result) if reversed else result
+        result = reverse_direction(result) if is_reversed else result
         return result
 
     def is_higher_order_reduction(self, check_prev_node=False):
         if self.numof_reduction_axis() > 1:
             return False
-        assert self.inside_reduction
+        if not (self.inside_reduction):
+            raise RuntimeError("assert self.inside_reduction")
+
         if self.inside_high_order_reduction:
             return self.inside_high_order_reduction
 
@@ -1347,15 +1348,16 @@ class NPUIndexTritonKernel(TritonKernel):
         #implict broadcast
         result = (self.numof_tiling_axis() > 1 and not self.persistent_reduction) and (tiling_axis[1] ^ tiling_axis[0])
         result = result and self.find_axis2_in_indexing()
-        return  result,  tiling_axis
+        return  result, tiling_axis
 
     def current_node_has_permute(self):
         if not self.current_node:
             return False
         for index in self.current_node._body.indexing.values():
-             if self.need_permuted(index):
+            if self.need_permuted(index):
                 return True
         return False
+
     def need_permuted(self, index: sympy.Expr):
         if self.numof_tiling_axis() <= 1:
             return False
@@ -1426,8 +1428,8 @@ class NPUIndexTritonKernel(TritonKernel):
         return f"[{', '.join(sizes)}]"
 
 
-    #broadcast, permute handling
 
+    #broadcast, permute handling
     def load(self, name: str, index: sympy.Expr):
         var = self.args.input(name)
         original_index = index
@@ -1540,7 +1542,6 @@ class NPUIndexTritonKernel(TritonKernel):
         self,
         index: sympy.Expr,
     ):
-        #index = self.simplify_indexing(index)
         index = sympy_subs(index, V.graph.sizevars.precomputed_replacements)
         # if simple replacements didn't get rid of floor/ceil, try full subs
         if len(index.atoms(sympy.floor)) or len(index.atoms(sympy.ceiling)):
@@ -1612,7 +1613,9 @@ class NPUIndexTritonKernel(TritonKernel):
 
         mask_vars: Set[str] = set()
         for var in index_vars:
-            assert isinstance(var, sympy.Symbol)
+            if not (isinstance(var, sympy.Symbol)):
+                raise RuntimeError("assert isinstance(var, sympy.Symbol)")
+
             has_rindex = has_rindex or var.name.startswith("r")
             if override_mask:
                 pass
@@ -1624,7 +1627,6 @@ class NPUIndexTritonKernel(TritonKernel):
                 pass
             else:
                 # var is one of xN, yN or rN
-                # assert var.name[0] in "xyr", var.name
                 mask_vars.add(f"{var.name}_mask")
 
         expand_str = None
@@ -1679,7 +1681,7 @@ class NPUIndexTritonKernel(TritonKernel):
         sv = V.graph.sizevars
         new_ranges: List[List[sympy.Expr]] = [[] for _ in groups]
         remaining = [sv.simplify(g) for g in groups]
-        for i, group in enumerate(remaining) :
+        for i, group in enumerate(remaining):
             if isinstance(group, (list, tuple)):
                 remaining[i] = NumelList(group).numels()
 
@@ -1697,16 +1699,17 @@ class NPUIndexTritonKernel(TritonKernel):
         def make_combined(strides, index_list):
             def getter(flat_vars):
                 expr = sympy.Integer(0)
-                for stride, index in zip(strides, index_list) :
+                for stride, index in zip(strides, index_list):
                     expr = stride * flat_vars[index] + expr
                 return expr
 
             return getter
 
         def size_hints(group):
-            if isinstance(group, (list,tuple)) :
+            if isinstance(group, (list,tuple)):
                 return sv.size_hint(NumelList(group).numels())
             return sv.size_hint(group)
+
         def add_multiple_range(size, return_getters):
             # need to break size in multiple
             index_list = []
@@ -1719,13 +1722,13 @@ class NPUIndexTritonKernel(TritonKernel):
             while (group < len(remaining) and remaining[group] > 1) and (remained_size > 1):
                 group_size = remaining[group]
                 # size should be divisible by group_size
-                if not sv.statically_known_multiple_of( remained_size, group_size ):
+                if not sv.statically_known_multiple_of(remained_size, group_size):
                     raise CantSplit()
                 index_list.append(add_range(group, group_size))
                 remained_size = FloorDiv(remained_size, group_size)
                 stride_list.append(remained_size)
                 group = group + 1
-            if remained_size != 1 :
+            if remained_size != 1:
                 raise CantSplit()
             return_getters.append(make_combined(stride_list, index_list))
 
@@ -1746,7 +1749,7 @@ class NPUIndexTritonKernel(TritonKernel):
                     # scroll to next group with remaining elements
                     current_group += 1
                 size_hint = sv.size_hint(size)
-                if current_group >= len(remaining) :
+                if current_group >= len(remaining):
                     pdb.set_trace()
                 if size_hint > size_hints(remaining[current_group]):
                     #add multiple ranges (two or more) to the list, as well as the getter funcs
@@ -1757,9 +1760,8 @@ class NPUIndexTritonKernel(TritonKernel):
                     )
             return_getters_groups.append(return_getters)
 
-        assert all(
-            V.graph.sizevars.size_hint(s) == 1 for s in remaining
-        ), f"failed to set ranges {remaining} {lengths}"
+        if not (all(V.graph.sizevars.size_hint(s) == 1 for s in remaining)):
+            raise RuntimeError("assert all(V.graph.sizevars.size_hint(s) == 1 for s in remaining)")
 
         return new_ranges, return_getters_groups
     
@@ -1769,7 +1771,6 @@ class NPUIndexTritonKernel(TritonKernel):
     # just to override load method of CSEProxy, however, CSEProxy is an inner which can not be monkey patched,
     # we need to override the whole inner class
     def __enter__(self):
-        # TODO: hoist this to top level
         class CSEProxy:
             self.name = "CSEProxy"
             vr_analysis = ValueRangeAnalysis()
@@ -1853,7 +1854,9 @@ class NPUIndexTritonKernel(TritonKernel):
 
                 fx_node = V.interpreter.current_node
                 if fx_node.target == name and self.node_to_bounds is not None:
-                    assert isinstance(self.node_to_bounds, dict)
+                    if not (isinstance(self.node_to_bounds, dict)):
+                        raise RuntimeError("assert isinstance(self.node_to_bounds, dict)")
+
                     return self.node_to_bounds.get(fx_node, ValueRanges.unknown())
                 elif config.compute_all_bounds and hasattr(ValueRangeAnalysis, name):
                     # These create lots of inner strings. We would need to compute the bounds at the ops
diff --git a/torch_npu/_inductor/lowering.py b/torch_npu/_inductor/lowering.py
index 15ac45b9b1..a7580f8cb4 100644
--- a/torch_npu/_inductor/lowering.py
+++ b/torch_npu/_inductor/lowering.py
@@ -35,7 +35,7 @@ from torch._inductor.lowering import (
     _validate_reduction_axis,
 )
 import torch_npu
-from torch_npu import  npu_dtype_cast
+from torch_npu import npu_dtype_cast
 
 
 def make_reduction(reduction_type: str, override_return_dtype=None):
diff --git a/torch_npu/_inductor/npu_fusion_attention_graph.py b/torch_npu/_inductor/npu_fusion_attention_graph.py
index 9e796252c2..2e421b6a1c 100644
--- a/torch_npu/_inductor/npu_fusion_attention_graph.py
+++ b/torch_npu/_inductor/npu_fusion_attention_graph.py
@@ -69,9 +69,9 @@ def npu_fa(query, key, value, head_num, input_layout, pse=None, padding_mask=Non
             torch.empty_like(softmax_max),
             torch.empty_like(softmax_sum),
             torch.empty_like(softmax_out),
-            torch.tensor([0], device='meta',requires_grad=False),
-            torch.tensor([0], device='meta',requires_grad=False),
-            torch.tensor([0], device='meta',requires_grad=False))
+            torch.tensor([0], device='meta', requires_grad=False),
+            torch.tensor([0], device='meta', requires_grad=False),
+            torch.tensor([0], device='meta', requires_grad=False))
 
 
 @impl(meta_lib, "npu_fa_backward")
diff --git a/torch_npu/_inductor/npu_triton_heuristics.py b/torch_npu/_inductor/npu_triton_heuristics.py
index 527700d224..f1ea1f49af 100644
--- a/torch_npu/_inductor/npu_triton_heuristics.py
+++ b/torch_npu/_inductor/npu_triton_heuristics.py
@@ -455,7 +455,7 @@ class NPUCachingAutotuner(CachingAutotuner):
 
         self.cuda_kernel_saved = True
 
-    def bench(self, launcher, *args, grid, with_profiler=False, **kwargs):
+    def bench(self, launcher, *args, input_grid, with_profiler=False, **kwargs):
         """Measure the performance of a given launcher"""        
 
         if not self.custom_kernel and launcher.n_spills > self.inductor_meta.get(
@@ -476,7 +476,7 @@ class NPUCachingAutotuner(CachingAutotuner):
             launcher(
                 *cloned_args,
                 **cloned_kwargs,
-                grid=grid,
+                grid=input_grid,
                 stream=stream,
             )
 
@@ -495,16 +495,16 @@ class NPUDebugAutotuner(NPUCachingAutotuner):
         super().__init__(*args, **kwargs)
         self.cached = None
 
-    def run(self, *args, grid, stream):
+    def run(self, *args, input_grid, stream):
         possible_names = _find_names(self)
         kernel_name = f"{max(possible_names, key=len)}"
         if not re.match(self.regex_filter, kernel_name):
             return
-        super().run(*args, grid=grid, stream=stream)
+        super().run(*args, grid=input_grid, stream=stream)
         (launcher,) = self.launchers
 
         if self.cached is None:
-            ms = self.bench(launcher, *args, grid=grid)
+            ms = self.bench(launcher, *args, input_grid=input_grid)
             num_in_out_ptrs = len(
                 [
                     arg_name
@@ -522,7 +522,7 @@ class NPUDebugAutotuner(NPUCachingAutotuner):
             create_bandwidth_info_str(ms, num_gb, gb_per_s, suffix=f" \t {kernel_name}")
         )
 
-#torch-260
+
 def cached_autotune(
     size_hints: Optional[List[int]],
     configs: List[Config],
@@ -537,7 +537,9 @@ def cached_autotune(
     has additional debugging, error handling, and on-disk caching.
     """
     configs = unique_configs(configs)
-    assert len(configs) == 1 or filename
+    if not (len(configs) == 1 or filename):
+        raise RuntimeError("assert len(configs) == 1 or filename")
+
     inductor_meta = {} if inductor_meta is None else inductor_meta
 
     disabled = inductor_meta.get("force_disable_caches", False)
@@ -655,8 +657,8 @@ def triton_config_npu_index(
     size_hints,
     inductor_meta,
     triton_meta=None,
-    reduction = False,
-    persistent_reduction = False,
+    reduction=False,
+    persistent_reduction=False,
 
 ) -> List[Config]:
     num_warps = 1
@@ -688,7 +690,7 @@ def triton_config_npu_index(
         # xblock is a range, don't auto_tune
         xnumel = split if split_axis_order == axis1_order else size_hints[axis1_order]
         rblock = 1
-        if axis2_order is not None :
+        if axis2_order is not None:
             rblock =  split if split_axis_order == axis2_order else size_hints[axis2_order]
 
         xblock_sub = xnumel
-- 
Gitee


From 4a097b9fa1704713c1dd04b5a2d2c8447e077840 Mon Sep 17 00:00:00 2001
From: wl1259 <wanglei752@huawei.com>
Date: Sat, 19 Apr 2025 15:42:59 +0800
Subject: [PATCH 333/358] 7 clean code commit

---
 torch_npu/_inductor/__init__.py               |  1 +
 torch_npu/_inductor/codegen/__init__.py       |  2 +-
 torch_npu/_inductor/codegen/ir.py             |  2 +-
 torch_npu/_inductor/codegen/schduling.py      |  2 +-
 torch_npu/_inductor/codegen/split_tiling.py   |  7 ++--
 torch_npu/_inductor/codegen/tile_generator.py |  2 +-
 torch_npu/_inductor/codegen/triton.py         | 33 ++++++++-----------
 torch_npu/_inductor/npu_triton_heuristics.py  |  5 +--
 8 files changed, 24 insertions(+), 30 deletions(-)

diff --git a/torch_npu/_inductor/__init__.py b/torch_npu/_inductor/__init__.py
index d15589aa09..bd98467d4a 100644
--- a/torch_npu/_inductor/__init__.py
+++ b/torch_npu/_inductor/__init__.py
@@ -78,6 +78,7 @@ inductor_lowering.make_reduction = make_reduction
 _register_npu_inductor_fallbacks()
 _register_npu_inductor_decompositons()
 
+
 def _replace_benchmark_all_configs():
     from torch._inductor.triton_heuristics import CachingAutotuner
     from .npu_triton_heuristics import benchmark_all_configs
diff --git a/torch_npu/_inductor/codegen/__init__.py b/torch_npu/_inductor/codegen/__init__.py
index ee073a8f83..d3557c012a 100644
--- a/torch_npu/_inductor/codegen/__init__.py
+++ b/torch_npu/_inductor/codegen/__init__.py
@@ -11,7 +11,7 @@ from torch._inductor.codegen.triton import TritonKernel
 from torch._inductor.codegen.simd import SIMDKernel
 
 from torch_npu._inductor.codegen._sizevars import simplify
-from torch_npu._inductor.codegen.ir import (num_splits,loopbody__call__,transform_dims_in_indexing, substituted_dims_in_indexing)
+from torch_npu._inductor.codegen.ir import (num_splits,loopbody__call__, transform_dims_in_indexing, substituted_dims_in_indexing)
 from torch_npu._inductor.codegen.triton import is_compatible
 from torch_npu._inductor.codegen.triton import group_fn, select_index_dtype
 from torch_npu._inductor.codegen.schduling import create_tiling
diff --git a/torch_npu/_inductor/codegen/ir.py b/torch_npu/_inductor/codegen/ir.py
index c16fce2549..2cb83e5c1f 100644
--- a/torch_npu/_inductor/codegen/ir.py
+++ b/torch_npu/_inductor/codegen/ir.py
@@ -183,7 +183,7 @@ def generate_body_indexing(body, indices):
     }
 
 
-def transform_dims_in_indexing(self, indices) :
+def transform_dims_in_indexing(self, indices):
     if self.indexing is None:
         generate_body_indexing(self, indices)
 
diff --git a/torch_npu/_inductor/codegen/schduling.py b/torch_npu/_inductor/codegen/schduling.py
index cdd0743cdd..ca4b306144 100644
--- a/torch_npu/_inductor/codegen/schduling.py
+++ b/torch_npu/_inductor/codegen/schduling.py
@@ -40,7 +40,7 @@ def create_tiling(
     """
     
     pw_tiling = flatten_groups(pw_tiling)
-    pw_prefixes = ["w", "v", "t", "z", "y", "x"][-len(pw_tiling): ]
+    pw_prefixes = ["w", "v", "t", "z", "y", "x"][-len(pw_tiling):]
     reduction_tiling = flatten_groups(reduction_tiling)
     reduction_tiling = [NumelList(reduction_tiling).numels()]
     reduction_prefixes = ["r"][: len(reduction_tiling)]
diff --git a/torch_npu/_inductor/codegen/split_tiling.py b/torch_npu/_inductor/codegen/split_tiling.py
index 7606b71316..7be80830d9 100644
--- a/torch_npu/_inductor/codegen/split_tiling.py
+++ b/torch_npu/_inductor/codegen/split_tiling.py
@@ -36,7 +36,6 @@ class SplitTiling:
         self.find_lowest_dimension()
         self.should_outer_reduce = False
 
-
     # Split 原则1 ：先做维度合并，再切分 。通过维度合并降维降低，split和tiling轴选择策略的复杂性 。
     # Split 原则2: 切分的数量要和AIcore的数量对齐（相同或是倍数）。每个核要分配的split的量一致。每个split形状要一致（包括维度和尺寸）。
     # Split  原则3: 对于规约类融合算子, 从非规约选择切分轴。对于非规约类融合算子, 从所有轴中选切分轴。
@@ -131,7 +130,7 @@ class SplitTiling:
             
             # choose reduction axis or low-dim as axis2
             if not axis2_selection_done(axis):
-                axis.is_tiling_axis2 = True if SplitTiling.great_than(numel,1) else False
+                axis.is_tiling_axis2 = True if SplitTiling.great_than(numel, 1) else False
                 # axis2 must be the reduction axis in case inside_reduction
                 if axis.prefix == "r":
                     axis.is_tiling_axis2 = True
@@ -190,7 +189,7 @@ class SplitTiling:
     
     @staticmethod
     def get_nblocks_before_launch(numel, xblock):
-        nblocks = (numel + xblock -1) // xblock
+        nblocks = (numel + xblock - 1) // xblock
         return nblocks, xblock
 
     @staticmethod
@@ -276,7 +275,7 @@ class SplitTiling:
         if isinstance(xnumel, sympy.Integer) and isinstance(ynumel, int):
             ynumel = sympy.Integer(ynumel)
         
-        if isinstance(ynumel, sympy.Integer) and  isinstance(xnumel, int):
+        if isinstance(ynumel, sympy.Integer) and isinstance(xnumel, int):
             xnumel = sympy.Integer(xnumel)
 
         return (xnumel, ynumel)
diff --git a/torch_npu/_inductor/codegen/tile_generator.py b/torch_npu/_inductor/codegen/tile_generator.py
index fc174631e7..6cca5e4d76 100644
--- a/torch_npu/_inductor/codegen/tile_generator.py
+++ b/torch_npu/_inductor/codegen/tile_generator.py
@@ -77,7 +77,7 @@ class TileGenerator:
                 break
 
     @staticmethod
-    def descend_xblock_rblock(rnumel, xblock, configs, cfg, align_numel, aggresive=True) :
+    def descend_xblock_rblock(rnumel, xblock, configs, cfg, align_numel, aggresive=True):
         count_bytes = align_numel
         start_numel = 4096 // count_bytes if aggresive else 1024 // count_bytes
         
diff --git a/torch_npu/_inductor/codegen/triton.py b/torch_npu/_inductor/codegen/triton.py
index 52e6252fa1..d10f132d43 100644
--- a/torch_npu/_inductor/codegen/triton.py
+++ b/torch_npu/_inductor/codegen/triton.py
@@ -971,8 +971,7 @@ class NPUIndexTritonKernel(TritonKernel):
             if not all_index_is_1d:
                 break 
         return all_index_is_1d
-                    
-            
+
     # to generate the shape of the accumulator of RBLOCK loop
     def dense_size_list(self, is_permute) -> List[str]:
 
@@ -1010,7 +1009,6 @@ class NPUIndexTritonKernel(TritonKernel):
             if len(self.axis2_list) > 1 and not node.is_tiling_axis2:
                 mask_vars.discard(f"{node.name}_mask")
 
-
     # and add to shape to value
     def reduction_resize(self, value):
         ndims = self.triton_tensor_ndim()
@@ -1029,7 +1027,7 @@ class NPUIndexTritonKernel(TritonKernel):
         if self.check_all_index_is_1d_for_dual_reduction():
             result = AxisDirection.Flat
         elif not self.inside_reduction:
-            if self.numof_tiling_axis() > 1 :
+            if self.numof_tiling_axis() > 1:
                 result = AxisDirection.Vertical if is_axis1 else AxisDirection.Horizontal
             else:
                 result = AxisDirection.Flat
@@ -1056,7 +1054,7 @@ class NPUIndexTritonKernel(TritonKernel):
             return False
 
         reduction = node.node.data
-        while check_prev_node and reduction is not None and  not isinstance(reduction, ir.Reduction):
+        while check_prev_node and reduction is not None and not isinstance(reduction, ir.Reduction):
             node = self.get_prev_scheduler_node(node)
             if node is None:
                 reduction = None
@@ -1092,7 +1090,7 @@ class NPUIndexTritonKernel(TritonKernel):
                 if node in (EnableReduction, DisableReduction):
                     continue
                 for key, _ in node._body.indexing_map.items():
-                    if key  in self.range_tree_nodes:
+                    if key in self.range_tree_nodes:
                         dim = self.range_tree_nodes[key]
                     else:
                         dim = self.range_tree_nodes_removed[key]
@@ -1182,8 +1180,7 @@ class NPUIndexTritonKernel(TritonKernel):
             # use tl
             # use tl.max
             if reduction_type in {"max", "min"}:
-                    return self.reduction_resize(f"{module}.{reduction_type}({value}, {dim})"
-                )
+                return self.reduction_resize(f"{module}.{reduction_type}({value}, {dim})")
             return self.reduction_resize(f"{module}.{reduction_type}({value}, {dim})")
 
         def final_argreduce(buffer, result_var, value, index):
@@ -1225,7 +1222,7 @@ class NPUIndexTritonKernel(TritonKernel):
                     masked_value = [_mask_value(v, d) for v, d in zip(value, default)]
                 else:
                     masked_value = _mask_value(value, default)
-            else :
+            else:
                 masked_value = value
 
             if reduction_type in {"argmax", "argmin", "max", "min"}:
@@ -1253,11 +1250,10 @@ class NPUIndexTritonKernel(TritonKernel):
                     result_var = self.cse.generate(
                         self.compute, final_reduction(masked_value), dtype=masked_value.dtype,
                 )
-                    
             elif reduction_type == "welford_reduce":
-                assert False, "welford_reduction is not supported now.."
+                raise RuntimeError("assert False, welford_reduction and is not supported now..")
             elif reduction_type == "welford_combine":
-                assert False, "welford_combine is not supported now.."
+                raise RuntimeError("assert False, welford_combine and is not supported now..")
             else:
                 result_var = self.cse.generate(
                     self.compute, final_reduction(masked_value), dtype=masked_value.dtype,
@@ -1317,6 +1313,7 @@ class NPUIndexTritonKernel(TritonKernel):
             self.outside_loop_vars.add(result_var)
 
         return result_var
+
     #XBLICK:split size, XBLOCK_SUB : tile1 size, RBLOCK:tile2 size
     def add_autotune_args(self, argdefs):
         # no tiling in this case
@@ -1330,7 +1327,9 @@ class NPUIndexTritonKernel(TritonKernel):
 
     def _get_heuristic(self):
         if self.persistent_reduction:
-            assert self.inside_reduction
+            if not (self.inside_reduction):
+                raise RuntimeError(" assert self.inside_reduction")
+
             return "persistent_reduction_npu_index"
         elif self.inside_reduction:
             return "reduction_npu_index"
@@ -1427,8 +1426,6 @@ class NPUIndexTritonKernel(TritonKernel):
             sizes = reversed(sizes)
         return f"[{', '.join(sizes)}]"
 
-
-
     #broadcast, permute handling
     def load(self, name: str, index: sympy.Expr):
         var = self.args.input(name)
@@ -1559,7 +1556,6 @@ class NPUIndexTritonKernel(TritonKernel):
                     replacements = {a: V.graph.sizevars.lookup_precomputed_size(a)}
                     index = sympy_subs(index, replacements)
 
-        #simp_index = self.simplify_indexing(index)
         simp_index = index
 
         # Now that we are done simplifying we can unwrap Identity so that downstream handling
@@ -1591,7 +1587,6 @@ class NPUIndexTritonKernel(TritonKernel):
         index = self.prepare_indexing(index)
         index_vars = index.free_symbols
         has_rindex = False
-        #index = self.simplify_indexing(index)
         index = sympy_subs(index, V.graph.sizevars.precomputed_replacements)
         # if simple replacements didn't get rid of floor/ceil, try full subs
         if len(index.atoms(sympy.floor)) or len(index.atoms(sympy.ceiling)):
@@ -1764,9 +1759,7 @@ class NPUIndexTritonKernel(TritonKernel):
             raise RuntimeError("assert all(V.graph.sizevars.size_hint(s) == 1 for s in remaining)")
 
         return new_ranges, return_getters_groups
-    
-    
-    
+
     # torch260 done
     # just to override load method of CSEProxy, however, CSEProxy is an inner which can not be monkey patched,
     # we need to override the whole inner class
diff --git a/torch_npu/_inductor/npu_triton_heuristics.py b/torch_npu/_inductor/npu_triton_heuristics.py
index f1ea1f49af..cfdaf3f9ba 100644
--- a/torch_npu/_inductor/npu_triton_heuristics.py
+++ b/torch_npu/_inductor/npu_triton_heuristics.py
@@ -455,7 +455,8 @@ class NPUCachingAutotuner(CachingAutotuner):
 
         self.cuda_kernel_saved = True
 
-    def bench(self, launcher, *args, input_grid, with_profiler=False, **kwargs):
+    # bench method is called by torch, grid can not be modified
+    def bench(self, launcher, *args, grid, with_profiler=False, **kwargs):
         """Measure the performance of a given launcher"""        
 
         if not self.custom_kernel and launcher.n_spills > self.inductor_meta.get(
@@ -476,7 +477,7 @@ class NPUCachingAutotuner(CachingAutotuner):
             launcher(
                 *cloned_args,
                 **cloned_kwargs,
-                grid=input_grid,
+                grid=grid,
                 stream=stream,
             )
 
-- 
Gitee


From f0781cb84117fc40a8fbf89404f73d19f27606ff Mon Sep 17 00:00:00 2001
From: wl1259 <wanglei752@huawei.com>
Date: Sat, 19 Apr 2025 15:56:13 +0800
Subject: [PATCH 334/358] 8 clean code commit

---
 torch_npu/_inductor/codegen/__init__.py      |  2 +-
 torch_npu/_inductor/codegen/triton.py        |  7 ++---
 torch_npu/_inductor/npu_triton_heuristics.py | 33 ++++++++++----------
 3 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/torch_npu/_inductor/codegen/__init__.py b/torch_npu/_inductor/codegen/__init__.py
index d3557c012a..e7a6832665 100644
--- a/torch_npu/_inductor/codegen/__init__.py
+++ b/torch_npu/_inductor/codegen/__init__.py
@@ -11,7 +11,7 @@ from torch._inductor.codegen.triton import TritonKernel
 from torch._inductor.codegen.simd import SIMDKernel
 
 from torch_npu._inductor.codegen._sizevars import simplify
-from torch_npu._inductor.codegen.ir import (num_splits,loopbody__call__, transform_dims_in_indexing, substituted_dims_in_indexing)
+from torch_npu._inductor.codegen.ir import (num_splits, loopbody__call__, transform_dims_in_indexing, substituted_dims_in_indexing)
 from torch_npu._inductor.codegen.triton import is_compatible
 from torch_npu._inductor.codegen.triton import group_fn, select_index_dtype
 from torch_npu._inductor.codegen.schduling import create_tiling
diff --git a/torch_npu/_inductor/codegen/triton.py b/torch_npu/_inductor/codegen/triton.py
index d10f132d43..8d7363a57c 100644
--- a/torch_npu/_inductor/codegen/triton.py
+++ b/torch_npu/_inductor/codegen/triton.py
@@ -1286,7 +1286,7 @@ class NPUIndexTritonKernel(TritonKernel):
                 )
                 final_argreduce(self.post_loop_store, result_var, accumulator, accumulator_index)
             elif is_welford_reduction(reduction_type):
-                assert False, "welford_reduction is not supported now.."
+                raise RuntimeError("assert False, welford_reduction and is not supported now..")
             else:
                 combine_fn = ir.get_reduction_combine_fn(reduction_type, src_dtype)
                 updated = combine_fn(accumulator, value)
@@ -1347,7 +1347,7 @@ class NPUIndexTritonKernel(TritonKernel):
         #implict broadcast
         result = (self.numof_tiling_axis() > 1 and not self.persistent_reduction) and (tiling_axis[1] ^ tiling_axis[0])
         result = result and self.find_axis2_in_indexing()
-        return  result, tiling_axis
+        return result, tiling_axis
 
     def current_node_has_permute(self):
         if not self.current_node:
@@ -1602,7 +1602,6 @@ class NPUIndexTritonKernel(TritonKernel):
                     replacements = {a: V.graph.sizevars.lookup_precomputed_size(a)}
                     index = sympy_subs(index, replacements)
 
-        #index = self.simplify_indexing(index)
         index_vars = index.free_symbols
         has_rindex = False
 
@@ -1701,7 +1700,7 @@ class NPUIndexTritonKernel(TritonKernel):
             return getter
 
         def size_hints(group):
-            if isinstance(group, (list,tuple)):
+            if isinstance(group, (list, tuple)):
                 return sv.size_hint(NumelList(group).numels())
             return sv.size_hint(group)
 
diff --git a/torch_npu/_inductor/npu_triton_heuristics.py b/torch_npu/_inductor/npu_triton_heuristics.py
index cfdaf3f9ba..bc686a8c67 100644
--- a/torch_npu/_inductor/npu_triton_heuristics.py
+++ b/torch_npu/_inductor/npu_triton_heuristics.py
@@ -265,14 +265,14 @@ class NPUCachingAutotuner(CachingAutotuner):
         scope["function"] = get_first_attr(binary, "function", "cu_function")
 
         def get_launch_args_without_kernel_launch_metadata(
-            grid,
+            input_grid,
             grid_0,
             grid_1,
             grid_2,
             stream,
             function,
             metadata,
-            bin,
+            input_bin,
             launch_enter_hook,
             launch_exit_hook,
             num_warps,
@@ -305,14 +305,14 @@ class NPUCachingAutotuner(CachingAutotuner):
         if binary.launch_enter_hook:
 
             def get_launch_args_with_kernel_launch_metadata(
-                grid,
+                input_grid,
                 grid_0,
                 grid_1,
                 grid_2,
                 stream,
                 function,
                 metadata,
-                bin,
+                input_bin,
                 launch_enter_hook,
                 launch_exit_hook,
                 num_warps,
@@ -331,7 +331,7 @@ class NPUCachingAutotuner(CachingAutotuner):
                     stream,
                     function,
                     metadata,
-                    bin.launch_metadata(grid, stream, *args),
+                    input_bin.launch_metadata(input_grid, stream, *args),
                     launch_enter_hook,
                     launch_exit_hook,
                 )
@@ -339,14 +339,14 @@ class NPUCachingAutotuner(CachingAutotuner):
         else:
 
             def get_launch_args_with_kernel_launch_metadata(
-                grid,
+                input_grid,
                 grid_0,
                 grid_1,
                 grid_2,
                 stream,
                 function,
                 metadata,
-                bin,
+                input_bin,
                 launch_enter_hook,
                 launch_exit_hook,
                 num_warps,
@@ -692,7 +692,7 @@ def triton_config_npu_index(
         xnumel = split if split_axis_order == axis1_order else size_hints[axis1_order]
         rblock = 1
         if axis2_order is not None:
-            rblock =  split if split_axis_order == axis2_order else size_hints[axis2_order]
+            rblock = split if split_axis_order == axis2_order else size_hints[axis2_order]
 
         xblock_sub = xnumel
         cfg = {"NBLOCKS": nblocks, "XBLOCK": split, "XBLOCK_SUB": xblock_sub}
@@ -702,9 +702,9 @@ def triton_config_npu_index(
         cfg["is_low_dim"] = is_low_dim
         cfg["min_aligned_numel"] = min_aligned_numel
         is_1d_reduction = reduction and axis2_order is None
-        if persistent_reduction :
+        if persistent_reduction:
             numof_reduction_axis = inductor_meta["numof_reduction_axis"]
-            if numof_reduction_axis > 1 :
+            if numof_reduction_axis > 1:
                 del cfg["XBLOCK_SUB"]
                 configs.append(Config(cfg, num_warps=1, num_stages=1))
             elif axis2_order is None :
@@ -712,23 +712,23 @@ def triton_config_npu_index(
                 del cfg["XBLOCK_SUB"]
                 cfg["NBLOCKS"] = 1
                 configs.append(Config(cfg, num_warps=1, num_stages=1))
-            else :
-                TileGenerator.descend_xblock(rnumel = rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
+            else:
+                TileGenerator.descend_xblock(rnumel=rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
         elif is_1d_reduction:
             cfg["NBLOCKS"] = 1
             cfg["XBLOCK"] = split_numel
             cfg["XBLOCK_SUB"] = split_numel
-            TileGenerator.descend_xblock(rnumel = rblock, xblock=split_numel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
+            TileGenerator.descend_xblock(rnumel=rblock, xblock=split_numel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
         # both of the two axis are low dims
         elif axis1_order in low_dims and axis2_order in low_dims :
             cfg["RBLOCK"] = rblock
-            TileGenerator.descend_xblock_rblock(rnumel = rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
+            TileGenerator.descend_xblock_rblock(rnumel=rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
         elif axis2_order is None and axis1_order is not None:
             TileGenerator.descend_xblock(rnumel=0, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
         # need to maximize xblock_sub
         elif axis1_order in low_dims:
             cfg["RBLOCK"] = rblock
-            TileGenerator.descend_rblock(rnumel = rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
+            TileGenerator.descend_rblock(rnumel=rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
         elif axis2_order in low_dims:
             cfg["RBLOCK"] = rblock
             TileGenerator.descend_xblock(rnumel=rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
@@ -894,8 +894,7 @@ def benchmark_all_configs(self, *args, input_grid, **kwargs):
         # md5_hash = hashlib.md5()
         md5_hash = hashlib.md5(datetime.now().strftime('%Y-%m-%d').encode('utf-8')).hexdigest()
 
-
-        torch_path= "./profile_result/" + md5_hash
+        torch_path = "./profile_result/" + md5_hash
         rep = 1
         with torch_npu.profiler.profile(
             activities=[
-- 
Gitee


From 508e63724a1f30d49135e490dba1ca593c2375ae Mon Sep 17 00:00:00 2001
From: wl1259 <wanglei752@huawei.com>
Date: Sat, 19 Apr 2025 16:10:07 +0800
Subject: [PATCH 335/358] 9 clean code commit

---
 torch_npu/_inductor/config.py                |  4 +++-
 torch_npu/_inductor/npu_triton_heuristics.py | 18 ++++++++++--------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/torch_npu/_inductor/config.py b/torch_npu/_inductor/config.py
index fe356796b9..9c0e168c78 100644
--- a/torch_npu/_inductor/config.py
+++ b/torch_npu/_inductor/config.py
@@ -41,4 +41,6 @@ logging.basicConfig(
 )
 log = logging.getLogger(__name__)
 
-aggresive_autotune = os.getenv("INDUCTOR_ASCEND_AGGRESSIVE_AUTOTUNE", '0').lower() in ('1', 'true')
\ No newline at end of file
+aggresive_autotune = os.getenv("INDUCTOR_ASCEND_AGGRESSIVE_AUTOTUNE", '0').lower() in ('1', 'true')
+
+profile_path = "./profile_result/"
\ No newline at end of file
diff --git a/torch_npu/_inductor/npu_triton_heuristics.py b/torch_npu/_inductor/npu_triton_heuristics.py
index bc686a8c67..59f6c501b7 100644
--- a/torch_npu/_inductor/npu_triton_heuristics.py
+++ b/torch_npu/_inductor/npu_triton_heuristics.py
@@ -707,7 +707,7 @@ def triton_config_npu_index(
             if numof_reduction_axis > 1:
                 del cfg["XBLOCK_SUB"]
                 configs.append(Config(cfg, num_warps=1, num_stages=1))
-            elif axis2_order is None :
+            elif axis2_order is None:
                 del cfg["XBLOCK"]
                 del cfg["XBLOCK_SUB"]
                 cfg["NBLOCKS"] = 1
@@ -720,7 +720,7 @@ def triton_config_npu_index(
             cfg["XBLOCK_SUB"] = split_numel
             TileGenerator.descend_xblock(rnumel=rblock, xblock=split_numel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
         # both of the two axis are low dims
-        elif axis1_order in low_dims and axis2_order in low_dims :
+        elif axis1_order in low_dims and axis2_order in low_dims:
             cfg["RBLOCK"] = rblock
             TileGenerator.descend_xblock_rblock(rnumel=rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
         elif axis2_order is None and axis1_order is not None:
@@ -881,8 +881,7 @@ def benchmark_all_configs(self, *args, input_grid, **kwargs):
                 shutil.rmtree(base_path)
         
         import torch_npu
-        from datetime import datetime
-        
+
         stream = torch.npu.current_stream()
         experimental_config = torch_npu.profiler._ExperimentalConfig(
             aic_metrics=torch_npu.profiler.AiCMetrics.PipeUtilization,
@@ -890,11 +889,14 @@ def benchmark_all_configs(self, *args, input_grid, **kwargs):
             l2_cache=False,
             data_simplification=False
         )
-        
-        # md5_hash = hashlib.md5()
-        md5_hash = hashlib.md5(datetime.now().strftime('%Y-%m-%d').encode('utf-8')).hexdigest()
 
-        torch_path = "./profile_result/" + md5_hash
+        import uuid
+        random_uuid = uuid.uuid4().hex
+        md5_hash = hashlib.md5(random_uuid.encode()).hexdigest()
+
+        from torch_npu._inductor.config import profile_path
+
+        torch_path = profile_path + md5_hash
         rep = 1
         with torch_npu.profiler.profile(
             activities=[
-- 
Gitee


From bf95e31e572c97a76c3f3351ff8c5b5aab5585a2 Mon Sep 17 00:00:00 2001
From: wl1259 <wanglei752@huawei.com>
Date: Sat, 19 Apr 2025 16:45:10 +0800
Subject: [PATCH 336/358] 10 clean code commit

---
 torch_npu/_inductor/codegen/triton.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torch_npu/_inductor/codegen/triton.py b/torch_npu/_inductor/codegen/triton.py
index 8d7363a57c..7172e6098c 100644
--- a/torch_npu/_inductor/codegen/triton.py
+++ b/torch_npu/_inductor/codegen/triton.py
@@ -231,6 +231,8 @@ class IterationRangesEntryNPUIndex(IterationRangesEntry):
             if direction != AxisDirection.Flat:
                 index += ("[:, None]" if direction == AxisDirection.Vertical else "[None, :]")
             return index
+        else:
+            raise RuntimeError("codegen_index")
 
     def codegen_header(self, code):
         # generate offset index loop
@@ -1980,7 +1982,7 @@ class NPUIndexTritonKernel(TritonKernel):
 
                 if name not in V.graph.removed_buffers:
                     return self.store_reduction(name, index, value)
-
+                raise RuntimeError("store_reduction")
             @staticmethod
             def reduction(
                 dtype: torch.dtype,
-- 
Gitee


From c570fd16598aa4522db19ac8d2b555ab1f1769fd Mon Sep 17 00:00:00 2001
From: wl1259 <wanglei752@huawei.com>
Date: Sat, 19 Apr 2025 16:58:00 +0800
Subject: [PATCH 337/358] 11 clean code commit

---
 torch_npu/_inductor/codegen/triton.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch_npu/_inductor/codegen/triton.py b/torch_npu/_inductor/codegen/triton.py
index 7172e6098c..68ffb16a4e 100644
--- a/torch_npu/_inductor/codegen/triton.py
+++ b/torch_npu/_inductor/codegen/triton.py
@@ -1983,6 +1983,7 @@ class NPUIndexTritonKernel(TritonKernel):
                 if name not in V.graph.removed_buffers:
                     return self.store_reduction(name, index, value)
                 raise RuntimeError("store_reduction")
+
             @staticmethod
             def reduction(
                 dtype: torch.dtype,
-- 
Gitee


From 9ade564651ae08bb6bc24a45234646d188fe8f1a Mon Sep 17 00:00:00 2001
From: wl1259 <wanglei752@huawei.com>
Date: Sat, 19 Apr 2025 17:19:52 +0800
Subject: [PATCH 338/358] 11 remove minganci

---
 torch_npu/_inductor/npu_triton_heuristics.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/torch_npu/_inductor/npu_triton_heuristics.py b/torch_npu/_inductor/npu_triton_heuristics.py
index 59f6c501b7..42575d3171 100644
--- a/torch_npu/_inductor/npu_triton_heuristics.py
+++ b/torch_npu/_inductor/npu_triton_heuristics.py
@@ -301,7 +301,6 @@ class NPUCachingAutotuner(CachingAutotuner):
         # `bin.launch_metadata` is relatively expensive, and returns None unless a
         # `launch_enter_hook` is installed.  So if we don't have that hook installed,
         # we want to burn None in to the launch args with zero overhead.
-        # See https://github.com/pytorch/pytorch/issues/123597
         if binary.launch_enter_hook:
 
             def get_launch_args_with_kernel_launch_metadata(
@@ -322,7 +321,6 @@ class NPUCachingAutotuner(CachingAutotuner):
             ):
                 """
                 Construct launch args after CompiledKernel.launch_metadata is added
-                by https://github.com/openai/triton/pull/3492 .
                 """
                 return (
                     grid_0,
@@ -356,7 +354,6 @@ class NPUCachingAutotuner(CachingAutotuner):
             ):
                 """
                 Construct launch args after CompiledKernel.launch_metadata is added
-                by https://github.com/openai/triton/pull/3492 .
                 """
                 return (
                     grid_0,
-- 
Gitee


From 97a8bd4843146a33107ee4cc3e4d97ddaccfcd85 Mon Sep 17 00:00:00 2001
From: wl1259 <wanglei752@huawei.com>
Date: Tue, 22 Apr 2025 22:21:25 +0800
Subject: [PATCH 339/358] rename file

---
 .../{dynamo_patch3.py => dynamo_embedding_backward_dispatch.py}   | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename torch_npu/_inductor/{dynamo_patch3.py => dynamo_embedding_backward_dispatch.py} (100%)

diff --git a/torch_npu/_inductor/dynamo_patch3.py b/torch_npu/_inductor/dynamo_embedding_backward_dispatch.py
similarity index 100%
rename from torch_npu/_inductor/dynamo_patch3.py
rename to torch_npu/_inductor/dynamo_embedding_backward_dispatch.py
-- 
Gitee


From 79b1830495325569b95225e4f22447d242c61ac5 Mon Sep 17 00:00:00 2001
From: wl1259 <wanglei752@huawei.com>
Date: Mon, 28 Apr 2025 20:48:17 +0800
Subject: [PATCH 340/358] fix import error

---
 torch_npu/_inductor/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_npu/_inductor/__init__.py b/torch_npu/_inductor/__init__.py
index bd98467d4a..75a6c81d58 100644
--- a/torch_npu/_inductor/__init__.py
+++ b/torch_npu/_inductor/__init__.py
@@ -19,7 +19,7 @@ from . import config as npu_config
 #register fx_pass should be put behind of _register_npu_inductor_decompositons
 from . import codegen
 from . import npu_fusion_attention_graph
-from . import dynamo_patch3
+from . import dynamo_embedding_backward_dispatch
 
 npulog.info("perform torch_npu._inductor patch")
 
-- 
Gitee


From 2530409cc35b425896b10f8723bc23f68a1213bb Mon Sep 17 00:00:00 2001
From: rmch <chenruimin2@huawei.com>
Date: Wed, 23 Apr 2025 10:19:00 +0800
Subject: [PATCH 341/358] support check accuracy

---
 torch_npu/_inductor/__init__.py              |   11 +-
 torch_npu/_inductor/codegen/ir_fx.py         |  837 ++++++
 torch_npu/_inductor/codegen/schduling.py     |  136 +-
 torch_npu/_inductor/codegen/triton.py        |    3 +-
 torch_npu/_inductor/config.py                |   14 +-
 torch_npu/_inductor/lowering_fx.py           | 2405 ++++++++++++++++++
 torch_npu/_inductor/npu_triton_heuristics.py |  172 +-
 7 files changed, 3563 insertions(+), 15 deletions(-)
 create mode 100644 torch_npu/_inductor/codegen/ir_fx.py
 create mode 100644 torch_npu/_inductor/lowering_fx.py

diff --git a/torch_npu/_inductor/__init__.py b/torch_npu/_inductor/__init__.py
index 75a6c81d58..dd90221072 100644
--- a/torch_npu/_inductor/__init__.py
+++ b/torch_npu/_inductor/__init__.py
@@ -72,9 +72,18 @@ register_interface_for_device("npu", NewNpuInterface)
 register_interface_for_device("npu:0", NewNpuInterface)
 device = get_interface_for_device("npu")
 
+inductor_lowering.make_reduction = make_reduction
 
 
-inductor_lowering.make_reduction = make_reduction
+if npu_config.check_accuracy:
+    from .codegen.ir_fx import _patch_npu_inductor_ir
+    _patch_npu_inductor_ir()
+
+if npu_config.check_accuracy:
+    from .lowering_fx import _register_npu_inductor_fallbacks
+else:
+    from .lowering import _register_npu_inductor_fallbacks
+    
 _register_npu_inductor_fallbacks()
 _register_npu_inductor_decompositons()
 
diff --git a/torch_npu/_inductor/codegen/ir_fx.py b/torch_npu/_inductor/codegen/ir_fx.py
new file mode 100644
index 0000000000..4ac670b858
--- /dev/null
+++ b/torch_npu/_inductor/codegen/ir_fx.py
@@ -0,0 +1,837 @@
+import traceback
+from unittest.mock import patch
+
+import typing
+
+from typing import (
+    Any,
+    Callable,
+    List,
+    Optional,
+    Union
+)
+from typing import Optional
+
+import sympy
+from sympy import Expr
+
+import torch
+from torch._inductor import ir
+from torch._inductor import config
+
+from torch._inductor.virtualized import ops, V
+from torch.utils._ordered_set import OrderedSet
+
+from ..lowering_fx import (
+    fetch_graphs, 
+    merge_traced_graphs, 
+    node_id, 
+    clone,
+    create_fake_input,
+    subtract_graph
+)
+
+
+def _patch_loops_get_name(self):
+    return self.node_name
+
+def _patch_loops_get_traced_graph(self):
+    return self.traced_graph
+
+@classmethod
+def _patch_loops_create(cls, *args, **kwargs):
+    origin_node = kwargs.pop("origin_node", None)
+    traced_graph = kwargs.pop("traced_graph", None)
+    node_name = kwargs.pop("node_name", None) 
+    tb = kwargs.pop("traceback", None)
+    r = cls(*args, **kwargs)
+    # Need to explicitly set origin_node here to propagate it down.
+    # todo(chilli): I think it would be better for IRNode to directly set
+    # origin_node
+    r._post_init_setattr("origin_node", origin_node)
+    r._post_init_setattr("traceback", tb or r.traceback)
+    r._post_init_setattr("traced_graph", traced_graph)
+    r._post_init_setattr("node_name", node_name)
+    return ir.TensorBox.create(r)
+
+def _patch_pointwise_constant_to_device(self, device, traced_graph=None, node_name=None):
+    """Move this to a given device. Requires that all reads are to constants."""
+    loader = self.make_loader()
+    loader = patch.object(ir.ConstantBuffer, "override_device", device)(loader)
+
+    r = ir.Pointwise(device, self.dtype, loader, self.ranges)
+    r._post_init_setattr("traced_graph", traced_graph)
+    r._post_init_setattr("node_name", node_name)
+    return r
+
+
+@classmethod
+def _patch_reduction_create(
+    cls,
+    device: torch.device,
+    dst_dtype: torch.dtype,
+    src_dtype: torch.dtype,
+    inner_fn: Callable[..., Any],
+    ranges: ir.Sequence[Expr],
+    reduction_ranges: ir.Sequence[Expr],
+    reduction_type: str,
+    reduction_hint: ir.ReductionHint = ir.ReductionHint.DEFAULT,
+    input_node: Optional[ir.IRNode] = None,
+    traced_graph = None, 
+    node_name: str = None 
+) -> ir.TensorBox:
+    reduction_numel = V.graph.sizevars.simplify(ir.sympy_product(reduction_ranges))
+
+    if reduction_numel == 0:
+        # N.B. This is a hack to generate the literal of the given type
+        # Ideally, we should be fixing `def constant` in triton.py
+        # but it breaks due to hardcoded dtypes in other places
+        def py_cnst(val: object) -> Union[bool, float, int]:
+            if dst_dtype == torch.bool:
+                return bool(val)
+            elif dst_dtype.is_floating_point:
+                assert isinstance(val, typing.SupportsFloat)
+                return float(val)
+            else:
+                assert isinstance(val, typing.SupportsInt)
+                return int(val)
+
+        rtypes_to_inits = {
+            "sum": py_cnst(0),
+            "xor_sum": py_cnst(0),
+            "prod": py_cnst(1),
+            "any": py_cnst(0),
+            # "all" is desugared to `!any(!val)`
+        }
+
+        assert (
+            reduction_type in rtypes_to_inits.keys()
+        ), f"{reduction_type} not supported for zero-dimension tensors!"
+
+        def const_fn(index: int) -> ir.OpsValue:
+            return ops.constant(rtypes_to_inits[reduction_type], dst_dtype)
+
+        return ir.Pointwise.create(
+            device=device,
+            dtype=src_dtype,
+            inner_fn=const_fn,
+            ranges=list(ranges),
+            traced_graph=traced_graph,
+            node_name=node_name
+        )
+
+    if reduction_numel == 1:
+        # this reduction is actually a pointwise op
+        if reduction_type in ("argmin", "argmax"):
+
+            def fn(index: int) -> ir.OpsValue:
+                return ops.constant(0, dst_dtype)
+
+        else:
+
+            def fn(index: int) -> ir.OpsValue:
+                reduction_index = [sympy.S.Zero for _ in reduction_ranges]
+                return inner_fn(index, reduction_index)
+
+        return ir.Pointwise.create(
+            device=device, dtype=dst_dtype, inner_fn=fn, ranges=ranges
+        )
+
+    if (
+        isinstance(reduction_numel, ir.Integer)
+        and V.graph.sizevars.size_hint(reduction_numel)
+        < config.unroll_reductions_threshold
+        and (ir.sympy_product(ranges) != 1 or ir.is_gpu(device.type))
+    ):
+        # NB: This works around https://github.com/pytorch/pytorch/issues/140457
+        # since turning reductions into pointwise ops can exacerbate this problem
+        return ir.Pointwise.create(
+            device=device,
+            dtype=dst_dtype,
+            inner_fn=cls._unroll_reduction_fn(
+                inner_fn, reduction_ranges, reduction_type, src_dtype
+            ),
+            ranges=ranges,
+            traced_graph=traced_graph,
+            node_name=node_name
+        )
+
+    # triton doesn't support reduce to single element well, so break it up
+    hint, split = cls.num_splits(
+        device,
+        dst_dtype,
+        src_dtype,
+        inner_fn,
+        ranges,
+        reduction_ranges,
+        reduction_type,
+        reduction_numel,
+        input_node,
+    )
+    # intermediate reduction in split can contain complex indexing,
+    # and num_splits will fail to correctly set the hint
+    # reuse the passed hint if available
+    if reduction_hint == ir.ReductionHint.DEFAULT:
+        reduction_hint = hint
+    if split == -1:
+        assert input_node is not None
+        new_ranges, new_reduction_ranges = ir.extract_input_node_reduction_ranges(
+            input_node
+        )
+        assert new_ranges is not None
+        assert new_reduction_ranges is not None
+        return cls.create_multilayer_existing_ranges(
+            device,
+            dst_dtype,
+            src_dtype,
+            inner_fn,
+            ranges,
+            reduction_ranges,
+            new_ranges,
+            new_reduction_ranges,
+            reduction_type,
+            reduction_hint,
+        )
+    elif split > 1:
+        # triton doesn't support reduce to single element well, so break it up
+        return cls.create_multilayer(
+            device,
+            dst_dtype,
+            src_dtype,
+            inner_fn,
+            ranges,
+            reduction_ranges,
+            reduction_type,
+            split,
+            reduction_hint,
+        )
+
+    r = ir.Reduction(
+        device=device,
+        dtype=dst_dtype,
+        inner_fn=inner_fn,
+        ranges=ranges,
+        reduction_ranges=reduction_ranges,
+        reduction_type=reduction_type,
+        src_dtype=src_dtype,
+        reduction_hint=reduction_hint,
+    )
+    r._post_init_setattr("traced_graph", traced_graph)
+    r._post_init_setattr("node_name", node_name)
+
+    return ir.TensorBox.create(r)
+
+
+def _patch_baseview_get_traced_graph(self):
+    if hasattr(self, 'traced_graph') and self.traced_graph is not None:
+        return self.traced_graph
+    return self.data.get_traced_graph()
+
+
+def _patch_base_view_get_reads(self):
+    with patch.object(ir.FlexibleLayout, "allow_indexing", True):
+        r = ir.extract_read_writes(
+            self.make_loader(),
+            self.get_size(),
+        ).reads
+    for md in r:
+        if md.index.has(ir.ModularIndexing):
+            if md.index.has(ir.FloorDiv):
+                self.realize()
+                return r
+            else:
+                for m in md.index.find(ir.ModularIndexing):
+                    for arg in m.args:
+                        if arg.has(ir.ModularIndexing):
+                            self.realize()
+                            return r
+    return r
+
+
+def has_buffer(inp):
+    if not hasattr(inp, 'data'):
+        return False
+    if isinstance(inp.data, ir.Buffer):
+        return True
+    return has_buffer(inp.data)
+
+def get_buffer(inp):
+    if isinstance(inp.data, ir.Buffer):
+        return inp.data
+    return get_buffer(inp.data)
+
+def _patch_baseview_realize(self):
+    if hasattr(self, 'traced_graph') and self.traced_graph is not None:
+        r = self.data.realize()
+        buffer = get_buffer(self)
+        if isinstance(buffer, (ir.MultiOutput, ir.InputBuffer, ir.ConcatKernel)):
+            return r
+        traced_graph = buffer.data.get_traced_graph()
+        buf_name = buffer.get_name()
+        new_traced_graph, placeholder = subtract_graph(self.traced_graph, traced_graph, node_name=buf_name)
+        if placeholder is not None:
+            placeholder.name = buf_name
+            device = buffer.get_device()
+            dtype = buffer.get_dtype()
+            size = buffer.get_size()
+            stride = buffer.get_stride()
+            fake_input = create_fake_input(size, stride, device, dtype)
+            placeholder.meta['val'] = fake_input
+        self._post_init_setattr("traced_graph", new_traced_graph)
+        return r
+    else:
+        return self.data.realize()
+
+def _patch_baseview_realize_hint(self):
+    if hasattr(self, 'traced_graph') and self.traced_graph is not None:
+        r = self.data.realize_hint()
+        if not has_buffer(self):
+            return r
+        buffer = get_buffer(self)
+        if isinstance(buffer, (ir.MultiOutput, ir.InputBuffer, ir.ConcatKernel)):
+            return r
+        traced_graph = buffer.data.get_traced_graph()
+        buf_name = buffer.get_name()
+        new_traced_graph, placeholder = subtract_graph(self.traced_graph, traced_graph, node_name=buf_name)
+        if placeholder is not None:
+            placeholder.name = buf_name
+            device = buffer.get_device()
+            dtype = buffer.get_dtype()
+            size = buffer.get_size()
+            stride = buffer.get_stride()
+            fake_input = create_fake_input(size, stride, device, dtype)
+            placeholder.meta['val'] = fake_input
+        self._post_init_setattr("traced_graph", new_traced_graph)
+        return r
+    else:
+        return self.data.realize_hint()
+    
+
+def _patch_mark_reuse(self, users):
+    if isinstance(self.data, ir.StorageBox):
+        if self.data.should_realize_on_reuse(users):
+            if hasattr(self, 'traced_graph') and self.traced_graph is not None:
+                r = self.data.realize()
+                buffer = get_buffer(self)
+                if isinstance(buffer, (ir.MultiOutput, ir.InputBuffer, ir.ConcatKernel)):
+                    return r
+                traced_graph = buffer.data.get_traced_graph()
+                buf_name = buffer.get_name()
+                new_traced_graph, placeholder = subtract_graph(self.traced_graph, traced_graph, node_name=buf_name)
+                if placeholder is not None:
+                    placeholder.name = buf_name
+                    device = buffer.get_device()
+                    dtype = buffer.get_dtype()
+                    size = buffer.get_size()
+                    stride = buffer.get_stride()
+                    fake_input = create_fake_input(size, stride, device, dtype)
+                    placeholder.meta['val'] = fake_input
+                self._post_init_setattr("traced_graph", new_traced_graph)
+                return r
+            else:
+                return self.data.realize()
+    else:
+        return self.data.mark_reuse(users)
+
+
+@classmethod
+def _patch_expandview_create(cls, x, new_size, traced_graph=None, node_name=None): 
+    new_size = cls._normalize_size(x, new_size)
+
+    if ir.is_storage_and_layout(x):
+        storage, old_layout = ir.as_storage_and_layout(x)
+        skip = len(new_size) - len(old_layout.size)
+        assert skip >= 0
+        new_stride = [sympy.Integer(0)] * skip
+        for stride, size in zip(old_layout.stride, old_layout.size):
+            new_stride.append(
+                stride
+                if not V.graph.sizevars.shape_env.evaluate_expr(
+                    sympy.Eq(size, 1), size_oblivious=True
+                )
+                else sympy.Integer(0)
+            )
+        new_layout = ir.FixedLayout(
+            old_layout.device,
+            old_layout.dtype,
+            list(new_size),
+            new_stride,
+            old_layout.offset,
+        )
+        
+        r = ir.ReinterpretView(data=storage, layout=new_layout)
+        r._post_init_setattr("traced_graph", traced_graph)
+        r._post_init_setattr("node_name", node_name)
+        return r
+    
+    r = ir.ExpandView(data=x, size=new_size)
+    r._post_init_setattr("traced_graph", traced_graph)
+    r._post_init_setattr("node_name", node_name)
+
+    return r
+
+
+@classmethod
+def _patch_permuteview_create(cls, x, dims, traced_graph=None, node_name=None):
+    dims = cls._map_neg_dims(dims)
+    assert OrderedSet(dims) == OrderedSet(range(len(dims)))
+
+    if ir.is_storage_and_layout(x):
+        storage, old_layout = ir.as_storage_and_layout(x)
+        new_layout = ir.FixedLayout(
+            old_layout.device,
+            old_layout.dtype,
+            [old_layout.size[i] for i in dims],
+            [old_layout.stride[i] for i in dims],
+            old_layout.offset,
+        )
+        r = ir.ReinterpretView(data=storage, layout=new_layout)
+        r._post_init_setattr("traced_graph", traced_graph)
+        r._post_init_setattr("node_name", node_name)
+        return r
+
+    
+    r = ir.PermuteView(data=x, dims=dims)
+    r._post_init_setattr("traced_graph", traced_graph)
+    r._post_init_setattr("node_name", node_name)
+    return r
+
+
+@classmethod
+def _patch_view_create(cls, x, new_size, traced_graph=None, node_name=None):
+    assert isinstance(new_size, (tuple, list))
+    old_size, new_size = cls.resolve_negative_size(x.get_size(), new_size)
+    # Skip pointless views
+    if V.graph.sizevars.statically_known_list_equals(old_size, new_size):
+        return x
+
+    unbacked_symbols_in_sizes = False
+    if (
+        len(ir.free_unbacked_symbols(old_size)) > 0
+        or len(ir.free_unbacked_symbols(new_size)) > 0
+    ):
+        unbacked_symbols_in_sizes = True
+
+    if 0 in new_size:
+
+        def fake_reindex(index):
+            return tuple([0] * len(old_size))
+        
+        r = cls(x, list(new_size), fake_reindex)
+        r._post_init_setattr("traced_graph", traced_graph)
+        r._post_init_setattr("node_name", node_name)
+        return r
+
+    # TODO: a new class for FixedTransferLayout that output layout is constrained by input layout
+    elif (ir.is_contiguous_storage_and_layout(x) or unbacked_symbols_in_sizes): # and not isinstance(x.data, ir.ReinterpretView):
+        if unbacked_symbols_in_sizes and (not ir.is_contiguous_storage_and_layout(x)):
+            # realize x; otherwise, the dynamic_reshape_indexer below will fail
+            # due to the size_hint's inability to process unbacked SymInts
+            x = ir.ExternKernel.realize_input(x)
+
+        storage, old_layout = ir.as_contiguous_storage_and_layout(x)
+        new_layout = ir.FixedLayout(
+            old_layout.device,
+            old_layout.dtype,
+            new_size,
+            ir.FlexibleLayout.contiguous_strides(new_size),
+            old_layout.offset,
+        )
+        
+        r = ir.ReinterpretView(data=storage, layout=new_layout)
+        r._post_init_setattr("traced_graph", traced_graph)
+        r._post_init_setattr("node_name", node_name)
+        return r
+
+    reindex = cls.dynamic_reshape_indexer(old_size, new_size)
+    
+    r = cls(data=x, size=list(new_size), reindex=reindex)
+    r._post_init_setattr("traced_graph", traced_graph)
+    r._post_init_setattr("node_name", node_name)
+    return r
+
+
+@classmethod
+def _patch_sliceview_create(cls, x, dim, start, end, step=1, clamp=True, traced_graph=None, node_name=None): # TODO: crm, clamp=True
+    step = sympy.expand(step)
+    assert isinstance(step, sympy.Expr) or step > 0
+    try:
+        if start == 0 and end >= 2**63 - 1 and step == 1:
+            return x
+    except TypeError:
+        pass
+    sizevars = V.graph.sizevars
+    new_size = list(x.get_size())
+
+    if clamp:
+        start, end = cls.normalize_start_end(x, dim, start, end)
+
+    new_size[dim] = ir.FloorDiv(end - start + (step - 1), step)
+
+    if ir.is_storage_and_layout(x):
+        # Fast path
+        storage, old_layout = ir.as_storage_and_layout(x)
+        new_stride = list(old_layout.stride)
+        new_stride[dim] = new_stride[dim] * step
+        new_layout = ir.FixedLayout(
+            old_layout.device,
+            old_layout.dtype,
+            new_size,
+            new_stride,
+            old_layout.offset + old_layout.stride[dim] * start,
+        )
+        r = ir.ReinterpretView(data=storage, layout=new_layout)
+        r._post_init_setattr("traced_graph", traced_graph)
+        r._post_init_setattr("node_name", node_name)
+        return r
+
+    def reindex(index):
+        assert len(index) == len(new_size), f"wrong ndim {index} {new_size}"
+        index = list(index)
+        index[dim] = index[dim] * step + start
+        return index
+
+    # redirect to a generic view
+    r = ir.SliceView(data=x, size=new_size, reindex=reindex)
+    r._post_init_setattr("traced_graph", traced_graph)
+    r._post_init_setattr("node_name", node_name)
+    return r
+
+
+def _patch_buffer_get_traced_graph(self):
+    return self.traced_graph
+
+
+@classmethod
+def _patch_concatkernel_create(cls, inputs, dim):
+    device = inputs[0].get_device()
+    dtype = inputs[0].get_dtype()
+    new_size = list(inputs[0].get_size())
+    offsets_start = [0]
+    offsets_end = [new_size[dim]]
+    assert 0 <= dim < len(new_size)
+    for i in range(1, len(inputs)):
+        input_size = inputs[i].get_size()
+        offsets_start.append(new_size[dim])
+        assert len(input_size) == len(new_size)
+        assert inputs[i].get_dtype() == dtype
+        assert inputs[i].get_device() == device
+        for j in range(len(new_size)):
+            if j == dim:
+                new_size[j] = new_size[j] + input_size[j]
+            else:
+                new_size[j] = V.graph.sizevars.guard_equals(
+                    new_size[j], input_size[j]
+                )
+        offsets_end.append(new_size[dim])
+
+    output_stride = ir.FlexibleLayout.contiguous_strides(new_size)
+    # If any of the inputs is in CL format, use CL format for the output
+    for i in range(len(inputs)):
+        x = inputs[i]
+        if ir.is_storage_and_layout(x):
+            layout = x.get_layout()
+            if (
+                isinstance(layout, ir.FixedLayout)
+                and layout.is_channels_last_contiguous(layout.size, layout.stride)
+            ):
+                # use CL stride for the output
+                output_stride = ir.make_channels_last_strides_for(new_size)
+                break
+    
+    any_input_is_storage_and_layout = any(ir.is_storage_and_layout(x) for x in inputs)
+    fx_node_args = V.graph.current_node.args[0]
+    assert isinstance(fx_node_args, list)
+    # If any of the inputs has meta tensor and the meta tensor is in CL format, use CL format for the output
+    if any_input_is_storage_and_layout is False and any(
+        "val" in arg.meta
+        and (
+            arg.meta["val"].is_contiguous(memory_format=torch.channels_last)
+            or arg.meta["val"].is_contiguous(memory_format=torch.channels_last_3d)
+        )
+        for arg in fx_node_args
+    ):
+        output_stride = ir.make_channels_last_strides_for(new_size)
+
+    concat_kernel = ir.ConcatKernel(
+        name=None,
+        layout=ir.FixedLayout(
+            device=device,
+            dtype=dtype,
+            size=new_size,
+            stride=output_stride,
+        ),
+        inputs=[],
+    )
+
+    kernel = ir.StorageBox(concat_kernel)
+    op_names = []
+    for i in range(len(inputs)):
+        input_buffer = cls.realize_into(
+            inputs[i],
+            ir.SliceView.create(
+                kernel, dim, offsets_start[i], offsets_end[i], clamp=False
+            ),
+        )
+        concat_kernel.inputs.append(input_buffer)
+
+        if isinstance(inputs[i].data, ir.BaseView):
+            input_unwrapped = inputs[i].data.unwrap_view()
+        else:
+            input_unwrapped = inputs[i].data
+
+        if (
+            input_unwrapped.is_input_buffer()
+            and ir.is_gpu(inputs[i].get_device().type)
+            and not ir.is_dynamic(input_buffer)
+        ):
+            op_names.append(input_buffer.get_operation_name())
+
+    if len(op_names) > 1 and V.graph.has_feature(device, ir.BackendFeature.FOREACH):
+        V.graph.register_operation_list(op_names)
+    
+    cat_inputs = [ir.TensorBox(ir.StorageBox(inp)) for inp in concat_kernel.inputs]
+    input_graphs = fetch_graphs([cat_inputs])
+    node_name = f'cat_{next(node_id)}'
+    new_graph = merge_traced_graphs(input_graphs, torch.ops.aten.cat, node_name, dim=dim)
+
+    concat_kernel._post_init_setattr("name", V.graph.register_buffer(concat_kernel))
+    concat_kernel._post_init_setattr("inputs", cls.unwrap_storage(concat_kernel.inputs))
+    concat_kernel._post_init_setattr("traced_graph", new_graph)
+    concat_kernel._post_init_setattr("node_name", node_name)
+
+    return kernel
+
+def _patch_concatkernel_get_traced_graph(self):
+    return self.traced_graph
+
+@classmethod
+def _patch_concatkernel_realize_into(cls, src, dst):
+    # Attempt to turn this into a ReinterpretView rather than assert.
+    # This has concessions around layout, as as_storage_and_layout
+    # can cause us to go from flexible to fixed layout.
+    if not isinstance(dst, ir.ReinterpretView):
+        if ir.is_storage_and_layout(dst):
+            storage, layout = ir.as_storage_and_layout(dst)
+            dst = ir.ReinterpretView(data=storage, layout=layout)
+    assert isinstance(dst, ir.ReinterpretView), dst
+    if isinstance(src, ir.TensorBox):
+        # unwrap a TensorBox
+        return cls.realize_into(src.data, dst)
+    if isinstance(src, ir.StorageBox):
+        src.realize()
+        # ExternKernelAlloc has specific requirements for output layout, should create a copy
+        assert hasattr(src.data, "layout")
+        if cls.can_realize_into_without_copy(src):
+            src.data.layout = ir.NonOwningLayout(dst)
+            return src.data
+    pw = clone(src, memory_format=torch.contiguous_format)
+    return cls.realize_into(pw, dst)
+
+
+def _patch_externkernel_copy_input(x):
+    traced_graph = x.get_traced_graph()
+    node_name = x.get_name()
+    if traced_graph is None:
+        traced_graph = fetch_graphs([x])[0]
+        node_name = f'getitem_{next(node_id)}'
+        
+    pw = ir.Pointwise.create(
+        device=x.get_device(),
+        dtype=x.get_dtype(),
+        inner_fn=x.make_loader(),
+        ranges=x.get_size(),
+        origin_node=x.get_origin_node(),
+        traceback=x.get_traceback(),
+        traced_graph=traced_graph, 
+        node_name=node_name 
+    )
+    pw.realize()
+    return pw
+
+
+@classmethod
+def _patch_externkernel_convert_to_reinterpret_view(cls, x):
+    """
+    In order to pass this to an extern kernel we need a
+    ReinterpretView not a View.  This allows us to avoid some
+    unneeded copies.
+    """
+    assert isinstance(x, ir.BaseView)
+    if isinstance(x, ir.ReinterpretView):
+        return x
+
+    # NOTE: Don't use extract_read_writes here as it fails when
+    # make_loader() inlines the computation
+    x_unwrap_view = x.unwrap_view()
+    buf = V.graph.get_buffer(x_unwrap_view.get_name())
+    assert buf is not None
+    x_unwrap_view_fx_node = buf.get_origin_node()
+    # Prefer channels last format according to how the format is set from eager.
+    if (
+        x_unwrap_view_fx_node is not None
+        and "val" in x_unwrap_view_fx_node.meta
+        and isinstance(x_unwrap_view.layout, ir.FlexibleLayout)
+        and (
+            x_unwrap_view_fx_node.meta["val"].is_contiguous(
+                memory_format=torch.channels_last
+            )
+            or x_unwrap_view_fx_node.meta["val"].is_contiguous(
+                memory_format=torch.channels_last_3d
+            )
+        )
+    ):
+        x_unwrap_view.freeze_layout_with_same_order(
+            ir.make_channels_last_strides_for(x_unwrap_view.get_size())
+        )
+    else:
+        x_unwrap_view.freeze_layout()
+
+    index_args, var_ranges = ir.dependencies.index_vars_squeeze(
+        x.get_size(), prefix="r"
+    )
+    range_vars = index_args[0]
+    index = x.make_indexer()(range_vars)
+
+    index = V.graph.sizevars.simplify_with_ranges(index, var_ranges)
+    strides = V.graph.sizevars.stride_vars(index, range_vars)
+    offset = V.graph.sizevars.offset_var(index, range_vars)
+    expected = ir.sympy_dot(range_vars, strides) + offset
+    
+    if index != expected:
+        ir.log.debug(
+            "convert_to_reinterpret_view failed: stride=%s offset=%s index=%s",
+            strides,
+            offset,
+            index,
+        )
+        raise NotImplementedError
+
+    r = ir.ReinterpretView(
+        data=x.data,
+        layout=ir.FixedLayout(
+            device=x.get_device(),
+            dtype=x.get_dtype(),
+            size=x.get_size(),
+            stride=strides,
+            offset=offset,
+        ),
+    )
+    r._post_init_setattr("traced_graph", x.get_traced_graph())
+    r._post_init_setattr("node_name", x.get_name())
+    return r
+
+
+@classmethod
+def _patch_devicecopy_create(cls, x, device, non_blocking, traced_graph=None, node_name=None):
+    if (
+        not x.is_extern()
+        and all(r in V.graph.constants for r in x.get_read_names())
+        and not config.aot_inductor.use_runtime_constant_folding
+    ):
+        return x.constant_to_device(device)
+
+    V.graph.add_device_info(device)
+    V.graph.add_device_info(x.get_device())
+
+    ir.developer_warning("DeviceCopy in input program")
+    constant_args = (non_blocking,)
+    r = ir.DeviceCopy(
+        ir.FlexibleLayout(
+            device=device,
+            dtype=x.get_dtype(),
+            size=x.get_size(),
+        ),
+        [cls.realize_input(x)],
+        constant_args,
+    )
+    r._post_init_setattr("traced_graph", traced_graph)
+    r._post_init_setattr("node_name", node_name)
+    return r
+    
+
+def _patch_devicecopy_get_traced_graph(self):
+    return self.traced_graph
+
+
+def _patch_multioutput_get_traced_graph(self):
+    return None
+
+ir.MultiOutput.get_traced_graph = _patch_multioutput_get_traced_graph
+
+def _patch_mutablebox_get_name(self):
+    return self.data.get_name()
+
+def _patch_mutablebox_get_traced_graph(self):
+    return self.data.get_traced_graph()
+
+
+@classmethod
+def _patch_mutationlayout_realize_into(cls, src, dst, unsafe_alias=False):
+    dst.realize()
+    # NOTE: We must realize users of `dst` before we realize `src`, since
+    # realization order determines scheduling order. Otherwise, src's
+    # mutation would be scheduled before the existing users of dst!
+    V.graph.mark_buffer_mutated(dst.get_name())
+
+    if isinstance(src, ir.TensorBox):
+        src = src.data
+
+    # We copy the contents of src into dst. In most cases this should
+    # be fused into a single kernel by the scheduler.
+    # NOTE: We cannot change src's layout to mutate dst directly as this
+    # would alias src to dst, which is not correct as further s to
+    # dst would effect users of src. However if there are no more users of
+    # dst, we can alias src to dst.
+    src.realize_hint()
+
+    if not unsafe_alias:
+        
+        input_graphs = fetch_graphs([dst, src])
+        node_name = f'copy__{next(node_id)}'
+        new_graph = merge_traced_graphs(input_graphs, torch.ops.aten.copy, node_name)
+
+        src = ir.Pointwise.create(
+            device=src.get_device(),
+            dtype=src.get_dtype(),
+            inner_fn=src.make_loader(),
+            ranges=[
+                V.graph.sizevars.guard_equals(a, b)
+                for a, b in zip(src.get_size(), dst.get_size())
+            ],
+            traced_graph=new_graph,
+            node_name=node_name,
+        ).data
+
+    src.realize()
+    assert isinstance(src.data.layout, ir.FlexibleLayout)
+    src.data.layout = ir.MutationLayoutSHOULDREMOVE(dst)
+    return src.data
+
+def _patch_npu_inductor_ir():
+    ir.Reduction.create = _patch_reduction_create
+    ir.BaseView.get_traced_graph = _patch_baseview_get_traced_graph
+    ir.BaseView.get_reads = _patch_base_view_get_reads
+    ir.BaseView.realize = _patch_baseview_realize
+    ir.BaseView.realize_hint = _patch_baseview_realize_hint
+    ir.BaseView.mark_reuse = _patch_mark_reuse
+    ir.ExpandView.create = _patch_expandview_create
+    ir.PermuteView.create = _patch_permuteview_create
+    ir.View.create = _patch_view_create
+    ir.SliceView.create = _patch_sliceview_create
+    ir.Buffer.traced_graph = None
+    ir.Buffer.get_traced_graph = _patch_buffer_get_traced_graph
+    ir.ConcatKernel.create = _patch_concatkernel_create
+    ir.ConcatKernel.get_traced_graph = _patch_concatkernel_get_traced_graph
+    ir.ConcatKernel.realize_into = _patch_concatkernel_realize_into
+    ir.ExternKernel.copy_input = _patch_externkernel_copy_input
+    ir.ExternKernel.convert_to_reinterpret_view = _patch_externkernel_convert_to_reinterpret_view
+    ir.DeviceCopy.create = _patch_devicecopy_create
+    ir.DeviceCopy.get_traced_graph = _patch_devicecopy_get_traced_graph
+    ir.MutableBox.get_name = _patch_mutablebox_get_name
+    ir.MutableBox.get_traced_graph = _patch_mutablebox_get_traced_graph
+    ir.Loops.get_name = _patch_loops_get_name
+    ir.Loops.get_traced_graph = _patch_loops_get_traced_graph
+    ir.Loops.create = _patch_loops_create
+    ir.Pointwise.constant_to_device = _patch_pointwise_constant_to_device
+    ir.MutationLayoutSHOULDREMOVE.realize_into = _patch_mutationlayout_realize_into
\ No newline at end of file
diff --git a/torch_npu/_inductor/codegen/schduling.py b/torch_npu/_inductor/codegen/schduling.py
index ca4b306144..e531f6b8a7 100644
--- a/torch_npu/_inductor/codegen/schduling.py
+++ b/torch_npu/_inductor/codegen/schduling.py
@@ -20,6 +20,43 @@ from .split_tiling import SplitTiling
 from .npu_kernel_features import NumelList, NPUKernelFeatures
 
 
+import os
+from typing import List, Union, Any
+import collections
+
+from .triton import NPUIndexTritonKernel
+from .. import config as npu_config
+from torch._inductor.codegen.triton import ( 
+    TritonScheduling, 
+    log, 
+    config, 
+    schedule_log,
+    get_fused_kernel_name,
+    get_kernel_category_by_source_code,
+    Placeholder,
+    get_kernel_metadata,
+    get_path,
+    IndentedBuffer
+    )
+from torch._inductor.codegen.simd import  DisableReduction, EnableReduction
+from torch._inductor import scheduler, metrics
+from torch._inductor.virtualized import (
+    V,
+)
+from torch._inductor.codecache import code_hash
+from torch._dynamo.utils import counters
+import itertools, contextlib
+from torch._inductor.utils import sympy_index_symbol
+import sympy
+from .split_tiling import SplitTiling
+from ..lowering_fx import (
+    create_fx_from_snodes_by_traced_graph, 
+    create_compile_kwargs,
+    generate_fx_graph_code,
+    dump_fx_graph_code
+    )
+
+
 def flatten_groups(nums):
     res = []
     for i in nums:
@@ -68,7 +105,7 @@ class NPUTritonScheduling(TritonScheduling):
         ]    
 
     # transform indexing before call codegen_node_schedule_with_kernel
-    def codegen_node_schedule(self, kernel_features: SIMDKernelFeatures):
+    def codegen_node_schedule(self, kernel_features: SIMDKernelFeatures, nodes):
         node_schedule = kernel_features.node_schedule
         tiling = self.select_tiling(
             node_schedule, kernel_features.numel, kernel_features.reduction_numel
@@ -88,7 +125,20 @@ class NPUTritonScheduling(TritonScheduling):
         for kernel in kernels:
             with V.set_kernel_handler(kernel):
                 src_code = kernel.codegen_kernel()
-            kernel_name = self.define_kernel(src_code, node_schedule, kernel)
+
+            V.graph.removed_buffers |= kernel.removed_buffers
+            V.graph.inplaced_to_remove |= kernel.inplaced_to_remove
+
+            if npu_config.check_accuracy:
+                if not npu_config.traced_fx_graph_cache:
+                    npu_config.traced_fx_graph_cache = os.path.join(os.getenv("TORCHINDUCTOR_CACHE_DIR"), 'traced_fx_graph_cache')
+                os.makedirs(npu_config.traced_fx_graph_cache, exist_ok=True)
+                traced_graph, fx_call_args, fx_args, compile_kwargs = create_fx_from_snodes_by_traced_graph(nodes)
+                traced_graph_hash = code_hash(traced_graph.print_readable(print_output=False))
+                
+            kernel_name, src_code = self.define_kernel(src_code, node_schedule, kernel, traced_graph_hash \
+                                                    if npu_config.check_accuracy else None)
+
             log.debug("Generating kernel code with kernel_name: %s", kernel_name)
             kernel.kernel_name = kernel_name
             kernel.code_hash = code_hash(src_code)
@@ -107,13 +157,19 @@ class NPUTritonScheduling(TritonScheduling):
         self.codegen_comment(node_schedule)
         final_kernel.call_kernel(final_kernel.kernel_name)
 
+        if npu_config.check_accuracy:
+            compile_kwargs |= create_compile_kwargs(final_kernel, fx_call_args, fx_args)
+            fx_dump_path = os.path.join(npu_config.traced_fx_graph_cache, traced_graph_hash)
+            os.makedirs(fx_dump_path, exist_ok=True)
+            fx_code = generate_fx_graph_code(traced_graph.code, src_code, kernel_name, compile_kwargs)
+            dump_fx_graph_code(fx_code, fx_dump_path, traced_graph_hash)
+            os.environ[traced_graph_hash] = fx_dump_path
+
         if config.nan_asserts:
             final_kernel.codegen_nan_check()
         if config.warn_mix_layout:
             final_kernel.warn_mix_layout(kernels[0].kernel_name)
 
-        V.graph.removed_buffers |= final_kernel.removed_buffers
-        V.graph.inplaced_to_remove |= final_kernel.inplaced_to_remove
 
         if (
             V.graph.wrapper_code.supports_intermediate_hooks
@@ -138,6 +194,67 @@ class NPUTritonScheduling(TritonScheduling):
 
         self.scheduler.free_buffers()
         
+
+    def define_kernel(self, src_code, node_schedule, kernel, traced_graph_hash: str):
+        wrapper = V.graph.wrapper_code
+        if (src_code, traced_graph_hash) in wrapper.src_to_kernel:
+            kernel_name = wrapper.src_to_kernel[(src_code, traced_graph_hash)]
+            if npu_config.check_accuracy:
+                src_code = src_code.replace(str(Placeholder.DESCRIPTIVE_NAME), kernel_name)
+                subs_name = kernel_name if config.triton.unique_kernel_names else "triton_"
+                src_code = src_code.replace(str(Placeholder.KERNEL_NAME), subs_name)
+                if traced_graph_hash:
+                    src_code = src_code.replace('TRACED_GRAPH_HASH', traced_graph_hash)
+        else:
+            fused_name = (
+                get_fused_kernel_name(node_schedule, config.triton.descriptive_names)
+                if config.triton.descriptive_names
+                else ""
+            )
+            kernel_category = get_kernel_category_by_source_code(src_code)[:3]
+            kernel_name = "_".join(
+                ["triton", kernel_category, fused_name, wrapper.next_kernel_suffix()]
+            )
+            # use the original src_code as the key
+            wrapper.src_to_kernel[(src_code, traced_graph_hash)] = kernel_name
+            subs_name = kernel_name if config.triton.unique_kernel_names else "triton_"
+
+            # DESCRIPTIVE_NAME is used for profiling purposes; it shows the full kernel name
+            # even when unique_kernel_names is turned off. Meanwhile, KERNEL_NAME is sometimes set
+            # to "triton_" to maximize caching opportunities (when unique_kernel_names = False).
+            src_code = src_code.replace(str(Placeholder.DESCRIPTIVE_NAME), kernel_name)
+            src_code = src_code.replace(str(Placeholder.KERNEL_NAME), subs_name)
+            if traced_graph_hash:
+                src_code = src_code.replace('TRACED_GRAPH_HASH', traced_graph_hash)
+
+            # TODO(voz): Ostensibly, we should not need this. But there are cases where C++ codegen does
+            # not use BracesBuffer, so we have no good indicator of a C++ buffer atm.
+            src_code = src_code.replace("#pragma CMT", "#")
+
+            basename, _, kernel_path = get_path(code_hash(src_code.strip()), "py")
+
+            compile_wrapper = IndentedBuffer()
+            compile_wrapper.writeline(f"async_compile.triton({subs_name!r}, '''")
+            compile_wrapper.splice(src_code, strip=True)
+            current_device = V.graph.get_current_device_or_throw()
+            compile_wrapper.writeline(f"''', device_str='{current_device.type}')")
+
+            metadata_comment = f"# kernel path: {kernel_path}"
+            origins, detailed_origins = get_kernel_metadata(node_schedule, wrapper)
+            metadata_comment += "\n" + origins + "\n" + detailed_origins
+            wrapper.define_kernel(
+                kernel_name, compile_wrapper.getvalue(), metadata_comment
+            )
+
+            # log kernel metadata for offline analysis.
+            # E.g. one can find all unaligned inner reduction and check if
+            # padding helps with the perf kernel by kernel.
+            if metrics.is_metric_table_enabled("kernel_metadata"):
+                metrics.log_kernel_metadata(kernel_name, kernel_path, src_code)
+
+        return kernel_name, src_code
+    
+
     def codegen_node(
         self, node: Union[scheduler.FusedSchedulerNode, scheduler.SchedulerNode]
     ):
@@ -152,9 +269,10 @@ class NPUTritonScheduling(TritonScheduling):
         schedule_log.debug("Schedule:\n %s", node_schedule)
         
         return self.codegen_node_schedule(
-            NPUKernelFeatures(node_schedule, numel, rnumel)
+            NPUKernelFeatures(node_schedule, numel, rnumel), nodes
         )
     
+    
     def decide_codegen_dims_in_kernel(self, node_schedule, kernel):
         def current_reduction_nodes(nodes):
             return itertools.takewhile(lambda n: n is not DisableReduction, nodes)
@@ -194,6 +312,7 @@ class NPUTritonScheduling(TritonScheduling):
                 for x, y in zip(node._body.indexing_exprs.values(), node._body.indexing.values()):
                     print(f"index transform:{x}->{y}")
 
+
     def additional_nodes_to_be_subs(self, kernel, node_to_be_substituted):
         for node in kernel.range_tree_nodes.values():
             if node.expr != sympy_index_symbol(f"{node.parent.prefix}index") \
@@ -213,9 +332,4 @@ class NPUTritonScheduling(TritonScheduling):
                 node_to_be_substituted[node.symbol()] = [(node.length, new_var_expr)]
             else:
                 log.warning("sub nodes (expr%s, numel:%d) can not make up parent node(%s:%d)",
-                                new_var_expr, numel, node.symbol(), node.length)
-
-
-
-
-
+                                new_var_expr, numel, node.symbol(), node.length)
\ No newline at end of file
diff --git a/torch_npu/_inductor/codegen/triton.py b/torch_npu/_inductor/codegen/triton.py
index 68ffb16a4e..e2fe0cdb5e 100644
--- a/torch_npu/_inductor/codegen/triton.py
+++ b/torch_npu/_inductor/codegen/triton.py
@@ -1132,7 +1132,8 @@ class NPUIndexTritonKernel(TritonKernel):
             "axis2_order": axis2_order,
             "low_dims": self.low_dims,
             "numof_reduction_axis": self.numof_reduction_axis(),
-            "split_axis_dtype": split_axis_dtype
+            "split_axis_dtype": split_axis_dtype,
+            "traced_graph_hash": "TRACED_GRAPH_HASH"
         }
         return inductor_meta
 
diff --git a/torch_npu/_inductor/config.py b/torch_npu/_inductor/config.py
index 9c0e168c78..6817d3e393 100644
--- a/torch_npu/_inductor/config.py
+++ b/torch_npu/_inductor/config.py
@@ -1,5 +1,5 @@
 import os  # noqa: C101
-import sys
+import torch
 import logging
 from typing import Any, Callable, Dict, Optional, TYPE_CHECKING
 from triton.runtime.driver import driver
@@ -23,6 +23,18 @@ num_vector_core = prop["num_aicore"]
 # unit byte
 npu_block = 32
 
+traced_fx_graph_cache = os.environ.get("INDUCTOR_ASCEND_FX_GRAPH_CACHE", None)
+check_accuracy = os.environ.get("INDUCTOR_ASCEND_CHECK_ACCURACY", False)
+auto_fallback = os.environ.get("INDUCTOR_ASCEND_AUTO_FALLBACK", True)
+fallback_warning = os.environ.get("INDUCTOR_ASCEND_FALLBACK_WARNING", False)
+
+acc_comp_tol = {
+    torch.float32: {'rtol': 1.3e-6, 'atol': 1e-5},
+    torch.float16: {'rtol': 1e-3, 'atol': 1e-5},
+    torch.bfloat16: {'rtol': 1.6e-2, 'atol': 1e-5},
+    "default": {'rtol': 1.3e-6, 'atol': 1e-5},
+}
+
 if ("Ascend910B" in target.arch):
     num_vector_core = num_cube_core * 2
 
diff --git a/torch_npu/_inductor/lowering_fx.py b/torch_npu/_inductor/lowering_fx.py
new file mode 100644
index 0000000000..5de2721d5d
--- /dev/null
+++ b/torch_npu/_inductor/lowering_fx.py
@@ -0,0 +1,2405 @@
+import itertools
+import functools
+
+import os
+import textwrap
+
+import sympy
+from sympy.core import Expr, Integer, Symbol
+from torch._inductor.ir import Reduction
+from torch._inductor.utils import sympy_product
+from torch._inductor import ir
+from torch._inductor.ir import ExpandView, TensorBox
+from torch._inductor.lowering import sum_
+from torch._inductor import lowering
+from torch._prims_common import (
+    is_boolean_dtype,
+    is_integer_dtype,
+    get_computation_dtype,
+)
+from torch._inductor.decomposition import decompositions, pw_cast_for_opmath
+import torch._ops
+
+from typing import (
+    Any, 
+    Callable, 
+    Dict, 
+    List, 
+    Optional, 
+    Set, 
+    Tuple, 
+    Union,
+    )
+
+from torch._prims_common import (
+    canonicalize_dims,
+    check,
+    dtype_to_type,
+    ELEMENTWISE_TYPE_PROMOTION_KIND,
+    get_computation_dtype,
+    is_boolean_dtype,
+    is_float_dtype,
+    is_integer_dtype,
+    Number,
+)
+
+from torch.utils._sympy.functions import (
+    FloorDiv,
+    Identity,
+    ModularIndexing,
+)
+
+
+from torch._inductor.ir import (
+    ExpandView,
+    IndexingConstant,
+    is_triton,
+    ops_wrapper,
+    PermuteView,
+    Pointwise,
+    Reduction,
+    SqueezeView,
+    TensorBox,
+    IRNode,
+    validate_ir,
+    View,
+)
+
+from torch._inductor.utils import (
+    decode_device,
+    sympy_product,
+
+)
+
+
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch._inductor.fx_passes.post_grad import view_to_reshape
+from torch._inductor import scheduler
+from torch._inductor.utils import  ModularIndexing, FloorDiv
+import sympy
+
+
+from torch._inductor.virtualized import ops, V
+
+from torch._inductor import scheduler
+
+from torch._inductor.ir import Reduction
+from torch._inductor.utils import sympy_product
+from torch._inductor import ir
+from torch._inductor.ir import ExpandView, TensorBox
+from torch._inductor import lowering
+from torch._prims_common import (
+    is_boolean_dtype,
+    is_integer_dtype,
+    get_computation_dtype,
+)
+from torch._inductor.decomposition import decompositions
+import torch._ops
+
+aten = torch.ops.aten
+tr_c10d = torch.ops.tr_c10d
+prims = torch.ops.prims
+npu = torch.ops.npu
+
+
+def _init_set(input_list, output_set):
+    for fn in input_list:
+        output_set.add(fn)
+        if isinstance(fn, torch._ops.OpOverloadPacket):
+            for overload in fn.overloads():
+                other_fn = getattr(fn, overload)
+                output_set.add(other_fn)
+
+
+GENERATE_LIST = [
+    aten.mul,
+    aten.add,
+    aten.sub,
+    aten.div,
+    aten.exp,
+    aten.maximum,
+    aten.sum,
+    aten.select,
+    aten.unsqueeze,
+    aten.repeat,
+    #aten.clone,
+    aten.reshape,
+    aten.where,
+    aten.lt,
+    aten.minimum,
+    aten.gt,
+    aten.le,
+    aten.ceil,
+    aten.floor,
+    aten.rsqrt,
+    aten.abs,
+    aten.log,
+    aten.bitwise_xor,
+    aten.amax,
+    # backward
+    prims.convert_element_type,
+    aten.min,
+    aten.max,
+    aten.erf,
+    aten.argmax,
+    aten.argmin,
+    aten.clamp_min,
+    aten.slice,
+    aten.neg,
+    aten.cat,
+    aten.arange,
+    aten.expand,
+    aten.eq,
+    aten.where,
+    aten.scalar_tensor,
+    aten.ge,
+    aten.permute,
+    aten.sqrt,
+    aten.relu,
+    aten.clamp,
+    aten.clamp_max,
+    aten.mean,
+    # npu.npu_dtype_cast
+    npu.npu_dtype_cast,
+    aten.select_scatter,
+    aten.slice_scatter,
+    prims.broadcast_in_dim,
+    prims.maximum,
+    aten.ne,
+    aten.sigmoid,
+    aten.sign,
+    aten.logical_and,
+    aten.logical_or,
+    aten.logical_not,
+    aten.pow,
+    aten.gelu,
+    aten.tanh,
+    aten.isnan,
+    aten.bitwise_and,
+    aten.squeeze,
+    aten.copy,
+    aten.reciprocal
+]
+
+GENERATE_LIST2 = [
+    "foreach"
+]
+
+FALLBACK_LIST = []
+
+# 先删除从lowering已经注册的op，再更新，不然会lowering的时候找到在torch注册的op
+LOWERING_OVERLOAD_OP = [
+    aten.cumsum,
+    aten.mean,
+    # aten.max,
+    # aten.min,
+    # aten.mul,
+    aten.var_mean,
+    aten.var,
+
+    aten.embedding,
+    aten.split,
+    aten.split_with_sizes,
+    aten.nll_loss_forward,
+    aten.gather,
+    aten.cat,
+    aten.clone
+]
+
+LOWERING_OVERLOAD_OP = list(set(GENERATE_LIST) | set(LOWERING_OVERLOAD_OP))
+
+
+fn_to_aten_fn = {}
+node_id = itertools.count(0)
+
+def register_fn_to_aten_fn(fn, aten_fn=None):
+    if fn not in fn_to_aten_fn:
+        fn_to_aten_fn[fn] = aten_fn
+    return fn
+
+def register_to_aten(aten_fn=None):
+    def decorator(fn):
+        if fn not in fn_to_aten_fn:
+            fn_to_aten_fn[fn] = aten_fn
+        return fn
+    return decorator
+
+reduction_type_to_aten_fn = {
+    "sum": aten.sum,
+    "prod": aten.prod,
+    "xor_sum": prims.xor_sum,
+    "any": aten.any,
+    "max": aten.amax,
+    "min": aten.amin,
+    "argmax": aten.argmax,
+    "argmin": aten.argmin
+}
+
+operator_to_string = {
+    '+': 'a',
+    '-': 'sub',
+    '*': 'm',
+    '/': 'd',
+    '(': 'l',
+    ')': 'r',
+    '.': 'p',
+}
+
+string_to_operator = {v: k for k, v in operator_to_string.items()}
+
+def map_operators_to_strings(expr_str: str):
+    expr_str = expr_str.replace(' ', '')
+    for op, string in operator_to_string.items():
+        expr_str = expr_str.replace(op, string)
+    return '_' + expr_str
+
+def map_strings_to_operators(expr_str: str):
+    for op, string in string_to_operator.items():
+        expr_str = expr_str.replace(op, string)
+    return expr_str[1:]
+
+
+class TracedGraph:
+    def __init__(self):
+        self.graph = torch.fx.Graph()
+        self.last_node: Optional[torch.fx.Node] = None
+        self.sym_nodes: Dict[str, torch.fx.Node] = {}
+
+    def __str__(self):
+        return str(self.graph)
+
+    def get_placeholder_names(self):
+        placeholder_names = set()
+        for node in self.graph.nodes:
+            if node.op == 'placeholder' and node.name not in self.sym_nodes:
+                placeholder_names.add(node.name)
+        return placeholder_names
+
+    __repr__ = __str__
+
+
+
+def create_fake_input(size, stride, device, dtype):
+    size = [V.graph.sizevars.shape_env.create_symintnode(s, hint=None) \
+            if isinstance(s, Expr) and not isinstance(s, Integer) else s for s in size]
+    stride = [V.graph.sizevars.shape_env.create_symintnode(s, hint=None) \
+              if isinstance(s, Expr) and not isinstance(s, Integer) else s for s in stride]
+    with V.graph.fake_mode:
+        fake_input = torch.empty_strided(size, stride, device=device, dtype=dtype)
+    return fake_input
+
+
+def create_sym_inputs(traced_graph: TracedGraph, size: List[Expr]):
+    for s in size:
+        if isinstance(s, (List, Tuple)):
+            create_sym_inputs(traced_graph, s)
+            continue
+        if isinstance(s, Expr) and not isinstance(s, Integer):
+            s_name = str(s)
+            if not isinstance(s, Symbol):
+                s_name = map_operators_to_strings(s_name)
+            if s_name in traced_graph.sym_nodes:
+                continue
+            new_node = traced_graph.graph.placeholder(s_name)
+            new_node.meta['val'] = V.graph.sizevars.shape_env.create_symintnode(s, hint=None)
+            traced_graph.sym_nodes.update({s_name: new_node})
+
+
+def process_ir_constant(inp: ExpandView) -> Union[TracedGraph, int, float]: 
+    skip = False
+    if isinstance(inp.data, IndexingConstant):
+        dtype = inp.data.dtype
+        inp = inp.data.index
+        # convert to original dtype.
+        if dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            # sympy inputs
+            if isinstance(inp, Expr) and not isinstance(inp, sympy.core.numbers.Number):
+                traced_graph = TracedGraph()
+                create_sym_inputs(traced_graph, [inp])
+                s_name = str(inp)
+                if not isinstance(inp, Symbol):
+                    s_name = map_operators_to_strings(str(inp))
+                traced_graph.last_node = traced_graph.sym_nodes[s_name]
+                inp = traced_graph
+            else:
+                inp = float(inp)
+    elif isinstance(inp.data, ir.Constant):
+        dtype = inp.data.dtype
+        inp = inp.data.value
+    else:
+        skip = True
+    return inp, skip
+
+
+def fetch_graphs(inputs: Optional[List[TensorBox]]):
+    if isinstance(inputs, (TensorBox, ir.StorageBox, ir.View, sympy.Symbol, ir.Constant)):
+        inputs = [inputs]
+    input_graphs = []
+    for inp in inputs:
+        if isinstance(inp, List):
+            input_graphs.append(fetch_graphs(inp))
+            continue
+        if not isinstance(inp, (TensorBox, ir.StorageBox, ir.View, ir.ReinterpretView, ir.PermuteView, ir.SliceView, ir.ExpandView)):
+            input_graphs.append(inp)
+            continue
+        if isinstance(inp, ExpandView):
+            inp, skip = process_ir_constant(inp)                    
+            if not skip:
+                input_graphs.append(inp)
+                continue
+        name = inp.get_name()
+        traced_graph = inp.get_traced_graph()
+        if traced_graph is not None: 
+            input_graphs.append(traced_graph)
+            continue
+        traced_graph = TracedGraph()
+        device = inp.get_device()
+        dtype = inp.get_dtype()
+        size = inp.get_size()
+        stride = inp.get_stride()
+        new_node = traced_graph.graph.placeholder(name)
+        fake_input = create_fake_input(size, stride, device, dtype)
+        new_node.meta['val'] = fake_input
+        traced_graph.last_node = new_node
+        input_graphs.append(traced_graph)
+    return input_graphs
+
+
+def merge_traced_graphs(input_graphs: List[TracedGraph], origin_fn, node_name, **kwargs):
+    new_graph = TracedGraph()
+    exist_nodes: Dict[str, torch.fx.Node] = {}
+    def merge_graph(input_graphs: List[TracedGraph]):
+        for input_graph in input_graphs:
+            if isinstance(input_graph, List):
+                merge_graph(input_graph)
+                continue
+            if not isinstance(input_graph, TracedGraph):
+                continue
+            for node in input_graph.graph.nodes:
+                if node.name in exist_nodes:
+                    continue
+                new_node = new_graph.graph.node_copy(node, lambda n: exist_nodes[n.name])
+                exist_nodes[node.name] = new_node
+                if node.name in input_graph.sym_nodes:
+                    new_graph.sym_nodes.update({node.name: new_node})
+                    
+    def parse_args(input_graphs, exist_nodes):
+        args = []
+        for input_graph in input_graphs:
+            if isinstance(input_graph, TracedGraph):
+                args.append(exist_nodes[input_graph.last_node.name])
+            elif isinstance(input_graph, (List, Tuple)):
+                args.append(parse_args(input_graph, exist_nodes))
+            else:
+                if isinstance(input_graph, Expr) and not isinstance(input_graph, Integer):
+                    if not isinstance(input_graph, Symbol):
+                        input_graph = map_operators_to_strings(str(input_graph))
+                    args.append(new_graph.sym_nodes[str(input_graph)])
+                else:
+                    args.append(input_graph)
+        return args
+    
+    num_args = len(input_graphs)
+
+    for k, v in kwargs.items():
+        if isinstance(v, Expr) and not isinstance(v, Integer):
+            traced_graph = TracedGraph()
+            create_sym_inputs(traced_graph, [v])
+            s_name = str(v)
+            if not isinstance(v, Symbol):
+                s_name = map_operators_to_strings(str(v))
+            traced_graph.last_node = traced_graph.sym_nodes[s_name]
+            kwargs[k] = traced_graph.sym_nodes[s_name]
+            input_graphs.append(traced_graph)
+    merge_graph(input_graphs)
+    input_graphs = input_graphs[:num_args]
+    # if inputs do not have any valid graphs, like full/iota
+    create_sym_inputs(new_graph, input_graphs)
+    args = parse_args(input_graphs, exist_nodes)
+    with new_graph.graph.inserting_after(new_graph.last_node):
+        new_node = new_graph.graph.call_function(origin_fn, args=tuple(args), kwargs=kwargs)
+    new_node.name = node_name
+    new_graph.last_node = new_node
+    return new_graph
+
+def merge_fx_graphs(traced_graphs: List[TracedGraph]):
+    new_graph = TracedGraph()
+    exist_nodes: Dict[str, torch.fx.Node] = {}
+    last_nodes = []
+    def merge_graph(input_graphs: List[TracedGraph]):
+        for input_graph in input_graphs:
+            if isinstance(input_graph, List):
+                merge_graph(input_graph)
+                continue
+            if not isinstance(input_graph, TracedGraph):
+                continue
+            for node in input_graph.graph.nodes:
+                if node.name in exist_nodes:
+                    continue
+                new_node = new_graph.graph.node_copy(node, lambda n: exist_nodes[n.name])
+                exist_nodes[node.name] = new_node
+            last_nodes.append(exist_nodes[input_graph.last_node.name])
+    merge_graph(traced_graphs)
+    new_graph.last_node = last_nodes
+    return new_graph
+
+def subtract_graph(graph1: TracedGraph, graph2: TracedGraph, node_name=None) -> Tuple[TracedGraph, torch.fx.Node]:
+    new_graph = TracedGraph()
+    last_node2 = graph2.last_node
+    graph1_node_names = {node.name for node in graph1.graph.nodes}
+    graph2_node_names = {node.name for node in graph2.graph.nodes}
+    placeholder = None
+    exist_nodes: Dict[str, torch.fx.Node] = {}
+    if node_name not in graph1_node_names:
+        placeholder = new_graph.graph.placeholder(last_node2.name if node_name is None else node_name)
+        exist_nodes[last_node2.name] = placeholder
+    for node in graph1.graph.nodes:
+        if node.name in graph2_node_names and node.name not in graph1.sym_nodes:
+            continue
+        new_node = new_graph.graph.node_copy(node, lambda n: exist_nodes[n.name])
+        exist_nodes[node.name] = new_node
+    new_graph.last_node = exist_nodes[graph1.last_node.name]
+    new_graph.sym_nodes = graph1.sym_nodes
+    return new_graph, placeholder
+
+
+def get_last_node(gm: torch.fx.GraphModule):
+    last_node = None
+    for node in gm.graph.nodes:
+        last_node = node
+    return last_node
+
+def create_fx_from_snodes_by_traced_graph(snodes: List[scheduler.SchedulerNode]):
+    fx_call_inputs = []
+    for snode in snodes:
+        snode.node.data.traced_graph.last_node.name = snode.node.get_name()
+    if len(snodes) == 1:
+        traced_graph = snodes[0].node.data.traced_graph
+    else:
+        traced_graph = merge_fx_graphs([snode.node.data.traced_graph for snode in snodes])
+    fx_inputs = []
+    for node in traced_graph.graph.nodes:
+        if node.op == 'placeholder':
+            fx_call_inputs.append(node.target)
+            fx_inputs.append(node.meta['val'])
+    non_contiguous_indices = {}
+    non_contiguous_indices["inputs"] = [i for i, inp in enumerate(fx_inputs) if torch.is_tensor(inp) and not inp.is_contiguous()]
+    num_inputs = len(fx_call_inputs)
+    fx_call_outputs = []
+    for snode in snodes:
+        if snode.has_aliasing_or_mutation():
+            for buf in snode.get_outputs():
+                if len(buf.get_mutations()):
+                    fx_call_outputs.extend(buf.get_mutations())
+                elif len(buf.get_aliases()):
+                    fx_call_outputs.append(buf.get_name())
+        elif snode.node.get_name() not in (V.graph.removed_buffers | V.graph.inplaced_to_remove):
+            fx_call_outputs.append(snode.node.get_name())
+    num_outputs = len(fx_call_outputs)
+    outputs = traced_graph.last_node if isinstance(traced_graph.last_node, List) \
+        else [traced_graph.last_node]
+    outputs = [output for output in outputs if output.name not in (V.graph.removed_buffers | V.graph.inplaced_to_remove)]
+    fx_call_args = fx_call_inputs + fx_call_outputs
+    traced_graph.graph.output(tuple(outputs))
+    traced_graph.graph.lint()
+    orig_module = torch.nn.Module()
+    gm = torch.fx.GraphModule(orig_module, traced_graph.graph)
+    gm.recompile()
+    def runnable_gm(*args):
+        return torch.fx.Interpreter(gm).run(*args)
+    with V.graph.fake_mode: 
+        gm = make_fx(runnable_gm)(*fx_inputs)
+    view_to_reshape(gm)
+    last_node = get_last_node(gm)
+    fx_output_nodes = last_node.args[0]
+    fx_outputs = [node.meta['val'] for node in fx_output_nodes]
+    non_contiguous_indices["outputs"] = [i for i, node in enumerate(fx_output_nodes) \
+        if not node.meta['val'].is_contiguous()]
+    fx_args = fx_inputs + fx_outputs
+
+    return gm, fx_call_args, fx_args, {
+        "num_inputs": num_inputs,
+        "num_outputs": num_outputs, 
+        "non_contiguous_indices": non_contiguous_indices, 
+    }
+
+
+def create_compile_kwargs(final_kernel, fx_call_args, fx_args):
+
+    _, kernel_call_args, _, arg_types = final_kernel.args.python_argdefs()
+    for idx, call_arg in enumerate(fx_call_args):
+        if call_arg in final_kernel.args.inplace_buffers:
+            fx_call_args[idx] = final_kernel.args.inplace_buffers[call_arg].other_names[-1]
+    fx_arg_shapes = [fx_arg.shape for fx_arg in fx_args if isinstance(fx_arg, torch.Tensor)]
+
+    if set(kernel_call_args) != set(fx_call_args):
+        # breakpoint()
+        raise RuntimeError(f"kernel call args and fx graph call args are NOT SAME. kernel_call_args: {kernel_call_args}, fx_call_args: {fx_call_args}")
+    grid: List[Any] = []
+    final_kernel.add_numel_to_call_args_and_grid(final_kernel.kernel_name, kernel_call_args, arg_types, grid)
+
+    index_map = {element: idx for idx, element in enumerate(kernel_call_args)}
+    call_args_mapping = [index_map[element] for element in fx_call_args]
+
+    mismatch_indices_shapes = {}
+
+    for i in range(len(fx_call_args)):
+        mismatch_indices_shapes[i] = fx_arg_shapes[i]
+
+    return {
+        "call_args_mapping": call_args_mapping,
+        'grid': tuple(grid),
+        "mismatch_indices_shapes": mismatch_indices_shapes,
+        }
+
+def generate_fx_graph_code(code, kernel_code, kernel_name, compile_kwargs):      
+    code = textwrap.indent(code, '    ')
+    code_template = f"""
+import os    
+import torch
+from torch._inductor.compile_fx import clone_preserve_strides
+from torch._dynamo.testing import rand_strided
+from torch import device
+
+import math
+import random
+import os
+import tempfile
+from math import inf, nan
+from torch._inductor.hooks import run_intermediate_hooks
+from torch._inductor.utils import maybe_profile
+from torch._inductor.codegen.memory_planning import _align as align
+from torch import device, empty_strided
+from torch._inductor.async_compile import AsyncCompile
+from torch._inductor.select_algorithm import extern_kernels
+from torch._inductor.codegen.multi_kernel import MultiKernelCall
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_heuristics import grid, split_scan_grid, grid_combo_kernels, start_graph, end_graph
+from torch_npu._inductor.npu_triton_heuristics import grid
+from torch_npu._inductor import get_current_raw_stream as get_raw_stream
+from torch_npu._inductor import config as npu_config
+
+aten = torch.ops.aten
+inductor_ops = torch.ops.inductor
+_quantized = torch.ops._quantized
+assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
+reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
+alloc_from_pool = torch.ops.inductor._alloc_from_pool
+
+file_path = os.path.abspath(__file__)
+dir_path = os.path.dirname(file_path)
+
+
+class GraphModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+{code}
+model = GraphModule().npu()
+call_args_mapping = {compile_kwargs['call_args_mapping']}
+num_inputs = {compile_kwargs['num_inputs']}
+num_outputs = {compile_kwargs['num_outputs']}
+non_contiguous_indices = {compile_kwargs['non_contiguous_indices']}
+mismatch_indices_shapes = {compile_kwargs['mismatch_indices_shapes']}
+
+def run():
+    async_compile = AsyncCompile()
+    {kernel_name} = async_compile.triton('{kernel_name}', '''
+{kernel_code}
+    ''', device_str='npu')
+
+    async_compile.wait(globals())
+    del async_compile
+
+    stream0 = get_raw_stream(0)
+
+    
+    args = torch.load(os.path.join(dir_path, "data.pth"))
+
+    call_inputs_indices = call_args_mapping[:num_inputs]
+    call_outputs_indices = call_args_mapping[num_inputs:]
+
+    args = [arg.npu() if isinstance(arg, torch.Tensor) else arg for arg in args]
+
+    fx_args = [] 
+    for idx in call_args_mapping:
+        arg = args[idx]
+        if isinstance(arg, torch.Tensor):
+            fx_arg = clone_preserve_strides(arg).float() if arg.dtype == torch.bfloat16 else clone_preserve_strides(arg)
+            fx_args.append(fx_arg)
+
+    fx_inputs = [fx_args[idx].contiguous() if idx in non_contiguous_indices['inputs'] else fx_args[idx] for idx in range(num_inputs)]
+    if len(mismatch_indices_shapes):
+        for ind, shape in mismatch_indices_shapes.items():
+            if ind >= num_inputs:
+                break
+            fx_inputs[ind] = fx_inputs[ind].reshape(shape)
+    model_outputs = model.forward(*fx_inputs)
+    for idx, (out1, out2) in enumerate(zip(model_outputs, fx_args[num_inputs:(num_inputs + num_outputs)])):
+        out1 = out1.reshape(out2.shape)
+        if idx in non_contiguous_indices['outputs']:
+            out2.copy_(out1)
+        else: 
+            out2.data = out1.data
+
+    {kernel_name}.run(*args, grid=grid{compile_kwargs['grid']}, stream=stream0)
+
+    for actual, expected in zip([args[i] for i in call_outputs_indices], fx_args[num_inputs:]):
+        if actual.dtype != expected.dtype:
+            expected = expected.to(actual.dtype)
+        acc_comp_tol = npu_config.acc_comp_tol.get(actual.dtype, npu_config.acc_comp_tol['default'])
+        rtol = acc_comp_tol['rtol']
+        atol = acc_comp_tol['atol']
+        try:
+            torch.testing.assert_close(actual, expected, rtol=rtol, atol=atol, equal_nan=False)
+        except Exception as e:
+            print(e)
+
+if __name__ == "__main__":
+    run()
+"""
+    return code_template
+
+
+def dump_fx_graph_code(code, dump_path, traced_graph_hash):
+    py_path = os.path.join(dump_path, traced_graph_hash + '.py')
+    with open(py_path, 'w') as f:
+        f.write(code)
+
+
+def clone(x, *, memory_format=None):
+    # TODO(jansel): memory format
+    input_graphs = fetch_graphs(x)
+    node_name = f'clone_{next(node_id)}'
+    new_graph = merge_traced_graphs(input_graphs, aten.clone, node_name)
+    return Pointwise.create(
+        device=x.get_device(), 
+        dtype=x.get_dtype(),
+        inner_fn=x.make_loader(),
+        ranges=list(x.get_size()),
+        traced_graph=new_graph,
+        node_name=node_name
+    )
+
+
+def _register_npu_inductor_fallbacks():
+    gen_set = set()
+    _init_set(GENERATE_LIST, gen_set)
+    overload_op_set = set()
+    _init_set(LOWERING_OVERLOAD_OP, overload_op_set)
+
+    # 把不在白名单的op fallback
+    for op in lowering.lowerings:
+        if op not in decompositions and op not in gen_set:
+            if isinstance(op, torch._ops.OpOverloadPacket) or \
+                    isinstance(op, (torch._ops.OpOverload, torch._ops.HigherOrderOperator)):
+                flag = False
+                for gens in GENERATE_LIST2:
+                    if str(op).find(gens) != -1:
+                        flag = True
+                if flag:
+                    continue
+                else:
+                    lowering.make_fallback(op)
+                    FALLBACK_LIST.append(op)
+
+    # 把需要overload的op在lowering里删除
+    for op in overload_op_set:
+        if op in lowering.lowerings:
+            del lowering.lowerings[op]
+
+    
+    def transform_args(
+        args: List[Any],
+        kwargs: Dict[str, Any],
+        broadcast: bool,
+        type_promotion_kind: Optional[ELEMENTWISE_TYPE_PROMOTION_KIND],
+        convert_input_to_bool: bool,
+    ) -> Tuple[List[Any], Dict[str, Any]]:
+        args_indices = [i for i, x in enumerate(args) if isinstance(x, TensorBox)]
+        kwargs_indices = [k for k, v in kwargs.items() if isinstance(v, TensorBox)]
+        # check that there's something to transform
+        if not args_indices and not kwargs_indices:
+            return args, kwargs
+
+        if type_promotion_kind or convert_input_to_bool:
+            if convert_input_to_bool:
+                dtype = torch.bool
+            else:
+                # FIXME this is a crude approximation for promoting args
+                promoting_args = [
+                    a
+                    for a in args
+                    if isinstance(a, (Number, sympy.Basic)) or hasattr(a, "dtype")
+                ]
+                # only consider tensor kwargs for promotion, for now
+                promoting_args.extend(a for a in kwargs.values() if hasattr(a, "dtype"))
+                dtype = lowering.get_promoted_dtype(
+                    *promoting_args, type_promotion_kind=type_promotion_kind  # type: ignore[arg-type]
+                )
+
+            device = (
+                args[args_indices[0]] if args_indices else kwargs[kwargs_indices[0]]
+            ).get_device()
+
+            # sometimes args are an immutable list so we can't mutate them
+            def promote(arg):
+                if isinstance(arg, TensorBox):
+                    return to_dtype(arg, dtype)
+                elif isinstance(arg, ir.Constant):
+                    return ir.Constant(value=arg.value, dtype=dtype, device=device)
+                else:
+                    return arg
+
+            args = [promote(a) for a in args]
+            kwargs = {k: promote(v) for k, v in kwargs.items()}
+
+        if broadcast:
+            broadcasted = broadcast_tensors(
+                *list(
+                    itertools.chain(
+                        (args[i] for i in args_indices),
+                        (kwargs[k] for k in kwargs_indices),
+                    )
+                )
+            )
+            size = list(broadcasted[0].get_size())
+
+            for i, x in zip(args_indices, broadcasted[: len(args_indices)]):
+                args[i] = x
+            for k, x in zip(kwargs_indices, broadcasted[len(args_indices) :]):
+                kwargs[k] = x
+
+            for i in range(len(args)):
+                if isinstance(args[i], ir.Constant):
+                    args[i] = ExpandView.create(args[i], size)
+            for k in kwargs:
+                if isinstance(kwargs[k], ir.Constant):
+                    kwargs[k] = ExpandView.create(kwargs[k], size)
+
+        return args, kwargs
+
+
+    def _register_lowering(
+        aten_fn, decomp_fn, broadcast, type_promotion_kind, convert_input_to_bool
+    ):
+
+        """
+        Add a lowering to lowerings dict
+
+        Arguments:
+            aten_fn: torch.ops.aten.* fn we are lowering
+            decomp_fn: alternate implementation on our IR
+            broadcast: True to apply broadcasting to tensor inputs
+            type_promotion_kind: kind of type promotion applied to tensor inputs, `None` means no type promotion
+            convert_input_to_bool: some logical ops require inputs are converted to bool
+        """
+
+        @functools.wraps(decomp_fn)
+        def wrapped(*args, **kwargs):
+            args: List[Any] = list(args)
+            kwargs: Dict[str, Any] = dict(kwargs)
+            unpacked = False
+            # TODO maybe we need to use pytrees here
+            if len(args) == 1 and isinstance(args[0], (list, tuple)):
+                unpacked = True
+                args = list(args[0])
+
+            if not all(
+                (fn in lowering.fallbacks or lowering.in_namespace(fn, "_c10d_functional")) for fn in aten_fn
+            ):
+                # explicitly assert for "out=" ops for better error messages
+                assert not any(
+                    x == "out" for x in kwargs.keys()
+                ), "out= ops aren't yet supported"
+
+            args, kwargs = transform_args(
+                args, kwargs, broadcast, type_promotion_kind, convert_input_to_bool
+            )
+
+            if unpacked:
+                args = [args]
+
+            out = decomp_fn(*args, **kwargs)
+            validate_ir(out)
+
+            return out
+
+        aten_fn = lowering.get_overloads(aten_fn)
+
+        lowering.lowerings.update(dict.fromkeys(aten_fn, wrapped))
+        return wrapped
+
+
+    def register_lowering(
+        aten_fn,
+        broadcast=False,
+        type_promotion_kind=lowering.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+        convert_input_to_bool=False,
+    ):
+
+        """
+        Shim to support decorator syntax.
+        """
+        return functools.partial(
+            _register_lowering,
+            aten_fn,
+            broadcast=broadcast,
+            type_promotion_kind=type_promotion_kind,
+            convert_input_to_bool=convert_input_to_bool,
+        )
+
+
+    def _make_reduction_inner(x, *, axis, keepdims, dtype, override_return_dtype):
+        if dtype is not None:
+            x = to_dtype(x, dtype)
+        size = x.get_size()
+        axis = set(lowering._validate_reduction_axis(x, axis))
+
+        kept_sizes = []
+        kept_idx = []
+        reduced_sizes = []
+        reduced_idx = []
+        for i in range(len(size)):
+            if i in axis:
+                reduced_idx.append(i)
+                reduced_sizes.append(size[i])
+            else:
+                kept_idx.append(i)
+                kept_sizes.append(size[i])
+
+        def loader(index, reduction_index):
+            assert len(reduction_index) == len(reduced_idx)
+            if keepdims:
+                assert len(index) == len(size)
+                index = [index[i] for i in kept_idx]
+            assert len(index) == len(kept_idx)
+            new_index = [None] * (len(index) + len(reduction_index))
+            for idx, var in itertools.chain(
+                zip(kept_idx, index), zip(reduced_idx, reduction_index)
+            ):
+                new_index[idx] = var
+            return inner_loader(new_index)
+
+        if keepdims:
+            new_size = list(size)
+            for i in reduced_idx:
+                new_size[i] = sympy.S.One
+        else:
+            new_size = kept_sizes
+
+        inner_loader = x.make_loader()
+        return dict(
+            device=x.get_device(),
+            dst_dtype=override_return_dtype or x.get_dtype(),
+            src_dtype=x.get_dtype(),
+            inner_fn=loader,
+            ranges=new_size,
+            reduction_ranges=reduced_sizes,
+        )
+
+
+    def make_reduction(reduction_type: str, override_return_dtype=None):
+        def inner(x, axis=None, keepdims=False, *, dtype=None):
+            kwargs = _make_reduction_inner(
+                x,
+                axis=axis,
+                keepdims=keepdims,
+                dtype=dtype,
+                override_return_dtype=override_return_dtype,
+            )
+            node_name = f'reduction_{next(node_id)}'
+            input_graphs = fetch_graphs([x, axis if axis is not None else list(range(len(x.get_size())))])
+            new_graph = merge_traced_graphs(input_graphs, reduction_type_to_aten_fn[reduction_type], 
+                                                node_name, keepdim=keepdims)
+
+            result = Reduction.create(reduction_type=reduction_type, 
+                                    input_node=x, 
+                                    node_name=node_name, 
+                                    traced_graph=new_graph, 
+                                    **kwargs)
+            if isinstance(
+                    result.data.data, Reduction
+            ):
+                #Only realize if reduction isn't unrolled
+                size = x.get_size()
+                axis = set(lowering._validate_reduction_axis(x, axis))
+                kept_idx = []
+                reduced_idx = []
+                for i in range(len(size)):
+                    if i in axis:
+                        reduced_idx.append(i)
+                    else:
+                        kept_idx.append(i)
+        
+                object.__setattr__(result.data.data, "kept_idx", kept_idx)
+                object.__setattr__(result.data.data, "reduced_idx", reduced_idx)
+
+                result.realize()
+            return result
+
+        return inner
+
+
+    lowering.make_reduction = make_reduction
+
+
+    def to_dtype(x: TensorBox, dtype: torch.dtype, copy=False):
+        src_dtype = x.get_dtype()
+        if src_dtype == dtype:
+            return clone(x) if copy else x
+
+        def _to_dtype(x):
+            return ops.to_dtype(x, dtype, src_dtype=src_dtype)
+        register_fn_to_aten_fn(_to_dtype, aten.to.dtype)
+        return make_pointwise(_to_dtype, override_return_dtype=dtype, dtype=dtype)(x)
+    
+
+    @register_lowering(prims.convert_element_type, type_promotion_kind=None)
+    def _convert_element_type(x: TensorBox, dtype: torch.dtype):
+        if dtype.is_complex or x.get_dtype().is_complex:
+            if x.get_size():
+                # Decompose since aa aten fallback is more friendly for c++ codegen.
+                # This decomposition doesn't work for empty tensor, which needs more investigation.
+                dst = empty_like(x, dtype=dtype)
+                ir.InplaceCopyFallback.create(dst, x)
+                return dst
+            else:
+                return lowering.fallback_handler(
+                    prims.convert_element_type.default, add_to_fallback_set=False
+                )(x, dtype)
+        return to_dtype(x, dtype, copy=True)
+
+    
+    def register_pointwise(
+        aten_fn,
+        name=None,
+        broadcast=True,
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+        convert_input_to_bool=False,
+        override_return_dtype=None,
+        override_fn_when_input_bool=None,
+        allow_alpha=False,
+        use_libdevice_for_f64=False,
+        triton_fallback=None,
+    ):
+        """A pointwise function that maps ops.{name} to inputs"""
+        name = name or aten_fn.__name__
+        fn = ops_wrapper(name)
+        if use_libdevice_for_f64:
+            fn_libdevice = ops_wrapper("libdevice_" + name)
+            lowering.register_op_dtype_propagation_rules(
+                "libdevice_" + name, type_promotion_kind, override_return_dtype
+            )
+
+        lowering.register_op_dtype_propagation_rules(
+            name, type_promotion_kind, override_return_dtype
+        )
+
+        if override_fn_when_input_bool is not None:
+            override_fn_when_input_bool = ops_wrapper(override_fn_when_input_bool)
+
+        fn = register_fn_to_aten_fn(fn, aten_fn)
+
+        fn = make_pointwise(
+            fn,
+            override_return_dtype=override_return_dtype,
+            override_fn_when_input_bool=override_fn_when_input_bool,
+            override_fn_when_gpu_float64=fn_libdevice if use_libdevice_for_f64 else None,  # type: ignore[possibly-undefined]
+            allow_alpha=allow_alpha,
+            triton_fallback=triton_fallback,
+        )
+        fn = register_lowering(
+            aten_fn,
+            broadcast=broadcast,
+            type_promotion_kind=type_promotion_kind,
+            convert_input_to_bool=convert_input_to_bool,
+        )(fn)
+
+        if hasattr(prims, name):
+            register_lowering(
+                getattr(prims, name),
+                type_promotion_kind=None,
+                convert_input_to_bool=convert_input_to_bool,
+            )(fn)
+        return fn
+    
+
+    def make_pointwise(
+        fn,
+        override_return_dtype=None,
+        override_device=None,
+        override_fn_when_input_bool=None,
+        override_fn_when_gpu_float64=None,
+        allow_alpha=False,
+        triton_fallback=None,
+        **kwargs
+    ):
+        def inner(*inputs: TensorBox, alpha=None):
+            if triton_fallback is not None and any(
+                isinstance(inp, IRNode) and is_triton(inp) for inp in inputs
+            ):
+                assert not allow_alpha  # not implemented
+                return triton_fallback(*inputs)
+
+            inputs = lowering.promote_constants(inputs, override_return_dtype)
+            if allow_alpha:
+                if alpha is not None and alpha != 1:
+                    inputs = list(inputs)
+                    inputs[-1] = mul(inputs[-1], alpha)
+            else:
+                assert alpha is None
+            loaders = [x.make_loader() for x in inputs]
+            ranges = inputs[0].get_size()
+            dtype = override_return_dtype or inputs[0].get_dtype()
+            is_gpu_device = lowering.is_gpu(decode_device(inputs[0].get_device()).type)
+
+            for other in inputs[1:]:
+                assert isinstance(other, ir.BaseConstant) or len(ranges) == len(
+                    other.get_size()
+                ), f"ndim mismatch {fn} {ranges} {other.get_size()}"
+
+            # in tracing, we will annotate pointwise nodes that correspond to the output of
+            # a pointwise node that would have been run in eager. intermediary pointwise nodes
+            # during decompositions are not annotated.
+            emulate_precision_casts = (
+                V.graph is not None
+                and getattr(V.graph, "current_node", None) is not None
+                and V.graph.current_node.meta is not None
+                and V.graph.current_node.meta.get("low_precision_pointwise_barrier", False)
+                and dtype in (torch.bfloat16, torch.float16)
+            )
+
+            def inner_fn(index):
+                assert len(index) == len(ranges), f"wrong ndim {index} {ranges}"
+                if dtype == torch.bool and override_fn_when_input_bool is not None:
+                    return override_fn_when_input_bool(*[load(index) for load in loaders])
+                elif (
+                    override_fn_when_gpu_float64
+                    and is_gpu_device
+                    and dtype == torch.float64
+                ):
+                    return override_fn_when_gpu_float64(*[load(index) for load in loaders])
+                else:
+                    inputs_loaded = []
+                    for load in loaders:
+                        out = load(index)
+                        if emulate_precision_casts:
+                            downcast = ops.to_dtype(out, dtype, use_compute_types=False)
+                            out = ops.to_dtype(downcast, dtype)
+                        inputs_loaded.append(out)
+
+                    out = fn(*inputs_loaded)
+                    if emulate_precision_casts:
+                        # fp16/bf16 kernels are computed in fp32. Casting down to fp16/bf16 here,
+                        # then upcasting again, to emulate casts that eager would do.
+                        downcast = ops.to_dtype(out, dtype, use_compute_types=False)
+                        return ops.to_dtype(downcast, dtype)
+                    return out
+
+            if not override_device:
+                device = None
+                for i in inputs:
+                    if lowering.is_gpu(i.get_device().type):
+                        device = i.get_device()
+                        break
+                if not device:
+                    device = inputs[0].get_device()
+
+            device = override_device or device
+
+            input_graphs = fetch_graphs(inputs)
+            node_name = f'pointwise_{next(node_id)}'
+            origin_fn = fn_to_aten_fn[fn]
+            new_graph = merge_traced_graphs(input_graphs, origin_fn, node_name, **kwargs)
+
+            return Pointwise.create(
+                device=device,
+                dtype=dtype,
+                inner_fn=inner_fn,
+                ranges=ranges,
+                node_name=node_name,
+                traced_graph=new_graph,
+            )
+
+        return inner
+    
+    @register_lowering(aten.where, broadcast=False, type_promotion_kind=None)
+    def where(cond, a, b):
+        def fn(*args):
+            return ops.where(*args)
+
+        if isinstance(a, (float, int)):
+            a = lowering.constant_like(a)(b)
+        if isinstance(b, (float, int)):
+            b = lowering.constant_like(b)(a)
+
+        args = [cond, a, b]
+        dtype = lowering.get_promoted_dtype(
+            args[1], args[2], type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+        )
+        indices = [i for i, x in enumerate(args) if isinstance(x, TensorBox)]
+        for i, x in zip(indices, broadcast_tensors(*[args[i] for i in indices])):
+            args[i] = x
+        for i in range(len(args)):
+            if isinstance(args[i], ir.Constant):
+                args[i] = ExpandView.create(args[i], list(args[indices[0]].get_size()))
+        register_fn_to_aten_fn(fn, aten.where)
+        return make_pointwise(fn, override_return_dtype=dtype)(
+            args[0], to_dtype(args[1], dtype), to_dtype(args[2], dtype)
+        )
+
+
+    @register_lowering(aten.broadcast_tensors, broadcast=False, type_promotion_kind=None)
+    def broadcast_tensors(*inputs):
+        if len(inputs) == 1 and isinstance(inputs[0], (list, tuple)):
+            return broadcast_tensors(*inputs[0])
+        target: List[sympy.Expr] = functools.reduce(
+            lowering.broadcast_symbolic_shapes, [x.get_size() for x in inputs], []
+        )
+        outputs = []
+        for x in inputs:
+            sizes = x.get_size()
+            if len(sizes) != len(target) or any(
+                (
+                    (
+                        V.graph.sizevars.shape_env.evaluate_expr(
+                            sympy.Eq(a, 1), size_oblivious=True
+                        )
+                        and not V.graph.sizevars.shape_env.evaluate_expr(
+                            sympy.Eq(b, 1), size_oblivious=True
+                        )
+                    )
+                    or (
+                        not V.graph.sizevars.shape_env.evaluate_expr(
+                            sympy.Eq(a, 1), size_oblivious=True
+                        )
+                        and V.graph.sizevars.shape_env.evaluate_expr(
+                            sympy.Eq(b, 1), size_oblivious=True
+                        )
+                    )
+                )
+                for a, b in zip(sizes, target)
+            ):
+                x = expand(x, target)
+            outputs.append(x)
+        return outputs
+    
+
+    @register_lowering(aten.squeeze, type_promotion_kind=None)
+    def squeeze(x, dim=None):
+        assert isinstance(x, TensorBox)
+        if dim is None:
+            return TensorBox(SqueezeView.create(x.data))
+
+        dim = (
+            V.graph.sizevars.evaluate_static_shape(dim)
+            if isinstance(dim, (int, sympy.Expr))
+            else tuple(V.graph.sizevars.evaluate_static_shape(d) for d in dim)
+        )
+        dim = canonicalize_dims(len(x.get_size()), dim)  # type: ignore[call-overload]
+        dims = set((dim,) if not isinstance(dim, tuple) else dim)
+
+        new_shape = []
+        for d, s in enumerate(x.get_size()):
+            if not (
+                d in dims
+                and V.graph.sizevars.evaluate_expr(sympy.Eq(s, 1, size_oblivious=True))
+            ):
+                new_shape.append(s)
+
+        # squeeze does nothing if the size isn't 1
+        return view(x, new_shape) if new_shape != x.get_size() else x
+    
+
+    @register_lowering([aten.squeeze_])
+    def squeeze_(x, dim=None):
+        val = squeeze(x, dim)
+        assert isinstance(x, TensorBox)
+        assert isinstance(val, TensorBox)
+        x.data = val.data
+        return x
+    
+
+    @register_lowering(aten.isinf)
+    def isinf(x):
+        if lowering.is_integer_type(x):
+            return full_like(x, False, dtype=torch.bool)
+        fn = ops_wrapper("isinf")
+        register_fn_to_aten_fn(fn, aten.isinf)
+        return make_pointwise(fn, override_return_dtype=torch.bool)(x)
+
+
+    @register_lowering(aten.isnan)
+    def isnan(x):
+        if lowering.is_integer_type(x):
+            return full_like(x, False, dtype=torch.bool)
+        fn = ops_wrapper("isnan")
+        register_fn_to_aten_fn(fn, aten.isnan)
+        return make_pointwise(fn, override_return_dtype=torch.bool)(x)
+
+
+    @register_lowering(aten.ceil)
+    def ceil(x):
+        if lowering.is_integer_type(x):
+            return clone(x)
+        fn = ops_wrapper("ceil")
+        register_fn_to_aten_fn(fn, aten.ceil)
+        return make_pointwise(fn)(x)
+
+
+    @register_lowering(aten.floor)
+    def floor(x):
+        if lowering.is_integer_type(x):
+            return clone(x)
+        fn = ops_wrapper("floor")
+        register_fn_to_aten_fn(fn, aten.floor)
+        return make_pointwise(fn)(x)
+
+
+    @register_lowering(aten.round.default)
+    def round(x):
+        if lowering.is_integer_type(x):
+            return clone(x)
+        else:
+            fn = ops_wrapper("round")
+            register_fn_to_aten_fn(fn, aten.round)
+            return make_pointwise(fn)(x)
+
+
+    @register_lowering(aten.trunc)
+    def trunc(x):
+        if lowering.is_integer_type(x):
+            return clone(x)
+        fn = ops_wrapper("trunc")
+        register_fn_to_aten_fn(fn, aten.trunc)
+        return make_pointwise(fn)(x)
+
+
+    @register_lowering(aten.expand, type_promotion_kind=None)
+    def expand(x, sizes):
+        from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+
+        (x,) = lowering.promote_constants([x])
+        if isinstance(x, ir.BaseConstant):
+            return ExpandView.create(x, tuple(sizes))
+        assert isinstance(x, TensorBox)
+        assert isinstance(sizes, (list, tuple))
+        if tuple(x.get_size()) == tuple(sizes):
+            return x
+
+        if not free_unbacked_symbols(x.get_size()):
+            x_size_product = V.graph.sizevars.size_hint(sympy_product(x.get_size()))
+            # TODO: It would be better to realize the input if any of its sizes
+            # are unbacked, because typically the size will be non-zero.  However,
+            # this cannot be done directly as below as we'll choke on the size_hint
+            # here
+            if x_size_product > 0 and not free_unbacked_symbols(sizes):
+                # maybe realize input before broadcasting it
+                x.mark_reuse(
+                    V.graph.sizevars.size_hint(sympy_product(sizes)) // x_size_product
+                )
+        input_graphs = fetch_graphs([x.data, tuple(sizes)])
+        node_name = f'expand_{next(node_id)}'
+        new_graph = merge_traced_graphs(input_graphs, aten.expand, node_name)
+        return TensorBox(ExpandView.create(x.data, tuple(sizes), traced_graph=new_graph, node_name=node_name))
+    
+
+    @register_lowering(aten.expand_as, type_promotion_kind=None)
+    def expand_as(x, y):
+        return expand(x, y.get_size())
+    
+
+    @register_lowering(aten.repeat)
+    def repeat(x, repeats):
+        input_graphs = fetch_graphs([x, repeats])
+        node_name = f'repeat_{next(node_id)}'
+        new_graph = merge_traced_graphs(input_graphs, aten.repeat, node_name)
+        old_size = list(x.get_size())
+        if len(repeats) > len(old_size):
+            old_size = [sympy.S.One] * (len(repeats) - len(old_size)) + old_size
+            x = view(x, list(old_size))
+        assert len(repeats) == len(x.get_size())
+
+        new_size = list(x.get_size())
+
+        zero_tensor = False
+        for i in range(len(repeats)):
+            if repeats[i] == 0:
+                zero_tensor = True
+            new_size[i] = new_size[i] * repeats[i]
+
+        if zero_tensor:
+            return empty(new_size, dtype=x.get_dtype(), device=x.get_device())
+        if all((a == 1 or b == 1) for a, b in zip(repeats, old_size)):
+            return clone(expand(x, new_size))
+
+        x_loader: Callable[[Any], Any]
+
+        def inner_fn(index):
+            assert len(index) == len(repeats)
+            index = list(index)
+            for i in range(len(repeats)):
+                if repeats[i] != 1:
+                    if old_size[i] == 1:
+                        index[i] = sympy.S.Zero
+                    else:
+                        index[i] = ModularIndexing(index[i], 1, old_size[i])
+            return x_loader(index)
+
+        old_size_product = V.graph.sizevars.size_hint(sympy_product(old_size))
+        if old_size_product > 0:
+            # maybe realize the input
+            x.mark_reuse(
+                V.graph.sizevars.size_hint(sympy_product(new_size)) // old_size_product
+            )
+
+        x_loader = x.make_loader()
+        return Pointwise.create(
+            device=x.get_device(),
+            dtype=x.get_dtype(),
+            inner_fn=inner_fn,
+            ranges=list(new_size),
+            traced_graph=new_graph,
+            node_name=node_name
+        )
+
+    @register_lowering(aten._unsafe_view, type_promotion_kind=None)
+    @register_lowering(aten.view, type_promotion_kind=None)
+    @register_lowering(aten.reshape, type_promotion_kind=None)
+    def view(x, sizes):
+        assert isinstance(x, TensorBox)
+        assert isinstance(sizes, (list, tuple))
+        input_graphs = fetch_graphs([x.data, sizes])
+        node_name = f'view_{next(node_id)}'
+        new_graph = merge_traced_graphs(input_graphs, aten.reshape, node_name)
+        return TensorBox(View.create(x.data, sizes, traced_graph=new_graph, node_name=node_name))
+
+
+    @register_lowering(aten.permute, type_promotion_kind=None)
+    def permute(x, dims):
+        assert isinstance(x, TensorBox)
+        assert isinstance(dims, (list, tuple))
+        input_graphs = fetch_graphs([x.data, dims])
+        node_name = f'permute_{next(node_id)}'
+        new_graph = merge_traced_graphs(input_graphs, aten.permute, node_name)
+        return TensorBox(PermuteView.create(x.data, tuple(dims), traced_graph=new_graph, node_name=node_name))
+
+
+    @register_lowering(aten.slice, type_promotion_kind=None)
+    def slice_(x, dim=0, start=0, end=2**63, step=1, clamp=True):
+        assert isinstance(x, TensorBox)
+        dim = _validate_dim(x, dim, 0)
+        input_graphs = fetch_graphs([x.data])
+        node_name = f'slice_{next(node_id)}'
+        new_graph = merge_traced_graphs(input_graphs, aten.slice, node_name, dim=dim, start=start, end=end, step=step)
+        
+        return TensorBox(ir.SliceView.create(x.data, dim, start, end, step, traced_graph=new_graph, node_name=node_name))
+    
+
+    @register_lowering(aten.select, type_promotion_kind=None)
+    def select(x, dim, idx):
+        idx = View.handle_negative_index(idx, x.get_size()[dim])
+        return squeeze(slice_(x, dim, idx, idx + 1), dim)
+
+
+    @register_lowering(aten.split, type_promotion_kind=None)
+    def split(x, sizes, dim=0):
+        dim = _validate_dim(x, dim, 0)
+        sizes_ = sizes
+
+        # If sizes is an integer (or a SymInt), we turn it into a list of sizes
+        # by computing what the actual size of each chunk should be.
+        if not isinstance(sizes, (list, tuple)):
+            x_size = x.get_size()[dim]
+            chunks = V.graph.sizevars.evaluate_static_shape(
+                FloorDiv(x_size + sizes - 1, sizes)
+            )
+            sizes_ = [sizes] * chunks
+            # The last chunk might have a smaller size than the rest.
+            sizes_[-1] = x_size - (chunks - 1) * sizes
+
+        # From this point, we assume that the sum of the sizes of all chunks
+        # equals the size of the base tensor.
+        result = []
+        start = 0
+        for size in sizes_:
+            end = start + size
+            # No need for clamping here, since we compute the exact
+            # start and end values.
+            result.append(slice_(x, dim, start, end, clamp=False))
+            start = end
+        return result
+
+
+    @register_lowering(aten.split_with_sizes, type_promotion_kind=None)
+    def split_with_sizes(x, sizes, dim=0):
+        return split(x, sizes, dim)
+    
+
+    @register_lowering(aten.unbind, type_promotion_kind=None)
+    def unbind(x, dim=0):
+        dim = _validate_dim(x, dim, 0)
+        x_size = V.graph.sizevars.evaluate_static_shape(x.get_size()[dim])
+        result = [select(x, dim, i) for i in range(x_size)]
+        return result
+    
+
+    @register_lowering(aten.unsqueeze, type_promotion_kind=None)
+    def unsqueeze(x, dim):
+        dim = _validate_dim(x, dim, 1)
+        new_shape = list(x.get_size())
+        new_shape.insert(dim, sympy.S.One)
+        return view(x, new_shape)
+
+
+    @register_lowering(aten.unsqueeze_, type_promotion_kind=None)
+    def unsqueeze_(x, dim):
+        val = unsqueeze(x, dim)
+        assert isinstance(x, TensorBox)
+        assert isinstance(val, TensorBox)
+        x.data = val.data
+        return x
+
+
+    def _validate_dim(x, dim, offset=0):
+        dim = V.graph.sizevars.shape_env.evaluate_expr(sympy.sympify(dim))
+        ndim = len(x.get_size())
+        if dim < 0:
+            dim += ndim + offset
+        assert 0 <= dim < ndim + offset
+        return dim
+    
+
+    @register_lowering(aten.copy, type_promotion_kind=None)
+    def copy(self, src, non_blocking=False):
+        x = src
+        if self.get_device() != src.get_device():
+            x = lowering.to_device(x, self.get_device())
+        if self.get_dtype() != src.get_dtype():
+            x = to_dtype(x, self.get_dtype())
+
+        if self.get_size() != src.get_size():
+            out = expand(x, self.get_size())
+            return clone(out)
+        return clone(x)
+
+
+    @register_lowering(prims.iota)
+    def iota(
+        length,
+        *,
+        start,
+        step,
+        dtype,
+        device,
+        requires_grad,
+    ):
+        def fn(index):
+            return ops.index_expr(step * index[0] + start, dtype=dtype)
+
+        node_name = f'iota_{next(node_id)}'
+        new_graph = merge_traced_graphs([length], prims.iota, node_name, \
+                                        start=start, step=step, \
+                                        dtype=dtype, device=device, \
+                                        requires_grad=requires_grad)
+        return Pointwise.create(
+            device=decode_device(device),
+            dtype=dtype,
+            inner_fn=fn,
+            ranges=[length],
+            traced_graph=new_graph,
+            node_name=node_name
+        )
+
+
+    @register_lowering(aten.select_scatter, type_promotion_kind=None)
+    def select_scatter(x, src, dim: int, index: int):
+        assert x.get_dtype() == src.get_dtype()
+        input_graphs = fetch_graphs([x, src, dim, index])
+        node_name = f'select_scatter_{next(node_id)}'
+        new_graph = merge_traced_graphs(input_graphs, aten.select_scatter, node_name)
+        x_loader = x.make_loader()
+        dim = _validate_dim(x, dim, 0)
+        if V.graph.sizevars.evaluate_expr(sympy.Lt(index, 0)):
+            index = index + x.get_size()[dim]
+        V.graph.sizevars.guard_leq(0, index)  # type: ignore[arg-type]
+        V.graph.sizevars.guard_lt(index, x.get_size()[dim])  # type: ignore[arg-type]
+        src = expand(unsqueeze(src, dim), x.get_size())
+        src_loader = src.make_loader()
+
+        def inner_fn(idx):
+            return ops.where(
+                ops.eq(
+                    ops.index_expr(idx[dim], torch.int32),
+                    ops.index_expr(index, torch.int32),
+                ),
+                src_loader(idx),
+                x_loader(idx),
+            )
+
+        return Pointwise.create(
+            device=x.get_device(),
+            dtype=x.get_dtype(),
+            inner_fn=inner_fn,
+            ranges=list(x.get_size()),
+            traced_graph=new_graph,
+            node_name=node_name
+        )
+
+
+    @register_lowering(aten.slice_scatter, type_promotion_kind=None)
+    def slice_scatter(x, src, dim=0, start=None, end=None, step=1):
+        assert x.get_dtype() == src.get_dtype()
+        input_graphs = fetch_graphs([x, src])
+        node_name = f'slice_scatter_{next(node_id)}'
+        new_graph = merge_traced_graphs(input_graphs, aten.slice_scatter, node_name, \
+                                        dim=dim, 
+                                        start=start, 
+                                        end=end,
+                                        step=step)
+        x_loader = x.make_loader()
+        dim = _validate_dim(x, dim, 0)
+        dim_size = x.get_size()[dim]
+
+        start, end = ir.SliceView.normalize_start_end(x, dim, start, end)
+
+        src_size = list(x.get_size())
+        src_size[dim] = FloorDiv(end - start + (step - 1), step)
+        src = expand(src, src_size)
+        src_loader = src.make_loader()
+
+        def inner_fn(idx):
+            if start == 0 and end == dim_size and step == 1:
+                # selecting every element is the same as just src.clone()
+                return src_loader(idx)
+
+            idx_dim = ops.index_expr(idx[dim], torch.int64)
+            src_idx = list(idx)
+            src_idx[dim] = FloorDiv(idx[dim] - start, step)
+
+            mask = []
+            if start != 0:
+                mask.append(
+                    ops.ge(
+                        idx_dim,
+                        ops.index_expr(sympy.expand(start), torch.int64),
+                    )
+                )
+            if end != dim_size:
+                mask.append(
+                    ops.lt(
+                        idx_dim,
+                        ops.index_expr(sympy.expand(end), torch.int64),
+                    )
+                )
+            if step != 1:
+                mask.append(
+                    ops.eq(
+                        ops.index_expr(
+                            ModularIndexing(idx[dim] - start, 1, step), torch.int64
+                        ),
+                        ops.constant(0, torch.int64),
+                    )
+                )
+            assert mask
+            mask = functools.reduce(ops.and_, mask)
+            src_val = ops.masked(
+                mask,
+                lambda: src_loader(src_idx),
+                0 if lowering.is_integer_type(x) else 0.0,
+            )
+            return ops.where(
+                mask,
+                src_val,
+                x_loader(idx),
+            )
+
+        return Pointwise.create(
+            device=x.get_device(),
+            dtype=x.get_dtype(),
+            inner_fn=inner_fn,
+            ranges=list(x.get_size()),
+            traced_graph=new_graph,
+            node_name=node_name
+        )
+
+
+    @register_lowering([torch.tensor, aten.scalar_tensor])
+    def tensor(data, *, dtype=None, device=None, layout=None, pin_memory=False):
+        lowering.assert_nyi(layout in (None, torch.strided), f"layout={layout}")
+        lowering.assert_nyi(not pin_memory, "pin_memory")
+        input_graphs = fetch_graphs([data])
+        node_name = f'tensor_{next(node_id)}'
+        new_graph = merge_traced_graphs(input_graphs, aten.scalar_tensor, node_name, \
+                                        dtype=dtype, 
+                                        device='npu',
+                                        layout=layout,
+                                        pin_memory=False)
+        if isinstance(lowering._unwrap(data), int):
+            dtype = dtype or torch.int64
+        else:
+            dtype = dtype or torch.get_default_dtype()
+
+        ranges: List[sympy.Expr] = []
+
+        if isinstance(data, sympy.Basic):
+
+            def inner_fn(index):
+                return ops.index_expr(data, dtype)
+
+        elif isinstance(data, (float, int)):
+
+            def inner_fn(index):
+                return ops.constant(data, dtype)
+
+        elif len(data) == 0 or isinstance(data[0], (float, int)) and len(data) <= 8:
+            # inline small tensors
+            ranges.append(sympy.Integer(len(data)))
+
+            def inner_fn(index):
+                def binary_search(start, end):
+                    assert start < end
+                    if end - start == 1:
+                        return ops.constant(data[start], dtype)
+                    mid = (end - start) // 2 + start
+                    return ops.where(
+                        ops.lt(
+                            ops.index_expr(index[0], torch.int64),
+                            ops.constant(mid, torch.int64),
+                        ),
+                        binary_search(start, mid),
+                        binary_search(mid, end),
+                    )
+
+                if len(data) == 0:
+                    return ops.constant(0, dtype)
+                return binary_search(0, len(data))
+
+        else:
+            return V.graph.add_tensor_constant(
+                torch.tensor(data, dtype=dtype, device=device)
+            )
+
+        return Pointwise.create(
+            device=decode_device(device),
+            dtype=dtype,
+            inner_fn=inner_fn,
+            ranges=ranges,
+            traced_graph=new_graph,
+            node_name=node_name
+        )
+    
+
+    def tensor_constructor(fill_value):
+        # torch.zeros, torch.ones, etc
+        def inner(
+            *size,
+            names=None,
+            dtype=None,
+            device=None,
+            layout=None,
+            pin_memory=False,
+            memory_format=None,
+        ):
+            lowering.assert_nyi(names is None, "named tensors")
+            lowering.assert_nyi(layout in (None, torch.strided), f"layout={layout}")
+            lowering.assert_nyi(not pin_memory, "pin_memory")
+            device = decode_device(device)
+            dtype = dtype or torch.get_default_dtype()
+            if len(size) == 1 and isinstance(size[0], (list, tuple, torch.Size)):
+                size = tuple(size[0])
+            # See https://github.com/pytorch/pytorch/issues/118102
+            # All sizes at lowering time should be sympy.Symbol, not SymInt!
+            for s in size:
+                assert not isinstance(s, torch.SymInt)
+            size = [sympy.expand(s) for s in size]
+            return _full(fill_value, device, dtype, size)
+
+        return inner
+
+
+    def _full(fill_value, device, dtype, size):
+        value = fill_value
+        if not isinstance(fill_value, (int, float)) and hasattr(value, "value"):
+            value = value.value
+
+        if isinstance(value, (int, float)):
+
+            def inner_fn(index):
+                return ops.constant(value, dtype)
+
+        elif isinstance(value, sympy.Basic):
+
+            def inner_fn(index):
+                return ops.index_expr(value, dtype)
+
+        else:
+            assert len(value.get_size()) == 0
+            value_loader = value.make_loader()
+
+            def inner_fn(index):
+                return value_loader([])
+
+        node_name = f'full_{next(node_id)}'
+        new_graph = merge_traced_graphs([size, fill_value], aten.full.default, node_name, \
+                                        device='npu', dtype=dtype, layout = torch.strided, pin_memory = False)
+
+        return Pointwise.create(
+            device=device,
+            dtype=dtype,
+            inner_fn=inner_fn,
+            ranges=list(size),
+            traced_graph=new_graph,
+            node_name=node_name
+        )
+
+    
+    @register_lowering(aten.empty_strided)
+    def empty_strided(
+        size, stride, *, dtype=None, layout=None, device=None, pin_memory=None
+    ):
+        assert isinstance(size, (list, tuple))
+        assert isinstance(stride, (list, tuple, type(None)))
+        lowering.assert_nyi(not pin_memory, "pin_memory")
+        lowering.assert_nyi(layout in (None, torch.strided), f"layout={layout}")
+        dtype = lowering.decode_dtype(dtype) or torch.get_default_dtype()
+        device = device or torch.tensor(0.0).device
+        device = decode_device(device)
+        pointwise = _full(fill_value=0, device=device, dtype=dtype, size=size)
+        pointwise.realize()
+        buffer = pointwise.data.data
+        # explicitly set ranges to zeros in order to make a NopKernelSchedulerNode
+        buffer.data = lowering.dataclasses.replace(buffer.data, ranges=[0] * len(size))
+        assert isinstance(buffer, ir.ComputedBuffer)
+        size = [sympy.expand(s) for s in size]
+        stride = (
+            [sympy.expand(s) for s in stride]
+            if stride
+            else ir.FlexibleLayout.contiguous_strides(size)
+        )
+        buffer.layout = ir.FixedLayout(
+            device=device,
+            dtype=dtype,
+            size=size,
+            stride=stride,
+        )
+        return pointwise
+    
+
+    @register_lowering([torch.empty, aten.empty])
+    def empty(
+        *size,
+        names=None,
+        dtype=None,
+        layout=None,
+        device=None,
+        pin_memory=None,
+        memory_format=None,
+    ):
+        lowering.assert_nyi(names is None, "named tensors")
+        device = decode_device(device)
+        if len(size) == 1 and isinstance(size[0], (list, tuple, torch.Size)):
+            size = tuple(size[0])
+        return empty_strided(
+            size, None, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+        )
+
+    
+    @register_lowering([torch.full, aten.full])
+    def full(size, fill_value, **kwargs):
+        assert kwargs.get("dtype") is not None, "dtype should be handled by decomposition"
+        return tensor_constructor(fill_value)(size, **kwargs)
+    
+
+    register_lowering(aten.clone)(clone)
+    
+
+    @register_lowering(aten.constant_pad_nd, type_promotion_kind=None)
+    def constant_pad_nd(x, padding, fill_value=0):
+        assert (len(padding) % 2) == 0
+
+        input_graphs = fetch_graphs([x, padding])
+        node_name = f'constand_pad_nd_{next(node_id)}'
+        new_graph = merge_traced_graphs(input_graphs, aten.constant_pad_nd, node_name, value=fill_value)
+
+        if all(p == 0 for p in padding):
+            return clone(x)
+
+        sizes = x.get_size()
+
+        bounds = list(reversed(list(zip(padding[::2], padding[1::2]))))
+        n = len(sizes) - len(bounds)
+
+        # if padding is a complicated expression, hoist it
+        bounds_precomp: List[Tuple[sympy.Symbol, Any]] = []
+        for l, h in bounds:
+            bounds_precomp.append((V.graph.sizevars.lookup_precomputed_size(l), h))  # type: ignore[arg-type]
+
+        output_size = list(sizes[:n])
+        mask_sizes = []
+        for (low, high), size in zip(bounds, sizes[n:]):
+            mask_sizes.append(size)
+            output_size.append(sympy.expand(size + low + high))
+        assert len(output_size) == len(sizes)
+        fill_value = dtype_to_type(x.get_dtype())(fill_value)
+
+        def mask(index):
+            mask = []
+            for idx, (low, high), length in zip(index[n:], bounds, mask_sizes):
+                if low != 0:
+                    mask.append(lowering.range_mask_low(idx, 0))
+                if high != 0:
+                    mask.append(lowering.range_mask_high(idx, length))
+            mask = functools.reduce(ops.and_, mask)
+            return ops.masked(mask, lambda: x_loader(index), fill_value)
+
+        def offset_fn(index):
+            new_index = list(index[:n])
+            for idx, (low, high) in zip(index[n:], bounds_precomp):
+                new_index.append(idx - low)
+            assert len(new_index) == len(index)
+            return mask(new_index)
+
+        x_loader = x.make_loader()
+        return Pointwise.create(
+            device=x.get_device(),
+            dtype=x.get_dtype(),
+            inner_fn=offset_fn,
+            ranges=output_size,
+            traced_graph=new_graph,
+            node_name=node_name
+        )
+
+
+    @make_pointwise
+    @register_to_aten(aten_fn=aten.pow)
+    def pow_native(a, b):
+        return ops.pow(a, b)
+    
+
+    @register_lowering(aten.pow, broadcast=True)
+    def pow(a, b):
+        if isinstance(b, float) and b == int(b):
+            return pow(a, int(b))
+        elif isinstance(b, float) and b == 0.5:
+            return sqrt(a)
+        elif isinstance(b, int) and b == 1:
+            return clone(a)
+        
+        input_graphs = fetch_graphs([a, b])
+        node_name = f'pointwise_{next(node_id)}'
+        new_graph = merge_traced_graphs(input_graphs, aten.pow, node_name)
+
+        # Type promotion ensures all tensor arguments have the same type
+        dtype = next(x.get_dtype() for x in (a, b) if isinstance(x, ir.TensorBox))
+        is_integer_pow = is_integer_dtype(dtype)
+
+        # Optimize away small fixed powers, or for integers avoid falling back to ATen
+        embed_exponent = isinstance(b, int) and (
+            -32 < b < 32 or (is_integer_pow and b >= 0)
+        )
+        if embed_exponent:
+            loader = a.make_loader()
+
+            def fn(idx):
+                return lowering.pow_recursive(loader(idx), b, a.get_dtype())
+
+            return Pointwise.create(
+                device=a.get_device(),
+                dtype=a.get_dtype(),
+                inner_fn=fn,
+                ranges=a.get_size(),
+                node_name=node_name,
+                traced_graph=new_graph,
+            )
+
+        if isinstance(a, Number):
+            if a == 1:
+                return full_like(b, 1)
+            if a == 2 and is_float_dtype(b.get_dtype()):
+                return exp2(b)
+
+        if is_integer_pow:
+            # ops.pow doesn't work for integers
+            if isinstance(a, Number):
+                return lowering.fallback_pow_scalar(a, b)
+            elif isinstance(b, Number):
+                return lowering.fallback_pow_tensor_scalar(a, b)
+            else:
+                return lowering.fallback_pow_tensor_tensor(a, b)
+
+        return pow_native(a, b)
+    
+
+    def mutate_to(changed, val, unsafe_alias=False):
+        if isinstance(changed, TensorBox):
+            changed_data = changed.data
+        else:
+            changed_data = changed
+        if isinstance(val, TensorBox):
+            val = val.data
+
+        if not isinstance(val, ir.StorageBox):
+            # introduce a copy to handle views
+            input_graphs = fetch_graphs([changed, val])
+            node_name = f'copy__{next(node_id)}'
+            new_graph = merge_traced_graphs(input_graphs, aten.copy_, node_name)
+            val = Pointwise.create(
+                device=changed.get_device(),
+                dtype=changed.get_dtype(),
+                inner_fn=val.make_loader(),
+                ranges=changed.get_size(),
+                traced_graph=new_graph,
+                node_name=node_name
+            ).data
+            assert isinstance(val, ir.StorageBox)
+
+        if isinstance(changed_data, ir.StorageBox) and not (
+            changed_data.is_input_buffer()
+            # In AOTI, module parameters and buffers are not lifted as graph inputs
+            or changed_data.is_module_buffer()
+            or isinstance(changed_data.data, ir.NopKernel)
+        ):
+            # Fast path, just swing the data pointer
+            val.realize()
+            changed_data.data = val.data
+            return changed
+
+        ir.MutationLayoutSHOULDREMOVE.realize_into(
+            val, changed_data, unsafe_alias=unsafe_alias
+        )
+        return changed
+    
+
+    empty_like = register_lowering(aten.empty_like)(lowering.create_tensor_like(empty))
+    ones_like = lowering.create_tensor_like(tensor_constructor(1))
+    zeros_like = lowering.create_tensor_like(tensor_constructor(0))
+
+
+    @register_lowering(aten.full_like, type_promotion_kind=None)
+    def full_like(x, fill_value, **kwargs):
+        return lowering.create_tensor_like(tensor_constructor(fill_value))(x, **kwargs)
+    
+
+    @register_lowering(aten.fill_)
+    def fill_(x, fill_value):
+        return mutate_to(x, full_like(x, fill_value))
+
+
+    @register_lowering(aten.copy_, type_promotion_kind=None)
+    def copy_(dst, src, non_blocking=False):
+        if dst is src:
+            # dst.copy_(dst) can happen from the reinplacing pass
+            return dst
+        src = lowering.to_device(src, dst.get_device())
+        src = to_dtype(src, dst.get_dtype())
+        src = expand(src, dst.get_size())
+        return mutate_to(dst, src)
+    
+
+    @make_pointwise
+    def floordiv(a, b):
+        return ops.floordiv(a, b)
+
+
+    @make_pointwise
+    def truncdiv(a, b):
+        return ops.truncdiv(a, b)
+
+
+    @register_lowering(aten.div, broadcast=True)
+    def div_mode(a, b, rounding_mode=None):
+        both_integer = lowering.is_integer_type(a) and lowering.is_integer_type(b)
+        both_boolean = lowering.is_boolean_type(a) and lowering.is_boolean_type(b)
+
+        # floordiv and truncdiv need special handling for integer tensors on Triton,
+        # see the discussion at https://github.com/openai/triton/issues/605
+        if rounding_mode == "floor":
+            assert not both_boolean, "floordiv operands can not be boolean at the same time"
+            return floordiv(a, b) if both_integer else floor(div(a, b))
+        if rounding_mode == "trunc":
+            assert not both_boolean, "truncdiv operands can not be boolean at the same time"
+            return truncdiv(a, b) if both_integer else trunc(div(a, b))
+        return div(a, b)
+
+
+    @register_lowering([aten.mul], broadcast=True)
+    def mul(a, b):
+        both_bool = lowering.is_boolean_type(a) and lowering.is_boolean_type(b)
+        if both_bool:
+            return logical_and(a, b)
+        else:
+            fn = ops_wrapper(aten.mul.__name__)
+            fn = register_fn_to_aten_fn(fn, aten.mul) 
+            return make_pointwise(fn)(a, b)
+    
+
+    @register_lowering([aten.reciprocal], broadcast=True,)
+    def reciprocal(a):
+        return div(1.0, a)
+
+
+    @register_lowering([prims.div], broadcast=True)
+    def div_prim(a, b):
+        is_integral = all(lowering.is_boolean_type(x) or lowering.is_integer_type(x) for x in [a, b])
+
+        if is_integral:
+            return truncdiv(a, b)
+
+        def fn(*args):
+            return ops.truediv(*args)
+
+        fn = register_fn_to_aten_fn(fn, aten.div)
+        return make_pointwise(fn)(a, b)
+
+
+    @register_lowering(
+        [aten.true_divide, aten.div.Tensor],
+        broadcast=True,
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    )
+    def div(a, b):
+        a, b = lowering.promote_constants(
+            (a, b), type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+        )
+        return div_prim(a, b)
+    
+
+    @register_lowering(aten.rsqrt)
+    def rsqrt(x):
+        dtype = x.get_dtype()
+        if is_integer_dtype(dtype) or is_boolean_dtype(dtype):
+            x = to_dtype(x, torch.get_default_dtype())
+
+        def _rsqrt(x):
+            return ops.rsqrt(x)
+
+        register_fn_to_aten_fn(_rsqrt, aten.rsqrt)
+        return make_pointwise(_rsqrt)(x)
+
+
+    @register_lowering([aten.sum, prims.sum])
+    def sum_(x, axis=None, keepdims=False, *, dtype=None):
+        if (
+            is_integer_dtype(x.get_dtype()) or is_boolean_dtype(x.get_dtype())
+        ) and dtype is None:
+            dtype = torch.int64
+
+        fn = make_reduction("sum", override_return_dtype=dtype)
+        return fn(x, axis, keepdims, dtype=dtype)
+    
+
+    @register_lowering(aten.prod)
+    def prod(x, axis=None, keepdims=False, *, dtype=None):
+        if (
+            is_integer_dtype(x.get_dtype()) or is_boolean_dtype(x.get_dtype())
+        ) and dtype is None:
+            dtype = torch.int64
+
+        fn = make_reduction("prod", override_return_dtype=dtype)
+        return fn(x, axis, keepdims, dtype=dtype)
+
+
+    @register_lowering(aten.any)
+    def reduce_any(x, dim=None, keepdim=False):
+        x = to_dtype(x, torch.bool)
+        return make_reduction("any")(x, axis=dim, keepdims=keepdim)
+
+
+    @register_lowering(aten.max, type_promotion_kind=None)
+    def reduce_max(x, dim=None, keepdim=False):
+        if dim is not None:
+            return (
+                reduce_amax(x, axis=dim, keepdims=keepdim),
+                reduce_argmax(x, axis=dim, keepdims=keepdim),
+            )
+
+        return reduce_amax(x, axis=None, keepdims=keepdim)
+
+
+    @register_lowering(aten.min, type_promotion_kind=None)
+    def reduce_min(x, dim=None, keepdim=False):
+        if dim is not None:
+            return (
+                reduce_amin(x, axis=dim, keepdims=keepdim),
+                reduce_argmin(x, axis=dim, keepdims=keepdim),
+            )
+
+        return reduce_amin(x, axis=None, keepdims=keepdim)
+    
+
+    register_lowering(prims.xor_sum)(make_reduction("xor_sum"))
+    reduce_amax = register_lowering(aten.amax)(make_reduction("max"))
+    reduce_amin = register_lowering(aten.amin)(make_reduction("min"))
+    reduce_argmax = register_lowering(aten.argmax)(
+        make_reduction("argmax", override_return_dtype=torch.int64)
+    )
+    reduce_argmin = register_lowering(aten.argmin)(
+        make_reduction("argmin", override_return_dtype=torch.int64)
+    )
+
+    add = register_pointwise(
+        aten.add, allow_alpha=True, override_fn_when_input_bool="logical_or"
+    )
+
+    def register_pointwise_numeric(op, name=None, triton_fallback=None):
+        return register_pointwise(
+            op,
+            name=name,
+            type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+            triton_fallback=triton_fallback,
+        )
+
+
+    def register_pointwise_numeric_ldf64(op):
+        return register_pointwise(
+            op,
+            type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+            use_libdevice_for_f64=True,
+        )
+
+
+    def register_inplace(aten_op, outplace_op):
+        @register_lowering(aten_op, type_promotion_kind=None)
+        def fn(*args, **kwargs):
+            result = outplace_op(*args, **kwargs)
+            result = to_dtype(result, args[0].get_dtype())
+            return mutate_to(args[0], result)
+
+        return fn
+
+
+    rsqrt = register_pointwise_numeric(aten.rsqrt)
+    exp = register_pointwise_numeric_ldf64(aten.exp)
+    exp2 = register_pointwise_numeric(aten.exp2)
+    expm1 = register_pointwise_numeric(aten.expm1)
+    relu = register_pointwise(aten.relu)
+    sigmoid = register_pointwise_numeric_ldf64(aten.sigmoid)
+    sqrt = register_pointwise_numeric_ldf64(aten.sqrt)
+    square = register_pointwise(aten.square)
+    sub = register_pointwise(aten.sub, allow_alpha=True)
+    register_pointwise_numeric_ldf64(aten.cos)
+    register_pointwise_numeric_ldf64(aten.sin)
+    abs = register_pointwise(aten.abs)
+    bitwise_and = register_pointwise(aten.bitwise_and)
+    bitwise_left_shift = register_pointwise(aten.bitwise_left_shift)
+    bitwise_not = register_pointwise(
+        aten.bitwise_not, override_fn_when_input_bool="logical_not"
+    )
+    bitwise_or = register_pointwise(aten.bitwise_or)
+    bitwise_right_shift = register_pointwise(aten.bitwise_right_shift)
+    bitwise_xor = register_pointwise(aten.bitwise_xor)
+    register_pointwise_numeric(aten.lgamma)
+    erf = register_pointwise_numeric(aten.erf)
+    register_lowering(
+        aten.special_erf, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    )(erf)
+
+    register_pointwise_numeric(aten.log1p)
+    register_pointwise_numeric(aten.tan)
+    register_pointwise_numeric(aten.tanh)
+    register_pointwise_numeric_ldf64(aten.log)
+    logical_and = register_pointwise(
+        aten.logical_and,
+        type_promotion_kind=None,
+        convert_input_to_bool=True,
+        override_return_dtype=torch.bool,
+    )
+    logical_not = register_pointwise(
+        aten.logical_not,
+        type_promotion_kind=None,
+        convert_input_to_bool=True,
+        override_return_dtype=torch.bool,
+    )
+    logical_or = register_pointwise(
+        aten.logical_or,
+        type_promotion_kind=None,
+        convert_input_to_bool=True,
+        override_return_dtype=torch.bool,
+    )
+    logical_xor = register_pointwise(
+        aten.logical_xor,
+        type_promotion_kind=None,
+        convert_input_to_bool=True,
+        override_return_dtype=torch.bool,
+    )
+    maximum = register_pointwise(aten.maximum)
+    minimum = register_pointwise(aten.minimum)
+    clamp_min = register_pointwise(aten.clamp_min, name='maximum')
+    clamp_max = register_pointwise(aten.clamp_max, name='minimum')
+    neg = register_pointwise(aten.neg)
+    abs = register_pointwise(aten.abs)
+    register_pointwise(aten.remainder)
+    sign = register_pointwise(aten.sign, override_fn_when_input_bool="identity")
+    register_pointwise(aten.ceil)
+    register_pointwise(aten.signbit, override_return_dtype=torch.bool)
+
+    register_lowering(aten._neg_view)(neg)
+
+    register_pointwise(aten.le, override_return_dtype=torch.bool)
+    register_pointwise(aten.lt, override_return_dtype=torch.bool)
+    register_pointwise(aten.ge, override_return_dtype=torch.bool)
+    gt = register_pointwise(aten.gt, override_return_dtype=torch.bool)
+    register_pointwise(aten.eq, override_return_dtype=torch.bool)
+    register_pointwise(aten.ne, override_return_dtype=torch.bool)
+
+    register_pointwise_numeric(aten.cosh)
+    register_pointwise_numeric(aten.sinh)
+    register_pointwise_numeric(aten.acos)
+    register_pointwise_numeric(aten.acosh)
+    register_pointwise_numeric(aten.asin)
+    register_pointwise_numeric(aten.asinh)
+    register_pointwise_numeric(aten.atan2)
+    register_pointwise_numeric(aten.atan)
+    register_pointwise_numeric(aten.atanh)
+    register_pointwise_numeric(aten.copysign)
+    register_pointwise_numeric(aten.erfc)
+    register_pointwise_numeric(aten.erfinv)
+    register_pointwise_numeric(aten.hypot)
+    register_pointwise_numeric(aten.log10)
+    register_pointwise_numeric(aten.log2)
+    register_pointwise_numeric(aten.nextafter)
+
+
+    register_inplace(aten.add_, add)
+    register_inplace(aten.bitwise_and_, bitwise_and)
+    register_inplace(aten.bitwise_left_shift_, bitwise_left_shift)
+    register_inplace(aten.bitwise_not_, bitwise_not)
+    register_inplace(aten.bitwise_or_, bitwise_or)
+    register_inplace(aten.bitwise_right_shift_, bitwise_right_shift)
+    register_inplace(aten.bitwise_xor_, bitwise_xor)
+    register_inplace(aten.mul_, mul)
+    register_inplace(aten.div_.Tensor, div)
+    register_inplace(aten.div_.Tensor_mode, div_mode)
+    register_inplace(aten.logical_and_, logical_and)
+    register_inplace(aten.logical_not_, logical_not)
+    register_inplace(aten.logical_or_, logical_or)
+    register_inplace(aten.logical_xor_, logical_xor)
+    register_inplace(aten.sub_, sub)
+    register_inplace(aten.relu_, relu)
+    register_inplace(aten.sigmoid_, sigmoid)
+
+
+    register_lowering(aten.__and__)(bitwise_and)
+    register_lowering(aten.__lshift__)(bitwise_left_shift)
+    register_lowering(aten.__or__)(bitwise_or)
+    register_lowering(aten.__rshift__)(bitwise_right_shift)
+    register_lowering(aten.__xor__)(bitwise_xor)
+
+    register_inplace(aten.__iand__, aten.__and__)
+    register_inplace(aten.__ilshift__, aten.__lshift__)
+    register_inplace(aten.__ior__, aten.__or__)
+    register_inplace(aten.__irshift__, aten.__rshift__)
+    register_inplace(aten.__ixor__, aten.__xor__)
+
+
+
+##########################################################################
+
+    @register_lowering(aten.mean)
+    def mean(x, axis=None, keepdim=False, *, dtype=None):
+        if dtype is not None:
+            x = to_dtype(x, dtype)
+        size = x.get_size()
+        axis = lowering._validate_reduction_axis(x, axis)
+        # compute in higher-precision until end of mean lowering
+        output_dtype = x.get_dtype()
+        if output_dtype in (torch.float16, torch.bfloat16):
+            x = to_dtype(x, torch.float)
+        sum_result = sum_(x, axis, keepdim)
+        denom = sympy_product(size[i] for i in axis)
+        denom = ir.IndexingConstant(index=denom, dtype=x.get_dtype(), device=x.get_device())
+        denom = ExpandView.create(denom, list(sum_result.get_size()))
+        return to_dtype(div(sum_result, denom), output_dtype)
+
+
+    @register_lowering(aten.cumsum)
+    def cumsum(x, axis=None, dtype=None):
+        if (
+                is_integer_dtype(x.get_dtype()) or is_boolean_dtype(x.get_dtype())
+        ) and dtype is None:
+            # torch.int64->torch.int32
+            dtype = torch.int32
+        if len(x.get_size()) == 0:
+            if axis not in [0, -1]:
+                raise ValueError("axis must be 0 or -1")
+            dtype = dtype or x.get_dtype()
+            return to_dtype(x, dtype, copy=True)
+        return lowering.fallback_cumsum(x, dim=axis, dtype=dtype)
+    
+
+    @register_lowering(npu.npu_dtype_cast, type_promotion_kind=None)
+    def _convert_npu_type(x: TensorBox, dtype: torch.dtype):
+        return to_dtype(x, dtype, copy=True)
+
+
+    def var_mean_sum_(x, axis, correction, keepdim, return_mean):
+        if correction is None:
+            correction = 1
+    
+        size = x.get_size()
+        axis = lowering._validate_reduction_axis(x, axis)
+        x_mean = mean(x, axis, keepdim=True)
+        if return_mean:
+            x_mean.realize()
+    
+        diffs = square(sub(x, x_mean))
+        sum_result = sum_(diffs, axis, keepdim)
+        denom = sympy_product(size[i] for i in axis)
+        if correction:
+            denom = sympy.Max(denom - correction, 0)
+        denom = ir.IndexingConstant(index=denom, dtype=x.get_dtype(), device=x.get_device())
+        denom = ExpandView.create(denom, list(sum_result.get_size()))
+        x_var = div(sum_result, denom)
+        if not return_mean:
+            return (x_var,)
+    
+        x_mean = x_mean if keepdim else squeeze(x_mean, axis)
+        return x_var, x_mean
+
+
+    def var_mean_helper_(x, *, axis, correction, keepdim, return_mean):
+        out_dtype = x.get_dtype()
+        compute_dtype = get_computation_dtype(out_dtype)
+        x = to_dtype(x, compute_dtype, copy=False)
+        kwargs = dict(
+            x=x,
+            axis=axis,
+            correction=correction,
+            keepdim=keepdim,
+            return_mean=return_mean,
+        )
+        output = (
+            var_mean_sum_(**kwargs)
+        )
+        output = tuple(to_dtype(x, out_dtype, copy=False) for x in output)
+        return output[0] if not return_mean else output
+
+    @register_lowering(aten.var_mean)
+    def var_mean(x, axis=None, *, correction=None, keepdim=False):
+        return var_mean_helper_(
+            x, axis=axis, correction=correction, keepdim=keepdim, return_mean=True
+        )
+
+    @register_lowering([aten.var, prims.var])
+    def var_(x, axis=None, *, correction=None, keepdim=False):
+        return var_mean_helper_(
+            x, axis=axis, correction=correction, keepdim=keepdim, return_mean=False
+        )
+
+    @register_lowering(aten.embedding, type_promotion_kind=None)
+    def embedding(weight, indices, padding_idx=-1, scale_grad_by_freq=False, sparse=False):
+        return lowering.fallback_handler(aten.embedding.default)(weight, indices, padding_idx=-1, scale_grad_by_freq=False,
+                                                        sparse=False)
+
+    @register_lowering(aten.cat)
+    def cat(inputs, dim=0):
+        return lowering.fallback_handler(aten.cat.default)(inputs, dim)
+
+    lowering.make_fallback(aten._log_softmax)
+    lowering.make_fallback(aten.gather)
+    lowering.make_fallback(aten.nll_loss_forward)
\ No newline at end of file
diff --git a/torch_npu/_inductor/npu_triton_heuristics.py b/torch_npu/_inductor/npu_triton_heuristics.py
index 42575d3171..c01db2fff3 100644
--- a/torch_npu/_inductor/npu_triton_heuristics.py
+++ b/torch_npu/_inductor/npu_triton_heuristics.py
@@ -1,7 +1,10 @@
 # This file is based on triton_heuristics with heuristics designed for NPU
 import os
+import sys
 import functools
+import time
 import copy
+import importlib
 from typing import Any, Callable, List, Optional
 import logging
 import re
@@ -21,6 +24,7 @@ from torch._inductor.runtime.triton_heuristics import (
     _find_names,
     get_first_attr,
     collected_calls,
+    _dump_launch_params,
 )
 from torch._inductor.runtime.benchmarking import benchmarker
 from torch._inductor.runtime.autotune_cache import AutotuneCache
@@ -32,6 +36,8 @@ from torch._inductor.runtime.runtime_utils import (
 
 )
 
+from torch._inductor.compile_fx import clone_preserve_strides
+
 import triton
 from triton.compiler import CompiledKernel
 
@@ -50,6 +56,7 @@ from .codegen.tile_generator import TileGenerator
 from .codegen.triton_utils import get_aligned_numel
 from .config import aggresive_autotune
 from .config import log
+from . import config as npu_config
 
 
 # torch-261
@@ -120,7 +127,6 @@ class NPUCachingAutotuner(CachingAutotuner):
 
             self.configs = None
 
-
  
     def _precompile_config(self, cfg: Config, warm_cache_only: bool):
         """Ahead of time compile a given autotuner config."""
@@ -484,7 +490,171 @@ class NPUCachingAutotuner(CachingAutotuner):
             return do_bench_using_profiling(kernel_call, warmup=10, rep=40)
         # remove fast_flush=True for high version triton
         return benchmarker.benchmark_gpu(kernel_call, rep=40)
+    
+
+    def get_fx_graph_call(self, auto_fallback=False):
+        kernel_name = self.inductor_meta.get("kernel_name", "triton_")
+        traced_graph_hash = self.inductor_meta.get("traced_graph_hash")
+        dump_path = os.getenv(traced_graph_hash, None)
+        if not dump_path:
+            raise RuntimeError(f"Can not find DUMP PATH of traced fx graph. kernel_name is {kernel_name}.")
+        sys.path.append(dump_path)
+        fx_module = importlib.import_module(traced_graph_hash)
+        sys.path.remove(dump_path)
+
+        model = fx_module.model
+        num_inputs = fx_module.num_inputs
+        num_outputs = fx_module.num_outputs
+        non_contiguous_indices = fx_module.non_contiguous_indices
+        mismatch_indices_shapes = fx_module.mismatch_indices_shapes
+                
+        def fx_graph_call(*fx_args):
+            fx_inputs = [fx_args[idx].contiguous() if idx in non_contiguous_indices['inputs'] else \
+                         fx_args[idx] for idx in range(num_inputs)]
+            if len(mismatch_indices_shapes):
+                for ind, shape in mismatch_indices_shapes.items():
+                    if ind >= num_inputs:
+                        break
+                    fx_inputs[ind] = fx_inputs[ind].reshape(shape)
+            model_outputs = model.forward(*fx_inputs)
+            for idx, (out1, out2) in enumerate(zip(model_outputs, fx_args[num_inputs:(num_inputs + num_outputs)])):
+                out1 = out1.reshape(out2.shape)
+                if idx in non_contiguous_indices['outputs']:
+                    out2.copy_(out1)
+                else: 
+                    out2.data = out1.data
+            
+        def fallback_call(*args):
+            fx_args = [args[idx] for idx in fx_module.call_args_mapping] 
+            return fx_graph_call(*fx_args)
+        if auto_fallback:
+            return fallback_call, kernel_name
+        return fx_graph_call, kernel_name, dump_path, fx_module
+    
+    def data_dump(self, *args, dump_path=None):
+        data_dump_path = os.path.join(dump_path, 'data.pth')
+        torch.save(args, data_dump_path)
+
+    def check_accuracy(self, *args, launcher, grid, stream, **kwargs):
+
+        fx_graph_call, kernel_name, dump_path, fx_module = self.get_fx_graph_call()
+        call_outputs_indices = fx_module.call_args_mapping[fx_module.num_inputs:]
+        self.data_dump(*args, dump_path=dump_path)
+
+        fx_args = [] 
+        for idx in fx_module.call_args_mapping:
+            arg = args[idx]
+            if isinstance(arg, torch.Tensor):
+                fx_arg = clone_preserve_strides(arg).float() if arg.dtype == torch.bfloat16 else clone_preserve_strides(arg)
+                fx_args.append(fx_arg)
+
+        fx_graph_call(*fx_args)
+    
+        ret = launcher(
+            *args,
+            **kwargs,
+            grid=grid,
+            stream=stream,
+        )
+        for actual, expected in zip([args[i] for i in call_outputs_indices], fx_args[fx_module.num_inputs:]):
+            if actual.dtype != expected.dtype:
+                expected = expected.to(actual.dtype)
+            acc_comp_tol = npu_config.acc_comp_tol.get(actual.dtype, npu_config.acc_comp_tol['default'])
+            rtol = acc_comp_tol['rtol']
+            atol = acc_comp_tol['atol']
+
+            matches = torch.isclose(
+                actual, expected, rtol=rtol, atol=atol, equal_nan=False
+            )
+            if not matches.all():
+                abs_diff = torch.abs(actual - expected)
+                rel_diff = abs_diff / torch.abs(expected)
+                rel_diff.masked_fill_(matches, 0)
+                print(f"CHECK ACCURACY FAILED! Greatest Relative Difference: {rel_diff.max().item()}, " f"Kernel Name: {kernel_name}", flush=True)
+                print(f"kernel {kernel_name} Dump Path: {dump_path}")
+                actual.copy_(expected)
+            del matches
+        for arg in fx_args:
+            del arg
+        return ret
+    
+
+    def run(
+        self, *args, grid, stream, benchmark_run=False, **kwargs
+    ):  # type:ignore[override]
+        if self.triton_interpret:
+            return self.fn[grid](
+                *args,
+                **kwargs,
+                **self.configs[0].kwargs,
+            )
+
+        if hasattr(self.launchers[0], "fallback"):
+            return self.launchers[0](
+                *args,
+                **kwargs,
+            )
 
+        if len(self.launchers) != 1:
+            if len(self.launchers) == 0:
+                start_time = time.time_ns()
+                self.precompile()
+                self.precompile_time_taken_ns = time.time_ns() - start_time
+            if len(self.launchers) > 1:
+                self.autotune_to_one_config(*args, grid=grid, **kwargs)
+
+        if not getattr(
+            self.launchers[0].config, "found_by_coordesc", False
+        ) and self.inductor_meta.get("coordinate_descent_tuning", False):
+            self.launchers = [
+                self.coordinate_descent_tuning(
+                    self.launchers[0], *args, grid=grid, **kwargs
+                )
+            ]
+
+        (launcher,) = self.launchers
+        if launcher.store_cubin and (not benchmark_run or not self.cuda_kernel_saved):
+            self.save_gpu_kernel(grid, stream, launcher)
+
+        if self.dump_launch_params:
+            _dump_launch_params(args, kwargs, launcher, self.fn.__name__)
+
+        if npu_config.check_accuracy:
+            return self.check_accuracy(*args, launcher=launcher, grid=grid, stream=stream, **kwargs)
+
+        # it is faster than entering and exiting a context manager, even if the context
+        # manager is a nullcontext.
+        if autograd_profiler._is_profiler_enabled:
+            # grid can be a tuple of ints or a string.
+            if isinstance(grid, tuple):
+                grid_info = str(grid)
+            else:
+                grid_info = getattr(grid, "grid_fn_str", "")
+
+            with torch._C._profiler._RecordFunctionFast(
+                self.inductor_meta.get("kernel_name", "triton kernel"),
+                args,
+                {
+                    "kernel_file": (self.filename or ""),
+                    "kernel_hash": self.kernel_hash,
+                    "kernel_backend": "triton",
+                    "grid": grid_info,
+                    "stream": stream,
+                },
+            ):
+                return launcher(
+                    *args,
+                    **kwargs,
+                    grid=grid,
+                    stream=stream,
+                )
+        else:
+            return launcher(
+                *args,
+                **kwargs,
+                grid=grid,
+                stream=stream,
+            )
 
     
 class NPUDebugAutotuner(NPUCachingAutotuner):
-- 
Gitee


From 2fe37ca8f66a3f950747fc7b52d116153158290e Mon Sep 17 00:00:00 2001
From: rmch <chenruimin2@huawei.com>
Date: Tue, 29 Apr 2025 17:13:31 +0800
Subject: [PATCH 342/358] skip instead of raise runtime error

---
 torch_npu/_inductor/codegen/schduling.py     | 14 ++++++++------
 torch_npu/_inductor/lowering_fx.py           |  7 +++----
 torch_npu/_inductor/npu_triton_heuristics.py | 11 +++++++----
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/torch_npu/_inductor/codegen/schduling.py b/torch_npu/_inductor/codegen/schduling.py
index e531f6b8a7..2b6decf07b 100644
--- a/torch_npu/_inductor/codegen/schduling.py
+++ b/torch_npu/_inductor/codegen/schduling.py
@@ -158,12 +158,14 @@ class NPUTritonScheduling(TritonScheduling):
         final_kernel.call_kernel(final_kernel.kernel_name)
 
         if npu_config.check_accuracy:
-            compile_kwargs |= create_compile_kwargs(final_kernel, fx_call_args, fx_args)
-            fx_dump_path = os.path.join(npu_config.traced_fx_graph_cache, traced_graph_hash)
-            os.makedirs(fx_dump_path, exist_ok=True)
-            fx_code = generate_fx_graph_code(traced_graph.code, src_code, kernel_name, compile_kwargs)
-            dump_fx_graph_code(fx_code, fx_dump_path, traced_graph_hash)
-            os.environ[traced_graph_hash] = fx_dump_path
+            new_compile_kwargs = create_compile_kwargs(final_kernel, fx_call_args, fx_args)
+            if new_compile_kwargs:
+                compile_kwargs |= new_compile_kwargs
+                fx_dump_path = os.path.join(npu_config.traced_fx_graph_cache, traced_graph_hash)
+                os.makedirs(fx_dump_path, exist_ok=True)
+                fx_code = generate_fx_graph_code(traced_graph.code, src_code, kernel_name, compile_kwargs)
+                dump_fx_graph_code(fx_code, fx_dump_path, traced_graph_hash)
+                os.environ[traced_graph_hash] = fx_dump_path
 
         if config.nan_asserts:
             final_kernel.codegen_nan_check()
diff --git a/torch_npu/_inductor/lowering_fx.py b/torch_npu/_inductor/lowering_fx.py
index 5de2721d5d..e0ff1308be 100644
--- a/torch_npu/_inductor/lowering_fx.py
+++ b/torch_npu/_inductor/lowering_fx.py
@@ -513,8 +513,8 @@ def create_fx_from_snodes_by_traced_graph(snodes: List[scheduler.SchedulerNode])
     last_node = get_last_node(gm)
     fx_output_nodes = last_node.args[0]
     fx_outputs = [node.meta['val'] for node in fx_output_nodes]
-    non_contiguous_indices["outputs"] = [i for i, node in enumerate(fx_output_nodes) \
-        if not node.meta['val'].is_contiguous()]
+    non_contiguous_indices["outputs"] = [i + num_inputs for i, call_output in enumerate(fx_call_outputs) \
+        if not V.graph.try_get_buffer(call_output).layout.is_contiguous()]
     fx_args = fx_inputs + fx_outputs
 
     return gm, fx_call_args, fx_args, {
@@ -533,8 +533,7 @@ def create_compile_kwargs(final_kernel, fx_call_args, fx_args):
     fx_arg_shapes = [fx_arg.shape for fx_arg in fx_args if isinstance(fx_arg, torch.Tensor)]
 
     if set(kernel_call_args) != set(fx_call_args):
-        # breakpoint()
-        raise RuntimeError(f"kernel call args and fx graph call args are NOT SAME. kernel_call_args: {kernel_call_args}, fx_call_args: {fx_call_args}")
+        return None
     grid: List[Any] = []
     final_kernel.add_numel_to_call_args_and_grid(final_kernel.kernel_name, kernel_call_args, arg_types, grid)
 
diff --git a/torch_npu/_inductor/npu_triton_heuristics.py b/torch_npu/_inductor/npu_triton_heuristics.py
index c01db2fff3..6a8dd8e0f1 100644
--- a/torch_npu/_inductor/npu_triton_heuristics.py
+++ b/torch_npu/_inductor/npu_triton_heuristics.py
@@ -497,7 +497,7 @@ class NPUCachingAutotuner(CachingAutotuner):
         traced_graph_hash = self.inductor_meta.get("traced_graph_hash")
         dump_path = os.getenv(traced_graph_hash, None)
         if not dump_path:
-            raise RuntimeError(f"Can not find DUMP PATH of traced fx graph. kernel_name is {kernel_name}.")
+            return None
         sys.path.append(dump_path)
         fx_module = importlib.import_module(traced_graph_hash)
         sys.path.remove(dump_path)
@@ -536,7 +536,9 @@ class NPUCachingAutotuner(CachingAutotuner):
         torch.save(args, data_dump_path)
 
     def check_accuracy(self, *args, launcher, grid, stream, **kwargs):
-
+        fx_call_and_kwargs = self.get_fx_graph_call()
+        if not fx_call_and_kwargs:
+            return None
         fx_graph_call, kernel_name, dump_path, fx_module = self.get_fx_graph_call()
         call_outputs_indices = fx_module.call_args_mapping[fx_module.num_inputs:]
         self.data_dump(*args, dump_path=dump_path)
@@ -576,7 +578,7 @@ class NPUCachingAutotuner(CachingAutotuner):
             del matches
         for arg in fx_args:
             del arg
-        return ret
+        return True
     
 
     def run(
@@ -620,7 +622,8 @@ class NPUCachingAutotuner(CachingAutotuner):
             _dump_launch_params(args, kwargs, launcher, self.fn.__name__)
 
         if npu_config.check_accuracy:
-            return self.check_accuracy(*args, launcher=launcher, grid=grid, stream=stream, **kwargs)
+            if self.check_accuracy(*args, launcher=launcher, grid=grid, stream=stream, **kwargs):
+                return
 
         # it is faster than entering and exiting a context manager, even if the context
         # manager is a nullcontext.
-- 
Gitee


From 5a2c5a4aaed529bbbce5d78a312cabf396a7b402 Mon Sep 17 00:00:00 2001
From: kaixin <kaixin.yang@outlook.com>
Date: Wed, 30 Apr 2025 00:03:32 +0800
Subject: [PATCH 343/358] move inductor_npu auto_tiling refactoring to
 torch_npu

---
 test/_inductor/test_embedding.py              |   18 +-
 test/_inductor/test_issue54.py                |    9 +-
 test/_inductor/test_issue57.py                |    3 -
 test/_inductor/test_issue59.py                |    3 -
 torch_npu/_inductor/__init__.py               |   14 +-
 .../_inductor/codegen/kernel_analysis.py      |  312 +++++
 .../_inductor/codegen/npu_kernel_features.py  |    2 +-
 torch_npu/_inductor/codegen/schduling.py      |   14 +-
 torch_npu/_inductor/codegen/split_tiling.py   |  363 +++--
 torch_npu/_inductor/codegen/tile_generator.py |  348 +++--
 torch_npu/_inductor/codegen/triton.py         | 1226 +++++++----------
 torch_npu/_inductor/codegen/wrapper.py        |   16 +-
 torch_npu/_inductor/config.py                 |    2 +-
 torch_npu/_inductor/decomposition.py          |    4 +-
 torch_npu/_inductor/lowering.py               |  109 +-
 torch_npu/_inductor/npu_triton_heuristics.py  |  173 +--
 torch_npu/_inductor/runtime.py                |   28 +
 17 files changed, 1451 insertions(+), 1193 deletions(-)
 create mode 100644 torch_npu/_inductor/codegen/kernel_analysis.py

diff --git a/test/_inductor/test_embedding.py b/test/_inductor/test_embedding.py
index c7732ec5c6..dce1208cb0 100644
--- a/test/_inductor/test_embedding.py
+++ b/test/_inductor/test_embedding.py
@@ -5,13 +5,17 @@ import torch_npu
 import torch_npu._inductor
 
 import pytest
-#from testutils import OperatorType, TestUtils
+#from .testutils import OperatorType, TestUtils
 import torch.nn as nn
 
 class TestSub():
 
-    def op_calc(self):
+    def op_calc(self, input):
         embedding = nn.Embedding(16, 128).npu()
+        output = embedding(input)
+        return output
+
+    def test_pointwise_cases(self):
 
         input = torch.tensor([[14, 1, 2, 10,  0, 10, 0],
                         [ 9, 13, 13,  4,  7, 15, 14],
@@ -27,16 +31,10 @@ class TestSub():
                         [ 3,  9,  8,  4, 13,  8,  3],
                         [ 4, 10,  8, 13,  6,  8,  3]], device='npu:0')
 
-        output = embedding(input.npu())
-        return output
-
-    def test_pointwise_cases(self):
-        torch_npu._inductor.config.enable_npu_indexing = True
-
-        std_sub = self.op_calc()
+        std_sub = self.op_calc(input)
 
         compiled_op_calc = torch.compile(self.op_calc, backend="inductor")
-        inductor_sum = compiled_op_calc()
+        inductor_sum = compiled_op_calc(input)
         #torch.testing.assert_close(std_sub, inductor_sum)
 
 
diff --git a/test/_inductor/test_issue54.py b/test/_inductor/test_issue54.py
index 2f532c059b..ce943a616d 100644
--- a/test/_inductor/test_issue54.py
+++ b/test/_inductor/test_issue54.py
@@ -8,7 +8,6 @@ import torch_npu._inductor
 import pytest
 from torch.nn import CrossEntropyLoss
 from torch import nn
-from test2.npu_indexing.utils import benchmark_test
 
 
 class Test_issue54():
@@ -62,14 +61,10 @@ class Test_issue54():
         func = torch.compile(test.func_layernorm, backend="inductor", dynamic=False,
                              options={"unroll_reductions_threshold": 1, "aggressive_fusion": True})
         calc = func(add_3, primals_6, primals_7, view, primals_9, permute_1, primals_10, primals_11)
-        torch.testing.assert_close(ref[0], calc[0], rtol=1e-3, atol=1e-3)
-        torch.testing.assert_close(ref[1], calc[1], rtol=1e-3, atol=1e-3)
+        torch.testing.assert_close(ref[0], calc[0], rtol=1e-2, atol=1e-2)
+        torch.testing.assert_close(ref[1], calc[1], rtol=1e-2, atol=1e-2)
         print("valid ok")
 
-        benchmark_test(test.func_layernorm, func,
-                       args=(add_3, primals_6, primals_7, view, primals_9, permute_1, primals_10, primals_11,),
-                       name="test_layernorm", times=10, repeat=10, profile=False)
-
 
 if __name__ == "__main__":
     test = Test_issue54()
diff --git a/test/_inductor/test_issue57.py b/test/_inductor/test_issue57.py
index 5ad6be8e2d..ac0cde11ae 100644
--- a/test/_inductor/test_issue57.py
+++ b/test/_inductor/test_issue57.py
@@ -5,7 +5,6 @@ import torch
 import torch_npu
 import torch_npu._inductor
 import pytest
-from test2.npu_indexing.utils import benchmark_test
 
 
 class Test_issue57():
@@ -39,8 +38,6 @@ class Test_issue57():
         torch.testing.assert_close(ref, calc, rtol=1e-3, atol=1e-3)
 
         print("valid ok")
-        benchmark_test(test.op_sum, func, args=(view_12, embedding_1, primals_221),
-                       name="issue57", times=10, repeat=10, profile=False)
 
 
 if __name__ == "__main__":
diff --git a/test/_inductor/test_issue59.py b/test/_inductor/test_issue59.py
index a1644749e4..eac1ae795b 100644
--- a/test/_inductor/test_issue59.py
+++ b/test/_inductor/test_issue59.py
@@ -4,7 +4,6 @@ import torch
 import torch_npu
 import torch_npu._inductor
 import pytest
-from test2.npu_indexing.utils import benchmark_test
 
 
 class Test_issue59():
@@ -38,8 +37,6 @@ class Test_issue59():
         torch.testing.assert_close(mean_2, mean_2_t, rtol=1e-3, atol=1e-3)
 
         print("valid ok")
-        benchmark_test(test.layernorm_backward, func, args=(x, y, z),
-                       name="issue59", times=10, repeat=10, profile=False)
 
 
 if __name__ == "__main__":
diff --git a/torch_npu/_inductor/__init__.py b/torch_npu/_inductor/__init__.py
index dd90221072..c513d215b8 100644
--- a/torch_npu/_inductor/__init__.py
+++ b/torch_npu/_inductor/__init__.py
@@ -4,7 +4,7 @@ from torch._inductor.codegen.common import register_backend_for_device, register
 from torch._dynamo.device_interface import register_interface_for_device, get_interface_for_device
 from torch._inductor import lowering as inductor_lowering
 from torch._inductor.choices import InductorChoices
-
+from torch._inductor.runtime import autotune_cache
 from torch_npu.utils._inductor import NPUDeviceOpOverrides
 from torch_npu.utils._dynamo_device import NpuInterface, current_device, set_device
 from torch_npu.npu.utils import device_count
@@ -20,6 +20,8 @@ from . import config as npu_config
 from . import codegen
 from . import npu_fusion_attention_graph
 from . import dynamo_embedding_backward_dispatch
+from .runtime import _load_cached_autotuning
+
 
 npulog.info("perform torch_npu._inductor patch")
 
@@ -67,9 +69,14 @@ class NewNpuInterface(NpuInterface):
     def maybe_exchange_device(device_id: int) -> int:
         return device_id
 
+    @staticmethod
+    def is_bf16_supported(including_emulation: bool = False):
+        return True 
+    
 
 register_interface_for_device("npu", NewNpuInterface)
-register_interface_for_device("npu:0", NewNpuInterface)
+for i in range(16) :
+    register_interface_for_device(f"npu:{i}", NewNpuInterface)
 device = get_interface_for_device("npu")
 
 inductor_lowering.make_reduction = make_reduction
@@ -99,4 +106,5 @@ if (aggresive_autotune):
     import os
     os.environ["TRITON_BENCH_METHOD"] = "npu"
 
-InductorChoices.should_use_persistent_reduction = should_use_persistent_reduction
\ No newline at end of file
+InductorChoices.should_use_persistent_reduction = should_use_persistent_reduction
+autotune_cache._load_cached_autotuning = _load_cached_autotuning
\ No newline at end of file
diff --git a/torch_npu/_inductor/codegen/kernel_analysis.py b/torch_npu/_inductor/codegen/kernel_analysis.py
new file mode 100644
index 0000000000..266a752626
--- /dev/null
+++ b/torch_npu/_inductor/codegen/kernel_analysis.py
@@ -0,0 +1,312 @@
+from torch._inductor.virtualized import V
+from torch._inductor.utils import sympy_index_symbol
+from torch._inductor.scheduler import SchedulerNode
+from typing import List, Tuple 
+from torch._inductor import ir
+import sympy
+import pdb
+
+class IndexAnalysis :
+    def __init__(self, kernel, raw_index, is_store_index = False ) :
+        self.index = raw_index.subs(V.graph.sizevars.var_to_val)
+        self.kernel = kernel
+        self.tiling_axis = [x.symbol() for x in self.kernel.tiling_axis]
+        # self.var_stride = None  # var list       [(r,1),(x,2),(y,4),(z,24)]   
+        # self.var_list = None  # sorted by stride [r,x,y,z], in reversed order 
+        self.stride_list = None # stride list [1,2,4,24]
+        self.reshape_sizes = []  # [RBLOCK, 1, 1, XBLOCK_SUB]
+        self.broadcast_sizes = [] # [RBLOCK, XBLOCK_SUB]
+        self.permute_shape = []   # [0,2,1,3]
+        self.var_replacements = {} # r2 ->r2_0, etc
+        self.var_directions = {} # r2_0 -> [None,:,None]
+        self.similar = None #(r,x,z,y) 
+        self.need_permute = False 
+        self.need_broadcast = False 
+        self.need_reshape = False 
+        self.gold = kernel.golden_var_list #tuple([x.symbol() for x in reversed(kernel.tiling_axis)])
+        self.var_stride = [(key,coeff) for key, coeff in self.index.as_coefficients_dict().items() if not isinstance(key, sympy.Integer)]
+        # sort by stride 
+        self.var_stride.sort(key = lambda  x : x[1]  )
+        # only contains tiing axis var
+        self.var_list= tuple([x[0] for x in self.var_stride if x[0] in self.tiling_axis ])
+        self.stride_list = tuple([x[1] for x in self.var_stride if x[0] in self.tiling_axis])
+        self.is_store_index = is_store_index
+        
+
+    def get_most_similar_shape(self) : 
+        matched_dims = 0
+        self.similar = None 
+        for vars in self.kernel.index_analysis.keys() : 
+            if len(vars) != len(self.gold) :
+                continue 
+            i = 0
+            while i < len(self.var_list) :
+                if vars[i] == self.var_list[i] : 
+                    i = i + 1
+                else :
+                    break 
+
+            if i > matched_dims :
+                matched_dims = i
+                self.similar = vars 
+        return self.similar
+             
+    def same_var_list(self, var1, var2) :
+        if len(var1) != len(var2) :
+            return False 
+        for i, v in enumerate(var1) :
+            if v != var2[i] :
+                return False 
+        return True 
+    
+    def shrink_permute_shape(self, permute_shape) :
+        diff = len(self.gold) - len(self.kernel.tiling_axis)
+        new_shape = [x for x in permute_shape if x - diff >= 0]
+        return new_shape 
+
+    def analyze_permute_shape(self):  
+        if self.gold == self.similar:
+            self.need_permute = False 
+            return 
+    
+        similar = tuple(reversed(self.similar))
+        gold = tuple(reversed(self.gold))
+        self.permute_shape = [None] * len(gold)
+
+        # kernel_name = self.kernel.get_kernel_name("", self.kernel.node_schedule, self.kernel) 
+        # if kernel_name == "triton_unk_fused_add_clone_tanh_20" :
+        #     pdb.set_trace()
+        
+        if self.is_store_index :
+            for i, x in enumerate(similar) :
+                if x != gold[i] :
+                    index = gold.index(x)
+                    self.permute_shape[i] = index
+                    self.need_permute = True 
+                else :
+                    self.permute_shape[i] = i
+            return 
+
+        for i, x in enumerate(gold) :
+            if x != similar[i] :
+                index = similar.index(x)
+                self.permute_shape[i] = index
+                self.need_permute = True 
+            else :
+                self.permute_shape[i] = i
+      
+    def analyze_broadcast_sizes(self) :
+        if not self.need_reshape :
+            self.need_broadcast = False 
+            return
+        self.need_broadcast = True 
+        reversed_similar = reversed(self.similar)
+        similar = [x for x in reversed_similar]
+        self.broadcast_sizes = ["1"] * len(similar)
+        for i, x in enumerate(similar) :
+            self.broadcast_sizes[i] =  f"{x.name.upper()}BLOCK_SUB"
+
+    def analyze_reshape_sizes(self) :
+        if all(x in self.var_list for x in self.tiling_axis ) :
+            self.need_reshape = False 
+            return 
+        self.need_reshape = True 
+        reversed_similar = reversed(self.similar)
+        similar = [x for x in reversed_similar ]
+        var_list = [x for x in reversed(self.var_list) ]
+        self.reshape_sizes = ["1"] * len(similar)
+        for i, x in enumerate(var_list):
+            index = similar.index(x)
+            self.reshape_sizes[index] =  f"{x.name.upper()}BLOCK_SUB"
+    
+    def analyze_var_direction(self) :
+        if self.var_list == self.gold :
+            return 
+        var_list = self.var_list if len(self.var_list) == len(self.gold) else self.similar
+        if var_list == self.gold :
+            return 
+        if not var_list :
+            return
+        var_list = list(tuple(reversed(var_list)))
+        gold = list(tuple(reversed(self.gold)))
+        assert len(var_list) == len(gold)
+        var_list = [x for x in var_list if x in self.kernel.tiling_axis]
+        gold = [x for x in gold if x in self.kernel.tiling_axis]
+        for i, x in enumerate(gold ):
+            index = var_list.index(x)
+            if(index == i) :
+                continue 
+            new_var = sympy_index_symbol(f"{x}_{index}")
+            if new_var in self.var_replacements:
+                continue 
+            direction = ["None"] * len(gold)
+            direction[index] = ":"
+            direction_str = f"[{','.join(direction)}]"
+            self.var_replacements[x] = new_var
+            self.var_directions[new_var] = direction_str
+            self.kernel.range_tree_nodes[x].var_directions[new_var] = direction_str
+
+            
+    def analyze_index(self) :
+        if isinstance(self.index, sympy.Integer ) :
+            return 
+        if not self.kernel.golden_var_list :
+            self.kernel.select_golden_varlist()
+            self.gold = self.kernel.golden_var_list 
+
+        assert self.gold is not None 
+        assert len(self.gold) == len(self.tiling_axis)
+
+        def all_tiling_in_var_list() :
+            return all([x in self.var_list for x in self.tiling_axis]  )  
+        #2 analyze permute shape for full_dim_len index
+        if all_tiling_in_var_list() :
+           self.similar = self.var_list 
+           self.analyze_permute_shape()
+           if self.var_list not in self.kernel.index_analysis :  
+               self.kernel.index_analysis[self.var_list] = self
+        #3. analyze reshape and broadcast sizes 
+        else :
+            pass
+            # self.similar = self.get_most_similar_shape()
+            # if self.similar is None :
+            #     return 
+            # self.analyze_reshape_sizes() 
+            # self.analyze_broadcast_sizes()
+            # self.analyze_permute_shape()
+
+        #4 analyze var direction
+        self.analyze_var_direction()
+    
+    def generate_statement(self) :
+        statement = ""
+        if self.need_reshape :
+            reshape_sizes = f"[{','.join(self.reshape_sizes)}]"
+            statement = f".reshape({reshape_sizes})"
+        if self.need_broadcast:
+            broadcast_sizes = f"[{','.join(self.broadcast_sizes)}]"
+            statement = f"{statement}.broadcast_to({broadcast_sizes})"
+        if self.need_permute:
+            statement = f"{statement}.permute({self.permute_shape})"
+        return statement 
+
+class ReductionAnalysis :
+    def __init__(self, kernel) :
+        self.kernel = kernel 
+        self.reduction = None 
+        self.reduced_dim = None 
+        if self.numof_reduction_axis() > 1 :
+            self.kernel.persistent_reduction = True
+            self.reduced_dim = 0
+            return 
+
+        reduction = self.kernel.find_reduction_node()
+        if reduction is None or not isinstance(reduction, ir.Reduction) :
+             raise RuntimeError("failed to get one reduction node")
+        if not hasattr(reduction, "reduced_idx") :
+             raise RuntimeError("reduction node doesn't have attr reduced_idx")
+        self.reduction = reduction 
+        self.reduced_dim = self.analyze_reduction_dim()
+
+    def is_higher_order_reduction(self ):
+        return self.dim < len(self.kernel.tiling_axis) -1
+    
+    def is_1d_reduction(self) :
+        return self.kernel.numels["r"] > 1 and len(self.kernel.numels) == 1
+
+    def get_reduce_dim_reshape(self, reduce_axis) :
+        if self.is_1d_reduction():
+            shape_str = f"[{reduce_axis.name.upper()}BLOCK_SUB]"
+        else :
+            shape = ["1"] * len(self.kernel.tiling_axis)
+            shape[self.reduced_dim] = f"{reduce_axis.name.upper()}BLOCK_SUB"
+            shape_str = f"[{','.join(shape)}]"
+        return shape_str
+    
+    def dense_size_list(self) -> List[str]:
+        sizes = [f"{x.name.upper()}BLOCK_SUB" for x in self.kernel.tiling_axis]
+        if self.numof_reduction_axis() > 1 :
+            return sizes 
+        
+        reduce_axis = self.kernel.tiling_axis[-1]
+        sizes.pop(-1)
+        sizes.insert(self.reduced_dim, f"{reduce_axis.name.upper()}BLOCK_SUB" )
+        return sizes
+    
+    def dense_size_str(self) :
+        sizes = self.dense_size_list()
+        if self.numof_reduction_axis() > 1:
+            return f"[{'* '.join(sizes)}]"
+        return f"[{', '.join(sizes)}]"
+        
+    def numof_reduction_axis(self):
+        return self.kernel.numof_reduction_axis()
+    
+    def reduction_axis_list(self):
+        return self.kernel.reduction_axis_list()
+
+    def analyze_reduction_dim(self) :
+
+        if self.numof_reduction_axis() > 1 :
+            self.kernel.persistent_reduction = True
+            self.reduced_dim = 0
+            return 0
+        
+        if not self.kernel.golden_var_list :
+            self.kernel.select_golden_varlist()
+        assert self.kernel.golden_var_list is not None 
+        
+        dim = -1
+        for i, x in enumerate(reversed(self.kernel.golden_var_list)) :
+            if x.name[0] == 'r'  :
+                dim = i 
+                break 
+        return dim 
+
+
+
+    def analyze_reduction_dim1(self) :
+        # kernel_name = self.kernel.get_kernel_name("", self.kernel.node_schedule, self.kernel) 
+        # if kernel_name == "triton_unk_fused_14" :
+        #     pdb.set_trace()
+
+        if self.numof_reduction_axis() > 1 :
+            self.kernel.persistent_reduction = True
+            self.reduced_dim = 0
+            return 0
+        reduction = self.reduction
+        # kept = [0,1,3], reduced = [2]
+        for i,x in enumerate(reduction.reduced_idx) :
+            if reduction.reduction_ranges[i] <=1 :
+                continue 
+            reduced_idx = x
+            break 
+        # the index (in reduction.ranges) of low_dims
+        low_dims = [i for i, x in enumerate(reduction.kept_idx) if x > reduced_idx]
+        if not low_dims :
+            return len(self.kernel.tiling_axis) -1
+        elif len(low_dims) == len(reduction.kept_idx) :
+            return 0
+        # reduction dim when low_dims are not meraged
+        dim = len(reduction.kept_idx) - len(low_dims)
+        
+        tiling_axis = self.kernel.tiling_axis[:-1]
+        merged =1
+        j = len(tiling_axis) -1
+        # remove all low_dims from tiling_axis
+        # all axis before ahead of j are high-orders
+        # then following is reduced dim 
+        ranges = [x for x in reduction.ranges if x > 1]
+        for i in reversed(low_dims) :
+            len_axis = tiling_axis[j].length
+            len_reduction = ranges[i] * merged
+            if len_reduction < len_axis :
+                merged = merged * len_reduction
+            elif len_reduction == len_axis:
+                j = j - 1 
+                merged = 1
+            else :
+                assert False, f"should not reach here low_dims({i})={len_reduction}, axis[{j}]=len)"
+        dim = j + 1
+        return dim
+    
\ No newline at end of file
diff --git a/torch_npu/_inductor/codegen/npu_kernel_features.py b/torch_npu/_inductor/codegen/npu_kernel_features.py
index 57c1211e35..6dd3189b67 100644
--- a/torch_npu/_inductor/codegen/npu_kernel_features.py
+++ b/torch_npu/_inductor/codegen/npu_kernel_features.py
@@ -9,7 +9,7 @@ from torch._inductor.utils import cache_on_self
 from torch.utils._ordered_set import OrderedSet
 from torch._inductor.virtualized import V
 from torch._inductor.codegen.simd import SIMDScheduling
-
+from typing import Iterable
 
 class NumelList(Tuple):
 
diff --git a/torch_npu/_inductor/codegen/schduling.py b/torch_npu/_inductor/codegen/schduling.py
index 2b6decf07b..2fae3cd018 100644
--- a/torch_npu/_inductor/codegen/schduling.py
+++ b/torch_npu/_inductor/codegen/schduling.py
@@ -17,6 +17,7 @@ from torch._inductor.utils import sympy_index_symbol, ModularIndexing, FloorDiv
 
 from torch_npu._inductor.codegen.triton import NPUIndexTritonKernel, flatten
 from .split_tiling import SplitTiling
+from torch.fx.immutable_collections import immutable_dict
 from .npu_kernel_features import NumelList, NPUKernelFeatures
 
 
@@ -55,7 +56,7 @@ from ..lowering_fx import (
     generate_fx_graph_code,
     dump_fx_graph_code
     )
-
+from .kernel_analysis import ReductionAnalysis
 
 def flatten_groups(nums):
     res = []
@@ -172,6 +173,8 @@ class NPUTritonScheduling(TritonScheduling):
         if config.warn_mix_layout:
             final_kernel.warn_mix_layout(kernels[0].kernel_name)
 
+        V.graph.removed_buffers |= final_kernel.removed_buffers
+        V.graph.inplaced_to_remove |= final_kernel.inplaced_to_remove
 
         if (
             V.graph.wrapper_code.supports_intermediate_hooks
@@ -306,15 +309,18 @@ class NPUTritonScheduling(TritonScheduling):
                     root.remove_entry(var)
             # select split and tiling axis
             split_tiling = SplitTiling(kernel)
-            split_tiling.select_tiling_axis()
+            split_tiling.select_split_tiling_axis()
+            kernel.load_store_indexing = split_tiling.indexing
             # debug print index transforms
             for node in node_schedule:
                 if node in (EnableReduction, DisableReduction):
                     continue
                 for x, y in zip(node._body.indexing_exprs.values(), node._body.indexing.values()):
                     print(f"index transform:{x}->{y}")
-
-
+            # ReductionAnalysis depends on kernel.load_store_indexing 
+            if kernel.inside_reduction :
+                kernel.reduce_analysis = ReductionAnalysis(kernel)  
+  
     def additional_nodes_to_be_subs(self, kernel, node_to_be_substituted):
         for node in kernel.range_tree_nodes.values():
             if node.expr != sympy_index_symbol(f"{node.parent.prefix}index") \
diff --git a/torch_npu/_inductor/codegen/split_tiling.py b/torch_npu/_inductor/codegen/split_tiling.py
index 7be80830d9..d52717fe71 100644
--- a/torch_npu/_inductor/codegen/split_tiling.py
+++ b/torch_npu/_inductor/codegen/split_tiling.py
@@ -1,217 +1,198 @@
-import sympy as sympy
+import pdb
 
 from torch._inductor.codegen.triton import TritonKernel
-from torch._inductor.utils import ModularIndexing, sympy_subs
+from torch._inductor.utils import ModularIndexing,sympy_subs
+import sympy as sympy
+from ..config import num_vector_core, log
 from torch._inductor.virtualized import V
-from torch._inductor.codegen.simd import (EnableReduction, DisableReduction)
-from torch._inductor.runtime.runtime_utils import next_power_of_2
-from torch._inductor.loop_body import MemoryUsageType
-
+from torch._inductor.codegen.simd import (  EnableReduction, DisableReduction)
+from torch._inductor.runtime.runtime_utils import  next_power_of_2
 from .triton_utils import get_aligned_numel
-from ..config import num_vector_core, log
-
+from torch._inductor.loop_body import MemoryUsageType
+from functools import reduce
+from .kernel_analysis import IndexAnalysis
 
 # split and tiling axis selector
-class SplitTiling:
-
-    def __init__(self, kernel: TritonKernel):
+class SplitTiling :
+    def __init__(self, kernel : TritonKernel) :
         self.kernel = kernel
-        self.indexing = []
-
-        def key(x):
-            # to be higher than x and y
-            if x.name[0] == 'w' or x.name[0] == 'v' or x.name[0] == 'p' or x.name[0] == 't':
-                return "z" + x.name
-            # to be lower than floor_dir
-            elif isinstance(x.expr, ModularIndexing):
-                return x.name[0] + "0" + x.name[1:]
-            else:
-                return x.name
-            
+        self.indexing = []  # load and store indexing  among all scheduler nodes 
         kernel.sorted_axis = [x for x in kernel.range_tree_nodes.values()]
-        kernel.sorted_axis.sort(reverse=True, key=key)
+        kernel.sorted_axis.sort(reverse=True, key = self.key)
         for i, dim in enumerate(kernel.sorted_axis):
             dim.sorted_order = i
         
         self.find_lowest_dimension()
         self.should_outer_reduce = False
+        self.possible_need_permute = self.find_possible_permutes()
+    
+    def find_possible_permutes(self) :
+        if len(self.kernel.low_dims) <= 1 :
+            return False 
+        var_lists = []
+        low_dims = [self.kernel.sorted_axis[x].symbol() for x in self.kernel.low_dims]
+        for index in self.indexing :
+            var_stride = [(key,coeff) for key, coeff in index.as_coefficients_dict().items() if not isinstance(key, sympy.Integer)]
+            var_stride.sort(key = lambda  x : x[1]  )
+            var_list= tuple([x[0] for x in var_stride if x[0] in low_dims ])
+            var_lists.append(var_list)
+        for i, var_list in enumerate(var_lists) :
+            if len(var_list) < len(low_dims) :
+                continue
+            for j, other in enumerate(var_lists) :
+                if i == j or len(other) < len(low_dims):
+                    continue
+                if var_list != other :
+                    return True 
+        return False 
+                
+
+    def key(self, x) :
+            # to be higher than x and y
+            if x.name[0] == 'w' or x.name[0] == 'v'  or x.name[0] == 't':
+                return "zz" + x.name
+            # to be lower than floor_dir
+            elif isinstance(x.expr, ModularIndexing):
+                return x.name[0] + "0" + x.name[1:]
+            else :
+                return x.name
 
-    # Split 原则1 ：先做维度合并，再切分 。通过维度合并降维降低，split和tiling轴选择策略的复杂性 。
-    # Split 原则2: 切分的数量要和AIcore的数量对齐（相同或是倍数）。每个核要分配的split的量一致。每个split形状要一致（包括维度和尺寸）。
-    # Split  原则3: 对于规约类融合算子, 从非规约选择切分轴。对于非规约类融合算子, 从所有轴中选切分轴。
-    # 为了tiling时刻的低维tilesize最大化，切分轴最好不是低维轴且长度大于aicore的数量 。
-    # Split 原则4: 如果高维规约类融合算子，而且高维尺寸非常大（ >= 64KB），低维度尺寸比较小（ <= 32B）, 可以选择对规约轴切分，然后在核间用atomic
-    # 原语做规约。
-    # Split  原则5 ：根据算子逻辑，优先选择一维发射。
+    def total_split_numels(self, axis_list):
+        numels = [x.length for x in axis_list]
+        return reduce(lambda x,y:x*y, numels) if numels else 1
+    
+    # Split 原则1 ：先做维度合并，再切分 。通过维度合并降维降低split和tiling轴选择策略的复杂性 。
+    # Split 原则2 : 切分轴尽量选择高维度的轴, 这样load/store 能够有比较好的线性度 ,
+    # Split 原则3 : 规约轴和低维轴不应选为切分轴 。但如果高维规约类融合算子，而且高维尺寸非常大（ >= 64KB），其他维度不足以支持切分，可以考虑对规约轴切分。
+    # Split 原则4 ：切分轴的总numel 要超过 aicore总数。切分轴的数量最好不要超过3个(triton 最多支持三维发射）， 因此 如果一点要超， 需要维度合并。
     def select_split_axis(self):
-
-        def select_longest_dim(can_be_low_dim=True):
-            longest = -1
-            longest_dim = None
-            for x in candidates:
-                if SplitTiling.great_than(x.length, longest) and (can_be_low_dim or not self.is_lowest_dimension(x)):
-                    longest_dim = x
-                    longest = x.length
-            return longest_dim
-        # point-wise : all dims , reduction: outer_reduction dim or non-reduction dims
-        is_reduction = lambda x: x.prefix == 'r'
-        candidates = [x for x in self.kernel.sorted_axis if not is_reduction(x) or self.should_outer_reduce_me(x)]
-        if self.should_outer_reduce:
-            return self.kernel.split_axis
-
-        # 0307 patch 5lines
-        if len(candidates) > 0:
-            longest_dim = candidates[0]
-            self.kernel.split_axis = longest_dim
-            self.kernel.split_axis.is_split_axis = True
-            return longest_dim
-
-        #longest and not low dims
-        longest_dim = select_longest_dim(can_be_low_dim=False)
-
-        # longest and can be low dims
-        if longest_dim is None or SplitTiling.less_than(longest_dim.length, int(num_vector_core * 0.8)):
-            longest_dim = select_longest_dim(can_be_low_dim=True)
-        if longest_dim is not None:
-            self.kernel.split_axis = longest_dim
-            self.kernel.split_axis.is_split_axis = True
-        elif len(self.kernel.sorted_axis) > 0:
-            longest_dim = self.kernel.sorted_axis[0]
-            self.kernel.split_axis = longest_dim
-            self.kernel.split_axis.is_split_axis = True
+        self.kernel.split_axis.clear()
+
+        # total numel exceed aicore or total split axis exceed 3
+        def meet_stop_condition() :
+            if self.total_split_numels(self.kernel.split_axis) >= num_vector_core :
+                return True 
+            if len(self.kernel.split_axis) == 3 :
+                return True 
+            return False 
+       
+        def select_one_split_axis(not_reduction = True, not_low_dims = True ) :
+            for axis in self.kernel.sorted_axis :
+                if not_reduction and axis.prefix == "r" :
+                    continue 
+                if not_low_dims and axis.sorted_order in self.kernel.low_dims :
+                    continue
+                if axis in self.kernel.split_axis :
+                    continue 
+                axis.is_split_axis = True 
+                return axis 
+            return None 
+        count = 0 
+        while not meet_stop_condition() :
+            count += 1
+            axis = select_one_split_axis(not_reduction=True, not_low_dims=True)
+            if axis is not None :
+               self.kernel.split_axis.append(axis)
+               continue 
+            axis = select_one_split_axis(not_reduction=True, not_low_dims=False ) 
+            if axis is not None :
+               self.kernel.split_axis.append(axis)
+               continue 
+            #fixme later, to split reduction dim 
+            if count > 10 :
+                break   
+
+        if not self.kernel.split_axis and self.kernel.sorted_axis:
+            self.kernel.split_axis.append(self.kernel.sorted_axis[0])
+
+        self.kernel.split_axis.sort(reverse=True, key = self.key)
+        for i, x in enumerate(self.kernel.split_axis) :
+            x.split_order = i
+           
+
+    # Tiling 原则1：load / store 中索引表达式的中的低维轴都要成为tiling 轴. 
+    # Tiling 原则2：对于规约算子，规约轴要成为tiling轴。
+    # Tiling 原则3: 多维规约， 只有规约轴可以被选择为tiling轴
+    # Tiling 原则4: tiling轴 要覆盖 total numel 的 80%
+
+
+    # fixme, two tiling axis might be insufficient when there're 3 or more low-dims in indexing
+    def select_tiling_axis(self ):
+        self.kernel.tiling_axis.clear()
+        #longest = self.find_longest_dimension()
+        #  cover the biggest axis and not exceed 3 axis
+        def meet_stop_condition() :
+            total_numel = reduce(lambda x,y : x + y, map(lambda x:x.length, self.kernel.sorted_axis))  if self.kernel.sorted_axis else 1
+            tiling_numel = reduce(lambda x,y :x + y, map(lambda x:x.length, self.kernel.tiling_axis)) if self.kernel.tiling_axis else 1
+            if self.kernel.numof_reduction_axis() > 1 and all(self.kernel.range_tree_nodes[var].is_tiling_axis for var in self.kernel.reduction_axis_list()) :
+                return True 
+            #currently, the maximum dim that triton-ascend support is 2
+            max_transpose_dims = 2 
+            if (self.possible_need_permute or  tiling_numel / total_numel >= 0.8) and \
+                    len(self.kernel.tiling_axis) >= min(max_transpose_dims, len(self.kernel.sorted_axis)) :
+                return True 
+            return False 
         
-        return longest_dim
-
-    # Tiling 原则1：切分要照顾所有load / store 中索引表达式的中的低维轴 ：所有的低维轴都被切分 从而成为tiling 轴。写代码的时候对所有的tiling
-    # 轴通过make_range产生连续索引，从而保证load / store的连续性。
-    # Tiling 原则2 ：规约的tile必须要二维。 对于低维规约算子，规约轴和至少一个非规约轴要选择为tiling轴。对于高维规约，规约轴和低维轴要选择为tiling轴
-    #    对于是多维规约, 所有的规约轴都要选择为tiling 轴 。
-    # Tiling 原则3: 如果tiling轴是低维，在该轴上的切分的尺寸要与SIMD的BlockSize 对齐（32bytes）
-    # Tiling 原则4: 低维轴的tile size 越大，性能越好。这个其实autotune 的原则，放在这里只是为了更好解释用例中使用的数值 。
-
-    def select_tiling_axis(self):
-
-        # True :self.kernel.axis2 is Not None and all reduction axis selected, False : other cases
-        def axis2_selection_done(axis):
-            if self.kernel.total_numels <= 1:
-                return True
-            elif self.kernel.axis2 is not None:
-                is_reduction = axis.prefix == "r"
-                if not is_reduction:
-                    return True
-                reduction_axis = self.kernel.numof_reduction_axis()
-                return True if reduction_axis <= 1 else len(self.kernel.axis2_list) == reduction_axis
-            else:
-                return False
-    
-        if self.kernel.axis2 is not None or self.kernel.axis1 is not None:
-            return
-        # two or more reduction axises, need to flatten reduction dims to one to do 1 dim reduction .
-        if self.kernel.numof_reduction_axis() > 1:
-            self.kernel.persistent_reduction = True
-        biggest = -1
-        dims = self.kernel.sorted_axis
-        if self.kernel.split_axis is None:
-            self.select_split_axis()
+        def select_tiling(low_dim = True, reduction = True ) :
+            for axis in reversed(self.kernel.sorted_axis) :
+                if low_dim and axis.sorted_order in self.kernel.low_dims and axis not in self.kernel.tiling_axis:
+                    axis.is_tiling_axis = True 
+                    self.kernel.tiling_axis.append(axis)
+                if reduction and axis.prefix == 'r' and axis not in self.kernel.tiling_axis:
+                    axis.is_tiling_axis = True 
+                    self.kernel.tiling_axis.append(axis)
+                if low_dim or reduction :
+                    continue  
+                # using principle 4, select one longest
+                longest = axis #self.find_longest_dimension(check_in_tiling = True)
+                if longest and longest not in self.kernel.tiling_axis:
+                    self.kernel.tiling_axis.append(longest)
+                    longest.is_tiling_axis = True 
+                if meet_stop_condition():
+                    break 
+                
+        select_tiling(low_dim=True, reduction=True)
+        count = 0
+        while not meet_stop_condition():
+            select_tiling(low_dim=False, reduction=False)
+            count += 1
+            if count > 10 :
+                break 
+        self.kernel.tiling_axis.sort(reverse=True, key = self.key)
+        for i , x in enumerate(self.kernel.tiling_axis) :
+            x.tiling_order = i
+   
         
-        if self.kernel.split_axis is None:
-            return
-        # select tiling_axis2 then tiling_axis1, for reduction, all reduction axis will be selected as tiling_axis2
-        for i in range(len(dims) - 1, -1, -1):
-            axis = dims[i]
-            numel = axis.length
-            if isinstance(numel, (sympy.Symbol, sympy.Expr)) and not isinstance(numel, sympy.Integer):
-                numel = numel.subs(V.graph.sizevars.var_to_val)
-            if axis.is_split_axis:
-                dtype = self.kernel.get_axis_dtype(axis)
-
-                min_aligned_numel = get_aligned_numel(dtype)
-                _, numel = SplitTiling.decide_nblocks_xblock(numel, len(self.kernel.sorted_axis) <= 1, min_aligned_numel)
-            
-            # choose reduction axis or low-dim as axis2
-            if not axis2_selection_done(axis):
-                axis.is_tiling_axis2 = True if SplitTiling.great_than(numel, 1) else False
-                # axis2 must be the reduction axis in case inside_reduction
-                if axis.prefix == "r":
-                    axis.is_tiling_axis2 = True
-                if axis.is_tiling_axis2 and self.kernel.axis2 is None:
-                    self.kernel.axis2 = axis.symbol()
-                if self.kernel.numof_reduction_axis() > 1:
-                    self.kernel.axis2_list.append(axis.symbol())
-                    self.kernel.axis2 = axis.symbol() if isinstance(axis.expr, ModularIndexing) else self.kernel.axis2
-            else:
-                # for _higher_order_reduction, axis1 must be  the lowest dimension
-                if self.kernel.inside_reduction and self.kernel.is_higher_order_reduction():
-                    self.kernel.axis1 = axis.symbol()
-                    break
-
-                # low-dim should be selected as another tiling axis
-                if self.is_lowest_dimension(axis):
-                    self.kernel.axis1 = axis.symbol()
-                    break
-                # select the longest in other cases
-                if numel > biggest:
-                    self.kernel.axis1 = axis.symbol()
-                    biggest = numel
-        if self.kernel.axis1 is not None:
-            axis = self.kernel.range_tree_nodes[self.kernel.axis1]
-            axis.is_tiling_axis1 = True
-
-
-        log.debug(f"split_tiling numels:{self.kernel.numels} split_axis: {self.kernel.split_axis.symbol()} "
-                  f"axis1:{self.kernel.axis1} axis2:{self.kernel.axis2} low_dims:{self.kernel.low_dims}, "
-                  f"indexing: {self.indexing}")
-
-
-
-
+    def select_split_tiling_axis(self) :
+        self.select_split_axis()
+        self.select_tiling_axis()
+        log.info(f"split_tiling numels:{self.kernel.numels} split_axis: {','.join([x.name for x in self.kernel.split_axis])} "
+              f"tiling_axis： {','.join([x.name for x in self.kernel.tiling_axis])} low_dims:{self.kernel.low_dims}, "
+              f"indexing: {self.indexing} possible_need_permute:{self.possible_need_permute}" )
+
+    # fixme the below logic doesn't work when there're two reduction axis, but only one need outer reduction
     def should_outer_reduce_me(self, x):
-        should_outer = self.kernel.is_higher_order_reduction(True) and SplitTiling.great_than(x.length, 32768) and x.is_loop
-        if should_outer:
+        should_outer = self.kernel.is_higher_order_reduction(True) and SplitTiling.great_than(x.length, 32768 ) and x.is_loop
+        if should_outer :
             self.should_outer_reduce = True
             self.kernel.split_axis = x
             self.kernel.split_axis.is_split_axis = True
         return should_outer
     
-    @staticmethod
-    def decide_nblocks_xblock(numel, no_axis2, min_aligned_numel, xblock=None):
-        #no_axis2 mean there's only on dims
-        min_xblock = min_aligned_numel if no_axis2 else 1
-
-        # need to keep linearity for low_dims
-        if xblock is None:
-            xblock = (numel + num_vector_core - 1) // num_vector_core if numel > num_vector_core else min_xblock
-        
-        xblock = next_power_of_2(xblock)
-
-        nblocks = (numel + xblock - 1) // xblock
-        return nblocks, xblock
-    
-    @staticmethod
-    def get_nblocks_before_launch(numel, xblock):
-        nblocks = (numel + xblock - 1) // xblock
-        return nblocks, xblock
-
-    @staticmethod
-    def get_nblocks_xblock_list(numel):
-        ret = []
-        XBLOCK = numel
-        NBLOCKS = 1
-        ret.append((NBLOCKS, XBLOCK))
-        while NBLOCKS <= num_vector_core and XBLOCK > 1:
-            XBLOCK -= 1
-            NBLOCKS = (numel + XBLOCK - 1) // XBLOCK
-            XBLOCK = (numel + NBLOCKS - 1) // NBLOCKS
-            ret.append((NBLOCKS, XBLOCK))
-
-        return ret
-
+    def find_longest_dimension(self, check_in_tiling = False ):
+        longest = None 
+        for axis in self.kernel.sorted_axis:
+            if (longest is None or axis.length > longest.length) and \
+                     (not check_in_tiling or axis not in self.kernel.tiling_axis ) :
+                longest = axis
+        return longest 
+             
     # return True when x is the low-dim in indexing
     def is_lowest_dimension(self, x):
         return x.sorted_order in self.kernel.low_dims
 
     def find_lowest_dimension(self):
-        def construct_low_dim():
+        def construct_low_dim() :
             for index in self.indexing:
                 coefficients_dict = index.as_coefficients_dict()
                 for key, value in coefficients_dict.items():
@@ -226,7 +207,8 @@ class SplitTiling:
                         self.kernel.low_dims.add(axis.sorted_order)
         
         # all read index should be considered
-        buf_names = [node.node.name for node in self.kernel.node_schedule if node not in (EnableReduction, DisableReduction)]
+        buf_names = [node.node.name for node in self.kernel.node_schedule if
+                     node not in (EnableReduction, DisableReduction)]
         for node in self.kernel.node_schedule:
             if node in (EnableReduction, DisableReduction):
                 continue
@@ -238,12 +220,11 @@ class SplitTiling:
                 read_is_inptr = False if arg[:3] != 'arg' and arg in buf_names else True
                 if read_is_inptr:
                     names.append(name)
-
             for key, index in node._body.indexing.items():
                 if key in names and index not in self.indexing:
                     self.indexing.append(index)
 
-        if self.kernel.inside_reduction:
+        if self.kernel.inside_reduction :
             construct_low_dim()
             return
 
@@ -272,10 +253,10 @@ class SplitTiling:
         if isinstance(ynumel, (sympy.Symbol, sympy.Expr)) and not isinstance(ynumel, sympy.Integer):
             ynumel = ynumel.subs(V.graph.sizevars.var_to_val)
 
-        if isinstance(xnumel, sympy.Integer) and isinstance(ynumel, int):
+        if isinstance(xnumel, sympy.Integer) and  isinstance(ynumel, int):
             ynumel = sympy.Integer(ynumel)
         
-        if isinstance(ynumel, sympy.Integer) and isinstance(xnumel, int):
+        if isinstance(ynumel, sympy.Integer) and   isinstance(xnumel, int):
             xnumel = sympy.Integer(xnumel)
 
         return (xnumel, ynumel)
diff --git a/torch_npu/_inductor/codegen/tile_generator.py b/torch_npu/_inductor/codegen/tile_generator.py
index 6cca5e4d76..40e60bd3ca 100644
--- a/torch_npu/_inductor/codegen/tile_generator.py
+++ b/torch_npu/_inductor/codegen/tile_generator.py
@@ -3,133 +3,251 @@ import math
 
 from torch._inductor.runtime.triton_heuristics import Config
 from torch._inductor.runtime.runtime_utils import next_power_of_2
-from .triton_utils import get_aligned_numel, byte_per_numel
-
-
+from .triton_utils import byte_per_numel
+import functools 
+from ..config import num_vector_core
+import sys
 # generate tiling configs
 class TileGenerator:
-    
-    @staticmethod
-    def aligned_numel(numel):
+
+    def __init__(self, numels, axis_names, tiling_axis, split_axis, low_dims, persistent_reduction, 
+                 configs, dtype, dual_reduction = False) :
+        self.numels = numels.copy() 
+        
+        self.blocks = [x for x in self.numels]
+        self.candidate_blocks=[]
+        self.sub_blocks = self.blocks.copy()
+        self.axis_name = axis_names
+        self.tiling_axis = tiling_axis 
+        self.split_axis = split_axis 
+        self.low_dims = low_dims 
+        self.configs = configs
+        self.dtype_bytes = self.get_byte_per_numel(dtype)
+        self.stop_numel = 1024 //  self.dtype_bytes
+        self.block_name = {}
+        self.sub_block_name = {}
+        self.persistent_reduction = persistent_reduction
+        self.dual_reduction = dual_reduction
+        for axis, name in enumerate(self.axis_name) :
+            if axis not in tiling_axis and axis not in split_axis :
+                self.blocks[axis] = 1 
+                self.sub_blocks[axis] =1
+                continue
+            if axis in self.split_axis :
+                self.block_name[axis] = f"{name.upper()}BLOCK"
+            if axis in self.tiling_axis :
+                self.sub_block_name[axis] = f"{name.upper()}BLOCK_SUB"
+
+
+    def aligned_numel(self, numel):
         aligned = next_power_of_2(numel)
         return aligned
 
-    @staticmethod
-    def get_byte_per_numel(dtype):
-        if dtype is None:
+    
+    def get_byte_per_numel(self, dtype):
+        if dtype is None :
             return 1
         return byte_per_numel[dtype]
 
-    @staticmethod
-    def valid_config(config, align_numel, rnumel=1):
-        
-        count_bytes = align_numel
-        max_numel = 16384 * 4 // count_bytes
 
-        rblock = config["RBLOCK"] if "RBLOCK" in config else rnumel
-        xblock_sub = config["XBLOCK_SUB"]
-        if rblock * xblock_sub <= max_numel:
-            return True
+    def valid_tile_numel(self, total_numel):
+        bytes = self.dtype_bytes
+        max_numel = 16384 * 4 // bytes
+        return total_numel <= max_numel
 
-        return False
 
-    # when rblock is low dim, need to maximize rblock
-    @staticmethod
-    def descend_xblock(rnumel, xblock, configs, cfg, align_numel, aggresive=True):
+    def calculate_config_numel(self, config) :
+        total_numel = 1 
+        # for axis in self.split_axis :
+        #     if axis not in self.tiling_axis :
+        #         total_numel = total_numel * config[self.block_name[axis]] 
+        for axis in self.tiling_axis :
+            total_numel = total_numel * config[self.sub_block_name[axis]]
+        return total_numel
+    
+    def calculate_total_numel(self) :
+        smallest = sys.maxsize
+        def calculate_total_numel_candi( blocks) :
+            total_numel = 1 
+            # for axis in self.split_axis :
+            #     if axis not in self.tiling_axis :
+            #         total_numel = total_numel * blocks[axis]
+            for axis in self.tiling_axis :
+                total_numel = total_numel * self.sub_blocks[axis]
+            return total_numel
+        for candi_blocks in self.candidate_blocks :
+            numel = calculate_total_numel_candi(candi_blocks)
+            if numel < smallest :
+                smallest = numel 
+        return smallest 
+    
+    def fill_config(self, config, blocks) :
+        for axis in self.split_axis :
+            config[self.block_name[axis]] = blocks[axis]
+        for axis in self.tiling_axis :
+            tiling_numel = self.aligned_numel(self.sub_blocks[axis] )
+            config[self.sub_block_name[axis]] = tiling_numel
+    def find_config(self, cfg) :
+        for config in self.configs :
+            if config.kwargs == cfg :
+                return True 
+        return False 
+    
+    def add_to_configs(self, candi_block) :
+        newcfg = {}
+        self.fill_config(newcfg, candi_block )
+        total_numel = self.calculate_config_numel(newcfg)
+        if self.valid_tile_numel(total_numel) and not self.find_config(newcfg):
+            self.configs.append(Config(newcfg, num_warps=1, num_stages=1))
+
+    def descend_one_axis(self, axis,  is_split = False  ):    
+        def calc_total_programs():  
+            grids = []
+            for axis in self.split_axis :
+                numel = self.numels[axis]
+                block_size = self.blocks[axis]
+                programs = (numel + block_size -1 ) // block_size
+                grids.append(programs)
+                
+            total_programs = functools.reduce(lambda x,y : x * y, grids) if grids else 1
+            return total_programs
         
-        count_bytes = align_numel
-        start_numel = 2048 // count_bytes if aggresive else 1024 // count_bytes
-        # include rblock is too big, need to decend rblock first
-        rblock = rnumel if rnumel > 0 else 1
-        while (rblock > start_numel):
-            newcfg = copy.deepcopy(cfg)
-            newcfg["RBLOCK"] = rblock
-            if TileGenerator.valid_config(newcfg, align_numel):
-                configs.append(Config(newcfg, num_warps=1, num_stages=1))
-            rblock = rblock // 2
-        cfg["RBLOCK"] = rblock
-        xblock_sub = TileGenerator.aligned_numel(xblock)
-
-        while True:
-            newcfg = copy.deepcopy(cfg)
-            newcfg["XBLOCK_SUB"] = xblock_sub
-            if TileGenerator.valid_config(newcfg, align_numel, rnumel=rblock):
-                configs.append(Config(newcfg, num_warps=1, num_stages=1))
-            xblock_sub = xblock_sub // 2
-            if xblock_sub * rblock <= start_numel:
-                break
-
-    @staticmethod
-    def descend_rblock(rnumel, xblock, configs, cfg, align_numel, aggresive=True):
-        count_bytes = align_numel
-        start_numel = 4096 // count_bytes if aggresive else 1024 // count_bytes
-
-        xblock_sub = start_numel if xblock > start_numel else xblock
-        cfg["XBLOCK_SUB"] = xblock_sub
-        rblock = rnumel
-        while True:
-            newcfg = copy.deepcopy(cfg)
-            newcfg["RBLOCK"] = rblock
-            if TileGenerator.valid_config(newcfg, align_numel):
-                configs.append(Config(newcfg, num_warps=1, num_stages=1))
-            rblock = rblock // 2
-            if xblock_sub * rblock <= start_numel:
+        reached_stop_numel = False 
+        slow_decend_split = False  
+          
+        while True :
+            total_numel = self.stop_numel + 100
+            for candi_block in self.candidate_blocks :
+                self.add_to_configs(candi_block)
+
+            # tile numel reached threshold     
+            total_numel = self.calculate_total_numel()         
+            if total_numel <= self.stop_numel:
+                self.add_to_configs(self.blocks)
+                reached_stop_numel = True 
+                break 
+
+            numel = self.blocks[axis] if is_split else self.sub_blocks[axis]
+            if numel == 1 :
+                self.add_to_configs(self.blocks)
                 break
 
-    @staticmethod
-    def descend_xblock_rblock(rnumel, xblock, configs, cfg, align_numel, aggresive=True):
-        count_bytes = align_numel
-        start_numel = 4096 // count_bytes if aggresive else 1024 // count_bytes
+            if is_split :
+                if self.persistent_reduction and self.axis_name[axis][0] == "r" :
+                    reached_stop_numel = True 
+                    break 
+                total_programs = calc_total_programs()
+                if total_programs > num_vector_core :
+                    break
+                if total_programs > num_vector_core // 2 or self.dual_reduction:
+                    if len(self.candidate_blocks) > 2 :
+                        self.candidate_blocks.pop(0)
+                    self.candidate_blocks.append(tuple(self.blocks))
+
+                self.blocks[axis] = numel // 2 
+                self.sub_blocks[axis] = self.blocks[axis]
+                total_programs = calc_total_programs()
+                if total_programs > num_vector_core:
+                    slow_decend_split = True 
+                step = numel // 4 if numel // 4 > 1 else 1
+                self.blocks[axis] = numel // 2 if not slow_decend_split else numel -step
+                self.sub_blocks[axis] = self.blocks[axis] 
+            else :
+                if numel >= 128 :
+                    self.sub_blocks[axis] = next_power_of_2(numel // 2 )
+                else :# numel >4 and numel < 128 :
+                    self.slow_descend_axis(axis)
+                # else :
+                #     break 
+        return reached_stop_numel
+    
+    def slow_descend_axis(self, axis) :
+        numel = self.sub_blocks[axis]
+        self.sub_blocks[axis] = self.aligned_numel( numel // 2 )
+        # numel = self.aligned_numel( max(numel - 4, numel //2 ))
+        # if (numel == self.sub_blocks[axis]) :
+        #     numel = self.aligned_numel( max(numel - 8, numel //2 ))
+        # self.sub_blocks[axis] = numel
+  
+    def descend_all_low_dims(self) :
+        low_dim_numels = [self.sub_blocks[x] for x in self.low_dims]
+        if not low_dim_numels :
+            return 
+        
+        def descent_all_axis(min_numel ) :
+            for axis in self.low_dims :
+                if self.axis_name[axis][0] == "r" and self.persistent_reduction :
+                    continue 
+                numel = self.sub_blocks[axis]
+                if numel == 1 :
+                    continue 
+                if min_numel > 1 and abs(numel - min_numel) / min_numel < 0.2 :
+                    continue 
+                if numel >= 128 :
+                    self.sub_blocks[axis] = next_power_of_2(numel // 2 )
+                else :# numel >4 and numel < 128 :
+                    self.slow_descend_axis(axis)
+
+        count = 0        
+        total_numel = self.calculate_total_numel()    
+        while total_numel > self.stop_numel and count < 100:
+            count += 1
+            total_numel = self.calculate_total_numel() 
+            for candi_block in self.candidate_blocks :
+                self.add_to_configs(candi_block)
+            min_numel = min(low_dim_numels)
+            descent_all_axis(min_numel)
+            total_numel_2 = self.calculate_total_numel() 
+            if total_numel == total_numel_2 :
+                descent_all_axis(0)
+
+        return total_numel < self.stop_numel
+    
+    def descend_split_tiling(self ):
+
+        tiling_not_low_dims = [x for x in self.tiling_axis if x not in self.low_dims ]
+        def descend_split_axis () :
+            
+            for axis in self.split_axis :
+                if self.descend_one_axis(axis, is_split=True) :
+                    return True  
+         
+            total = self.calculate_total_numel()
+            return total <= self.stop_numel
+        
+        def desceond_tiling_not_low_dims() :
+            for axis in tiling_not_low_dims :
+                if self.axis_name[axis][0] == "r" and self.persistent_reduction :
+                    continue 
+                if self.descend_one_axis( axis) :
+                     return True  
+            total = self.calculate_total_numel()
+            return total <= self.stop_numel
         
-        # Depending on the number of bytes available to the hardware UB,
-        # 4096 bytes is an appropriate empirical value for an intra-core split.
-        # Rule: xblock_sub * rblock <= start_numel
-        end_numel = math.floor(math.sqrt(start_numel))
-
-        xblock = next_power_of_2(xblock)
-        rnumel = next_power_of_2(rnumel)
-
-        xblock_sub = xblock if xblock > start_numel else xblock
-        rblock = start_numel if rnumel > start_numel else rnumel
-
-        rblock_is_biggerr = rblock > xblock_sub
-
-        if xblock_sub * rblock <= start_numel:
-            newcfg = copy.deepcopy(cfg)
-            newcfg["XBLOCK_SUB"] = xblock_sub
-            newcfg["RBLOCK"] = rblock
-            if TileGenerator.valid_config(newcfg, align_numel):
-                configs.append(Config(newcfg, num_warps=1, num_stages=1))
-
-        if rblock_is_biggerr:
-            while rblock > xblock_sub and xblock_sub * rblock > start_numel:
-                newcfg = copy.deepcopy(cfg)
-                newcfg["RBLOCK"] = rblock
-                xblock_sub = xblock
-                if TileGenerator.valid_config(newcfg, align_numel):
-                    configs.append(Config(newcfg, num_warps=1, num_stages=1))
-                rblock = rblock // 2
-        else:
-            while rblock < xblock_sub and xblock_sub * rblock > start_numel:
-                newcfg = copy.deepcopy(cfg)
-                newcfg["XBLOCK_SUB"] = xblock_sub
-                if TileGenerator.valid_config(newcfg, align_numel):
-                    configs.append(Config(newcfg, num_warps=1, num_stages=1))
-                xblock_sub = xblock_sub // 2
-
-        while xblock_sub * rblock > start_numel:
-            newcfg = copy.deepcopy(cfg)
-            newcfg["XBLOCK_SUB"] = xblock_sub
-            newcfg["RBLOCK"] = rblock
-            if TileGenerator.valid_config(newcfg, align_numel):
-                configs.append(Config(newcfg, num_warps=1, num_stages=1))
-            if xblock_sub >= end_numel:
-                xblock_sub = xblock_sub // 2
-            if rblock >= end_numel:
-                rblock = rblock // 2
-
-    @staticmethod
-    def nearest_power_of_2(n):
-        big = next_power_of_2(n)
-        small = big // 2
-        return big if (big - n) < (n - small) else small
+        #fixme, need to all low dims fairly 
+        def descend_low_dims() :
+            for axis in self.tiling_axis :
+                if self.axis_name[axis][0] == "r" and self.persistent_reduction :
+                    continue 
+                if axis in tiling_not_low_dims :
+                    continue 
+                if self.descend_one_axis(axis) :
+                    return True  
+            total = self.calculate_total_numel()
+            return total <= self.stop_numel       
+       
+        while True :
+            # descend split axis
+            if descend_split_axis() :
+                break   
+            if len(self.candidate_blocks) > 0 :
+                self.sub_blocks = list(self.candidate_blocks[0])
+            # descend tiling but not low dims
+            if desceond_tiling_not_low_dims() :
+                break 
+            # descend low dims, fixme, need to descend all axis at the same time 
+            # descend_low_dims() 
+            self.descend_all_low_dims()
+            break
+            
+   
\ No newline at end of file
diff --git a/torch_npu/_inductor/codegen/triton.py b/torch_npu/_inductor/codegen/triton.py
index e2fe0cdb5e..a25421917a 100644
--- a/torch_npu/_inductor/codegen/triton.py
+++ b/torch_npu/_inductor/codegen/triton.py
@@ -1,6 +1,5 @@
 import os
 from typing import List, Set, Iterable, Callable, Sequence
-from typing import Dict
 import operator
 import itertools
 from enum import Enum
@@ -11,7 +10,8 @@ from typing import (
     Union,
     Tuple,
     Any,
-    cast
+    cast,
+	Dict
 )
 
 import re
@@ -42,7 +42,9 @@ from torch._inductor.codegen.triton import (
     triton_acc_type,
     constant_repr,
     is_welford_reduction, FixedTritonConfig,
-    prefix_is_reduction, upcast_acc_dtype
+    prefix_is_reduction, upcast_acc_dtype,
+    get_kernel_category_by_source_code,
+    get_fused_kernel_name
 )
 
 from torch.utils._sympy.functions import FloorDiv, Identity, ModularIndexing
@@ -71,6 +73,8 @@ from torch.utils._sympy.symbol import SymT, symbol_is_type
 from torch.utils._sympy.value_ranges import bound_sympy, ValueRangeAnalysis, ValueRanges
 from torch.utils._sympy.numbers import int_oo
 from torch._inductor.dtype_propagation import DtypePropagationOpsHandler
+from .kernel_analysis import IndexAnalysis, ReductionAnalysis
+import torch_npu._inductor.config  as inductor_npu_config
 
 from ..runtime import NPUDeviceProperties
 from .npu_kernel_features import NumelList
@@ -85,22 +89,6 @@ def flatten(nums):
             res.append(i)
     return res
 
-
-class AxisDirection(Enum):
-    Flat = 0,
-    Vertical = 1,
-    Horizontal = 2
-
-
-def reverse_direction(direction):
-    if direction == AxisDirection.Vertical:
-        return AxisDirection.Horizontal
-    elif direction == AxisDirection.Horizontal:
-        return AxisDirection.Vertical
-    else:
-        return AxisDirection.Flat
-
-
 class NPUTritonKernelOverrides(TritonKernelOverrides):
 
     @staticmethod
@@ -134,14 +122,14 @@ class NPUTritonKernelOverrides(TritonKernelOverrides):
 
 def group_fn(self, sizes):
     groups = list()
-    for s in sizes:
-        if not s:
-            groups.append(1)
-        elif isinstance(s, list):
-            group = flatten(s)
-            groups.append(NumelList(tuple(group)) if isinstance(group, list) else group)
-        else:
-            groups.append(s)
+    for s in sizes :
+       if not s :
+           groups.append(1)
+       elif isinstance(s, list):
+           group = flatten(s)
+           groups.append(NumelList(tuple(group)) if isinstance(group, list) else group)
+       else :
+           groups.append(s)      
     return tuple(groups)
 
 
@@ -156,48 +144,71 @@ class IterationRangesEntryNPUIndex(IterationRangesEntry):
             self,
             *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.is_tiling_axis1 = False
-        self.is_tiling_axis2 = False
+        self.is_tiling_axis = False
         self.is_split_axis = False
         self.indexing_code = IndentedBuffer()
         self.sorted_order = None
-        self.low_dims = set()
-
-
+        self.tiling_order = None
+        self.split_order = None 
+        self.var_directions = {}
+        self.directions = []
+        # don't use functools.lru_cache(None), so that previous indexing_code produdec by previous index,
+        # could be overwritten
+        self.codegen = self._codegen
+    # axis mask
     def _codegen_mask(self):
-        if self.is_tiling_axis1 or self.is_tiling_axis2:
+        if self.is_tiling_axis :
             upper = f"{self.name}_numel"
             line = f"{self.name}_mask = {self.name} < {upper}"
             self.writeline(line)
-            line = f"{self.name}_prime_mask = {self.name}_prime < {upper}"
-            self.writeline(line)
+            for var in self.var_directions.keys():
+                line = f"{var.name}_mask = {var.name} < {upper}"
+                self.writeline(line)
         else:
             pass
-
+        
+    def get_axis_direction(self ) :
+        
+        #assume self.golden_var_list is to be correct axis order 
+
+        if self.directions:
+           return f"[{','.join(self.directions)}]"
+        tiling_axis = [x.symbol() for x in self.kernel.tiling_axis]
+
+        rev_orders = [x for x in self.kernel.golden_var_list if x in tiling_axis] 
+        self.directions = ["None"] * len(tiling_axis)
+        assert len(tiling_axis) == len(rev_orders), f"tiling len={len(tiling_axis)}, golden varlist len ={len(rev_orders)}"
+        var_orders = list(reversed(rev_orders))
+        index = var_orders.index(self.symbol())
+        self.directions[index] = ":"
+        return f"[{','.join(self.directions)}]"
+    
+    # axis var, FIXME, need to define var with diffent direction 
     def _codegen(self):
+        self.indexing_code.clear()
         index = None
-        vertical = self.is_tiling_axis1 if V.kernel.numof_reduction_axis() <= 1 else not isinstance(self.expr, ModularIndexing)
-        direction = V.kernel.get_axis_direction(vertical)
         # for multiple reduce dims, don't need this
-        if self.is_tiling_axis1 and V.kernel.numof_reduction_axis() <= 1:
-            index = f"{self.name} = {self.codegen_index(direction)}"
-            #to be fixed, only permute need to this .
-            self.writeline(f"{self.name}_prime = {self.codegen_index(reverse_direction(direction))}")
-
-        elif self.is_tiling_axis2:
-            index = f"{self.name} = {self.codegen_index(direction)}"
-            #to be fixed, only permute need to this .
-            self.writeline(f"{self.name}_prime = {self.codegen_index(reverse_direction(direction))}")
+        if not self.is_tiling_axis :
+            return self.name
+
+        direction = self.get_axis_direction()
+        index = f"{self.name} = {self.codegen_index(direction)}"
+        for var, dir in self.var_directions.items():
+            line = f"{var.name} = {self.codegen_index(dir)}"
+            self.writeline(line)
+       
+        # reduction axis 
+        if self.prefix == 'r':
             if V.kernel.inside_reduction and V.kernel.current_node  \
                     and isinstance(V.kernel.current_node, SchedulerNode) \
                     and V.kernel.current_node.node \
                     and V.kernel.current_node.node.data \
                     and isinstance(V.kernel.current_node.node.data, ir.Reduction):
                 reduction_type = V.kernel.current_node.node.data.reduction_type
-                if reduction_type in {"argmax", "argmin"}:
+                if reduction_type in {"argmax", "argmin"} :
                     self.writeline(f"{self.parent.prefix}index = "
-                                   f"{self.codegen_index(reverse_direction(AxisDirection.Flat))}")
-        if index:
+                                   f"{self.codegen_index(None)}")
+        if index:  
             self.writeline(index)
             self._codegen_mask()
         return self.name
@@ -205,55 +216,51 @@ class IterationRangesEntryNPUIndex(IterationRangesEntry):
     def writeline(self, line):
         self.indexing_code.writeline(line)
 
+    def is_1d_persisent_reduction(self) :
+        return len(V.kernel.tiling_axis) == 1 and V.kernel.persistent_reduction 
+    
     def codegen_index(self, direction):
-        if self.is_tiling_axis1 and V.kernel.axis2 is None and V.kernel.persistent_reduction:
-            index = f"tl.arange(0, RBLOCK)"
-            return index
-        elif self.is_tiling_axis1:
-            if self.is_split_axis:
-                offset = f"{self.symbol()}_offset"
-                index = f"{offset} + (loop1 * XBLOCK_SUB) + base1"
+        BLOCK_NAME = f"{self.name.upper()}BLOCK"
+        BLOCK_NAME_SUB = f"{BLOCK_NAME}_SUB" 
+        index = None 
+        if self.prefix == 'r' :
+            if V.kernel.persistent_reduction :
+                if self.is_1d_persisent_reduction() :
+                    index = f"tl.arange(0, {BLOCK_NAME_SUB})"
+                else :
+                    index = f"base_{self.name}"
             else:
-                index = f"(loop1 * XBLOCK_SUB) + base1"
-            
-            if V.kernel.axis2 is not None and direction != AxisDirection.Flat:
-                index += ("[None, :]" if direction == AxisDirection.Horizontal else "[:, None]")
-            return index
-        elif self.is_tiling_axis2:
-            if V.kernel.persistent_reduction:
-                index = f"tl.arange(0, RBLOCK_{self.symbol()})" if V.kernel.numof_reduction_axis() > 1 else "base2"
-            elif self.is_split_axis:
+                index = f"(loop_{self.name} * {BLOCK_NAME_SUB}) + base_{self.name}"
+        else :   
+            if self.is_split_axis :
                 offset = f"{self.symbol()}_offset"
-                index = f"{offset} + (loop2 * RBLOCK) + base2"
-            else:
-                index = "loop2 * RBLOCK + base2"
+                index = f"{offset} + (loop_{self.name} * {BLOCK_NAME_SUB}) + base_{self.name}"
+            else :
+                index = f"(loop_{self.name} * {BLOCK_NAME_SUB}) + base_{self.name}"
+            
+        if len(V.kernel.tiling_axis) > 1  and direction is not None :
+            index += direction
+        
+        return index 
 
-            if direction != AxisDirection.Flat:
-                index += ("[:, None]" if direction == AxisDirection.Vertical else "[None, :]")
-            return index
-        else:
-            raise RuntimeError("codegen_index")
 
     def codegen_header(self, code):
         # generate offset index loop
         lines = []
+        BLOCK_NAME = f"{self.name.upper()}BLOCK" 
+        BLOCK_NAME_SUB = f"{BLOCK_NAME}_SUB" 
+         
+        if self.is_1d_persisent_reduction() :
+            return 
 
-        if self.is_split_axis and not (V.kernel.axis2 is None and V.kernel.persistent_reduction):
-            lines.append(f"{self.symbol()}_offset = tl.program_id(0) * XBLOCK")
+        if self.is_split_axis :
+            lines.append(f"{self.symbol()}_offset = tl.program_id({self.split_order}) * {BLOCK_NAME}")
 
-        if self.is_tiling_axis1 and not (V.kernel.axis2 is None and V.kernel.persistent_reduction):
-            #  don't create loops for multi-reductions
-            if V.kernel.numof_reduction_axis() <= 1:
-                lines.append("base1 = tl.arange(0, XBLOCK_SUB)")
-                xblock = f"XBLOCK" if self.is_split_axis else f"{self.symbol()}_numel"
-                lines.append(f"loops1 = ({xblock} + XBLOCK_SUB - 1) // XBLOCK_SUB")
+        if self.is_tiling_axis :
+            lines.append(f"base_{self.name}= tl.arange(0, {BLOCK_NAME_SUB})")
+            block = f"{BLOCK_NAME}" if self.is_split_axis else f"{self.symbol()}_numel"
+            lines.append(f"loops_{self.name} = ({block} + {BLOCK_NAME_SUB} - 1) // {BLOCK_NAME_SUB}")
 
-        elif self.is_tiling_axis2 and len(V.kernel.axis2_list) <= 1:
-            lines.append("base2 = tl.arange(0, RBLOCK)")
-            if self.is_split_axis:
-                lines.append(f"loops2 = (XBLOCK + RBLOCK - 1) // RBLOCK")
-            else:
-                lines.append(f"loops2 = ({self.name}_numel + RBLOCK - 1) // RBLOCK")
         else:
             pass
 
@@ -292,13 +299,13 @@ class IterationRangesRootNPUIndex(IterationRangesRoot):
             grid_dim: Optional[int],
     ):
         super().__init__(name, numel, prefix, index, kernel, pid_cache, is_loop=is_loop, tensor_dim=tensor_dim,
-                         grid_dim=grid_dim, has_zdim=False)
+                         grid_dim=grid_dim, has_zdim= False )
 
     def __repr__(self):
         return f"IterationRangesRootNPUIndex({self.name!r}, {self.numel}, ...)"
 
     def remove_entry(self, name):
-        if name in self.var_ranges:
+        if name in self.var_ranges :
             del self.var_ranges[name]
         if name in self.var_list:
             del self.var_list[self.var_list.index(name)]
@@ -377,21 +384,19 @@ class NPUIndexTritonKernel(TritonKernel):
             **kwargs)
         self.first_node = True
         self.inside_high_order_reduction = False
-        # split axis
-        self.split_axis = None
-        # tiling axis
-        self.axis1 = None
-        self.axis2 = None
-        # incase two reduction axis
-        self.axis2_list = []
-        self.low_dims = set()
-
+        self.low_dims  = set()
+        self.split_axis = []
+        self.tiling_axis = []
         self.range_tree_nodes_removed: Dict[sympy.Symbol, IterationRangesEntry] = {}
         self.range_tree_nodes_substituted = {}
         self.expr_substituted = {}
         self.sorted_axis = []
         self.prefix: IndentedBuffer = IndentedBuffer()
-
+        self.index_analysis = {} # var_list -> indexAnalysis
+        self.golden_var_list = None 
+        self.reduce_analysis = None
+        self.load_store_indexing = None 
+          
     def gen_triton_ext_imports(self):
         imports = IndentedBuffer()
         imports.splice(
@@ -402,6 +407,7 @@ class NPUIndexTritonKernel(TritonKernel):
             from torch_npu._inductor.runtime import NPUDeviceProperties
             from torch_npu._inductor.npu_triton_helpers import libdevice, math as tl_math
             import torch
+            import torch_npu
             """
         )
         return imports.getvalue()
@@ -416,15 +422,8 @@ class NPUIndexTritonKernel(TritonKernel):
         key = f"{triton_key()}-{backend.hash()}"
         return hashlib.sha256(key.encode("utf-8")).hexdigest()
 
-    def numof_reduction_axis(self):
-        root = self.range_trees[-1]
-        if root is None:
-            return 0
-
-        return len(root.var_list)
-
     def numof_tiling_axis(self):
-        return (1 if self.axis1 is not None else 0) + (1 if self.axis2 is not None else 0)
+        return len(self.tiling_axis)
 
     #do nothing in NpuTritonKernel
     def codegen_range_tree(self):
@@ -432,19 +431,20 @@ class NPUIndexTritonKernel(TritonKernel):
            
 
     def initialize_range_tree(self, pid_cache):
+        #self.numels = flatten(self.numels)
         self.total_numels = 0
-        for k, x in self.numels.items():
-            if not isinstance(x, sympy.Integer):
-                x = x.subs(V.graph.sizevars.var_to_val)
-                self.numels[k] = x
-            if x > 1:
-                self.total_numels += 1
+        for k, x in self.numels.items() :
+            if not isinstance(x, sympy.Integer) :
+                 x = x.subs(V.graph.sizevars.var_to_val)
+                 self.numels[k] = x
+            if x > 1 :
+                self.total_numels +=1
 
         no_r_dim = not self.inside_reduction or self.numels["r"] == 1
         prefixes = "wvtzyxr"
-        active_prefixes = prefixes[-len(self.numels):]
-        #prefix can not be 's', 'u', 'ps' , 'i', 'z', 'q'
-        #prefix can not be 'p' from torch 2.6.0
+        active_prefixes = prefixes[-len(self.numels) :]
+        #prefix can not be 's', 'u', 'ps' , 'i', 'z'
+        #prefix can not be 'p' but can be 'z' since 2.6
         grid_dims = "xyztvw"
         if self.no_x_dim:
             tensor_dims = "r"
@@ -472,14 +472,82 @@ class NPUIndexTritonKernel(TritonKernel):
                 )
             )
 
+
+
+    def get_axis_dtype(self, axis):
+        dtype = None
+        if axis is None :
+            return None
+        for node in self.node_schedule :
+            if node in (EnableReduction, DisableReduction) :
+                continue
+            if axis.symbol() in node._body.indexing_map :
+                dtype = V.graph.get_dtype(node.node.name)
+                break
+        if dtype is None :
+            should_break_all = False
+            for node in self.node_schedule:
+                if should_break_all:
+                    break
+                if node in (EnableReduction, DisableReduction):
+                    continue
+                for key, value in node._body.indexing_map.items():
+                    if key  in self.range_tree_nodes :
+                        dim = self.range_tree_nodes[key]
+                    else :
+                        dim = self.range_tree_nodes_removed[key]
+
+                    if dim.parent == axis.parent :
+                        dtype = V.graph.get_dtype(node.node.name)
+                        should_break_all = True
+                        break
+        return dtype
+    
+    def create_inductor_meta(self):
+        mutated_args = set()
+        for mutation in self.mutations:
+            if mutation in self.args.input_buffers:
+                mutated_args.add(self.args.input_buffers[mutation])
+            if (
+                    mutation in self.args.inplace_buffers
+                    and mutation not in V.graph.removed_buffers
+                    and mutation not in self.removed_buffers
+            ):
+                mutated_args.add(self.args.inplace_buffers[mutation].inner_name)
+            if mutation in self.args.output_buffers:
+                mutated_args.add(self.args.output_buffers[mutation])
+        mutated_args = sorted(mutated_args)
+        tiling_axis = [x.sorted_order for x in self.tiling_axis]
+        split_axis = [x.sorted_order for x in self.split_axis]
+        axis_names = [x.name for x in self.sorted_axis]
+        split_axis_dtype = self.get_axis_dtype(self.split_axis[0]) if self.split_axis else None 
+        inductor_meta = {
+            "autotune_hints": set(self.autotune_hints),
+            "kernel_name": str(Placeholder.DESCRIPTIVE_NAME),
+            "mutated_arg_names": mutated_args,
+
+            # Due to breaking change of triton 3.0, the original invocation is broken
+            "backend_hash": self.patch_triton_hash(),  # torch.utils._triton.triton_hash_with_backend(),
+            "split_axis" :  split_axis,
+            "tiling_axis" : tiling_axis,
+            "axis_names" : axis_names,
+            "low_dims" : self.low_dims,
+            "numof_reduction_axis": self.numof_reduction_axis(),
+            "split_axis_dtype": split_axis_dtype,
+            "dual_reduction": self.numof_reduction_axis() > 1,
+            "traced_graph_hash": "TRACED_GRAPH_HASH"
+            #"coordinate_descent_tuning" : True 
+            
+        }
+        return inductor_meta
+    
     # numels sent to autotune configs
     def get_size_hints(self):
         size_hints = []
-
         if (len(self.range_tree_nodes.values()) == 0):
-            return size_hints
+            return [v for _,v in self.numels.items()]
         
-        for _, node in enumerate(self.sorted_axis):
+        for i, node in enumerate(self.sorted_axis):
             if isinstance(node.expr, ModularIndexing):
                 numel_expr = node.length
             else:
@@ -491,11 +559,11 @@ class NPUIndexTritonKernel(TritonKernel):
         return size_hints
 
     # torch251 done
-    def add_numel_to_call_args_and_grid(self, name, call_args, arg_types, grid):
+    def add_numel_to_call_args_and_grid(self, name, call_args, arg_types,  grid):
         for node in self.sorted_axis:
-            if isinstance(node.expr, ModularIndexing):
+            if isinstance(node.expr, ModularIndexing) :
                 numel_expr = node.length
-            else:
+            else :
                 numel_expr = node.expr.subs({sympy_index_symbol(r.name): r.numel for r in self.range_trees})
 
             if isinstance(numel_expr, (sympy.Integer, sympy.Symbol)):
@@ -507,21 +575,55 @@ class NPUIndexTritonKernel(TritonKernel):
             if node.parent.grid_dim is not None:
                 grid.append(expr)
 
-    def gen_numel_args(self, signature, triton_meta_signature, argdefs):
+    def gen_numel_args(self, signature, triton_meta_signature, argdefs ):
         for node in self.sorted_axis:
             arg_name = f"{node.name}_numel"
-            if not os.environ.get('INDUCTOR_STATIC_MODE'):
+            if not inductor_npu_config.inductor_static_mode:
                 sizearg = SizeArg(arg_name, node.length)
                 signature.append(sizearg)
                 triton_meta_signature[arg_name] = signature_of(
                     sizearg, size_dtype=self.index_dtype
                 )
                 argdefs.append(arg_name)
-            else:
+            else :
                 argdefs.append(f"{arg_name}: tl.constexpr")
                 self.triton_meta["constants"][arg_name] = node.length
 
+        # BLOCK and SUB_BLOCK definitions
+    def add_autotune_args(self, argdefs):
+        for axis in self.split_axis :
+            argdefs.append(f"{axis.name.upper()}BLOCK: tl.constexpr")
+        
+        for axis in self.tiling_axis :
+             if axis.name[0] == 'r' and self.persistent_reduction:
+                 continue            
+             argdefs.append(f"{axis.name.upper()}BLOCK_SUB: tl.constexpr")
+
+    def _get_heuristic(self):
+        if self.persistent_reduction:
+            assert self.inside_reduction
+            return "persistent_reduction_npu_index"
+        elif self.inside_reduction:
+            return "reduction_npu_index"
+        return "pointwise_npu_index"
+   
+    def get_kernel_name(self, src_code, node_schedule, kernel):
+        wrapper = V.graph.wrapper_code
+        if src_code in wrapper.src_to_kernel:
+            kernel_name = wrapper.src_to_kernel[src_code]
+        else:
+            fused_name = (
+                get_fused_kernel_name(node_schedule, config.triton.descriptive_names)
+                if config.triton.descriptive_names
+                else ""
+            )
+            kernel_category = get_kernel_category_by_source_code(src_code)[:3]
+            kernel_name = "_".join(
+                ["triton", kernel_category, fused_name, wrapper.get_next_kernel_suffix()]
+            )
+        return kernel_name
 
+    # modify triton_meta, inductor_meta , etc.
     def codegen_kernel(self, name=None):
         code = IndentedBuffer()
         size_hints = self.get_size_hints()
@@ -544,7 +646,7 @@ class NPUIndexTritonKernel(TritonKernel):
                         arg.name, V.graph.sizevars.inv_precomputed_replacements[symbol]
                     )
 
-        triton_meta_signature = signature_to_meta(signature, size_dtype=self.index_dtype, argdefs=argdefs)
+        triton_meta_signature = signature_to_meta( signature, size_dtype=self.index_dtype, argdefs = argdefs )
 
         triton_meta = {
             "signature": triton_meta_signature,
@@ -627,67 +729,54 @@ class NPUIndexTritonKernel(TritonKernel):
 
     
     def codegen_static_numels(self, code):
-        no_x_axis = self.numof_reduction_axis() > 1
-        symbols = []
-        if self.axis2 is not None:
-            symbols = list(self.axis2_list) if no_x_axis else list([self.axis2])
-        elif self.persistent_reduction and self.axis1 is not None:
-            symbols = list([self.axis1])
-
-        nodes = [self.range_tree_nodes[symbol] for symbol in symbols if symbol is not None]
-        for node in nodes:
-            if node.prefix == "r" and self.persistent_reduction:
-                simplified_tree_numel = V.graph.sizevars.simplify(node.length)
-                if isinstance(simplified_tree_numel, (sympy.Integer, int)):
-                    val = int(simplified_tree_numel)
-                else:
-                    continue
-                val = next_power_of_2(val)
-                if no_x_axis:
-                    code.writeline(f"RBLOCK_{node.symbol()}: tl.constexpr = {val}")
-                else:
-                    code.writeline(f"RBLOCK: tl.constexpr = {val}")
+        for symbol in self.reduction_axis_list():
+            if symbol.name[0] != "r" or not self.persistent_reduction:
+                continue
+
+            node = self.range_tree_nodes[symbol]
+            simplified_tree_numel = V.graph.sizevars.simplify(node.length)
+            if isinstance(simplified_tree_numel, (sympy.Integer, int)):
+                val = int(simplified_tree_numel)
+            else:
+                continue
+            val = next_power_of_2(val)
+            code.writeline(f"{node.name.upper()}BLOCK_SUB: tl.constexpr = {val}")
+   
+
+    def lowest_axis_variable(self):
+        if len(self.tiling_axis) == 0 :
+            return None 
+        return self.tiling_axis[-1]
 
-    def axis2_variable(self):
-        if self.axis2 is not None:
-            return self.range_tree_nodes[self.axis2]
-        return None
 
-    def is_isolated_symbol(self, input_str, symbol):
-        # 使用正则表达式查找独立的符号, 防止out_ptr0 匹配上r0  r0_prime
-        pattern1 = r'\b' + re.escape(symbol) + r'\b'
-        pattern2 = r'\b' + re.escape(symbol + '_prime') + r'\b'
+    def is_isolated_symbol(self, input_str, range):
+        patterns = [r'\b' + re.escape(range.name) + r'\b']
+        for var in range.var_directions.keys():
+            pattern = r'\b' + re.escape(var.name) + r'\b'
+            patterns.append(pattern)
 
-        return bool(re.search(pattern1, input_str)) or bool(re.search(pattern2, input_str))
+        for pattern in patterns :
+            if re.search(pattern, input_str) :
+                return True 
+        return False 
 
-    def find_axis2_in_load_store(self):
-        var = self.axis2_variable()
-        if not var:
+
+    def find_axis_in_load_store(self, range):
+        if not range :
             return False
-        for line in self.loads._lines:
-            if line.find('tl.load') >= 0 and self.is_isolated_symbol(line, var.name):
+        for line in self.loads._lines :
+            if line.find('tl.load') >= 0 and self.is_isolated_symbol(line, range):
                 return True
-        for line in self.compute._lines:
-            if line.find('tl.load') >= 0 and self.is_isolated_symbol(line, var.name):
+        for line in self.compute._lines :
+            if line.find('tl.load') >= 0 and self.is_isolated_symbol(line, range):
                 return True
-        for line in self.post_loop_store._lines:
-            if line.find('tl.store') >= 0 and self.is_isolated_symbol(line, var.name):
+        for line in self.post_loop_store._lines :
+            if line.find('tl.store') >= 0 and self.is_isolated_symbol(line, range):
                 return True
-        for line in self.stores._lines:
-            if isinstance(line, DeferredLine):
+        for line in self.stores._lines :
+            if isinstance(line,DeferredLine) :
                 line = line.line
-            if line.find('tl.store') >= 0 and self.is_isolated_symbol(line, var.name):
-                return True
-        return False
-
-    def find_axis2_in_indexing(self):
-        var = self.axis2_variable()
-        if not var:
-            return False
-        if self.current_node is None:
-            return False
-        for index in self.current_node._body.indexing.values():
-            if var.symbol() in index.free_symbols:
+            if line.find('tl.store') >= 0 and self.is_isolated_symbol(line, range):
                 return True
         return False
 
@@ -701,9 +790,6 @@ class NPUIndexTritonKernel(TritonKernel):
         self.stores.clear()
         self.post_loop_store.clear()
         self.prefix.clear()
-
-    def is_1d_reduction(self):
-        return self.numels["r"] > 1 and self.axis2 is None
     
     def codegen_body(self):
         if not (
@@ -714,119 +800,113 @@ class NPUIndexTritonKernel(TritonKernel):
         ):
             return
 
-        def write_pointwise():
+        def write_pointwise() :
             self.body.splice(self.indexing_code)
             self.body.splice(self.loads)
             self.body.splice(self.compute)
             self.body.splice(self.stores)
         
-        def codegen_range(index):
-
-            def loop_body(index, indexing_code, is_last_axis, do_indent=True):
+        def codegen_range(index) :
+            def is_1d_reduction() :
+                return self.numels["r"] > 1 and len(self.numels) == 1
+            
+            def loop_body(index, indexing_code, is_last_axis, do_indent = True ) :
                 if do_indent:
                     self.body.do_indent()
-                if indexing_code:
+                if indexing_code :
                     self.body.splice(indexing_code)
-
                 if is_last_axis:
                     write_pointwise()
                 else:
                     codegen_range(index + 1)
-
-                if do_indent:
+                if do_indent :
                     self.body.do_unindent()
 
             if index < 0 or index >= len(self.range_tree_nodes):
                 return
-            nodes = self.sorted_axis
-            range_node = nodes[index]
-            is_tilling_asix1 = getattr(range_node, "is_tiling_axis1")
-            is_tilling_asix2 = getattr(range_node, "is_tiling_axis2")
-            is_last_axis = index == len(nodes) - 1
-            indexing_code = getattr(range_node, "indexing_code")
-            numof_axis2 = self.numof_reduction_axis()
-            if is_tilling_asix1:
-                do_indent = True
-                reduction_1d = self.is_1d_reduction()
-                if reduction_1d:
-                    self.body.splice(self.prefix)
-                    self.prefix.clear()
-                
-                # multi-dim reduction, i.e. var_mean[1,2]
-                if numof_axis2 > 1:
-                    if range_node.is_split_axis:
-                        offset = f"{range_node.name}_offset"
-                        self.body.writeline(f"for {range_node.name} in range({offset}, "
-                                             f"min({offset} + XBLOCK, {range_node.name}_numel)):")
-                    else:
-                        self.body.writeline(f"for {range_node.name} in  range({range_node.name}_numel):")
-                # 1D persistent_reduction or 1d reduction non-first-node
-                elif self.axis2 is None and (self.persistent_reduction or len(self.loads._lines) == 0):
-                    do_indent = False
-                    if len(self.loads._lines) == 0:
-                        indexing_code = None
-                else:
-                    self.body.writeline(f"for loop1 in range(loops1):")
-
-                
-                if not reduction_1d and self.persistent_reduction:
-                    self.body.do_indent()
-                    self.body.splice(self.prefix)
-                    self.prefix.clear()    
-                    self.body.do_unindent()
-                
-                loop_body(index, indexing_code, is_last_axis, do_indent=do_indent)
-                
-                # for 1D reduction, need to add in suffix for persist_reduction or second node of 1d reduction
-                if self.is_1d_reduction() or self.persistent_reduction:
-                    self.body.splice(self.post_loop_store)
-                    self.post_loop_store.clear()
-
-
-            elif is_tilling_asix2:
+           
+            range = self.sorted_axis[index]
+            numof_tilings = len(self.tiling_axis)
+            last_tiling = range.is_tiling_axis and numof_tilings >=1 and range.tiling_order == len(self.tiling_axis) -1
+            next_is_dual_reduction_tiling = index == len(self.sorted_axis) - numof_tilings -1 and self.numof_reduction_axis()
+
+            is_last_axis = index == len(self.sorted_axis) -1
+            indexing_code = getattr(range, "indexing_code")
+            reduction_1d = is_1d_reduction()
+            do_indent = False
+            # do nothing except for writing porintwise
+            if  len(self.loads._lines) == 0:
                 do_indent = False
-                need_axis2_loop = self.find_axis2_in_load_store()
-                if not need_axis2_loop:
+                indexing_code = None
+                #loop_body(index, indexing_code, is_last_axis, do_indent = do_indent)
+                #return 
+            # tiling axis and last tiling
+            if range.is_tiling_axis and last_tiling:
+                do_indent = False
+                need_axis_loop = self.find_axis_in_load_store(range)
+                if not need_axis_loop :
                     indexing_code = None
-                if (not self.inside_reduction or not self.persistent_reduction) \
-                        and need_axis2_loop:
+                if (range.prefix != 'r' or not self.persistent_reduction) and need_axis_loop:
                     self.body.splice(self.prefix)
-                    self.body.writeline(f"for loop2 in range(loops2):")
+                    self.body.writeline(f"for loop_{range.name} in range(loops_{range.name}):")
                     do_indent = True
                 loop_body(index, indexing_code, is_last_axis, do_indent)
                 self.body.splice(self.post_loop_store)
                 self.post_loop_store.clear()
+            
+            # tiling axis and but not last tiling
+            elif range.is_tiling_axis :
+                do_indent = False 
+                if  len(self.loads._lines) == 0:
+                    do_indent = False
+                    indexing_code = None
+                if self.numof_reduction_axis() <= 1  :
+                    do_indent = True 
+                    self.body.writeline(f"for loop_{range.name} in range(loops_{range.name}):") 
+                loop_body(index, indexing_code, is_last_axis, do_indent = do_indent)
+
+            elif not is_last_axis :   
+                do_indent = True      
+                if range.is_split_axis :
+                    offset = f"{range.name}_offset"
+                    self.body.writeline(f"for {range.name} in range({offset}, "
+                                         f"min({offset} + {range.name.upper()}BLOCK, {range.name}_numel)):")
+                else :
+                     self.body.writeline(f"for {range.name} in range({range.name}_numel):")
+                
+                if not reduction_1d and self.persistent_reduction : 
+                    self.body.do_indent()
+                    self.body.splice(self.prefix)
+                    self.prefix.clear()    
+                    self.body.do_unindent()
 
-            elif is_last_axis and range_node.numel == 1:
-                #pointwise , last axis =1
+                loop_body(index, indexing_code, is_last_axis, do_indent = do_indent)
+            else :
                 write_pointwise()
-            else:
-                if range_node.is_split_axis:
-                    offset = f"{range_node.symbol()}_offset"
-                    self.body.writeline(f"for {range_node.symbol()} in range({offset}, min({offset} + XBLOCK, {range_node.name}_numel)):")
-                else:
-                    self.body.writeline(f"for {range_node.symbol()} in range({range_node.name}_numel):")
-                loop_body(index, indexing_code, is_last_axis)
 
         if self.first_node:
             for node in self.sorted_axis:
                 node.codegen_header(self.body)
-
+        
+        while True :
+            if not self.sorted_axis[-1].is_tiling_axis : 
+                x = self.sorted_axis[-1]
+                self.sorted_axis.pop(-1)
+                self.sorted_axis.insert(0, x)
+            else :
+                break 
 
         if self.first_node:
             codegen_range(0)
-        else:
-            if self.axis2 is None:
-                codegen_range(0)
-            else:
-                axis2_order = self.range_tree_nodes[self.axis2].sorted_order
-                if self.persistent_reduction and self.numof_reduction_axis() > 1:
-                    axis2_order = axis2_order - self.numof_reduction_axis() + 1
-                for _ in range(axis2_order):
-                    self.body.do_indent()
-                codegen_range(axis2_order)
-                for _ in range(axis2_order):
-                    self.body.do_unindent()
+        else :
+            last_axis_order = self.tiling_axis[-1].sorted_order
+            if self.persistent_reduction and self.numof_reduction_axis() > 1 :
+                last_axis_order = last_axis_order - self.numof_reduction_axis() + 1
+            for _ in range(last_axis_order) :
+                self.body.do_indent()
+            codegen_range(last_axis_order)
+            for _ in range(last_axis_order) :
+                self.body.do_unindent()
 
         self.cse.invalidate(self.outside_loop_vars)
         self.loads.clear()
@@ -838,14 +918,12 @@ class NPUIndexTritonKernel(TritonKernel):
 
     # for creat constant tensor, if have two axis, constant=tl.full([1,1]) else  tl.full([1])
     def triton_tensor_ndim(self):
-        if self.numof_reduction_axis() > 1:
+        if self.numof_reduction_axis() > 1 :
             return 1
-        if self.axis1 is not None and self.axis2 is not None:
-            ndim = 2
-        else:
-            ndim = 1
-        return ndim
+        
+        return len(self.tiling_axis)
 
+    # fixme, indexing.mask_str is None , see varmean_test.py
     def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable):
         if not self.inside_reduction:
             raise RuntimeError("assert self.inside_reduction")
@@ -871,46 +949,29 @@ class NPUIndexTritonKernel(TritonKernel):
             if not isinstance(indexing, IndexingOptions):
                 raise RuntimeError("assert isinstance(indexing, IndexingOptions)")
             line = f"tl.store({var} + ({indexing.index_str} ), {value}, {indexing.mask_str})"
-            if self.numof_reduction_axis() > 1:
+            if self.numof_reduction_axis() > 1 :
                 line = f"tl.store({var} + ({indexing.index_str} + tl.arange(0,1) ), {value}, {indexing.mask_str})"
             self.post_loop_store.writeline(
-                DeferredLine(name, line)
+                DeferredLine( name, line )
             )
 
-    def apply_var_prime(self, index, line, mask):
-        # axis should only be replaced once
-        axis_list = []
-        for key in index.as_coefficients_dict().keys():
-            if not key.free_symbols:
-                continue
-            symbol = list(key.free_symbols)[0]
-            if symbol not in self.range_tree_nodes:
-                continue
-            range_node = self.range_tree_nodes[symbol]
-            if (range_node.is_tiling_axis1 or range_node.is_tiling_axis2) and (symbol not in axis_list):
-                line = line.replace(f"{range_node.name}", f"{range_node.name}_prime")
-                mask = mask.replace(f"{range_node.name}", f"{range_node.name}_prime")
-                axis_list.append(symbol)
-        return line, mask
-
-    # apply xxx_prime var in case dim are permuted
+    
+    # apply new var in case dim are permuted/broadcast
     def store(
         self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
     ) -> None:
         
         var = self.args.output(name)
         original_index = index
-        indexing = self.indexing(index, dense_indexing=True, block_ptr=mode is None)
+        index_analyze = IndexAnalysis(self, index, is_store_index=True)
+        index_analyze.analyze_index()
+        indexing = self.indexing(index, dense_indexing=True, block_ptr=mode is None, index_analyze=index_analyze)
         index_str = indexing.index_str
         value_str = f"{value}"
-
-        # need to reshape when value's dimensions > 2, e.g. (XBLOCK,1,RBLOCK)
-        is_permuted = self.need_permuted(index)
-
         mask_str = indexing.mask_str
-        if is_permuted:
-            index_str, mask_str = self.apply_var_prime(index, index_str, indexing.mask_str)
-            value_str = value_str.replace(f"{value}", f"{value}.permute(1,0)")
+        
+        if index_analyze.need_permute :
+            value_str = value_str.replace(f"{value}", f"{value}{index_analyze.generate_statement()}")
 
         advance_block_ptr = None
         if isinstance(indexing, BlockPtrOptions):
@@ -923,7 +984,7 @@ class NPUIndexTritonKernel(TritonKernel):
             )
         elif mode is None:
             line = f"tl.store({var} + ({index_str}), {value_str}, {mask_str})"
-            if len(self.axis2_list) > 1:
+            if self.numof_reduction_axis() > 1 :
                 line = f"tl.store({var} + ({index_str} + tl.arange(0,1) ), {value_str}, {indexing.mask_str})"
 
         elif mode == "atomic_add":
@@ -938,215 +999,106 @@ class NPUIndexTritonKernel(TritonKernel):
         if not self.inside_reduction:
             self.outside_loop_vars.add(value)
 
+    def find_reduction_node(self):
+        node =  self.current_node 
+        if node is not None and  isinstance(node, SchedulerNode) :
+            reduction = node.node.data
+            if reduction is not None and  isinstance(reduction, ir.Reduction) :
+                return reduction
+        
+        for  node in self.node_schedule:
+            if node  in (EnableReduction, DisableReduction):
+                continue
+            reduction = node.node.data
+            if reduction is not None and  isinstance(reduction, ir.Reduction) :
+                return reduction
 
-    @staticmethod
-    def _get_next_scheduler_node(node_schedule, current_node):
-        found_current = False if current_node else True
-        for node in node_schedule:
-            if isinstance(node, SchedulerNode):
-                if not found_current and node.get_name() == current_node.get_name():
-                    found_current = True
-                    continue
-                if found_current:
-                    return node
         return None
 
-    def get_next_scheduler_node(self, node):
-        return self._get_next_scheduler_node(self.node_schedule, node)
-
-    def get_prev_scheduler_node(self, node):
-        return self._get_next_scheduler_node(reversed(self.node_schedule), node)
-
-    def check_all_index_is_1d_for_dual_reduction(self):
-        if self.numof_reduction_axis() <= 1:
-            return False 
-        
-        all_index_is_1d = True
-        for _, index in self.current_node._body.indexing.items():
-            count = 0 
-            for symbol in index.free_symbols:
-                if symbol in self.axis2_list:
-                    count = count + 1
-            if count > 1:
-                all_index_is_1d = False 
-
-            if not all_index_is_1d:
-                break 
-        return all_index_is_1d
-
-    # to generate the shape of the accumulator of RBLOCK loop
-    def dense_size_list(self, is_permute) -> List[str]:
-
-        sizes = []
-        if self.numof_reduction_axis() > 1:
-            sizes = [] if self.check_all_index_is_1d_for_dual_reduction() else [f"RBLOCK_{axis}" for axis in self.axis2_list] 
-            return sizes
-        if self.persistent_reduction and self.axis2 is None:
-            sizes = ["RBLOCK"]
-            return sizes
-        # current computedbuffer is reduction
-        cb_is_reduction = self.inside_reduction if not self.current_node else isinstance(self.current_node.node.data, ir.Reduction)
-
-        for tree in self.sorted_axis:
-            if tree.is_tiling_axis1:
-                sizes.append("XBLOCK_SUB")
-            elif tree.is_tiling_axis2:
-                sizes.append("RBLOCK")
-
-        if cb_is_reduction and self.inside_reduction and self.is_higher_order_reduction() or is_permute:
-            sizes = sizes[::-1]
-
+    # select the golden varlist, from to which to deduce permute, broadcast shape 
+    def select_golden_varlist(self) :
+        longest = None 
+        maximum_length = 0 
+        self.golden_var_list = None 
+        def all_tiling_in_var_list(var_list) :
+            return all([x in var_list for x in  self.tiling_axis])
+        # all are load indexings, select the longest as gold  
+        for index in self.load_store_indexing:
+            index = index.subs(V.graph.sizevars.var_to_val)
+            analyze = IndexAnalysis(self, index)
+            if len(analyze.var_list) > maximum_length and all_tiling_in_var_list(analyze.var_list) :
+                longest = analyze.var_list
+                maximum_length = len(longest)
+        #fixme , this may cause problems 
+        if not longest :
+            self.golden_var_list = tuple([x.symbol() for x in self.tiling_axis]) if self.tiling_axis else []
+        else :
+            self.golden_var_list = tuple([x for x in longest if x in self.tiling_axis]) if self.tiling_axis else []
+        assert self.golden_var_list is not None 
+
+    # to generate shape of the tile 
+    def dense_size_list(self) -> List[str]:
+        if self.inside_reduction :
+            if not self.reduce_analysis:
+                self.reduce_analysis = ReductionAnalysis(self)  
+            return self.reduce_analysis.dense_size_list()
+       
+        if not self.golden_var_list :
+            self.select_golden_varlist()
+
+        golden_var_list = self.golden_var_list if self.golden_var_list else [x.symbol() for x in self.tiling_axis]   
+        assert golden_var_list is not None
+        #shape = range(len(self.golden_var_list))
+        sizes = [None for _ in golden_var_list ]
+        for i, var in enumerate(reversed(golden_var_list)) :
+            axis = self.range_tree_nodes[var]
+            sizes[i] = f"{axis.name.upper()}BLOCK_SUB"
         return sizes
 
-    def dense_size_str(self, is_permute=False):
-        sizes = self.dense_size_list(is_permute)
-        if self.numof_reduction_axis() > 1:
-            return f"[{'* '.join(sizes)}]"
+    def dense_size_str(self):
+        if self.inside_reduction :
+            if not self.reduce_analysis:
+                self.reduce_analysis = ReductionAnalysis(self)  
+            return self.reduce_analysis.dense_size_str()
+        sizes = self.dense_size_list()
         return f"[{', '.join(sizes)}]"
 
-    def filter_masks(self, mask_vars):
-        for node in self.sorted_axis:
-            if not(node.is_tiling_axis1 or node.is_tiling_axis2):
-                mask_vars.discard(f"{node.name}_mask")
-            if len(self.axis2_list) > 1 and not node.is_tiling_axis2:
-                mask_vars.discard(f"{node.name}_mask")
-
     # and add to shape to value
-    def reduction_resize(self, value):
+    def reduction_resize(self, value, dim):
         ndims = self.triton_tensor_ndim()
         if ndims == 1:
-            return f"triton_helpers.promote_to_tensor({value})"
-        is_higher_order_reduction = self.is_higher_order_reduction()
-
-        expand_str = "1," if is_higher_order_reduction else ",1"
-        if is_higher_order_reduction:
-            return f"{value}.reshape({expand_str}XBLOCK_SUB)"
-        else:
-            return f"{value}.reshape(XBLOCK_SUB{expand_str})"
-
-    def get_axis_direction(self, is_axis1, is_reversed=False):
-      
-        if self.check_all_index_is_1d_for_dual_reduction():
-            result = AxisDirection.Flat
-        elif not self.inside_reduction:
-            if self.numof_tiling_axis() > 1:
-                result = AxisDirection.Vertical if is_axis1 else AxisDirection.Horizontal
-            else:
-                result = AxisDirection.Flat
-        else:
-            if is_axis1:
-                result = AxisDirection.Horizontal if V.kernel.is_higher_order_reduction() else AxisDirection.Vertical
-            else:
-                result = AxisDirection.Vertical if V.kernel.is_higher_order_reduction() else AxisDirection.Horizontal
-
-        result = reverse_direction(result) if is_reversed else result
-        return result
-
-    def is_higher_order_reduction(self, check_prev_node=False):
-        if self.numof_reduction_axis() > 1:
-            return False
-        if not (self.inside_reduction):
-            raise RuntimeError("assert self.inside_reduction")
-
-        if self.inside_high_order_reduction:
-            return self.inside_high_order_reduction
-
-        node = self.current_node if self.current_node is not None else self.get_prev_scheduler_node(None)
-        if node is None or not isinstance(node, SchedulerNode):
-            return False
-
-        reduction = node.node.data
-        while check_prev_node and reduction is not None and not isinstance(reduction, ir.Reduction):
-            node = self.get_prev_scheduler_node(node)
-            if node is None:
-                reduction = None
-            else:
-                reduction = node.node.data
-
-
-        if reduction is None or not isinstance(reduction, ir.Reduction):
-            return False
-        if not hasattr(reduction, "reduced_idx"):
-            return False
-
-        reduced_order = reduction.reduced_idx[0]
-        is_last_axis = all(_ < reduced_order for _ in reduction.kept_idx)
-        self.inside_high_order_reduction = not is_last_axis
-        return self.inside_high_order_reduction
-
-    def get_axis_dtype(self, axis):
-        dtype = None
-        if axis is None:
-            return None
-        for node in self.node_schedule:
-            if node in (EnableReduction, DisableReduction):
-                continue
-            if axis.symbol() in node._body.indexing_map:
-                dtype = V.graph.get_dtype(node.node.name)
-                break
-        if dtype is None:
-            should_break_all = False
-            for node in self.node_schedule:
-                if should_break_all:
-                    break
-                if node in (EnableReduction, DisableReduction):
-                    continue
-                for key, _ in node._body.indexing_map.items():
-                    if key in self.range_tree_nodes:
-                        dim = self.range_tree_nodes[key]
-                    else:
-                        dim = self.range_tree_nodes_removed[key]
-
-                    if dim.parent == axis.parent:
-                        dtype = V.graph.get_dtype(node.node.name)
-                        should_break_all = True
-                        break
-        return dtype
-
-    def create_inductor_meta(self):
-        mutated_args = set()
-        for mutation in self.mutations:
-            if mutation in self.args.input_buffers:
-                mutated_args.add(self.args.input_buffers[mutation])
-            if (
-                    mutation in self.args.inplace_buffers
-                    and mutation not in V.graph.removed_buffers
-                    and mutation not in self.removed_buffers
-            ):
-                mutated_args.add(self.args.inplace_buffers[mutation].inner_name)
-            if mutation in self.args.output_buffers:
-                mutated_args.add(self.args.output_buffers[mutation])
-        mutated_args = sorted(mutated_args)
-        axis1_order = self.range_tree_nodes[self.axis1].sorted_order if self.axis1 is not None else None
-        axis2_order = self.range_tree_nodes[self.axis2].sorted_order if self.axis2 is not None else None
-        split_axis_dtype = self.get_axis_dtype(self.split_axis)
-        inductor_meta = {
-            "autotune_hints": set(self.autotune_hints),
-            "kernel_name": str(Placeholder.DESCRIPTIVE_NAME),
-            "mutated_arg_names": mutated_args,
-            "no_x_dim": self.no_x_dim,
-            # Due to breaking change of triton 3.0, the original invocation is broken
-            "backend_hash": self.patch_triton_hash(),  # torch.utils._triton.triton_hash_with_backend(),
-            "split_axis_order": self.split_axis.sorted_order if self.split_axis is not None else None,
-            "axis1_order": axis1_order,
-            "axis2_order": axis2_order,
-            "low_dims": self.low_dims,
-            "numof_reduction_axis": self.numof_reduction_axis(),
-            "split_axis_dtype": split_axis_dtype,
-            "traced_graph_hash": "TRACED_GRAPH_HASH"
-        }
-        return inductor_meta
+           return f"triton_helpers.promote_to_tensor({value})"  
+        dense_list = self.dense_size_list()
+        dense_list[dim] = "1"
+        expand_str = ", ".join(dense_list)
+        return f"{value}.reshape({expand_str})"
+        #return f"{value}"
 
+    
+    # FIXME, to determine reduction_dim
     def reduction_dim(self):
-        if not self.inside_reduction:
-            raise RuntimeError("assert self.inside_reduction")
-        if self.numof_reduction_axis() > 1:
+        if not self.reduce_analysis:
+            self.reduce_analysis = ReductionAnalysis(self)  
+        return self.reduce_analysis.reduced_dim
+    
+    def filter_masks(self, mask_vars):
+        for node in self.sorted_axis:
+            if not(node.is_tiling_axis ):
+                mask_vars.discard(f"{node.name}_mask")
+    
+    def numof_reduction_axis(self):
+        root = self.range_trees[-1]
+        if root is None :
             return 0
-        return 0 if self.is_higher_order_reduction() or len(self.sorted_axis) == 1 else 1
 
-    def reduction_var(self):
-        var = self.axis2
-        return var
+        return len(root.var_list)
+    
+    
+    def reduction_axis_list(self):
+        root = self.range_trees[-1]
+        if root is None :
+            return []
+        return root.var_list
 
     def reduction(
         self,
@@ -1163,8 +1115,9 @@ class NPUIndexTritonKernel(TritonKernel):
         if self._load_mask:
             masks.append(self._load_mask)
         reduction_range_prefix = self.range_trees[-1].prefix
-
-        dense_size_str = self.dense_size_str(False)
+        if not self.reduce_analysis:
+            self.reduce_analysis = ReductionAnalysis(self)  
+        dense_size_str = self.dense_size_str()
 
         if len(dense_size_str) > 2:
             value = self._map_tuple_or_scalar(
@@ -1179,22 +1132,21 @@ class NPUIndexTritonKernel(TritonKernel):
         root_op: str
 
         def final_reduction(value):
-            module = "tl"
-            # use tl
-            # use tl.max
+            #use_helper = reduction_type in {"any", "max", "min", "prod"}
+            module = "tl" # use tl
             if reduction_type in {"max", "min"}:
-                return self.reduction_resize(f"{module}.{reduction_type}({value}, {dim})")
-            return self.reduction_resize(f"{module}.{reduction_type}({value}, {dim})")
+                return self.reduction_resize(f"{module}.{reduction_type}({value}, {dim})", dim)
+            return self.reduction_resize(f"{module}.{reduction_type}({value}, {dim})", dim)
 
         def final_argreduce(buffer, result_var, value, index):
             buffer.splice(
                 f"""\
                 _, {result_var}_tmp = triton_helpers.{root_op}_with_index({value}, {index}, {dim})
-                {result_var} = {self.reduction_resize(f'{result_var}_tmp')}
+                {result_var} = {self.reduction_resize(f'{result_var}_tmp', dim)}
                 """
             )
 
-        def get_reduction_axis():
+        def get_reduction_axis() :
             return list(self.range_tree_nodes.values())[-1]
 
         cache_key = (src_dtype, reduction_type, value)
@@ -1208,6 +1160,7 @@ class NPUIndexTritonKernel(TritonKernel):
         result_var.mask_vars = {var for var in masks if var[0] != "r"}
         cond = " & ".join(masks)
 
+
         def where_cond(tval, fval):
             if not cond:
                 return tval
@@ -1218,25 +1171,21 @@ class NPUIndexTritonKernel(TritonKernel):
             default = self._map_tuple_or_scalar(constant_repr, default)
 
             def _mask_value(value, default):
-                return self.cse.generate(self.compute, where_cond(value, default), dtype=value.dtype)
-
-            if self.numof_reduction_axis() == 1:
+                return self.cse.generate(self.compute, where_cond(value, default) , dtype=value.dtype)
+            # fixme masked_value doesn't work dual reduction
+            if self.numof_reduction_axis() == 1 :
                 if isinstance(value, tuple):
                     masked_value = [_mask_value(v, d) for v, d in zip(value, default)]
                 else:
                     masked_value = _mask_value(value, default)
-            else:
+            else :
                 masked_value = value
 
             if reduction_type in {"argmax", "argmin", "max", "min"}:
                 reduce_axis = get_reduction_axis()
                 broadcast_string: str
-                if self.is_1d_reduction():
-                    broadcast_string = f"tl.broadcast_to({reduce_axis.symbol()}.reshape({reduction_range_prefix.upper()}BLOCK), {masked_value}.shape)"
-                elif self.is_higher_order_reduction():
-                    broadcast_string = f"tl.broadcast_to({reduce_axis.symbol()}.reshape({reduction_range_prefix.upper()}BLOCK,1), {masked_value}.shape)"
-                else:
-                    broadcast_string = f"tl.broadcast_to({reduce_axis.symbol()}.reshape(1,{reduction_range_prefix.upper()}BLOCK), {masked_value}.shape)"
+                reshape_str = self.reduce_analysis.get_reduce_dim_reshape(reduce_axis)
+                broadcast_string = f"tl.broadcast_to({reduce_axis.symbol()}.reshape({reshape_str}), {masked_value}.shape)"
                 accumulator_index = str(
                     self.cse.generate(
                         self.compute,
@@ -1316,139 +1265,24 @@ class NPUIndexTritonKernel(TritonKernel):
             self.outside_loop_vars.add(result_var)
 
         return result_var
-
-    #XBLICK:split size, XBLOCK_SUB : tile1 size, RBLOCK:tile2 size
-    def add_autotune_args(self, argdefs):
-        # no tiling in this case
-        if self.persistent_reduction and self.axis2 is None:
-            return
-        argdefs.append(f"XBLOCK: tl.constexpr")
-        if self.numof_reduction_axis() <= 1:
-            argdefs.append(f"XBLOCK_SUB: tl.constexpr")
-        if self.axis2 is not None and not self.persistent_reduction:
-            argdefs.append(f"RBLOCK: tl.constexpr")
-
-    def _get_heuristic(self):
-        if self.persistent_reduction:
-            if not (self.inside_reduction):
-                raise RuntimeError(" assert self.inside_reduction")
-
-            return "persistent_reduction_npu_index"
-        elif self.inside_reduction:
-            return "reduction_npu_index"
-        return "pointwise_npu_index"
-
-    def need_broadcast(self, index: sympy.Expr):
-        tiling_axis = [False, False]
-        for axis in index.free_symbols:
-            if axis not in self.range_tree_nodes:
-                continue
-            if self.range_tree_nodes[axis].is_tiling_axis1:
-                tiling_axis[0] = True
-            elif self.range_tree_nodes[axis].is_tiling_axis2:
-                tiling_axis[1] = True
-        #implict broadcast
-        result = (self.numof_tiling_axis() > 1 and not self.persistent_reduction) and (tiling_axis[1] ^ tiling_axis[0])
-        result = result and self.find_axis2_in_indexing()
-        return result, tiling_axis
-
-    def current_node_has_permute(self):
-        if not self.current_node:
-            return False
-        for index in self.current_node._body.indexing.values():
-            if self.need_permuted(index):
-                return True
-        return False
-
-    def need_permuted(self, index: sympy.Expr):
-        if self.numof_tiling_axis() <= 1:
-            return False
-
-        need_permute = False
-        tmp_list = []
-        coefficients_dict = index.as_coefficients_dict()
-        need_permute_axis1 = False
-        need_permute_axis2 = False
-        for key, value in coefficients_dict.items():
-            if not key.free_symbols:
-                continue
-            key = list(key.free_symbols)[0]
-            if key not in self.range_tree_nodes:
-                continue
-            axis = self.range_tree_nodes[key]
-            # normally, axis2 is lowest dimension, except for higher_order_reduction
-            if (self.inside_reduction and self.is_higher_order_reduction(True)):
-                if axis.is_tiling_axis1 and value > sympy.Integer(1):
-                    need_permute_axis1 = True
-            elif axis.is_tiling_axis2 and value > sympy.Integer(1):
-                need_permute_axis2 = True if self.numof_reduction_axis() <= 1 else isinstance(axis.expr, ModularIndexing)
-            tmp_list.append(True if value > sympy.Integer(1) else False)
-
-        # If all axes have coefficients greater than 1,
-        # then the stride is not 1, and in this case, return false,
-        # indicating that the transpose is not required.
-        if all(tmp_list):
-            return False
-        return need_permute_axis1 or need_permute_axis2
-
-    def get_reshape_dense_str(self, tiling_axis):
-        # there must be one tiling asis missing
-        if not (tiling_axis[1] or tiling_axis[0]):
-            raise RuntimeError("assert tiling_axis[1] or tiling_axis[0]")
-
-        sizes = ["XBLOCK_SUB", "1"]
-        if not tiling_axis[0]:
-            sizes = ["1", "RBLOCK"]
-
-        if self.inside_reduction and self.is_higher_order_reduction():
-            sizes = reversed(sizes)
-        return f"[{', '.join(sizes)}]"
-
-    def get_reshape_str(self, tiling_axis, check_prev_node=True):
-        # there must be one tiling asis missing
-        if not (tiling_axis[1] or tiling_axis[0]):
-            raise RuntimeError("assert tiling_axis[1] or tiling_axis[0]")
-
-        sizes = ["XBLOCK_SUB", "RBLOCK"]
-        if not tiling_axis[0]:
-            sizes[0] = "1"
-        elif not tiling_axis[1]:
-            sizes[1] = "1"
-        if self.inside_reduction and self.is_higher_order_reduction(check_prev_node):
-            sizes = reversed(sizes)
-
-        return f"[{', '.join(sizes)}]"
-
-    def get_broadcast_dense_str(self, tiling_axis, check_prev_node=True):
-        # there must be one tiling asis missing
-        if not (tiling_axis[1] or tiling_axis[0]):
-            raise RuntimeError("assert tiling_axis[1] or tiling_axis[0]")
-
-        sizes = ["XBLOCK_SUB", "RBLOCK"]
-        if self.inside_reduction and self.is_higher_order_reduction(check_prev_node):
-            sizes = reversed(sizes)
-        return f"[{', '.join(sizes)}]"
-
+    
     #broadcast, permute handling
     def load(self, name: str, index: sympy.Expr):
         var = self.args.input(name)
         original_index = index
-        is_permuted = self.need_permuted(index)
         store_cache = self.cse.store_cache
         if name in store_cache:
-            broadcasted, tiling_axis = self.need_broadcast(original_index)
+            index_analyze = IndexAnalysis(self, index)
+            index_analyze.analyze_index()
             result_var = store_cache[name]
-            if broadcasted:
-                line = f"{result_var}.broadcast_to({self.get_broadcast_dense_str(tiling_axis, True)})"
+            if index_analyze.need_permute:
+                line =  f"{result_var}{index_analyze.generate_statement()}"
                 buffer = self.compute if self.persistent_reduction else self.loads
                 result_var = self.cse.generate(buffer, line, dtype=result_var.dtype)
-            elif is_permuted:
-                line = f"{result_var}.permute(1,0)"
-                buffer = self.compute if self.persistent_reduction else self.loads
-                result_var = self.cse.generate(self.loads, line, dtype=result_var.dtype)
             return result_var
 
-        need_broadcast, tiling_axis = self.need_broadcast(index)
+        index_analyze = IndexAnalysis(self, index)
+        index_analyze.analyze_index()
         indirect_indexing = self.is_indirect_indexing(index)
         indexing = self.indexing(index, block_ptr=True)
         has_rindex = indexing.has_rindex()
@@ -1484,17 +1318,15 @@ class NPUIndexTritonKernel(TritonKernel):
                 )
             elif isinstance(original_index, sympy.Integer):
                 line = f"tl.load({var} + ({original_index}))"
-                num_size = len(self.dense_size_list(is_permuted))
-                append_broadcast = "[1, 1]" if (num_size > 1) else "[1]"
+                full_list = ["1"] * (len(self.tiling_axis) if self.tiling_axis else 1 )
+                append_broadcast = f"[{', '.join(full_list)} ]"
             else:
                 index_str = indexing.index_str
                 mask_str = indexing.mask_str
-                if is_permuted:
-                    index_str, mask_str = self.apply_var_prime(index, index_str, mask_str)
                 line = f"tl.load({var} + ({index_str}), {mask_str}{ep}{other})"
 
             dtype = V.graph.get_dtype(name)
-            if dtype in (torch.bfloat16, ):
+            if dtype in  (torch.bfloat16, ):
                 line += ".to(tl.float32)"
             if dtype == torch.bool and torch.version.hip is None:
                 line += ".to(tl.int1)"
@@ -1521,13 +1353,14 @@ class NPUIndexTritonKernel(TritonKernel):
 
         if append_broadcast and append_broadcast != '[]':
             line = f"tl.broadcast_to({result_var}, {append_broadcast})"
-            result_var = self.cse.generate(load_buffer, line, dtype=dtype)
-        elif need_broadcast and not indirect_indexing:
-            line = f"{result_var}.broadcast_to({self.get_broadcast_dense_str(tiling_axis)})"
-            result_var = self.cse.generate(load_buffer, line, dtype=dtype)
-        elif is_permuted:
-            line = f"{result_var}.permute(1,0)"
-            result_var = self.cse.generate(self.loads, line, dtype=dtype)
+            result_var = self.cse.generate(load_buffer, line, dtype = dtype)
+        # triton can handle broadcast 
+        # elif need_broadcast and not indirect_indexing:
+        #     line = f"{result_var}.broadcast_to({self.get_broadcast_dense_str(broadcast_shape)})"
+        #     result_var = self.cse.generate(load_buffer, line, dtype = dtype)
+        elif index_analyze.need_permute :
+            line = f"{result_var}{index_analyze.generate_statement()}"
+            result_var = self.cse.generate(self.loads, line, dtype = dtype)
 
         if advance_block_ptr:
             load_buffer.writeline(advance_block_ptr)
@@ -1541,7 +1374,9 @@ class NPUIndexTritonKernel(TritonKernel):
     def prepare_indexing(
         self,
         index: sympy.Expr,
+        index_analyze
     ):
+        #index = self.simplify_indexing(index)
         index = sympy_subs(index, V.graph.sizevars.precomputed_replacements)
         # if simple replacements didn't get rid of floor/ceil, try full subs
         if len(index.atoms(sympy.floor)) or len(index.atoms(sympy.ceiling)):
@@ -1559,17 +1394,32 @@ class NPUIndexTritonKernel(TritonKernel):
                     replacements = {a: V.graph.sizevars.lookup_precomputed_size(a)}
                     index = sympy_subs(index, replacements)
 
+        #simp_index = self.simplify_indexing(index)
         simp_index = index
 
-        # Now that we are done simplifying we can unwrap Identity so that downstream handling
-        # for its contained expression will work. previously, tl.full wrapping of sympy.Integer
-        # would not occur
         simp_index = (
             simp_index if not isinstance(simp_index, Identity) else simp_index.args[0]
         )
-
+        
+        # to generate range.var_directions for permuted axis
+        index_analyze.analyze_index()
         return self.codegen_indexing(simp_index)
     
+
+    def replace_index_vars(self, index, index_analyze) :
+        
+        new_index = index
+        if index_analyze.var_replacements :
+            new_index = sympy_subs(index, index_analyze.var_replacements)
+        return new_index
+        
+    
+    def index_to_str(self, index: sympy.Expr) -> str:
+        if isinstance(index, list):
+            return f"[{', '.join(map(self.index_to_str, index))}]"
+        index = self.rename_indexing(index)
+        return self.kexpr(index)  # type: ignore[call-arg]
+    
     #1. only remove the line which asserts index var should be in "xyr"
     #2. don't do simplify_indexing, which combine continuous dims
     #3. removed block_ptr, removed dense mask/broadcast support
@@ -1583,13 +1433,19 @@ class NPUIndexTritonKernel(TritonKernel):
             dense_indexing=False,
             override_mask=None,
             block_ptr=False,
+            index_analyze = None 
     ) -> Union[IndexingOptions, BlockPtrOptions]:
         """
         Compute the index and mask to pass to tl.load() or tl.store()
         """
-        index = self.prepare_indexing(index)
+        if not index_analyze :
+            index_analyze = IndexAnalysis(self, index)
+        index_analyze.analyze_index()
+
+        index = self.prepare_indexing(index, index_analyze)
         index_vars = index.free_symbols
         has_rindex = False
+        #index = self.simplify_indexing(index)
         index = sympy_subs(index, V.graph.sizevars.precomputed_replacements)
         # if simple replacements didn't get rid of floor/ceil, try full subs
         if len(index.atoms(sympy.floor)) or len(index.atoms(sympy.ceiling)):
@@ -1605,6 +1461,9 @@ class NPUIndexTritonKernel(TritonKernel):
                     replacements = {a: V.graph.sizevars.lookup_precomputed_size(a)}
                     index = sympy_subs(index, replacements)
 
+         #if not self.inside_reduction :
+        index = self.replace_index_vars(index, index_analyze)            
+        #index = self.simplify_indexing(index)
         index_vars = index.free_symbols
         has_rindex = False
 
@@ -1628,9 +1487,9 @@ class NPUIndexTritonKernel(TritonKernel):
 
         expand_str = None
         index_str = self.index_to_str(index)
-        is_permute = self.need_permuted(index)
+  
         if isinstance(index, sympy.Integer):
-            expand_str = f"{copy_shape}.shape" if copy_shape else self.dense_size_str(is_permute)
+            expand_str = f"{copy_shape}.shape" if copy_shape else self.dense_size_str()
             if (index != 0):
                 index_str = f"tl.full({expand_str}, {index_str}, tl.int32)"
             else:
@@ -1663,6 +1522,7 @@ class NPUIndexTritonKernel(TritonKernel):
                 self.range_tree_nodes[sym].codegen()  # type: ignore[index]
         return expr
     
+    #FIXME, when xindex(16) -> x2:2,x3:8, when new length:16 in , should return (x2,x3)
     def split_and_set_ranges(self, lengths: Sequence[Sequence[sympy.Expr]]):
         groups = [rt.numel for rt in self.range_trees]
         if not self.inside_reduction:
@@ -1746,8 +1606,6 @@ class NPUIndexTritonKernel(TritonKernel):
                     # scroll to next group with remaining elements
                     current_group += 1
                 size_hint = sv.size_hint(size)
-                if current_group >= len(remaining):
-                    pdb.set_trace()
                 if size_hint > size_hints(remaining[current_group]):
                     #add multiple ranges (two or more) to the list, as well as the getter funcs
                     add_multiple_range(size_hint, return_getters)
@@ -2025,66 +1883,6 @@ class NPUIndexTritonKernel(TritonKernel):
                 sorter: Optional[Tuple[str, sympy.Expr]] = None,
                 sorter_indices: Optional[CSEVariable] = None,
             ) -> CSEVariable:
-                """
-                [Note: Inductor bucketize op]
-
-                Inputs:
-                -------
-                values: the values to be bucketized.
-                boundaries: a tuple containing
-                  (a) the name of the boundaries tensor (which must be sorted, unless
-                  the sorting tensor is present),
-                  (b) the length of the tensor in the last dimension (i.e. the length of
-                  one set of boundaries),
-                  (c) the number of elements in the underlying storage (i.e. the length
-                  of the flattened tensor, ignoring striding), and
-                  (d) the stride of the tensor in the last dimension.
-                boundary_indices: indices into a flattened version of the boundaries
-                tensor, of the same size and shape as "values".  Each index points to
-                the first element in the set of boundaries to be used for the
-                corresponding value.
-                indexing_dtype: the dtype to use when indexing into the boundaries
-                tensor.  This must be int64 or int32.  This additionally specifies the
-                dtype of the return value.
-                right: see "Details" below.
-                sorter: an optional tuple containing
-                  (a) the name of an optional sorting tensor, used to access unsorted
-                  boundaries without reordering the boundaries tensor, and
-                  (b) the stride of the tensor in the last dimension.
-                The values in the sorting tensor are used as indices into the *last*
-                dimension of the boundaries tensor, with all other indices matching.
-                The size of the sorting and boundaries tensors must be equivalent.
-                sorter_indices: must be present if the sorting array is present; see
-                "boundary_indices" for the equivalent definition for the boundaries
-                tensor.
-
-                Output:
-                -------
-                The buckets each value belongs in, within a given set of boundaries.  0
-                indicates a position before the first boundary, and len(boundaries_set)
-                represents a position after the last boundary.
-
-                Details:
-                --------
-                Given a value and a set of boundaries, calculate the bucket that each
-                value belongs to.  This works differently in 1-D and N-D cases.
-
-                for values [[-1, 0, 1, 2], [3, 4, 5, 9]], boundaries [0, 4, 4, 8], right=True
-                return =   [[ 0, 1, 1, 1], [1, 3, 3, 4]].
-
-                for values [[-1, 0, 1, 2], [3, 4, 5, 9]], boundaries [[0, 4], [4, 8]], right=True
-                return =   [[ 0, 1, 1, 1], [0, 1, 1, 2]]
-
-                Note that in the N-D boundaries case, the shape of "values" and
-                "boundaries" must match in every dimension _except_ the last.
-
-                When right == False, bucket i refers to range (boundaries[i], boundaries[i+1]].
-                When right == True,  bucket i refers to range [boundaries[i], boundaries[i+1]).
-
-                Boundaries must be non-decreasing, or a sorter must be provided which
-                would re-index offsets in a non-decreasing order (e.g. the second output
-                of torch.sort(offsets)).  Otherwise, the result is undefined.
-                """
                 return self.bucketize(
                     values,
                     boundaries,
diff --git a/torch_npu/_inductor/codegen/wrapper.py b/torch_npu/_inductor/codegen/wrapper.py
index 8daeafbf63..2f6772b5e5 100644
--- a/torch_npu/_inductor/codegen/wrapper.py
+++ b/torch_npu/_inductor/codegen/wrapper.py
@@ -5,7 +5,7 @@ from torch._inductor.utils import (
 )
 from torch._inductor.runtime import triton_heuristics
 from torch._inductor import config
-
+import copy
 
 class NPUWrapperCodeGen(PythonWrapperCodegen):
     def __init__(self):
@@ -19,6 +19,15 @@ class NPUWrapperCodeGen(PythonWrapperCodegen):
             return SubgraphPythonWrapperCodegen(subgraph_name, parent_wrapper)
         return NPUWrapperCodeGen()
         
+    def write_header(self) -> None:
+        super().write_header()
+        self.imports.splice(
+            f"""
+                import torch_npu
+            """,
+            strip=True,
+        )
+        
     @cache_on_self
     def write_triton_header_once(self) -> None:
         import_str = f"""
@@ -32,6 +41,7 @@ class NPUWrapperCodeGen(PythonWrapperCodegen):
                 cooperative_reduction_grid,
             )
             from torch_npu._inductor.npu_triton_heuristics import grid
+            import torch_npu
             """
         if config.triton.autotune_at_compile_time:
             self.kernel_autotune_calls.splice(import_str)
@@ -71,3 +81,7 @@ class NPUWrapperCodeGen(PythonWrapperCodegen):
     # don't assert
     def codegen_input_size_asserts(self) -> None:
         pass
+
+    def get_next_kernel_suffix(self) -> str:
+        iter = copy.copy(self._names_iter)
+        return f"{next(iter)}"
diff --git a/torch_npu/_inductor/config.py b/torch_npu/_inductor/config.py
index 6817d3e393..e7bc046065 100644
--- a/torch_npu/_inductor/config.py
+++ b/torch_npu/_inductor/config.py
@@ -54,5 +54,5 @@ logging.basicConfig(
 log = logging.getLogger(__name__)
 
 aggresive_autotune = os.getenv("INDUCTOR_ASCEND_AGGRESSIVE_AUTOTUNE", '0').lower() in ('1', 'true')
-
+inductor_static_mode = os.environ.get('INDUCTOR_STATIC_MODE', '0').lower() in ('1', 'yes', 'true')
 profile_path = "./profile_result/"
\ No newline at end of file
diff --git a/torch_npu/_inductor/decomposition.py b/torch_npu/_inductor/decomposition.py
index af5ecbf311..17a9b00adc 100644
--- a/torch_npu/_inductor/decomposition.py
+++ b/torch_npu/_inductor/decomposition.py
@@ -13,7 +13,9 @@ DECOMPOSITION_OVERLOAD_OP = [
     # aten.gelu,
     aten.nll_loss_backward,
     aten._log_softmax_backward_data,
-    aten.embedding_dense_backward
+    aten.embedding_dense_backward,
+    aten.addmm,
+    aten.gelu
 ]
 
 
diff --git a/torch_npu/_inductor/lowering.py b/torch_npu/_inductor/lowering.py
index a7580f8cb4..caac343535 100644
--- a/torch_npu/_inductor/lowering.py
+++ b/torch_npu/_inductor/lowering.py
@@ -13,30 +13,6 @@ from torch._prims_common import (
 from torch._inductor.decomposition import decompositions, pw_cast_for_opmath
 import torch._ops
 
-from torch._inductor.lowering import (
-    lowerings,
-    make_fallback,
-    register_lowering,
-    to_dtype,
-    # make_reduction,
-    # reduce_amax,
-    # reduce_amin,
-    fallback_cumsum,
-    _validate_reduction_axis,
-    div,
-    squeeze,
-    square,
-    sub,
-    fallback_handler,
-    is_boolean_type,
-    logical_and,
-    make_pointwise,
-    _make_reduction_inner,
-    _validate_reduction_axis,
-)
-import torch_npu
-from torch_npu import npu_dtype_cast
-
 
 def make_reduction(reduction_type: str, override_return_dtype=None):
     def inner(x, axis=None, keepdims=False, *, dtype=None):
@@ -50,8 +26,7 @@ def make_reduction(reduction_type: str, override_return_dtype=None):
         result = Reduction.create(reduction_type=reduction_type, input_node=x, **kwargs)
         if isinstance(
                 result.data.data, Reduction
-        ):
-            #Only realize if reduction isn't unrolled
+        ):  #Only realize if reduction isn't unrolled
             size = x.get_size()
             axis = set(_validate_reduction_axis(x, axis))
             kept_idx = []
@@ -73,11 +48,36 @@ def make_reduction(reduction_type: str, override_return_dtype=None):
 
 lowering.make_reduction = make_reduction
 
+from torch._inductor.lowering import (
+    lowerings,
+    make_fallback,
+    register_lowering,
+    to_dtype,
+    # make_reduction,
+    # reduce_amax,
+    # reduce_amin,
+    fallback_cumsum,
+    _validate_reduction_axis,
+    div,
+    squeeze,
+    square,
+    sub,
+    fallback_handler,
+    is_boolean_type,
+    logical_and,
+    make_pointwise,
+    _make_reduction_inner,
+    _validate_reduction_axis,
+)
 
 aten = torch.ops.aten
 tr_c10d = torch.ops.tr_c10d
 prims = torch.ops.prims
 
+import torch_npu
+
+from torch_npu import  npu_dtype_cast
+
 
 def _init_set(input_list, output_set):
     for fn in input_list:
@@ -99,7 +99,7 @@ GENERATE_LIST = [
     aten.select,
     aten.unsqueeze,
     aten.repeat,
-    #aten.clone,
+    aten.clone,  #remove this, case permute_reshape will fail
     aten.reshape,
     aten.where,
     aten.lt,
@@ -168,9 +168,14 @@ FALLBACK_LIST = []
 LOWERING_OVERLOAD_OP = [
     aten.cumsum,
     aten.mean,
-    # aten.max,
-    # aten.min,
-    # aten.mul,
+    aten.max,
+    aten.min,
+    aten.amin,
+    aten.amax,
+    aten.argmax,
+    aten.argmin,
+    aten.sum,
+
     aten.var_mean,
     aten.var,
 
@@ -180,7 +185,7 @@ LOWERING_OVERLOAD_OP = [
     aten.nll_loss_forward,
     aten.gather,
     aten.cat,
-    aten.clone
+    #aten.clone, cast permute_reshape will fail if enable this 
 ]
 
 
@@ -209,6 +214,48 @@ def _register_npu_inductor_fallbacks():
         if op in lowerings:
             del lowerings[op]
 
+    # register the reductions useing custom make_reduction
+    reduce_amax = register_lowering(aten.amax)(make_reduction("max"))
+    reduce_amin = register_lowering(aten.amin)(make_reduction("min"))
+    reduce_argmax = register_lowering(aten.argmax)(
+        make_reduction("argmax", override_return_dtype=torch.int64)
+    )
+    reduce_argmin = register_lowering(aten.argmin)(
+        make_reduction("argmin", override_return_dtype=torch.int64)
+    )
+    @register_lowering([aten.sum, prims.sum])
+    def sum_(x, axis=None, keepdims=False, *, dtype=None):
+        if (
+            is_integer_dtype(x.get_dtype()) or is_boolean_dtype(x.get_dtype())
+        ) and dtype is None:
+            dtype = torch.int64
+    
+        fn = make_reduction("sum", override_return_dtype=dtype)
+        return fn(x, axis, keepdims, dtype=dtype)
+    
+    
+    @register_lowering(aten.max, type_promotion_kind=None)
+    def reduce_max(x, dim=None, keepdim=False):
+        if dim is not None:
+            return (
+                reduce_amax(x, axis=dim, keepdims=keepdim),
+                reduce_argmax(x, axis=dim, keepdims=keepdim),
+            )
+    
+        return reduce_amax(x, axis=None, keepdims=keepdim)
+    
+    
+    @register_lowering(aten.min, type_promotion_kind=None)
+    def reduce_min(x, dim=None, keepdim=False):
+        if dim is not None:
+            return (
+                reduce_amin(x, axis=dim, keepdims=keepdim),
+                reduce_argmin(x, axis=dim, keepdims=keepdim),
+            )
+    
+        return reduce_amin(x, axis=None, keepdims=keepdim)
+
+
     @register_lowering(aten.mean)
     def mean(x, axis=None, keepdim=False, *, dtype=None):
         if dtype is not None:
diff --git a/torch_npu/_inductor/npu_triton_heuristics.py b/torch_npu/_inductor/npu_triton_heuristics.py
index 6a8dd8e0f1..43acb8dd8c 100644
--- a/torch_npu/_inductor/npu_triton_heuristics.py
+++ b/torch_npu/_inductor/npu_triton_heuristics.py
@@ -25,6 +25,7 @@ from torch._inductor.runtime.triton_heuristics import (
     get_first_attr,
     collected_calls,
     _dump_launch_params,
+    builtins
 )
 from torch._inductor.runtime.benchmarking import benchmarker
 from torch._inductor.runtime.autotune_cache import AutotuneCache
@@ -109,6 +110,7 @@ class NPUCachingAutotuner(CachingAutotuner):
                 raise RuntimeError("No triton configs are available")
             for c in self.configs:
                 try:
+                    print(f"start compile kernel {self.inductor_meta['kernel_name']} config:{c.kwargs}", flush=True)
                     compiled_binary, launcher = self._precompile_config(
                         c, warm_cache_only
                     )
@@ -228,7 +230,7 @@ class NPUCachingAutotuner(CachingAutotuner):
                     self.fn.src,
                     compile_meta,
                 )
-                return None, None
+                raise
 
         call_args = [
             arg
@@ -486,11 +488,28 @@ class NPUCachingAutotuner(CachingAutotuner):
 
         if with_profiler:
             from torch._inductor.utils import do_bench_using_profiling
+            ret = do_bench_using_profiling(kernel_call, warmup=10, rep=1)
+            
 
-            return do_bench_using_profiling(kernel_call, warmup=10, rep=40)
+        print(f"start bench for kernel {self.inductor_meta['kernel_name']} config:{launcher.config}", flush=True)
         # remove fast_flush=True for high version triton
-        return benchmarker.benchmark_gpu(kernel_call, rep=40)
+        ret = benchmarker.benchmark_gpu(kernel_call, rep=1)
+        print(f"do bench ret = {ret} ",flush=True)
+        return ret 
     
+    def autotune_to_one_config(self, *args, **kwargs):
+        """Do the actual autotuning"""
+        start_time = time.time_ns()
+        timings = self.benchmark_all_configs(*args, **kwargs)
+        benchmark_time_taken_ns = time.time_ns() - start_time
+        self.launchers = [builtins.min(timings, key=timings.get)]
+        self.autotune_time_taken_ns = (
+            self.precompile_time_taken_ns + benchmark_time_taken_ns
+        )
+        if self.save_cache_hook:
+            self.save_cache_hook(self.launchers[0].config, self.autotune_time_taken_ns)
+            print(f"saved best_config:{self.launchers[0].config.kwargs}", flush=True)
+
 
     def get_fx_graph_call(self, auto_fallback=False):
         kernel_name = self.inductor_meta.get("kernel_name", "triton_")
@@ -659,7 +678,7 @@ class NPUCachingAutotuner(CachingAutotuner):
                 stream=stream,
             )
 
-    
+
 class NPUDebugAutotuner(NPUCachingAutotuner):
     def __init__(self, *args, regex_filter="", **kwargs):
         self.regex_filter = regex_filter
@@ -729,7 +748,7 @@ def cached_autotune(
         if autotune_cache:
             if best_config := autotune_cache.read_best(inductor_meta, configs):
                 configs = [best_config]
-
+                print(f"loaded best_config: {best_config.kwargs}", flush=True)
     else:
         if disabled:
             log.debug("autotune caching is disabled by config.force_disable_caches")
@@ -745,18 +764,6 @@ def cached_autotune(
         reset_to_zero_arg_names.extend(triton_meta.pop("reset_to_zero"))
 
     def decorator(fn):
-        # Remove XBLOCK from config if it's not a function argument.
-        # This way, coordinate descent tuning will not try to tune it.
-        #
-        # Context: When TritonKernel.no_x_dim is True, we hardcode XBLOCK to 1.
-        import inspect
-
-        if "XBLOCK" not in inspect.signature(fn.fn).parameters:
-            for tconfig in configs:
-                if "XBLOCK" in tconfig.kwargs:
-                    if tconfig.kwargs["XBLOCK"] != 1:
-                        raise ValueError("tconfig.kwargs[XBLOCK] != 1")
-                    tconfig.kwargs.pop("XBLOCK")
 
         if inductor_meta.get("profile_bandwidth"):
             return NPUDebugAutotuner(
@@ -804,21 +811,20 @@ def cached_autotune(
 
 def grid(*numels):
     def grid_fn(meta):
-        split_axis_order = meta["split_axis_order"]
-
-        if split_axis_order is not None and split_axis_order < len(numels):
-            numel = numels[split_axis_order] if split_axis_order is not None else 1
-            xblock = meta["XBLOCK"]
-            NBLOCKS, _ = SplitTiling.get_nblocks_before_launch(numel, xblock)
-        else:
-            NBLOCKS = 1
-
-        log.debug("launch grid(%s), NBLOCKS:%d, meta:%s", numels, NBLOCKS, meta)
-        return (
-            NBLOCKS,
-            1,
-            1,
-        )
+        split_axis = meta["split_axis"]
+        split_blocks = meta["split_blocks"]
+        programs = [ ]
+        for i, order in enumerate(split_axis) :
+            if not numels :
+                continue 
+            numel = numels[order]
+            block = split_blocks[i]
+            programs.append((numel + block -1) // block)
+
+        for _ in range(3 - len(programs)) :
+            programs.append(1)
+        #log.debug("launch grid(numels:%s), programs:%s, meta:%s", numels, programs, meta)
+        return tuple(programs)
 
     return grid_fn
 
@@ -836,87 +842,38 @@ def triton_config_npu_index(
     num_stages = 1
     configs = []
     log.info("[InductorNPU] processing kernel %s", inductor_meta['kernel_name'])
-    split_axis_order = inductor_meta["split_axis_order"]
-    axis1_order = inductor_meta["axis1_order"]
-    axis2_order = inductor_meta["axis2_order"]
+    split_axis = inductor_meta["split_axis"]
+    tiling_axis = inductor_meta["tiling_axis"]
     low_dims = inductor_meta["low_dims"]
     split_axis_dtype = inductor_meta["split_axis_dtype"]
-    split_numel = size_hints[split_axis_order] if split_axis_order is not None else 1
-    is_low_dim = True if split_axis_order is not None and split_axis_order in low_dims else False
+    axis_names = inductor_meta["axis_names"]
+    dual_reduction = inductor_meta["dual_reduction"]
 
-    min_aligned_numel = get_aligned_numel(split_axis_dtype)
+    tile_generator = TileGenerator(size_hints, axis_names, tiling_axis, split_axis, low_dims,
+                                    persistent_reduction = persistent_reduction, configs=configs, 
+                                    dtype = split_axis_dtype, dual_reduction=dual_reduction )
 
-    grid_list = []
-    if (aggresive_autotune):
-        grid_list = SplitTiling.get_nblocks_xblock_list(split_numel)
-    else:
-        nblocks, split = SplitTiling.decide_nblocks_xblock(split_numel, axis2_order is None, min_aligned_numel)
-        grid_list.append((nblocks, split))
-
-    for nblocks, split in grid_list:
-        log.debug("generating tiling : size_hints:%s split_axis_order:%s, axis1_order:%s, axis2_order:%s, "
-                    "low_dims:%s  nblocks %s, split:%s persistent_reduction:%s split_axis_dtype:%s", size_hints,
-                    split_axis_order, axis1_order, axis2_order, low_dims, nblocks, split,
-                    persistent_reduction, split_axis_dtype)
-        # xblock is a range, don't auto_tune
-        xnumel = split if split_axis_order == axis1_order else size_hints[axis1_order]
-        rblock = 1
-        if axis2_order is not None:
-            rblock = split if split_axis_order == axis2_order else size_hints[axis2_order]
-
-        xblock_sub = xnumel
-        cfg = {"NBLOCKS": nblocks, "XBLOCK": split, "XBLOCK_SUB": xblock_sub}
-        # forward to grid()
-        cfg["split_axis_order"] = split_axis_order
-        cfg["axis2_order"] = axis2_order if not(axis2_order is None) else -1 
-        cfg["is_low_dim"] = is_low_dim
-        cfg["min_aligned_numel"] = min_aligned_numel
-        is_1d_reduction = reduction and axis2_order is None
-        if persistent_reduction:
-            numof_reduction_axis = inductor_meta["numof_reduction_axis"]
-            if numof_reduction_axis > 1:
-                del cfg["XBLOCK_SUB"]
-                configs.append(Config(cfg, num_warps=1, num_stages=1))
-            elif axis2_order is None:
-                del cfg["XBLOCK"]
-                del cfg["XBLOCK_SUB"]
-                cfg["NBLOCKS"] = 1
-                configs.append(Config(cfg, num_warps=1, num_stages=1))
-            else:
-                TileGenerator.descend_xblock(rnumel=rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
-        elif is_1d_reduction:
-            cfg["NBLOCKS"] = 1
-            cfg["XBLOCK"] = split_numel
-            cfg["XBLOCK_SUB"] = split_numel
-            TileGenerator.descend_xblock(rnumel=rblock, xblock=split_numel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
-        # both of the two axis are low dims
-        elif axis1_order in low_dims and axis2_order in low_dims:
-            cfg["RBLOCK"] = rblock
-            TileGenerator.descend_xblock_rblock(rnumel=rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
-        elif axis2_order is None and axis1_order is not None:
-            TileGenerator.descend_xblock(rnumel=0, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
-        # need to maximize xblock_sub
-        elif axis1_order in low_dims:
-            cfg["RBLOCK"] = rblock
-            TileGenerator.descend_rblock(rnumel=rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
-        elif axis2_order in low_dims:
-            cfg["RBLOCK"] = rblock
-            TileGenerator.descend_xblock(rnumel=rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel)
-        elif len(low_dims) == 0:
-            cfg["RBLOCK"] = rblock
-            if (axis1_order is not None) and (axis2_order is not None):
-                TileGenerator.descend_xblock_rblock(rnumel=rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel, aggresive=False)
-            elif axis1_order is not None:
-                TileGenerator.descend_xblock(rnumel=0, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel, aggresive=False)
-            else:
-                TileGenerator.descend_rblock(rnumel=rblock, xblock=xnumel, configs=configs, cfg=cfg, align_numel=min_aligned_numel, aggresive=False)
-        else:
-            cfg["RBLOCK"] = rblock
-            tmp = Config(cfg, num_warps=num_warps, num_stages=num_stages)
-            configs.append(tmp)
+    tile_generator.descend_split_tiling()
+    
+    if not configs :
+        cfg = {}
+        for x in split_axis :
+            cfg[f"{axis_names[x].upper()}BLOCK"] = size_hints[x]
+        if not cfg :
+            cfg["dummy"] = 1
+        tmp = Config(cfg, num_warps=num_warps, num_stages=num_stages)
+        configs.append(tmp)
+    
+    for cfg in configs :
+        split_blocks = [None for x in split_axis]
+        for i,axis in enumerate(split_axis) :
+            name = axis_names[axis]
+            block_name = f"{name.upper()}BLOCK"
+            split_blocks[i] = cfg.kwargs[block_name]
+        cfg.kwargs["split_axis"] = tuple(split_axis)
+        cfg.kwargs["split_blocks"] = tuple(split_blocks)
+        #log.info("generated tiling configs %s", cfg.kwargs)
 
-    for cfg in configs:
-        log.debug("generated tiling configs %s", cfg.kwargs)
 
     return configs
 
diff --git a/torch_npu/_inductor/runtime.py b/torch_npu/_inductor/runtime.py
index ae00e99043..296fff4de1 100644
--- a/torch_npu/_inductor/runtime.py
+++ b/torch_npu/_inductor/runtime.py
@@ -3,6 +3,34 @@ import functools
 
 from torch._inductor.runtime.hints import DeviceProperties
 from .config import num_vector_core
+from typing import List , Dict
+from torch.utils._triton import has_triton, has_triton_package
+from torch._inductor.remote_cache import JsonDataTy
+
+
+if has_triton_package():
+    from triton import Config
+
+# overload this to avoid autotune after best_config already generated
+def _load_cached_autotuning(
+    best_config: Dict[str, JsonDataTy],
+    configs_hash: str,
+    configs: List[Config],
+    inductor_meta: Dict,
+) -> Optional[Config]:
+    if best_config is None:
+        return None
+    if best_config.pop("configs_hash", None) != configs_hash:
+        return None
+    # Remove time taken for comparison
+    best_config.pop("time_taken_ms", None)
+
+    #if inductor_meta.get("coordinate_descent_tuning") :
+    num_warps = best_config.pop("num_warps")
+    num_stages = best_config.pop("num_stages")
+    triton_config = Config(best_config, num_warps=num_warps, num_stages=num_stages)
+    triton_config.found_by_coordesc = True
+    return triton_config
 
 
 class NPUDeviceProperties(DeviceProperties):
-- 
Gitee


From 4fedea198cb24920cbf8dcee8cec5f0b8f23a764 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E7=AB=8B?= <zhangchunli19@huawei.com>
Date: Tue, 29 Apr 2025 16:11:13 +0000
Subject: [PATCH 344/358] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20patch?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torch_npu/_inductor/patch/.keep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 torch_npu/_inductor/patch/.keep

diff --git a/torch_npu/_inductor/patch/.keep b/torch_npu/_inductor/patch/.keep
new file mode 100644
index 0000000000..e69de29bb2
-- 
Gitee


From 63c263ccca73d240899793dcfb214f54e7277d4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E7=AB=8B?= <zhangchunli19@huawei.com>
Date: Tue, 29 Apr 2025 16:12:01 +0000
Subject: [PATCH 345/358] temporarily save patch features of AOTI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 张春立 <zhangchunli19@huawei.com>
---
 torch_npu/_inductor/patch/_runner.cpp         |  130 ++
 torch_npu/_inductor/patch/aoti_torch_npu.cpp  |  127 ++
 .../_inductor/patch/ascend_aot_package.py     |  533 ++++++++
 torch_npu/_inductor/patch/c_shim_npu.cpp      |   69 +
 .../patch/model_container_runner_npu.cpp      |   56 +
 torch_npu/_inductor/patch/torch_changes.patch | 1138 +++++++++++++++++
 .../_inductor/patch/torch_npu_changes.patch   |   14 +
 7 files changed, 2067 insertions(+)
 create mode 100644 torch_npu/_inductor/patch/_runner.cpp
 create mode 100644 torch_npu/_inductor/patch/aoti_torch_npu.cpp
 create mode 100644 torch_npu/_inductor/patch/ascend_aot_package.py
 create mode 100644 torch_npu/_inductor/patch/c_shim_npu.cpp
 create mode 100644 torch_npu/_inductor/patch/model_container_runner_npu.cpp
 create mode 100644 torch_npu/_inductor/patch/torch_changes.patch
 create mode 100644 torch_npu/_inductor/patch/torch_npu_changes.patch

diff --git a/torch_npu/_inductor/patch/_runner.cpp b/torch_npu/_inductor/patch/_runner.cpp
new file mode 100644
index 0000000000..adab65e162
--- /dev/null
+++ b/torch_npu/_inductor/patch/_runner.cpp
@@ -0,0 +1,130 @@
+#include <iostream>
+#include <vector>
+
+#include "torch/script.h"
+
+#include <torch/torch.h>
+#include <torch/csrc/inductor/aoti_runner/model_container_runner_npu.h>
+#include <torch/csrc/inductor/aoti_package/model_package_loader.h>
+#include <torch_npu/torch_npu.h>
+#include <acl/acl.h>
+
+#include <fstream>
+#include <string>
+#include <cctype>
+
+void removeWhitespace(std::string& str) {
+    std::string result;
+    for (char c : str) {
+        if (!std::isspace(c)) {
+            result += c;
+        }
+    }
+    str = result;
+}
+
+int extractValue(const std::string& json_str, const std::string& key) {
+    std::string target_key = "\"" + key + "\":";  // "input_ids":
+    size_t pos = json_str.find(target_key);
+    
+    if (pos == std::string::npos) {
+        std::cerr << "Key '" << target_key << "' not found!" << std::endl;
+        return -1;
+    }
+    
+    pos += target_key.length();
+    size_t end = json_str.find_first_not_of("0123456789", pos);
+    return std::stoi(json_str.substr(pos, end - pos));
+}
+
+std::string parseArgMapJson(const std::string &argMapPath){
+    std::ifstream jsonfile(argMapPath); 
+    if (!jsonfile.is_open()) {
+        std::cerr << "Failed to open file!" << std::endl;
+        return nullptr;
+    }
+    std::string json_str{
+        std::istreambuf_iterator<char>(jsonfile),
+        std::istreambuf_iterator<char>()
+    };
+    removeWhitespace(json_str);
+    return json_str;
+}
+
+void loadDebertaWeights(std::vector<torch::Tensor> &inputs, std::string weightArgPath, int num){
+    if(inputs.size()==num)return;
+    inputs.reserve(num);
+    torch::jit::script::Module weightTensors = torch::jit::load(weightArgPath);
+    for(int i=0;i<num;i++){
+        std::string argName = "arg_" + std::to_string(i);
+        inputs.push_back(weightTensors.attr(argName).toTensor());
+    }
+}
+
+const std::map<std::string,std::string> getDebertaFilepathFromBatch(int batchSize){
+    std::map<std::string,std::string> ret;
+    // "/host/zcl/deberta_aoti/deberta_new.pt2",
+    // "/host/zcl/weights/args_aoti.pt",
+    // "/host/zcl/weights/input_args_map_aoti.json"
+    std::string batchString = std::to_string(batchSize);
+    std::string basePath = "/host/deberta_files/batch_" + batchString;
+
+    ret["pt2Path"] = basePath + "/deberta_" + batchString + ".pt2";
+    ret["weightArgPath"] = basePath + "/data/aotinductor/model/weight_args_" + batchString + ".pt";
+    ret["argMapPath"] = basePath + "/data/aotinductor/model/args_map_" + batchString +  ".json";
+    return ret;
+}
+
+std::vector<torch::Tensor> runDebertaModelInference(
+    const std::map<std::string,torch::Tensor> &userInputs, const int batchSize){
+
+    const auto paths = getDebertaFilepathFromBatch(batchSize);
+    const std::string pt2Path = paths.at("pt2Path");
+    const std::string weightArgPath = paths.at("weightArgPath");
+    const std::string argMapPath = paths.at("argMapPath");
+
+    std::string json_str = parseArgMapJson(argMapPath);
+
+    std::vector<torch::Tensor> inputs;
+    loadDebertaWeights(inputs, weightArgPath, extractValue(json_str, "input_arg_length"));
+
+    inputs[extractValue(json_str, "input_ids")] = userInputs.at("input_ids");
+    inputs[extractValue(json_str, "segment_ids")] = userInputs.at("segment_ids");
+    inputs[extractValue(json_str, "input_mask")] = userInputs.at("input_mask");
+
+    torch::inductor::AOTIModelPackageLoader loader(pt2Path);
+    torch::inductor::AOTIModelContainerRunner* runner = loader.get_runner();
+    std::vector<torch::Tensor> outputs = runner->run(inputs);
+
+    return outputs;
+}
+
+int main() {
+    c10::InferenceMode mode;
+    torch::inductor::RegistNpu();
+
+    // Status QSEngineInterfaceInherit::Infer(QSIOTensor& in, QSIOTensor& out, void* stream)
+    //QSIOTensor -> userInputs
+    torch::jit::script::Module tensors = torch::jit::load("/host/zcl/deberta_aoti/deberta_inputs.pth");
+
+    // tensors.to(at::kPrivateUse1);
+    torch::Tensor input_ids = tensors.attr("input_ids").toTensor().to(at::kPrivateUse1);
+    torch::Tensor segment_ids = tensors.attr("segment_ids").toTensor().to(at::kPrivateUse1);
+    torch::Tensor input_mask = tensors.attr("input_mask").toTensor().to(at::kPrivateUse1);
+    int batchSize = input_ids.size(0) - 1;
+
+    std::map<std::string,torch::Tensor> userInputs={
+        {"input_ids", input_ids},
+        {"segment_ids", segment_ids},
+        {"input_mask", input_mask}
+    };
+
+    std::vector<torch::Tensor> outputs = runDebertaModelInference(userInputs, batchSize);
+
+    for(auto &out: outputs)out = out.to(torch::kCPU);
+    torch_npu::finalize_npu();
+
+    std::cout << outputs[0] << std::endl;
+    std::cout << outputs[1] << std::endl;
+    return 0;
+}
\ No newline at end of file
diff --git a/torch_npu/_inductor/patch/aoti_torch_npu.cpp b/torch_npu/_inductor/patch/aoti_torch_npu.cpp
new file mode 100644
index 0000000000..6bf0249cbc
--- /dev/null
+++ b/torch_npu/_inductor/patch/aoti_torch_npu.cpp
@@ -0,0 +1,127 @@
+#include <cstdint>
+#include <ATen/TensorUtils.h>
+#include <torch_npu/csrc/aten/common/from_blob.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/inductor/aoti_torch/utils.h>
+#include <torch/csrc/inductor/inductor_ops.h>
+
+#include <ATen/ops/empty_strided.h>
+
+namespace {
+enum class DeviceType : int8_t {
+    CPU = 0,
+    CUDA = 1, // CUDA.
+    MKLDNN = 2, // Reserved for explicit MKLDNN
+    OPENGL = 3, // OpenGL
+    OPENCL = 4, // OpenCL
+    IDEEP = 5, // IDEEP.
+    HIP = 6, // AMD HIP
+    FPGA = 7, // FPGA
+    MAIA = 8, // ONNX Runtime / Microsoft
+    XLA = 9, // XLA / TPU
+    Vulkan = 10, // Vulkan
+    Metal = 11, // Metal
+    XPU = 12, // XPU
+    MPS = 13, // MPS
+    Meta = 14, // Meta (tensors with no data)
+    HPU = 15, // HPU / HABANA
+    VE = 16, // SX-Aurora / NEC
+    Lazy = 17, // Lazy Tensors
+    IPU = 18, // Graphcore IPU
+    MTIA = 19, // Meta training and inference devices
+    PrivateUse1 = 20, // PrivateUse1 device
+    // NB: If you add more devices:
+    //  - Change the implementations of DeviceTypeName and isValidDeviceType
+    //    in DeviceType.cpp
+    //  - Change the number below
+    COMPILE_TIME_MAX_DEVICE_TYPES = 21,
+  };
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+int32_t aoti_torch_device_type_npu() {
+    return (int32_t)DeviceType::PrivateUse1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+namespace {
+  static c10::Device c10_device(int32_t device_type, int32_t device_index) {
+    if (device_type == aoti_torch_device_type_cpu()) {
+      return c10::Device(static_cast<c10::DeviceType>(device_type));
+    } else {
+      return c10::Device(
+          static_cast<c10::DeviceType>(device_type),
+          static_cast<c10::DeviceIndex>(device_index));
+    }
+  }
+} // namespace
+
+AOTITorchError aoti_torch_create_tensor_from_blob_npu(
+  void* data,
+  int64_t ndim,
+  const int64_t* sizes_ptr,
+  const int64_t* strides_ptr,
+  int64_t storage_offset,
+  int32_t dtype,
+  int32_t device_type,
+  int32_t device_index,
+  AtenTensorHandle* ret_new_tensor) {
+AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+  c10::IntArrayRef sizes(sizes_ptr, ndim);
+  c10::IntArrayRef strides(strides_ptr, ndim);
+  c10::Device device = c10_device(device_type, device_index);
+  c10::TensorOptions options = c10::TensorOptions().device(device).dtype(
+      static_cast<c10::ScalarType>(dtype));
+  *ret_new_tensor = torch::aot_inductor::new_tensor_handle(
+      // data == nullptr can happen for a 0-size tensor
+      (data != nullptr) ? at_npu::native::from_blob(data, sizes, strides, storage_offset, options, device)
+                          : at::empty_strided(sizes, strides, options));
+      // (data != nullptr) ? c10_npu::native::for_blob(data, sizes)
+      //                         .strides(strides)
+      //                         .storage_offset(storage_offset)
+      //                         .options(options)
+      //                         .make_tensor()
+      //                   : at::empty_strided(sizes, strides, options));
+});
+}
+
+AOTITorchError aoti_torch_create_tensor_from_blob_npu_v2(
+  void* data,
+  int64_t ndim,
+  const int64_t* sizes_ptr,
+  const int64_t* strides_ptr,
+  int64_t storage_offset,
+  int32_t dtype,
+  int32_t device_type,
+  int32_t device_index,
+  AtenTensorHandle* ret_new_tensor,
+  int32_t layout,
+  const uint8_t* opaque_metadata,
+  int64_t opaque_metadata_size) {
+AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+  if (layout == static_cast<int32_t>(at::kMkldnn)) {
+    throw std::runtime_error("do not support mkldnn on npu.");
+  } else {
+    aoti_torch_create_tensor_from_blob_npu(
+        data,
+        ndim,
+        sizes_ptr,
+        strides_ptr,
+        storage_offset,
+        dtype,
+        device_type,
+        device_index,
+        ret_new_tensor);
+  }
+});
+}
+
+
diff --git a/torch_npu/_inductor/patch/ascend_aot_package.py b/torch_npu/_inductor/patch/ascend_aot_package.py
new file mode 100644
index 0000000000..d55d698388
--- /dev/null
+++ b/torch_npu/_inductor/patch/ascend_aot_package.py
@@ -0,0 +1,533 @@
+import torch
+import torch_npu
+
+import os
+import re
+import sys
+import inductor_npu
+from torch_npu.contrib import transfer_to_npu
+import torch._inductor.package as pkg
+
+from typing import Dict, Any
+
+import importlib
+import json
+import shutil
+import shlex
+import subprocess
+
+from abc import ABC, abstractmethod
+
+def modify_class_name(module_code: str) -> str:
+    """ replace '<lambda>' with 'testModule' """
+    modified_code = re.sub(
+        r'class <lambda>\(torch\.nn\.Module\):',
+        "class testModule(torch.nn.Module):",
+        module_code,
+        count=1
+    )
+    header = """
+import torch
+from torch import device
+import torch_npu
+import xpu_graph
+
+from xpu_graph.passes.patterns.targets.npu.triton_kernel.fused_brc_permute_sum import fused_brc_permute_sum
+from xpu_graph.passes.patterns.targets.npu.triton_kernel.fused_div_mul_sum import fused_div_mul_sum
+
+import os
+import inductor_npu
+from torch_npu.contrib import transfer_to_npu\n\n
+"""
+    return header + modified_code
+
+# analysis forward func string and generate input tensors
+def generate_inputs(code: str) -> Dict[str, torch.Tensor]:
+    # arg0_1: "i64[11, 12, 256, 256]"
+    pattern = r"(arg\d+_\d+): \"([if]\d+)\s*\[(.*?)\]\""
+
+    # 使用正则表达式查找所有匹配项
+    matches = re.findall(pattern, code)
+
+    from torch._dynamo.testing import rand_strided
+    # 解析结果
+    fake_params = {}
+    dtype_map = {
+        "i64": torch.int64,
+        "i32": torch.int32,
+        "i16": torch.int16,
+        "i8": torch.int8,
+        "i1": torch.bool,
+        "f16": torch.float16,
+        "f32": torch.float32,
+        "bf16": torch.bfloat16,
+    }
+    for match in matches:
+        param_name = match[0]
+        dtype = match[1]
+        shape = tuple(int(dim) for dim in match[2].split(','))
+        
+        fake_params[param_name] = torch.zeros(shape,dtype=dtype_map[dtype],device="npu")
+
+    return fake_params
+
+def import_from_path(input_path):
+    module_name = os.path.basename(input_path).replace('.py', '')
+    spec = importlib.util.spec_from_file_location(module_name, input_path)
+    if not spec:
+        raise ImportError(f"can not create package: {input_path}")
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
+
+def process_and_run_model(input_path: str, do_aoti = False):
+    # 1. read and replace class
+    with open(input_path) as f:
+        code = f.read()
+    modified_code = modify_class_name(code)
+
+    # 2. generate inputs
+    forward_str =  re.search(r"def forward\(.*?\):\n", modified_code).group()
+    fake_inputs = generate_inputs(forward_str)
+
+    # 3. declare module
+    # module_dict = {}
+    output_path = os.path.join(os.path.dirname(input_path),"decorated_graph.py")
+    with open(output_path, "w") as f:
+        f.write(modified_code)    
+    
+    module = import_from_path(output_path)
+
+    # 4. create a module object
+    model = module.testModule().to("npu")
+
+    # 5. run module
+    with torch.no_grad():
+        if do_aoti:
+            exported = torch.export.export(model, tuple(fake_inputs.values()))
+            output_path = torch._inductor.aoti_compile_and_package(
+                exported,
+                # [Optional] Specify the generated shared library path. If not specified,
+                # the generated artifact is stored in your system temp directory.
+                package_path=os.path.join(os.path.dirname(input_path),"origin.pt2"),
+            )
+        else:
+            print(model(*(fake_inputs.values())))
+
+    return output_path
+
+class OpCodeGenerator(ABC):
+    @abstractmethod
+    def generate(self, num, opname, arglist, outbuf):
+        pass
+
+class LibraryOpGenerator(OpCodeGenerator):
+    def generate(self, node_id, opname, arglist, outbuf):
+        NewLines = []
+        # 生成输入Tensor变量声明
+        new_input_argnames = []
+        for i, arg in enumerate(arglist):
+            if isinstance(arg, str):
+                new_input = f"node{node_id}_{opname}_{i}"
+                new_input_argnames.append(new_input)
+                NewLines.append(f"    at::Tensor {new_input} = *reinterpret_cast<at::Tensor*>({arg}.get());")
+            elif isinstance(arg, int):
+                new_input_argnames.append(str(arg))
+            else :
+                raise TypeError(f"can not generate unsupport argtype: {type(arg).__name__}")
+
+        # 生成函数调用
+        func_arg_str = ", ".join(new_input_argnames)
+
+        output_argname = f"{outbuf}_tensor"
+        NewLines.append(f"    auto {output_argname} = at::{opname}({func_arg_str});")
+        # 生成输出句柄
+        NewLines.append(f"    RAIIAtenTensorHandle {outbuf}(reinterpret_cast<AtenTensorHandle>(new at::Tensor({output_argname})));")
+
+        return NewLines
+
+class CustomOpGenerator(OpCodeGenerator):
+    def __init__(self, kernel_path, kernel_name, scalar_params=None, grid_size=(44, 1, 1)):
+        self.kernel_path = kernel_path
+        self.kernel_name = kernel_name
+        self.scalar_params = scalar_params or []
+        self.grid_size = grid_size
+
+    def generate(self, num, opname, arglist, outbuf):
+        code = []
+        kernel_var = f"kernels.{self.kernel_name}"
+        
+        # Kernel加载逻辑
+        code.append(f"if ({kernel_var} == nullptr) {{")
+        code.append(f"    {kernel_var} = loadKernel(\"{self.kernel_path}\", \"{self.kernel_name}\", 1);")
+        code.append("}")
+        
+        # 网格配置
+        code.append(f"Grid {self.kernel_name}_grid = Grid({self.grid_size[0]}, {self.grid_size[1]}, {self.grid_size[2]});")
+        
+        # 生成Tensor指针变量
+        for i, arg in enumerate(arglist):
+            code.append(f"void* var_{i} = reinterpret_cast<void*>({arg}.data_ptr());")
+        
+        # 生成标量参数
+        for name, value in self.scalar_params:
+            code.append(f"int {name} = {value};")
+        
+        # FFT地址获取
+        code.append("rtError_t ret;")
+        code.append("void* ffts_addr = nullptr;")
+        code.append("uint32_t ffts_len;")
+        code.append("ret = rtGetC2cCtrlAddr((uint64_t*)&ffts_addr, &ffts_len);")
+        code.append("if (ret != RT_ERROR_NONE) return;")
+        code.append("void* workspace_addr = nullptr;")
+        
+        # 生成参数结构体
+        code.append("struct __attribute__((packed)) {")
+        code.append("    void* ffts_addr __attribute__((aligned(8)));")
+        code.append("    void* workspace_addr __attribute__((aligned(8)));")
+        for i in range(len(arglist)):
+            code.append(f"    void* var_{i} __attribute__((aligned(8)));")
+        for name, _ in self.scalar_params:
+            code.append(f"    int {name} __attribute__((aligned(4)));")
+        code.append("    int32_t gridX __attribute__((aligned(4)));")
+        code.append("    int32_t gridY __attribute__((aligned(4)));")
+        code.append("    int32_t gridZ __attribute__((aligned(4)));")
+        code.append("} kernel_args = {")
+        code.append("    ffts_addr, workspace_addr,")
+        code.append("    " + ", ".join([f"var_{i}" for i in range(len(arglist))]) + ",")
+        code.append("    " + ", ".join([name for name, _ in self.scalar_params]) + ",")
+        code.append(f"    {self.kernel_name}_grid.grid_x, {self.kernel_name}_grid.grid_y, {self.kernel_name}_grid.grid_z")
+        code.append("};")
+        
+        # 内核启动
+        code.append(f"if ({self.kernel_name}_grid.is_non_zero()) {{")
+        code.append(f'    launchKernel("{self.kernel_name}", {kernel_var}, stream, '
+                    f'{self.kernel_name}_grid.grid_x, {self.kernel_name}_grid.grid_y, '
+                    f'{self.kernel_name}_grid.grid_z, &kernel_args, sizeof(kernel_args));')
+        code.append("}")
+        return code
+
+
+def MakePath(directory, name):
+    return os.path.abspath(os.path.join(directory, name))
+
+class FallbackData:
+    def __init__(self, json_path: str):
+        with open(json_path, 'r') as f:
+            self.nodes = json.load(f)['nodes']
+
+    def get_node_info(self, node_id: int):
+        if node_id < 0 or node_id >= len(self.nodes):
+            raise ValueError(f"Invalid node_id: {node_id}. Total nodes: {len(self.nodes)}")
+
+        # 提取目标节点
+        node = self.nodes[node_id]['node']
+
+        # 遍历所有输入参数
+        arg_types = []
+        for input_item in node['inputs']:
+            arg = input_item['arg']
+            arg_type = next(iter(arg.keys()))  # as_tensor | as_float | as_int
+            arg_types.append(arg_type)
+
+        return node['target'], arg_types
+
+class CodeManager:
+    OP_REGISTRY = {
+        "aten::addmm": {"revertlines": 2, "generator": LibraryOpGenerator()},
+        "aten::gather": {"revertlines": 2, "generator": LibraryOpGenerator()},
+        "torch_npu_triton::fused_div_mul_sum": {
+            "revertlines": 2,
+            "generator": CustomOpGenerator(
+                kernel_path="/path/to/custom_op.cubin",
+                kernel_name="custom_op_name",
+                grid_size=(),
+            )
+        },
+    }
+
+    def __init__(self, directory, cpp_name, json_name):
+        self.code_list = []
+        self.cpp_path = MakePath(directory, cpp_name)
+        self.proxy_data = FallbackData(MakePath(directory, json_name))
+
+    def clear(self):
+        self.code_list.clear()
+    
+    def pop_lines(self, num):
+        for _ in range(num):
+            self.code_list.pop()
+
+    def append_lines(self, newlines):
+        self.code_list.extend(newlines)
+
+    def save_new_file(self, new_file_path):
+        with open(new_file_path, "w", encoding="utf-8") as f:
+            for line in self.code_list:
+                f.write(line + "\n")
+
+    def extract_proxy_executor_line(self, line: str, argtypes: list[str]):
+        # 匹配 int64_t vector
+        int64_pattern = r'std::vector<int64_t>\s*{\s*([^}]+?)\s*}\s*\.data\(\)'
+        int64_match = re.search(int64_pattern, line)
+        
+        # 匹配 AtenTensorHandle vector
+        aten_pattern = r'std::vector<AtenTensorHandle>\s*{\s*([^}]+?)\s*}\s*\.data\(\)'
+        aten_match = re.search(aten_pattern, line)
+        
+        # 提取 int64_t 元素
+        int64_list = []
+        if int64_match:
+            int64_content = int64_match.group(1)
+            int64_list = [e.strip() for e in int64_content.split(',')]
+        
+        # 提取 AtenTensorHandle 元素
+        aten_list = []
+        if aten_match:
+            aten_content = aten_match.group(1)
+            aten_list = [e.strip() for e in aten_content.split(',')]
+
+        arglist = []
+        iptr = 0
+        aptr = 0
+        for argtype in argtypes:
+            if argtype == "as_tensor":
+                arglist.append(aten_list[aptr])
+                aptr+=1
+            elif argtype == "as_int":
+                arglist.append(int(int64_list[iptr]))
+                iptr+=1
+            else:
+                raise ValueError(f"meeting unsupported argtype:{argtype}")
+        if aptr != len(aten_list) - 1:
+            raise ValueError(f"mismatched argtype length and arglist length!")
+
+        return arglist, aten_list[aptr]
+
+    def process_cpp_file(self):
+        fallbackOpPrefixPattern = re.compile(
+            r'^\s*aoti_torch_proxy_executor_call_function\(\s*proxy_executor\s*,\s*(\d+),'
+        )
+
+        compileCmdPattern = re.compile(
+            r'^//\sg\+\+\s+\S+\.cpp'
+        )
+
+        linkCmdPattern = re.compile(
+            r'^//\sg\+\+\s+\S+\.o'
+        )
+
+        loadKernelPattern = re.compile(
+            r'loadKernel\(\"(.*/)([^/]+\.cubin)\"'
+        )
+
+        with open(self.cpp_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                # 保留原始行
+                self.code_list.append(line.rstrip('\n'))
+                
+                if compileCmdPattern.search(line):
+                    originCmd = line.replace("// ", "", 1)
+                    continue
+                
+                if linkCmdPattern.search(line):
+                    linkCmd = line.replace("// ", "", 1)
+                    continue
+                
+                # if loadKernelPattern.search(line):
+                #     self.pop_lines(1)
+                #     modified_line = re.sub(loadKernelPattern, r'loadKernel("\2"', line)
+                #     self.code_list.append(modified_line)
+                #     continue
+
+                # 检查是否匹配代理函数调用
+                match = fallbackOpPrefixPattern.search(line)
+                if not match:
+                    continue
+
+                # 提取 node_id 并获取算子信息
+                node_id = int(match.group(1))
+                target, argtypes = self.proxy_data.get_node_info(node_id)
+                
+                # 检查算子是否已注册
+                if target not in CodeManager.OP_REGISTRY:
+                    continue
+
+                # 提取参数和输出缓冲区
+                arglist, outbuf = self.extract_proxy_executor_line(line, argtypes)
+
+                revertlines = CodeManager.OP_REGISTRY[target]["revertlines"]
+                generator = CodeManager.OP_REGISTRY[target]["generator"]
+                self.pop_lines(revertlines)
+
+                new_lines = generator.generate(node_id, target.split("::")[-1], arglist, outbuf)
+                self.append_lines(new_lines)
+
+        return originCmd, linkCmd
+
+
+class AOTIPkgManager:
+    def __init__(self,pt2_path, weight_path, new_name_prefix):
+        self.binfiles = []           # .cubin 
+        self.wrapper_name = None     # xxx.cpp 
+        self.proxy_json_name = None  # xxx.json 
+        self.metadata_json_name = None  # xxx_metadata.json 
+        self.weight_name = None      # .o 
+        self.weight_path = weight_path
+        self.shared_library_name = None  # .so
+        self.extract_dir = self.extract_pt2(pt2_path)
+        self.new_name_prefix = new_name_prefix
+        self.classify_files(self.extract_dir)
+        self.code_manager = CodeManager(
+            self.extract_dir,
+            self.wrapper_name,
+            self.proxy_json_name
+        )
+
+    def classify_files(self, directory):
+        from pathlib import Path
+        
+        path = Path(directory)
+
+        for file in path.glob("*"):
+            if file.suffix == ".cubin":
+                self.binfiles.append(file.name)
+            elif file.suffix == ".cpp":
+                self.wrapper_name = file.name
+            elif file.suffix == ".json":
+                if file.stem.endswith("_metadata"):
+                    self.metadata_json_name = file.name
+                else:
+                    self.proxy_json_name = file.name
+            elif file.suffix == ".o":
+                self.weight_name = file.name
+            elif file.suffix == ".so":
+                self.shared_library_name = file.name
+        print(f"[INFO] binfiles: cnt={len(self.binfiles)}, {self.binfiles}")
+        print(f"[INFO] wrapper_name = {self.wrapper_name}")
+        print(f"[INFO] metadata_json_name = {self.metadata_json_name}")
+        print(f"[INFO] proxy_json_name = {self.proxy_json_name}")
+        print(f"[INFO] weight_name = {self.weight_name}")
+        print(f"[INFO] shared_library_name = {self.shared_library_name}")
+        
+    
+    def extract_pt2(self, pt2_path: str) -> None:
+        """
+        unzip <path-to>/*.pt2 to <path-to>/pt2tmp
+        and return <path-to>/pt2tmp/data/aotinductor/model
+        """
+
+        self.pt2_dir = os.path.dirname(pt2_path)
+        extract_dir = os.path.join(self.pt2_dir, "pt2tmp")
+        extract_dir = os.path.abspath(extract_dir)
+
+        if os.path.exists(extract_dir):
+            def handle_error(func, path, exc_info):
+                import stat
+                if not os.access(path, os.W_OK):
+                    os.chmod(path, stat.S_IWUSR)
+                    func(path)
+                else:
+                    raise
+            shutil.rmtree(extract_dir, onerror=handle_error)
+        os.makedirs(extract_dir, exist_ok=True)
+
+        import zipfile
+        with zipfile.ZipFile(pt2_path, 'r') as zip_ref:
+            zip_ref.extractall(extract_dir)
+
+        return os.path.join(extract_dir,"data/aotinductor/model")
+
+    def rewrite_cpp_wrapper(self):
+        self.new_cpp_path = MakePath(self.extract_dir, self.new_name_prefix+".cpp")
+        # self.weight_path = MakePath(self.extract_dir, self.weight_name)
+        self.new_so_path = MakePath(self.extract_dir, self.new_name_prefix+".so")
+
+        old_compile_cmd, old_link_cmd = self.code_manager.process_cpp_file()
+        self.code_manager.save_new_file(self.new_cpp_path)
+
+        compile_list = shlex.split(old_compile_cmd)
+        link_list = shlex.split(old_link_cmd)
+
+        compile_list[1] = self.new_cpp_path
+        tmp_path = MakePath(self.extract_dir, "tmp.o")
+        compile_list[-1] = tmp_path
+
+        link_list[1] = tmp_path
+        if link_list[2].endswith(".o"):
+            link_list[2] = self.weight_path
+        link_list[-1] = self.new_so_path
+
+        print("[INFO] after rewrite_cpp_wrapper:")
+        print(f"[INFO] new_cpp_path = {self.new_cpp_path}")
+        print(f"[INFO] compile_list = {compile_list}")
+        print(f"[INFO] link_list = {link_list}")
+
+        return compile_list, link_list
+
+    def recompile(self, compile_cmd, link_cmd):
+        try:
+            subprocess.run(compile_cmd, check=True)
+        except Exception as e:
+            raise e
+
+        try:
+            subprocess.run(link_cmd, check=True)
+        except Exception as e:
+            raise e
+
+    def repackage(self, input_arg_map_path, args_aoti_path):
+        new_proxy_json_path = MakePath(self.extract_dir, self.new_name_prefix + ".json")
+        new_metadata_json_path = MakePath(self.extract_dir, self.new_name_prefix + "_metadata.json")
+
+        shutil.copy(MakePath(self.extract_dir,self.proxy_json_name), new_proxy_json_path)
+        shutil.copy(MakePath(self.extract_dir, self.metadata_json_name),new_metadata_json_path)
+
+        file_list = [
+            self.new_cpp_path,
+            self.new_so_path,
+            new_proxy_json_path,
+            new_metadata_json_path,
+        ]
+
+        if len(input_arg_map_path)>3:
+            file_list.append(input_arg_map_path)
+
+        if len(args_aoti_path)>3:
+            file_list.append(args_aoti_path)
+
+        for filename in self.binfiles:
+            file_list.append(MakePath(self.extract_dir, filename))
+        
+        new_pkg_path = MakePath(self.pt2_dir, self.new_name_prefix + ".pt2")
+        pkg.package_aoti(new_pkg_path, file_list)
+        print(f"[INFO] OUTPUT NEW AOTI PACKAGE TO: {new_pkg_path}")
+
+    def make_new_pt2(self, input_arg_map_path="", args_aoti_path=""):
+        compile_cmd, link_cmd = self.rewrite_cpp_wrapper()
+        self.recompile(compile_cmd, link_cmd)
+        return self.repackage(input_arg_map_path, args_aoti_path)
+
+
+if __name__ == "__main__":
+    batch_size_list = [1, 2, 4, 8, 12, 16, 20, 24, 28, 32]
+    for batch_size in batch_size_list:
+        decorated_path = process_and_run_model(
+            f"/host/zcl/deberta_pkgs/fx_graph_readable_{batch_size}.py",
+            do_aoti=True
+        )
+        print(f"finished run outputcode, dump into {decorated_path}")
+
+        aoti_manager =  AOTIPkgManager(
+        pt2_path = decorated_path,
+        weight_path = "/host/aoti_weights/weight.o",
+        new_name_prefix = f"deberta_{batch_size}"
+        )
+
+        aoti_manager.make_new_pt2(
+            f"/host/aoti_weights/weight_args_{batch_size}.pt",
+            f"/host/aoti_weights/args_map_{batch_size}.json"
+        )
+
+        print("[INFO] ---------- DONE BATCH {batch_size} ----------")
\ No newline at end of file
diff --git a/torch_npu/_inductor/patch/c_shim_npu.cpp b/torch_npu/_inductor/patch/c_shim_npu.cpp
new file mode 100644
index 0000000000..7db50caba9
--- /dev/null
+++ b/torch_npu/_inductor/patch/c_shim_npu.cpp
@@ -0,0 +1,69 @@
+
+
+// WARNING: THIS FILE IS AUTOGENERATED BY torchgen. DO NOT MODIFY BY HAND.
+// See https://github.com/pytorch/pytorch/blob/7e86a7c0155295539996e0cf422883571126073e/torchgen/gen.py#L2424-L2436 for details
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/inductor/aoti_torch/utils.h>
+
+// 基础支持
+#include <ATen/core/Tensor.h>
+#include <c10/core/SymInt.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+
+// NPU扩展
+// #include <ATen/npu/npu_utils.h>
+#include "torch_npu/torch_npu.h"
+// #include "torch_npu/csrc/framework/utils/NpuUtils.h"
+// #include <ATen/npu/native/npu_framework.h>
+
+// 算子定义
+#include <ATen/ops/index.h>
+#include <ATen/ops/cat.h>
+#include <ATen/ops/convolution_compositeexplicitautograd_dispatch.h>
+
+// 模板工具
+#include <ATen/core/List.h>
+#include <memory> // 智能指针支持
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/CPUFunctions.h>
+#include <ATen/CompositeExplicitAutogradFunctions.h>
+#include <ATen/CompositeExplicitAutogradNonFunctionalFunctions.h>
+#include <ATen/CompositeImplicitAutogradFunctions.h>
+#include </host/zcl/pta_v2.6/third_party/acl/inc/acl/acl_mdl.h>
+#else
+
+#endif
+
+using namespace torch::aot_inductor;
+
+
+AOTITorchError aoti_torch_npu_index_Tensor(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle* ret0) {
+    AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+        auto tmp_result = at::index(
+            *tensor_handle_to_tensor_pointer(self), c10::List<::std::optional<at::Tensor>>(c10::ArrayRef<::std::optional<at::Tensor>>(pointer_to_list<::std::optional<at::Tensor>>(indices, indices_len_)))
+        );
+        *ret0 = new_tensor_handle(std::move(tmp_result));;
+    });
+}
+
+AOTITorchError aoti_torch_npu_cat(const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, AtenTensorHandle* ret0) {
+    AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+        auto tmp_result = at::cat(
+            pointer_to_list<at::Tensor>(tensors, tensors_len_), dim
+        );
+        *ret0 = new_tensor_handle(std::move(tmp_result));;
+    });
+}
+
+AOTITorchError aoti_torch_npu_convolution(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, AtenTensorHandle* ret0) {
+    AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+        auto tmp_result = at::compositeexplicitautograd::convolution_symint(
+            *tensor_handle_to_tensor_pointer(input), *tensor_handle_to_tensor_pointer(weight), pointer_to_optional<at::Tensor>(bias), pointer_to_list<c10::SymInt>(stride, stride_len_), pointer_to_list<c10::SymInt>(padding, padding_len_), pointer_to_list<c10::SymInt>(dilation, dilation_len_), transposed, pointer_to_list<c10::SymInt>(output_padding, output_padding_len_), groups
+        );
+        *ret0 = new_tensor_handle(std::move(tmp_result));;
+    });
+}
+
+// -------------------------------- split line --------------------------------
\ No newline at end of file
diff --git a/torch_npu/_inductor/patch/model_container_runner_npu.cpp b/torch_npu/_inductor/patch/model_container_runner_npu.cpp
new file mode 100644
index 0000000000..e8a80a50e3
--- /dev/null
+++ b/torch_npu/_inductor/patch/model_container_runner_npu.cpp
@@ -0,0 +1,56 @@
+#include <iostream>
+#include <stdio.h>
+#include <torch/csrc/inductor/aoti_runner/model_container_runner_npu.h>
+
+namespace torch::inductor {
+
+AOTIModelContainerRunnerNpu::AOTIModelContainerRunnerNpu(
+    const std::string& model_so_path,
+    size_t num_models,
+    const std::string& device_str,
+    const std::string& cubin_dir)
+    : AOTIModelContainerRunner(
+          model_so_path,
+          num_models,
+          device_str,
+          cubin_dir) {
+            std::cerr <<"[DEBUG] in create func" << std::endl;
+          }
+
+AOTIModelContainerRunnerNpu::~AOTIModelContainerRunnerNpu() = default;
+
+
+std::vector<at::Tensor> AOTIModelContainerRunnerNpu::run(
+  const std::vector<at::Tensor>& inputs, void* stream_handle){
+  c10_npu::NPUStream npu_stream = c10_npu::getCurrentNPUStream();
+  std::cerr<<"[DEBUG] before ModelContainer run, stream = "<<npu_stream.stream()<<std::endl;
+  return AOTIModelContainerRunner::run(
+      inputs, reinterpret_cast<AOTInductorStreamHandle>(npu_stream.stream()));
+}
+
+std::vector<at::Tensor> AOTIModelContainerRunnerNpu::run_with_npu_stream(
+    std::vector<at::Tensor>& inputs,
+    c10_npu::NPUStream npu_stream) {
+  return AOTIModelContainerRunner::run(
+      inputs, reinterpret_cast<AOTInductorStreamHandle>(npu_stream.stream()));
+}
+
+
+std::unique_ptr<AOTIModelContainerRunner> create_aoti_runner_npu(
+    const std::string& model_so_path,
+    size_t num_models,
+    const std::string& device_str,
+    const std::string& cubin_dir) {
+  std::cout <<"[DEBUG] in create_aoti_runner_npu" << std::endl;
+  return std::make_unique<AOTIModelContainerRunnerNpu>(
+      model_so_path, num_models, device_str, cubin_dir);
+}
+
+void RegistNpu() {
+  std::cout << "[DEBUG] start regist npu" << std::endl;
+  RegisterAOTIModelRunner register_npu_runner("npu", &create_aoti_runner_npu);
+  std::cout << "[DEBUG] end regist npu" << std::endl;
+}
+
+
+} // namespace torch::inductor
diff --git a/torch_npu/_inductor/patch/torch_changes.patch b/torch_npu/_inductor/patch/torch_changes.patch
new file mode 100644
index 0000000000..46ebd614ad
--- /dev/null
+++ b/torch_npu/_inductor/patch/torch_changes.patch
@@ -0,0 +1,1138 @@
+diff --git a/_inductor/codecache.py b/_inductor/codecache.py
+index de72c7e..f7b52c4 100644
+--- a/_inductor/codecache.py
++++ b/_inductor/codecache.py
+@@ -1469,6 +1469,7 @@ class AotCodeCompiler:
+             generated_files.append(input_path)
+ 
+         output_code_log.info("Output code written to: %s", input_path)
++        print("Output code written to: %s", input_path)
+         trace_structured(
+             "graph_dump",
+             lambda: {
+@@ -1544,6 +1545,7 @@ class AotCodeCompiler:
+                 output_dir=object_output_dir,
+                 BuildOption=object_build_options,
+             )
++            # import pdb;pdb.set_trace()
+             compile_cmd = object_builder.get_command_line()
+             consts_o = object_builder.get_target_file_path()
+             if fbcode_aot_cpu_re:
+@@ -1675,10 +1677,12 @@ class AotCodeCompiler:
+                 output_dir=object_output_dir,
+                 BuildOption=object_build_options,
+             )
++            # import pdb;pdb.set_trace()
+             compile_cmd = object_builder.get_command_line()
+             output_o = object_builder.get_target_file_path()
+ 
+             log.debug("aot compilation command: %s", compile_cmd)
++            print(f"aot compilation command: {compile_cmd}")
+             if not config.aot_inductor.package_cpp_only:
+                 if fbcode_aot_cpu_re:
+                     output_o = os.path.splitext(input_path)[0] + ".o"
+@@ -1686,7 +1690,7 @@ class AotCodeCompiler:
+                     os.chmod(output_o, 0o644)
+                 else:
+                     run_command_and_check(compile_cmd)
+-
++            # import pdb;pdb.set_trace()
+             if config.aot_inductor.package_cpp_only:
+                 compile_flags = os.path.splitext(input_path)[0] + "_compile_flags.json"
+                 object_build_options.save_flags_to_file(compile_flags)
+@@ -1713,24 +1717,26 @@ class AotCodeCompiler:
+             kernels_o = " ".join(kernels_o)
+ 
+             output_name, output_dir = get_name_and_dir_from_output_file_path(output_so)
++
+             so_build_options = CppTorchDeviceOptions(
+                 vec_isa=picked_vec_isa,
+                 device_type=device_type,
+                 aot_mode=graph.aot_mode,
+                 use_absolute_path=use_absolute_path,
+             )
+-
+             so_builder = CppBuilder(
+                 name=output_name,
+                 sources=[output_o, consts_o, kernels_o],
+                 output_dir=output_dir,
+                 BuildOption=so_build_options,
+             )
++
+             link_cmd = so_builder.get_command_line()
++            shutil.copy(consts_o, "/host/aoti_weights/weight.o")
+             output_so = so_builder.get_target_file_path()
+ 
+             log.debug("aot linkage command: %s", link_cmd)
+-
++            print(f"aot linkage command: {link_cmd}")
+             # Append cmds to the end of codegen-ed wrapper file
+             with open(input_path, "a") as f:
+                 f.write("\n")
+@@ -2000,6 +2006,7 @@ class CppCodeCache:
+         # And then pass the command_line to below write function as extra parameter to
+         # guarantee the source code hash contains ISA difference.
+         vec_isa_cmd = repr(command_gen.get_command_line())
++        # import pdb;pdb.set_trace()
+         key, input_path = write(source_code, "cpp", extra=vec_isa_cmd)
+ 
+         if key not in cls.cache:
+diff --git a/_inductor/codegen/aoti_runtime/interface.cpp b/_inductor/codegen/aoti_runtime/interface.cpp
+index b270ccb..f9e0a7f 100644
+--- a/_inductor/codegen/aoti_runtime/interface.cpp
++++ b/_inductor/codegen/aoti_runtime/interface.cpp
+@@ -1,10 +1,15 @@
+ // Definition of AOTI runtime interface functions
+ 
++#include <torch/csrc/inductor/aoti_runtime/arrayref_tensor.h>
+ #include <torch/csrc/inductor/aoti_runtime/interface.h>
+ #include <torch/csrc/inductor/aoti_runtime/model_container.h>
++#include <torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h>
++#include <torch/csrc/inductor/aoti_runtime/thread_local.h>
+ 
+ #include <iostream>
+ #include <sstream>
++#include <fstream>
++#include <iostream>
+ #include <stdexcept>
+ #include <vector>
+ 
+@@ -55,7 +60,7 @@ AOTIRuntimeError AOTInductorModelContainerCreate(
+       return AOTInductorModelContainerCreateWithDevice(
+         container_handle,
+         num_models,
+-        is_cpu ? "cpu" : "cuda",
++        is_cpu ? "cpu" : "npu",
+         cubin_dir);
+ }
+ 
+diff --git a/_inductor/codegen/cpp_utils.py b/_inductor/codegen/cpp_utils.py
+index 4a62f92..849476f 100644
+--- a/_inductor/codegen/cpp_utils.py
++++ b/_inductor/codegen/cpp_utils.py
+@@ -82,6 +82,7 @@ DEVICE_TO_ATEN = {
+     "cpu": "at::kCPU",
+     "cuda": "at::kCUDA",
+     "xpu": "at::kXPU",
++    "npu": "at::kNPU",
+ }
+ 
+ LAYOUT_TO_ATEN = {
+diff --git a/_inductor/codegen/cpp_wrapper_cpu.py b/_inductor/codegen/cpp_wrapper_cpu.py
+index f92da71..532c38d 100644
+--- a/_inductor/codegen/cpp_wrapper_cpu.py
++++ b/_inductor/codegen/cpp_wrapper_cpu.py
+@@ -190,7 +190,6 @@ class CppWrapperCpu(PythonWrapperCodegen):
+             #include <torch/csrc/inductor/aoti_runtime/arrayref_tensor.h>
+             #include <torch/csrc/inductor/aoti_runtime/thread_local.h>
+             #include <torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h>
+-            #include <torch/csrc/inductor/aoti_torch/generated/c_shim_{self.device}.h>
+ 
+             #include <c10/util/generic_math.h>
+             typedef at::Half half;
+diff --git a/_inductor/codegen/wrapper.py b/_inductor/codegen/wrapper.py
+index 4da5e4c..ff7f724 100644
+--- a/_inductor/codegen/wrapper.py
++++ b/_inductor/codegen/wrapper.py
+@@ -402,9 +402,8 @@ class EnterDeviceContextManagerLine(WrapperLine):
+                 # associated with a device, so we never expect the device to change.
+                 # CUDAStreamGuard sets the stream and the device.
+                 if self.last_seen_device_guard_index is None:
+-                    code.writeline(
+-                        f"{V.graph.device_ops.cpp_aoti_stream_guard()} stream_guard(stream, this->device_idx_);"
+-                    )
++                    code.writeline(f"c10_npu::NPUStream npuStream = c10_npu::getCurrentNPUStream(this->device_idx_);")
++                    code.writeline(f"if(stream != npuStream.stream()){{std::cerr<<\"stream not equal to npuStream!!!\"<<std::endl;abort();}}")
+                 else:
+                     assert (
+                         self.last_seen_device_guard_index == self.device_idx
+@@ -419,8 +418,8 @@ class EnterDeviceContextManagerLine(WrapperLine):
+         else:
+             # Note _DeviceGuard has less overhead than device, but only accepts
+             # integers
+-            code.writeline(f"with {V.graph.device_ops.device_guard(self.device_idx)}:")
+-            code.do_indent()
++            # code.writeline(f"with {V.graph.device_ops.device_guard(self.device_idx)}:")
++            # code.do_indent()
+             code.writeline(V.graph.device_ops.set_device(self.device_idx))
+ 
+ 
+@@ -1147,7 +1146,6 @@ class PythonWrapperCodegen(CodeGen):
+         result.splice(self.subgraph_definitions)
+ 
+         with contextlib.ExitStack() as stack:
+-            stack.enter_context(self.wrapper_call.indent())
+             if config.profiler_mark_wrapper_call:
+                 self.generate_profiler_mark_wrapper_call(stack)
+             if config.profile_bandwidth:
+diff --git a/_inductor/cpp_builder.py b/_inductor/cpp_builder.py
+index 92cf88d..92d357d 100644
+--- a/_inductor/cpp_builder.py
++++ b/_inductor/cpp_builder.py
+@@ -597,6 +597,7 @@ def get_cpp_options(
+         + _get_warning_all_cflag(warning_all)
+         + _get_cpp_std_cflag()
+         + _get_os_related_cpp_cflags(cpp_compiler)
++        + ["Wno-unused-function"]
+     )
+ 
+     passthough_args.append(" ".join(extra_flags))
+@@ -686,7 +687,7 @@ def _use_fb_internal_macros() -> List[str]:
+             ]
+             # TODO: this is to avoid FC breakage for fbcode. When using newly
+             # generated model.so on an older verion of PyTorch, need to use
+-            # the v1 version for aoti_torch_create_tensor_from_blob
++            # the v1 version for aoti_torch_create_tensor_from_blob_npu
+             create_tensor_from_blob_v1 = "AOTI_USE_CREATE_TENSOR_FROM_BLOB_V1"
+ 
+             fb_internal_macros.append(create_tensor_from_blob_v1)
+@@ -786,6 +787,15 @@ def _get_torch_related_args(
+ 
+     return include_dirs, libraries_dirs, libraries
+ 
++def _get_torch_npu_related_args(
++    include_pytorch: bool, aot_mode: bool
++):
++    from torch_npu.utils._inductor import _TORCH_NPU_PATH, TORCH_NPU_LIB_PATH
++
++    include_dirs = [os.path.join(_TORCH_NPU_PATH, "include"), "/host/zcl/aoti_files"]
++    libraries_dirs = ["/host/zcl/pta_v2.6/libtorch_npu/lib" ,"/host/zcl/aoti_files"]
++    libraries = ["torch_npu", "aoti_npu", "aoti_runner_npu", "aoti_npuops"]
++    return include_dirs, libraries_dirs, libraries
+ 
+ def _get_python_include_dirs() -> List[str]:
+     include_dir = Path(sysconfig.get_path("include"))
+@@ -1043,6 +1053,12 @@ def get_cpp_torch_options(
+         torch_libraries,
+     ) = _get_torch_related_args(include_pytorch=include_pytorch, aot_mode=aot_mode)
+ 
++    (
++        torch_npu_include_dirs,
++        torch_npu_libraries_dirs,
++        torch_npu_libraries,
++    ) = _get_torch_npu_related_args(include_pytorch=include_pytorch, aot_mode=aot_mode)
++
+     python_include_dirs, python_libraries_dirs = _get_python_related_args()
+ 
+     (
+@@ -1070,12 +1086,13 @@ def get_cpp_torch_options(
+         sys_libs_include_dirs
+         + python_include_dirs
+         + torch_include_dirs
++        + torch_npu_include_dirs
+         + omp_include_dir_paths
+     )
+     cflags = sys_libs_cflags + omp_cflags
+     ldflags = omp_ldflags
+-    libraries_dirs = python_libraries_dirs + torch_libraries_dirs + omp_lib_dir_paths
+-    libraries = torch_libraries + omp_lib
++    libraries_dirs = python_libraries_dirs + torch_libraries_dirs + omp_lib_dir_paths + torch_npu_libraries_dirs
++    libraries = torch_libraries + omp_lib + torch_npu_libraries
+     passthough_args = (
+         sys_libs_passthough_args
+         + isa_ps_args_build_flags
+@@ -1233,6 +1250,12 @@ def get_cpp_torch_device_options(
+         cflags += ["fsycl", "Wno-unsupported-floating-point-opt"]
+         libraries += ["c10_xpu", "sycl", "ze_loader", "torch_xpu"]
+ 
++    if device_type == "npu":
++        definations.append(" USE_NPU")
++        definations.append(" BUILD_LIBTORCH=ON")
++        # cflags += [""]
++        libraries += ["runtime", "ascendcl"]
++
+     if aot_mode:
+         if config.is_fbcode():
+             from torch._inductor.codecache import cpp_prefix_path
+@@ -1306,7 +1329,6 @@ class CppTorchDeviceOptions(CppTorchOptions):
+         device_libraries_dirs: List[str] = []
+         device_libraries: List[str] = []
+         device_passthough_args: List[str] = []
+-
+         (
+             device_definations,
+             device_include_dirs,
+@@ -1325,6 +1347,7 @@ class CppTorchDeviceOptions(CppTorchOptions):
+         _append_list(self._libraries_dirs, device_libraries_dirs)
+         _append_list(self._libraries, device_libraries)
+         _append_list(self._passthough_args, device_passthough_args)
++
+         self._finalize_options()
+ 
+     def _finalize_options(self) -> None:
+@@ -1448,7 +1471,8 @@ class CppBuilder:
+                 self._cflags_args += f"/{cflag} "
+             else:
+                 self._cflags_args += f"-{cflag} "
+-
++        # if self._compile_only:
++        #     import pdb;pdb.set_trace()
+         for defination in BuildOption.get_definations():
+             if _IS_WINDOWS:
+                 self._definations_args += f"/D {defination} "
+diff --git a/_inductor/graph.py b/_inductor/graph.py
+index 3a5942f..8e3018b 100644
+--- a/_inductor/graph.py
++++ b/_inductor/graph.py
+@@ -1860,7 +1860,7 @@ class GraphLowering(torch.fx.Interpreter):
+         """
+         For GPU, Triton kernels are autotuned and stored as cubin files
+         """
+-        if any(device in self.device_types for device in ["cuda", "xpu"]):
++        if any(device in self.device_types for device in ["cuda", "xpu", 'npu']):
+             if config.triton.autotune_at_compile_time:
+                 # If autotune_at_compile_time is True, we can do the codegen in one-pass
+                 # TODO: once autotune_at_compile_time is stable, we should delete the else branch
+diff --git a/_inductor/utils.py b/_inductor/utils.py
+index d5c096a..fcceb62 100644
+--- a/_inductor/utils.py
++++ b/_inductor/utils.py
+@@ -64,6 +64,7 @@ GPU_TYPES = ["cuda", "xpu"]
+ @functools.lru_cache(None)
+ def get_gpu_type():
+     avail_gpus = [x for x in GPU_TYPES if getattr(torch, x).is_available()]
++    # import pdb;pdb.set_trace()
+     assert len(avail_gpus) <= 1
+     gpu_type = "cuda" if len(avail_gpus) == 0 else avail_gpus.pop()
+     return gpu_type
+@@ -1944,7 +1945,7 @@ def get_cloned_parameter_buffer_name(name: str):
+ 
+ def is_gpu(device: Optional[str]):
+     assert isinstance(device, str) or device is None, device
+-    return device in GPU_TYPES
++    return device in GPU_TYPES or device == "npu"
+ 
+ 
+ def device_need_guard(device: str):
+diff --git a/include/torch/csrc/inductor/aoti_runner/model_container_runner_npu.h b/include/torch/csrc/inductor/aoti_runner/model_container_runner_npu.h
+new file mode 100644
+index 0000000..848cab6
+--- /dev/null
++++ b/include/torch/csrc/inductor/aoti_runner/model_container_runner_npu.h
+@@ -0,0 +1,32 @@
++#pragma once
++
++#include <torch_npu/csrc/core/npu/NPUStream.h>
++#include <torch/csrc/inductor/aoti_runner/model_container_runner.h>
++
++namespace torch::inductor {
++
++// NOTICE: Following APIs are subject to change due to active development
++// We provide NO BC guarantee for these APIs
++class TORCH_API AOTIModelContainerRunnerNpu : public AOTIModelContainerRunner {
++ public:
++  // @param device_str: cuda device string, e.g. "cuda", "cuda:0"
++  AOTIModelContainerRunnerNpu(
++      const std::string& model_so_path,
++      size_t num_models = 1,
++      const std::string& device_str = "npu",
++      const std::string& cubin_dir = "");
++
++  ~AOTIModelContainerRunnerNpu();
++
++  std::vector<at::Tensor> run(
++    const std::vector<at::Tensor>& inputs,
++    void* stream_handle = nullptr) override;
++
++  std::vector<at::Tensor> run_with_npu_stream(
++      std::vector<at::Tensor>& inputs,
++      c10_npu::NPUStream npu_stream);
++};
++
++void RegistNpu();
++
++} // namespace torch::inductor
+\ No newline at end of file
+diff --git a/include/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h b/include/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h
+index e2f2957..9730c60 100644
+--- a/include/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h
++++ b/include/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h
+@@ -229,7 +229,7 @@ class ArrayRefTensor {
+ 
+   AtenTensorHandle borrowAsTensor() const {
+     AtenTensorHandle result = nullptr;
+-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_v2(
++    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu_v2(
+         data(),
+         sizes_.size(),
+         sizes_.data(),
+diff --git a/include/torch/csrc/inductor/aoti_runtime/device_utils.h b/include/torch/csrc/inductor/aoti_runtime/device_utils.h
+index 7b48f49..b1aa844 100644
+--- a/include/torch/csrc/inductor/aoti_runtime/device_utils.h
++++ b/include/torch/csrc/inductor/aoti_runtime/device_utils.h
+@@ -50,6 +50,32 @@ using DeviceStreamType = sycl::queue*;
+ 
+ } // namespace torch::aot_inductor
+ 
++#elif defined(USE_NPU)
++
++#include "third_party/acl/inc/acl/acl_base.h"
++#include "third_party/acl/inc/acl/acl_rt.h"
++
++// DCK_TODO: do we need to support 32bit os.
++typedef void* NPUdeviceptr;
++
++typedef void* NPUfunction;
++
++#define AOTI_RUNTIME_DEVICE_CHECK(EXPR)                          \
++  do {                                                           \
++    const aclError code = EXPR;                                  \
++    if (code != ACL_SUCCESS) {                                   \
++      throw std::runtime_error(                                  \
++          std::string("NPU error core: ") + std::to_string(code) \
++          + std::string(" ") +  std::string(__FILE__) + std::string(":") + std::to_string(__LINE__)); \
++    }                                                            \
++  } while (0)
++
++namespace torch::aot_inductor {
++
++using DeviceStreamType = aclrtStream;
++
++} // namespace torch::aot_inductor
++
+ #else
+ 
+ #define AOTI_RUNTIME_DEVICE_CHECK(EXPR)            \
+diff --git a/include/torch/csrc/inductor/aoti_runtime/model.h b/include/torch/csrc/inductor/aoti_runtime/model.h
+index a8ec3a6..2c3ca04 100644
+--- a/include/torch/csrc/inductor/aoti_runtime/model.h
++++ b/include/torch/csrc/inductor/aoti_runtime/model.h
+@@ -59,6 +59,22 @@ GPUPtr RAII_gpuMalloc(size_t num_bytes) {
+ 
+ #endif // USE_CUDA
+ 
++#ifdef USE_NPU
++
++using NPUPtr = std::unique_ptr<void, std::function<void(void*)>>;
++
++NPUPtr RAII_npuMalloc(size_t num_bytes) {
++  void* data_ptr;
++  // DCK_TODO: aclrtMalloc bytes cannot be 0, how to adapt.
++  if (num_bytes == 0) num_bytes = 4;
++  // DCK_TODO: ACL_MEM_MALLOC_NORMAL_ONLY ?
++  AOTI_RUNTIME_DEVICE_CHECK(aclrtMalloc((void**)&data_ptr, num_bytes, ACL_MEM_MALLOC_NORMAL_ONLY));
++  auto deleter = [](void* ptr) { AOTI_RUNTIME_DEVICE_CHECK(aclrtFree(ptr)); };
++  return NPUPtr(data_ptr, deleter);
++}
++
++#endif // USE_NPU
++
+ #ifdef USE_XPU
+ 
+ using GPUPtr = std::unique_ptr<void, std::function<void(void*)>>;
+@@ -92,9 +108,10 @@ inline void parse_device_str(
+     const std::string& device_str,
+     int32_t& device_type,
+     int32_t& device_idx) {
+-  std::regex re("(cpu|cuda|xpu)(:([0-9]+))?");
++  std::regex re("(cpu|cuda|xpu|npu)(:([0-9]+))?");
+   std::smatch sm;
+   bool matched = std::regex_match(device_str, sm, re);
++  std::cout <<"wz 1" << std::endl;
+   AOTI_RUNTIME_CHECK(matched, "Invalid device: " + device_str);
+ 
+   if (sm[1].str() == "cpu") {
+@@ -104,8 +121,13 @@ inline void parse_device_str(
+ #ifdef USE_XPU
+   } else if (sm[1].str() == "xpu") {
+     device_type = aoti_torch_device_type_xpu();
++#endif
++#ifdef USE_NPU
++  } else if (sm[1].str() == "npu") {
++    device_type = aoti_torch_device_type_npu();
+ #endif
+   } else {
++    std::cout <<"wz 1" << std::endl;
+     AOTI_RUNTIME_CHECK(false, "Invalid device: " + device_str);
+   }
+ 
+@@ -153,6 +175,14 @@ class AOTInductorModelBase {
+       aoti_torch_set_current_xpu_device(device_idx_);
+     }
+ #endif // USE_XPU
++#ifdef USE_NPU
++    if (device_idx_ == -1) {
++      // DCK_TODO: which device to set WZ_TODO: match CUDA
++      std::cout << "wzdebugsetnpu device0" << std::endl;
++      AOTI_RUNTIME_DEVICE_CHECK(aclrtSetDevice(0));
++      AOTI_RUNTIME_DEVICE_CHECK(aclrtGetDevice(&device_idx_));
++    }
++#endif // USE_NPU
+   }
+ 
+   // NOLINTNEXTLINE(modernize-use-equals-default)
+@@ -172,6 +202,15 @@ class AOTInductorModelBase {
+       delete *run_finished_;
+     }
+ #endif // USE_XPU
++#ifdef USE_NPU
++    if (run_finished_) {
++      auto code = aclrtDestroyEvent(*run_finished_);
++      if (code != ACL_SUCCESS) {
++        std::cerr << "Failed to destroy NPU event in AOTInductor model erorr code: "
++                  << code << std::endl;
++      }
++    }
++#endif // USE_NPU
+   }
+ 
+   AOTInductorModelBase(AOTInductorModelBase&&) = delete;
+@@ -201,6 +240,12 @@ class AOTInductorModelBase {
+       delete *run_finished_;
+       run_finished_.reset();
+     }
++#elif defined(USE_NPU)
++    if (!run_finished_) {
++        aclrtEvent run_finished;
++        AOTI_RUNTIME_DEVICE_CHECK(aclrtCreateEvent(&run_finished));
++        run_finished_.emplace(run_finished);
++    }
+ #else // !USE_CUDA && !USE_XPU
+     run_finished_ = false;
+ #endif
+@@ -213,6 +258,8 @@ class AOTInductorModelBase {
+ #elif defined(USE_XPU)
+     run_finished_ = std::make_optional<sycl::event*>(new sycl::event(
+         static_cast<sycl::queue*>(stream)->ext_oneapi_submit_barrier()));
++#elif defined(USE_NPU)
++    AOTI_RUNTIME_DEVICE_CHECK(aclrtRecordEvent(*run_finished_, stream));
+ #else // !USE_CUDA && !USE_XPU
+     run_finished_ = true;
+ #endif // USE_CUDA
+@@ -234,6 +281,12 @@ class AOTInductorModelBase {
+       delete *run_finished_;
+       run_finished_.reset();
+     }
++#elif defined(USE_NPU)
++    if (!run_finished_) {
++      aclrtEvent run_finished;
++      AOTI_RUNTIME_DEVICE_CHECK(aclrtCreateEvent(&run_finished));
++      run_finished_.emplace(run_finished);
++    }
+ #else // !USE_CUDA && !USE_XPU
+     run_finished_ = false;
+ #endif
+@@ -250,6 +303,8 @@ class AOTInductorModelBase {
+     run_finished_ = std::make_optional<sycl::event*>(new sycl::event(
+         static_cast<sycl::queue*>(stream)->ext_oneapi_submit_barrier()));
+ 
++#elif defined(USE_NPU)
++    AOTI_RUNTIME_DEVICE_CHECK(aclrtRecordEvent(*run_finished_, stream));
+ #else // !USE_CUDA && !USE_XPU
+     run_finished_ = true;
+ #endif // USE_CUDA
+@@ -267,6 +322,8 @@ class AOTInductorModelBase {
+       compute_gpu_constant_blob(blob_size, constants_internal_offset);
+ #if defined(USE_CUDA) || defined(USE_XPU)
+       constant_blob_ = RAII_gpuMalloc(blob_size);
++#elif defined(USE_NPU)
++    constant_blob_ = RAII_npuMalloc(blob_size);
+ #endif
+     }
+     if (!include_weights) {
+@@ -276,7 +333,7 @@ class AOTInductorModelBase {
+     size_t bytes_read = 0;
+     for (size_t i = 0; i < num_constants; i++) {
+       bool from_folded = this->constant_from_folded(i);
+-#if not defined(USE_XPU) && not defined(USE_CUDA)
++#if not defined(USE_XPU) && not defined(USE_CUDA) && not defined(USE_NPU)
+       if (from_folded) {
+         // We do not reallocate and copy for CPU.
+         continue;
+@@ -306,11 +363,11 @@ class AOTInductorModelBase {
+       AtenTensorHandle tensor_handle = nullptr;
+ #ifdef AOTI_USE_CREATE_TENSOR_FROM_BLOB_V1
+       // When opaque_metadata_size is not 0, we need to have the
+-      // aoti_torch_create_tensor_from_blob_v2 available
++      // aoti_torch_create_tensor_from_blob_npu_v2 available
+       AOTI_RUNTIME_CHECK(
+           opaque_metadata_size == 0,
+           "Expect opaque_metadata_size to be 0 when AOTI_USE_CREATE_TENSOR_FROM_BLOB_V1 is defined");
+-      AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob(
++      AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu(
+           internal_ptr,
+           ndim,
+           size,
+@@ -321,7 +378,7 @@ class AOTInductorModelBase {
+           device_idx_,
+           &tensor_handle));
+ #else
+-      AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_v2(
++      AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu_v2(
+           internal_ptr,
+           ndim,
+           size,
+@@ -347,6 +404,11 @@ class AOTInductorModelBase {
+     return std::move(constant_blob_);
+   }
+ #endif
++#ifdef USE_NPU
++  NPUPtr&& release_constant_blob() {
++    return std::move(constant_blob_);
++  }
++#endif
+ 
+   std::shared_ptr<std::vector<ConstantHandle>> get_constants_array() {
+     return constants_;
+@@ -361,7 +423,7 @@ class AOTInductorModelBase {
+       size_t bytes_read,
+       size_t data_size,
+       bool skip_copy) {
+-#if defined(USE_CUDA) || defined(USE_XPU)
++#if defined(USE_CUDA) || defined(USE_XPU) || defined(USE_NPU)
+     auto* constants_ptr = static_cast<uint8_t*>(constant_blob_.get());
+     uint8_t* internal_ptr = constants_ptr + constant_offset;
+     // Copy data to GPU memory
+@@ -374,6 +436,13 @@ class AOTInductorModelBase {
+           ->memcpy(internal_ptr, _get_constants_start() + bytes_read, data_size)
+           .wait();
+ 
++#elif defined(USE_NPU)
++        AOTI_RUNTIME_DEVICE_CHECK(aclrtMemcpy(
++            internal_ptr,
++            data_size,
++            _get_constants_start() + bytes_read,
++            data_size,
++            ACL_MEMCPY_HOST_TO_DEVICE));
+ #else
+       AOTI_RUNTIME_DEVICE_CHECK(cudaMemcpy(
+           internal_ptr,
+@@ -394,7 +463,7 @@ class AOTInductorModelBase {
+   void compute_gpu_constant_blob(
+       size_t& blob_size,
+       std::vector<size_t>& constants_internal_offset) {
+-#if defined(USE_CUDA) || defined(USE_XPU)
++#if defined(USE_CUDA) || defined(USE_XPU) || defined(USE_NPU)
+     size_t num_constants = this->num_constants();
+     // Compute required blob size with 64-alignment if on GPU.
+     blob_size = 0;
+@@ -544,6 +613,19 @@ class AOTInductorModelBase {
+     throw std::runtime_error(
+         std::string("The model did not finish successfully. Error: ") +
+         cudaGetErrorString(cudaGetLastError()));
++
++#elif defined(USE_NPU)
++        if (!run_finished_) {
++          throw std::runtime_error{"Model NPU event was not initialized"};
++        }
++        aclrtEventRecordedStatus recordStatus = ACL_EVENT_RECORDED_STATUS_NOT_READY;
++        AOTI_RUNTIME_DEVICE_CHECK(aclrtQueryEventStatus(*run_finished_, &recordStatus));
++    
++        if (recordStatus == ACL_EVENT_RECORDED_STATUS_COMPLETE) {
++          return true;
++        } else {
++          return false;
++        }
+ #elif defined(USE_XPU)
+     if (!run_finished_) {
+       throw std::runtime_error{"Model XPU event was not initialized"};
+@@ -648,6 +730,12 @@ class AOTInductorModelBase {
+   GPUPtr constant_blob_;
+ #endif // USE_CUDA
+ 
++#ifdef USE_NPU
++  // Holds the blob storage for constants' at::Tensor for CUDA.
++  NPUPtr constant_blob_;
++#endif // USE_NPU
++
++
+ #ifdef USE_MMAP_SELF
+   uint8_t* self_mmap = NULL;
+ #endif
+@@ -666,6 +754,8 @@ class AOTInductorModelBase {
+   std::optional<cudaEvent_t> run_finished_;
+ #elif defined(USE_XPU)
+   std::optional<sycl::event*> run_finished_;
++#elif defined(USE_NPU)
++  std::optional<aclrtEvent> run_finished_;
+ #else // !USE_CUDA
+   bool run_finished_{};
+ #endif
+diff --git a/include/torch/csrc/inductor/aoti_runtime/model_container.h b/include/torch/csrc/inductor/aoti_runtime/model_container.h
+index d94ee86..29cb503 100644
+--- a/include/torch/csrc/inductor/aoti_runtime/model_container.h
++++ b/include/torch/csrc/inductor/aoti_runtime/model_container.h
+@@ -52,7 +52,7 @@ class AOTInductorModelContainer {
+       output_names_.emplace_back(model->output_name(static_cast<int64_t>(i)));
+     }
+     model->load_constants();
+-#if defined(USE_CUDA) || defined(USE_XPU)
++#if defined(USE_CUDA) || defined(USE_XPU) || defined(USE_NPU)
+     constant_blob_ = model->release_constant_blob();
+     constants_internal_offset_.resize(model->num_constants());
+     model->compute_gpu_constant_blob(blob_size_, constants_internal_offset_);
+@@ -299,6 +299,13 @@ class AOTInductorModelContainer {
+           ->memcpy(internal_constants_ptr, user_constant_ptr, constant_size)
+           .wait();
+ 
++#elif defined(USE_NPU)
++AOTI_RUNTIME_DEVICE_CHECK(aclrtMemcpy(
++    internal_constants_ptr,
++    constant_size,
++    user_constant_ptr,
++    constant_size,
++    ACL_MEMCPY_HOST_TO_DEVICE));
+ #else
+       AOTI_RUNTIME_DEVICE_CHECK(cudaMemcpy(
+           internal_constants_ptr,
+@@ -316,7 +323,7 @@ class AOTInductorModelContainer {
+       AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_strides(tensor, &stride));
+       AOTI_TORCH_ERROR_CODE_CHECK(
+           aoti_torch_get_storage_offset(tensor, &offset));
+-      AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob(
++      AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu(
+           internal_constants_ptr,
+           models_[0]->constant_ndim(idx),
+           models_[0]->constant_shape(idx),
+@@ -325,6 +332,8 @@ class AOTInductorModelContainer {
+           models_[0]->constant_dtype(idx),
+ #ifdef USE_XPU
+           aoti_torch_device_type_xpu(),
++#elif defined(USE_NPU)
++          aoti_torch_device_type_npu(),
+ #else
+           aoti_torch_device_type_cuda(),
+ #endif
+@@ -418,6 +427,17 @@ class AOTInductorModelContainer {
+   std::vector<size_t> constants_internal_offset_;
+ #endif // USE_CUDA
+ 
++#ifdef USE_NPU
++  // Holds the blob storage for constants' at::Tensor for CUDA.
++  NPUPtr constant_blob_;
++  NPUPtr constant_blob_secondary_;
++
++  // Let's place this within USE_NPU at the moment before we fully support
++  // update for CPU cases.
++  size_t blob_size_;
++  std::vector<size_t> constants_internal_offset_;
++#endif // USE_NPU
++
+   // Determine which constants is being used for the model.
+   // If true,
+   // constants_map_secondary/constant_blob_secondary/constants_array_secondary
+@@ -485,6 +505,20 @@ class AOTInductorModelContainer {
+   }
+ #endif // USE_CUDA
+ 
++#ifdef USE_NPU
++  void* get_constant_blob_ptr(bool get_inactive) {
++    if ((get_inactive && use_secondary_) ||
++        (!get_inactive && !use_secondary_)) {
++      return constant_blob_.get();
++    } else {
++      if (!constant_blob_secondary_) {
++        constant_blob_secondary_ = RAII_npuMalloc(blob_size_);
++      }
++      return constant_blob_secondary_.get();
++    }
++  }
++#endif // USE_NPU
++
+   std::shared_ptr<ConstantMap> get_constants_map(bool get_inactive) {
+     if ((get_inactive && use_secondary_) ||
+         (!get_inactive && !use_secondary_)) {
+diff --git a/include/torch/csrc/inductor/aoti_runtime/thread_local.h b/include/torch/csrc/inductor/aoti_runtime/thread_local.h
+index fd931c9..a614bf4 100644
+--- a/include/torch/csrc/inductor/aoti_runtime/thread_local.h
++++ b/include/torch/csrc/inductor/aoti_runtime/thread_local.h
+@@ -66,7 +66,7 @@ struct ThreadLocalCachedOutputTensor<ArrayRefTensor<T>> {
+     // NOLINTNEXTLINE(*arrays*)
+     storage_ = std::make_unique<T[]>(t.numel());
+     AtenTensorHandle handle = nullptr;
+-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob(
++    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_npu(
+         storage_.get(),
+         t.sizes().size(),
+         t.sizes().data(),
+diff --git a/include/torch/csrc/inductor/aoti_runtime/utils_npu.h b/include/torch/csrc/inductor/aoti_runtime/utils_npu.h
+new file mode 100644
+index 0000000..2a3d8ea
+--- /dev/null
++++ b/include/torch/csrc/inductor/aoti_runtime/utils_npu.h
+@@ -0,0 +1,114 @@
++#pragma once
++
++#ifdef USE_CUDA
++// WARNING: Be careful when adding new includes here. This header will be used
++// in model.so, and should not refer to any aten/c10 headers except the stable
++// C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
++// applies to other files under torch/csrc/inductor/aoti_runtime/.
++#include <torch/csrc/inductor/aoti_runtime/utils.h>
++
++#include <cuda.h>
++#include <cuda_runtime.h>
++
++namespace torch::aot_inductor {
++
++inline void delete_cuda_guard(void* ptr) {
++  AOTI_TORCH_ERROR_CODE_CHECK(
++      aoti_torch_delete_cuda_guard(reinterpret_cast<CUDAGuardHandle>(ptr)));
++}
++
++inline void delete_cuda_stream_guard(void* ptr) {
++  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_delete_cuda_stream_guard(
++      reinterpret_cast<CUDAStreamGuardHandle>(ptr)));
++}
++
++class AOTICudaGuard {
++ public:
++  AOTICudaGuard(int32_t device_index) : guard_(nullptr, delete_cuda_guard) {
++    CUDAGuardHandle ptr = nullptr;
++    AOTI_TORCH_ERROR_CODE_CHECK(
++        aoti_torch_create_cuda_guard(device_index, &ptr));
++    guard_.reset(ptr);
++  }
++
++  void set_index(int32_t device_index) {
++    AOTI_TORCH_ERROR_CODE_CHECK(
++        aoti_torch_cuda_guard_set_index(guard_.get(), device_index));
++  }
++
++ private:
++  std::unique_ptr<CUDAGuardOpaque, DeleterFnPtr> guard_;
++};
++
++class AOTICudaStreamGuard {
++ public:
++  AOTICudaStreamGuard(cudaStream_t stream, int32_t device_index)
++      : guard_(nullptr, delete_cuda_stream_guard) {
++    CUDAStreamGuardHandle ptr = nullptr;
++    AOTI_TORCH_ERROR_CODE_CHECK(
++        aoti_torch_create_cuda_stream_guard(stream, device_index, &ptr));
++    guard_.reset(ptr);
++  }
++
++ private:
++  std::unique_ptr<CUDAStreamGuardOpaque, DeleterFnPtr> guard_;
++};
++
++} // namespace torch::aot_inductor
++#endif // USE_CUDA
++
++#ifdef USE_NPU
++// WARNING: Be careful when adding new includes here. This header will be used
++// in model.so, and should not refer to any aten/c10 headers except the stable
++// C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
++// applies to other files under torch/csrc/inductor/aoti_runtime/.
++#include <torch/csrc/inductor/aoti_runtime/utils.h>
++
++#include <acl_base.h>
++
++namespace torch::aot_inductor {
++
++inline void delete_npu_guard(void* ptr) {
++  AOTI_TORCH_ERROR_CODE_CHECK(
++      aoti_torch_delete_npu_guard(reinterpret_cast<NPUGuardHandle>(ptr)));
++}
++
++inline void delete_npu_stream_guard(void* ptr) {
++  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_delete_npu_stream_guard(
++      reinterpret_cast<NPUStreamGuardHandle>(ptr)));
++}
++
++class AOTINpuGuard {
++ public:
++  AOTINpuGuard(int32_t device_index) : guard_(nullptr, delete_npu_guard) {
++    NPUGuardHandle ptr = nullptr;
++    AOTI_TORCH_ERROR_CODE_CHECK(
++        aoti_torch_create_npu_guard(device_index, &ptr));
++    guard_.reset(ptr);
++  }
++
++  void set_index(int32_t device_index) {
++    AOTI_TORCH_ERROR_CODE_CHECK(
++        aoti_torch_npu_guard_set_index(guard_.get(), device_index));
++  }
++
++ private:
++  std::unique_ptr<NPUGuardOpaque, DeleterFnPtr> guard_;
++};
++
++class AOTINpuStreamGuard {
++ public:
++  AOTINpuStreamGuard(aclrtStream stream, int32_t device_index)
++      : guard_(nullptr, delete_npu_stream_guard) {
++    NpuStreamGuardHandle ptr = nullptr;
++    AOTI_TORCH_ERROR_CODE_CHECK(
++        aoti_torch_create_npu_stream_guard(stream, device_index, &ptr));
++    guard_.reset(ptr);
++  }
++
++ private:
++  std::unique_ptr<NPUStreamGuardOpaque, DeleterFnPtr> guard_;
++};
++
++} // namespace torch::aot_inductor
++#endif // USE_NPU
+diff --git a/include/torch/csrc/inductor/aoti_torch/c/shim.h b/include/torch/csrc/inductor/aoti_torch/c/shim.h
+index 4c6c9af..e66a0b9 100644
+--- a/include/torch/csrc/inductor/aoti_torch/c/shim.h
++++ b/include/torch/csrc/inductor/aoti_torch/c/shim.h
+@@ -88,6 +88,139 @@ using AOTITorchError = int32_t;
+ #define AOTI_TORCH_SUCCESS 0
+ #define AOTI_TORCH_FAILURE 1
+ 
++
++
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__adaptive_avg_pool2d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__adaptive_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__adaptive_avg_pool3d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__adaptive_avg_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__addmm_activation(AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha, int32_t use_gelu, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__cdist_backward(AtenTensorHandle grad, AtenTensorHandle x1, AtenTensorHandle x2, double p, AtenTensorHandle cdist, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__cdist_forward(AtenTensorHandle x1, AtenTensorHandle x2, double p, int64_t* compute_mode, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__efficientzerotensor(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__embedding_bag(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__embedding_bag_dense_backward(AtenTensorHandle grad, AtenTensorHandle indices, AtenTensorHandle offset2bag, AtenTensorHandle bag_size, AtenTensorHandle maximum_indices, int64_t num_weights, int32_t scale_grad_by_freq, int64_t mode, AtenTensorHandle* per_sample_weights, int64_t padding_idx, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__embedding_bag_forward_only(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__embedding_bag_per_sample_weights_backward(AtenTensorHandle grad, AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, AtenTensorHandle offset2bag, int64_t mode, int64_t padding_idx, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__fft_c2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t forward, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__fft_r2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t onesided, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__fused_moving_avg_obs_fq_helper(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__fused_moving_avg_obs_fq_helper_functional(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4, AtenTensorHandle* ret5);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__histogramdd_from_bin_cts(AtenTensorHandle self, const int64_t* bins, int64_t bins_len_, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__pdist_backward(AtenTensorHandle grad, AtenTensorHandle self, double p, AtenTensorHandle pdist, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__pdist_forward(AtenTensorHandle self, double p, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__scaled_dot_product_flash_attention_for_npu(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, double dropout_p, int32_t is_causal, AtenTensorHandle* attn_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__scaled_dot_product_flash_attention_for_npu_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle out, AtenTensorHandle logsumexp, double dropout_p, int32_t is_causal, AtenTensorHandle* attn_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__segment_reduce_backward(AtenTensorHandle grad, AtenTensorHandle output, AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* offsets, int64_t axis, double* initial, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__trilinear(AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu__weight_int8pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scales, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_adaptive_max_pool2d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_adaptive_max_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_adaptive_max_pool3d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_adaptive_max_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_add_Scalar(AtenTensorHandle self, double other, double alpha, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_add_Tensor(AtenTensorHandle self, AtenTensorHandle other, double alpha, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_addbmm(AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_addmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_addmv(AtenTensorHandle self, AtenTensorHandle mat, AtenTensorHandle vec, double beta, double alpha, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_angle(AtenTensorHandle self, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_avg_pool2d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_avg_pool3d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_avg_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_baddbmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_bernoulli__Tensor(AtenTensorHandle self, AtenTensorHandle p, AtenGeneratorHandle* generator);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_bernoulli__float(AtenTensorHandle self, double p, AtenGeneratorHandle* generator);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_bmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_bucketize_Tensor(AtenTensorHandle self, AtenTensorHandle boundaries, int32_t out_int32, int32_t right, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_cat(const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_cholesky_inverse(AtenTensorHandle self, int32_t upper, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_cholesky_solve(AtenTensorHandle self, AtenTensorHandle input2, int32_t upper, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_convolution(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_convolution_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, const int64_t** bias_sizes, int64_t bias_sizes_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_cummax(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_cummin(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_cumprod(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_cumsum(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_exponential(AtenTensorHandle self, double lambd, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_fractional_max_pool2d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_fractional_max_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_fractional_max_pool3d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_fractional_max_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_gcd(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_geqrf(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_grid_sampler_2d_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_histc(AtenTensorHandle self, int64_t bins, double min, double max, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_histogram_bin_ct(AtenTensorHandle self, int64_t bins, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_index_Tensor(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_index_put(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_index_reduce(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle source, const char* reduce, int32_t include_self, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_kthvalue(AtenTensorHandle self, int64_t k, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_logcumsumexp(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_lu_unpack(AtenTensorHandle LU_data, AtenTensorHandle LU_pivots, int32_t unpack_data, int32_t unpack_pivots, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_masked_scatter(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle source, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_masked_scatter_backward(AtenTensorHandle grad_output, AtenTensorHandle mask, const int64_t* sizes, int64_t sizes_len_, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_masked_select(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_max_pool2d_with_indices(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_max_pool2d_with_indices_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_max_pool3d_with_indices(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_max_pool3d_with_indices_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_max_unpool2d(AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_max_unpool3d(AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_median(AtenTensorHandle self, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_mode(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_mul_Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_mul_Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_nanmedian(AtenTensorHandle self, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_native_dropout(AtenTensorHandle input, double p, int32_t* train, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_nonzero(AtenTensorHandle self, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_normal_functional(AtenTensorHandle self, double mean, double std, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_ormqr(AtenTensorHandle self, AtenTensorHandle input2, AtenTensorHandle input3, int32_t left, int32_t transpose, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_polar(AtenTensorHandle abs, AtenTensorHandle angle, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_pow_Scalar(double self, AtenTensorHandle exponent, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_pow_Tensor_Scalar(AtenTensorHandle self, double exponent, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_pow_Tensor_Tensor(AtenTensorHandle self, AtenTensorHandle exponent, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_rand(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_rand_generator(const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_randint(int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_randint_generator(int64_t high, const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_randint_low(int64_t low, int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_randint_low_out(AtenTensorHandle out, int64_t low, int64_t high, const int64_t* size, int64_t size_len_);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_randn(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_randn_generator(const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_randperm(int64_t n, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_repeat_interleave_Tensor(AtenTensorHandle repeats, int64_t* output_size, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_replication_pad1d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_replication_pad2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_reshape(AtenTensorHandle self, const int64_t* shape, int64_t shape_len_, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_resize_(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_resize_as_(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_scatter_src_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_scatter_value_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_scatter_reduce_two_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src, const char* reduce, int32_t include_self);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_searchsorted_Scalar(AtenTensorHandle sorted_sequence, double self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_searchsorted_Tensor(AtenTensorHandle sorted_sequence, AtenTensorHandle self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_segment_reduce(AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* indices, AtenTensorHandle* offsets, int64_t axis, int32_t unsafe, double* initial, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_slice_Tensor(AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_soft_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_sort(AtenTensorHandle self, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_sort_stable(AtenTensorHandle self, int32_t* stable, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_topk(AtenTensorHandle self, int64_t k, int64_t dim, int32_t largest, int32_t sorted, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_triangular_solve(AtenTensorHandle self, AtenTensorHandle A, int32_t upper, int32_t transpose, int32_t unitriangular, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_uniform(AtenTensorHandle self, double from, double to, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_upsample_bicubic2d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_upsample_linear1d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_upsample_trilinear3d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_d, double* scales_h, double* scales_w, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_view_dtype(AtenTensorHandle self, int32_t dtype, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_view_as_complex(AtenTensorHandle self, AtenTensorHandle* ret0);
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_npu_view_as_real(AtenTensorHandle self, AtenTensorHandle* ret0);
++
++
++
++
+ // Getter functions for retrieving various constants from the runtime, that
+ // can subsequently be passed to other aoti_* functions.  By hiding these
+ // behind functions, the precise value of device/dtype is NOT part of the
+@@ -97,6 +230,7 @@ using AOTITorchError = int32_t;
+ AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_cpu();
+ AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_cuda();
+ AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_xpu();
++AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_npu();
+ AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_privateuse1();
+ 
+ AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float8_e5m2();
+@@ -293,7 +427,7 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_empty_strided(
+     AtenTensorHandle* ret_new_tensor // returns new reference
+ );
+ 
+-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_npu(
+     void* data,
+     int64_t ndim,
+     const int64_t* sizes_ptr,
+@@ -305,7 +439,7 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
+     AtenTensorHandle* ret // returns new reference
+ );
+ 
+-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2(
++AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_npu_v2(
+     void* data,
+     int64_t ndim,
+     const int64_t* sizes_ptr,
+diff --git a/utils/_triton.py b/utils/_triton.py
+index 1609a3f..4977801 100644
+--- a/utils/_triton.py
++++ b/utils/_triton.py
+@@ -19,7 +19,14 @@ def has_triton_package() -> bool:
+ def has_triton_tma():
+     if has_triton_package():
+         import torch
+-
++        try:
++            from triton.tools.experimental_descriptor import (  # noqa: F401
++                create_1d_tma_descriptor,
++                create_2d_tma_descriptor,
++            )
++            return True
++        except ImportError:
++            pass
+         if (
+             torch.cuda.is_available()
+             and torch.cuda.get_device_capability() >= (9, 0)
+@@ -80,6 +87,7 @@ def has_triton() -> bool:
+         return True
+ 
+     triton_supported_devices = {
++        "npu": _return_true,
+         "cuda": cuda_extra_check,
+         "xpu": _return_true,
+         "cpu": cpu_extra_check,
+
+diff --git a/utils/cpp_extension.py b/utils/cpp_extension.py
+index b4a70dc..41cd7a2 100644
+--- a/utils/cpp_extension.py
++++ b/utils/cpp_extension.py
+@@ -141,6 +141,22 @@ def _find_rocm_home() -> Optional[str]:
+               file=sys.stderr)
+     return rocm_home
+ 
++def _find_npu_home() -> Optional[str]:
++    """Find the NPU install path."""
++    # Guess #1
++    npu_home = os.environ.get('ASCEND_HOME_PATH') or os.environ.get('ASCEND_TOOLKIT_HOME') or os.environ.get('TOOLCHAIN_HOME')
++    if npu_home is None:
++        npu_home = '/usr/local/Ascend/ascend-toolkit/latest'
++        if not os.path.exists(npu_home):
++            npu_home = None
++    if not npu_home:
++        print(f"Warning ASCEND_HOME_PATH not found")
++    # TODO NPU runtime check
++    # if npu_home and not torch.cuda.is_available():
++    #     print(f"No CUDA runtime is found, using CUDA_HOME='{cuda_home}'",
++    #           file=sys.stderr)
++    return npu_home
++
+ def _find_sycl_home() -> Optional[str]:
+     """Find the OneAPI install path."""
+     # Guess #1
+@@ -239,6 +255,7 @@ CUDA_HOME = _find_cuda_home() if torch.cuda._is_compiled() else None
+ CUDNN_HOME = os.environ.get('CUDNN_HOME') or os.environ.get('CUDNN_PATH')
+ SYCL_HOME = _find_sycl_home() if torch.xpu._is_compiled() else None
+ 
++NPU_HOME = _find_npu_home()
+ # PyTorch releases have the version pattern major.minor.patch, whereas when
+ # PyTorch is built from source, we append the git commit hash, which gives
+ # it the below pattern.
+@@ -1235,6 +1252,14 @@ def include_paths(device_type: str = "cpu") -> List[str]:
+             paths.append(os.path.join(CUDNN_HOME, 'include'))
+     elif device_type == "xpu":
+         paths.append(_join_sycl_home('include'))
++    elif device_type == "npu":
++        npu_home_include = _join_npu_home('x86_64-linux/include')
++        paths.append(npu_home_include)
++        npu_exp_include = _join_npu_home('x86_64-linux/include/experiment')
++        paths.append(npu_exp_include)
++        npu_home_prof_include = _join_npu_home('x86_64-linux/include/experiment/msprof')
++        paths.append(npu_home_prof_include)
++        paths.append("")
+     return paths
+ 
+ 
+@@ -1281,6 +1306,10 @@ def library_paths(device_type: str = "cpu") -> List[str]:
+                 lib_dir = 'lib'
+ 
+         paths.append(_join_sycl_home(lib_dir))
++    elif device_type == "npu":
++        npu_home_lib = _join_npu_home('lib64')
++        paths.append(npu_home_lib)
++        paths.append("/host/zcl/aoti_files")
+ 
+     return paths
+ 
+@@ -2532,3 +2561,9 @@ def _is_cuda_file(path: str) -> bool:
+     if IS_HIP_EXTENSION:
+         valid_ext.append('.hip')
+     return os.path.splitext(path)[1] in valid_ext
++
++def _join_npu_home(*paths) -> str:
++    if NPU_HOME is None:
++        raise OSError('ASCEND_HOME_PATH environment variable is not set. '
++                      'Please set it to your CUDA install root. suggest using set_env.sh')
++    return os.path.join(NPU_HOME, *paths)
+\ No newline at end of file
\ No newline at end of file
diff --git a/torch_npu/_inductor/patch/torch_npu_changes.patch b/torch_npu/_inductor/patch/torch_npu_changes.patch
new file mode 100644
index 0000000000..73da7279c5
--- /dev/null
+++ b/torch_npu/_inductor/patch/torch_npu_changes.patch
@@ -0,0 +1,14 @@
+diff --git a/utils/_inductor.py b/utils/_inductor.py
+index 9a36ddb..5a3c874 100755
+--- a/utils/_inductor.py
++++ b/utils/_inductor.py
+@@ -1,5 +1,9 @@
++import os
+ from torch._inductor.codegen.common import DeviceOpOverrides, register_device_op_overrides
+ 
++_HERE = os.path.abspath(__file__)
++_TORCH_NPU_PATH = os.path.dirname(os.path.dirname(_HERE))
++TORCH_NPU_LIB_PATH = "/host/zcl/pta_v2.6/libtorch_npu/lib"
+ 
+ class NPUDeviceOpOverrides(DeviceOpOverrides):
+     def import_get_raw_stream_as(self, name):
-- 
Gitee


From a4c0a840d43a1bf4765aa234b63422c230085b2a Mon Sep 17 00:00:00 2001
From: kaixin <kaixin.yang@outlook.com>
Date: Wed, 30 Apr 2025 01:34:55 +0800
Subject: [PATCH 346/358] move FA pass after the register_decompse statements

---
 torch_npu/_inductor/__init__.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/torch_npu/_inductor/__init__.py b/torch_npu/_inductor/__init__.py
index c513d215b8..f3020bea83 100644
--- a/torch_npu/_inductor/__init__.py
+++ b/torch_npu/_inductor/__init__.py
@@ -1,4 +1,3 @@
-
 import torch
 from torch._inductor.codegen.common import register_backend_for_device, register_device_op_overrides
 from torch._dynamo.device_interface import register_interface_for_device, get_interface_for_device
@@ -9,17 +8,14 @@ from torch_npu.utils._inductor import NPUDeviceOpOverrides
 from torch_npu.utils._dynamo_device import NpuInterface, current_device, set_device
 from torch_npu.npu.utils import device_count
 
-from .lowering import _register_npu_inductor_fallbacks, make_reduction
+from .lowering import make_reduction
 from .decomposition import _register_npu_inductor_decompositons
 from .utils import get_current_raw_stream
 from .config import log as npulog
 from .config import aggresive_autotune, num_vector_core
 from .npu_choices import should_use_persistent_reduction
 from . import config as npu_config
-#register fx_pass should be put behind of _register_npu_inductor_decompositons
-from . import codegen
-from . import npu_fusion_attention_graph
-from . import dynamo_embedding_backward_dispatch
+
 from .runtime import _load_cached_autotuning
 
 
@@ -79,6 +75,8 @@ for i in range(16) :
     register_interface_for_device(f"npu:{i}", NewNpuInterface)
 device = get_interface_for_device("npu")
 
+from . import codegen
+
 inductor_lowering.make_reduction = make_reduction
 
 
@@ -94,6 +92,10 @@ else:
 _register_npu_inductor_fallbacks()
 _register_npu_inductor_decompositons()
 
+#register fx_pass should be put behind of _register_npu_inductor_decompositons
+
+from . import npu_fusion_attention_graph
+from . import dynamo_embedding_backward_dispatch
 
 def _replace_benchmark_all_configs():
     from torch._inductor.triton_heuristics import CachingAutotuner
-- 
Gitee


From e77045466460e5cf5b6cbd2b04600a593a4312b1 Mon Sep 17 00:00:00 2001
From: kaixin <kaixin.yang@outlook.com>
Date: Wed, 30 Apr 2025 14:44:15 +0800
Subject: [PATCH 347/358] merged in cppwrapper implementation

---
 test/_inductor/run_ut.sh                  |   4 +
 torch_npu/_inductor/__init__.py           |  47 +-
 torch_npu/_inductor/codegen/cppwrapper.py | 737 ++++++++++++++++++++++
 torch_npu/_inductor/lowering.py           |   2 +
 torch_npu/_inductor/npu_device.py         | 247 ++++++++
 5 files changed, 997 insertions(+), 40 deletions(-)
 create mode 100644 torch_npu/_inductor/codegen/cppwrapper.py
 create mode 100644 torch_npu/_inductor/npu_device.py

diff --git a/test/_inductor/run_ut.sh b/test/_inductor/run_ut.sh
index bdbe08e8df..c68e41b8cd 100644
--- a/test/_inductor/run_ut.sh
+++ b/test/_inductor/run_ut.sh
@@ -11,6 +11,9 @@ mkdir -p ${WORKSPACE}TritonNpu
 cd ${WORKSPACE}TritonNpu
 git clone https://gitee.com/ascend/triton-ascend.git -b master
 
+# clear inductor cache
+rm -rf /tmp/torchinductor_*
+
 if [ -d ${WORKSPACE}TritonNpu/triton-ascend/triton ];then
   rm -rf ${WORKSPACE}TritonNpu/triton-ascend/triton
 fi
@@ -37,6 +40,7 @@ LLVM_INCLUDE_DIRS=$LLVM_SYSPATH/include \
 LLVM_LIBRARY_DIR=$LLVM_SYSPATH/lib \
 LLVM_SYSPATH=$LLVM_SYSPATH  \
 TRITON_BUILD_WITH_CLANG_LLD=true \
+TRITON_BUILD_PROTON=OFF \
 pip install -e ${WORKSPACE}TritonNpu/triton-ascend/triton/python --no-build-isolation -vvv
 
 pip list
diff --git a/torch_npu/_inductor/__init__.py b/torch_npu/_inductor/__init__.py
index f3020bea83..fc3e21dc93 100644
--- a/torch_npu/_inductor/__init__.py
+++ b/torch_npu/_inductor/__init__.py
@@ -3,6 +3,7 @@ from torch._inductor.codegen.common import register_backend_for_device, register
 from torch._dynamo.device_interface import register_interface_for_device, get_interface_for_device
 from torch._inductor import lowering as inductor_lowering
 from torch._inductor.choices import InductorChoices
+from .npu_device import NewNPUDeviceOpOverrides, NewNpuInterface
 from torch._inductor.runtime import autotune_cache
 from torch_npu.utils._inductor import NPUDeviceOpOverrides
 from torch_npu.utils._dynamo_device import NpuInterface, current_device, set_device
@@ -18,58 +19,24 @@ from . import config as npu_config
 
 from .runtime import _load_cached_autotuning
 
-
 npulog.info("perform torch_npu._inductor patch")
-
+import torch
+from torch_npu.utils._inductor import NPUDeviceOpOverrides
+from torch_npu.utils._dynamo_device import NpuInterface, current_device, set_device
+from torch_npu.npu.utils import device_count
 
 def _inductor_register_backend_for_device():
     from .codegen.schduling import NPUTritonScheduling
     from .codegen.wrapper import NPUWrapperCodeGen
-    register_backend_for_device('npu', NPUTritonScheduling, NPUWrapperCodeGen)
+    from .codegen.cppwrapper import CppWrapperNpu
+    register_backend_for_device('npu', NPUTritonScheduling, NPUWrapperCodeGen, CppWrapperNpu)
 
 _inductor_register_backend_for_device()
 
-
-## Override original inductor device overrides in torch_npu
-class NewNPUDeviceOpOverrides(NPUDeviceOpOverrides):
-    def import_get_raw_stream_as(self, name):
-        return f"from torch_npu._inductor import get_current_raw_stream as {name}"
-
-
-
 def _inductor_register_device_op_overrides():
     register_device_op_overrides('npu', NewNPUDeviceOpOverrides())
 
 _inductor_register_device_op_overrides()
-
-
-## Override original dynamo device interface in torch_npu
-class NewNpuInterface(NpuInterface):
-
-    @staticmethod
-    def is_available() -> bool:
-        return device_count() > 0
-
-    @staticmethod
-    def get_compute_capability(mydevice=None):
-        # npu has no concept of cc. triton-npu compiler depends on subarch instead
-        return torch.npu.get_device_name(mydevice)
-    
-    @staticmethod
-    def exchange_device(device_id: int) -> int:
-        curr_device = current_device()
-        set_device(device_id)
-        return curr_device
-    
-    @staticmethod
-    def maybe_exchange_device(device_id: int) -> int:
-        return device_id
-
-    @staticmethod
-    def is_bf16_supported(including_emulation: bool = False):
-        return True 
-    
-
 register_interface_for_device("npu", NewNpuInterface)
 for i in range(16) :
     register_interface_for_device(f"npu:{i}", NewNpuInterface)
diff --git a/torch_npu/_inductor/codegen/cppwrapper.py b/torch_npu/_inductor/codegen/cppwrapper.py
new file mode 100644
index 0000000000..ffa695c910
--- /dev/null
+++ b/torch_npu/_inductor/codegen/cppwrapper.py
@@ -0,0 +1,737 @@
+# mypy: allow-untyped-defs
+import functools
+import os
+from itertools import chain, count, zip_longest
+from typing import Any, Callable, List, Optional, Tuple, TYPE_CHECKING, Union
+
+import sympy
+import torch
+from torch import dtype as torch_dtype
+from torch._inductor.codecache import get_cpp_wrapper_cubin_path_name
+from torch._inductor.runtime.runtime_utils import dynamo_timed
+from torch._inductor.runtime.triton_heuristics import grid as default_grid_fn
+
+from torch._inductor import config
+from torch._inductor.codecache import CudaKernelParamCache
+from torch._inductor.ir import IRNode, TensorBox
+from torch._inductor.utils import DeferredLineBase
+from torch._inductor.virtualized import V
+from torch._inductor.codegen.aoti_hipify_utils import maybe_hipify_code_wrapper
+from torch._inductor.codegen.common import get_device_op_overrides
+from torch._inductor.codegen.cpp_utils import cexpr, DTYPE_TO_CPP
+from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
+from torch._inductor.codegen.multi_kernel import MultiKernelCall
+from torch._inductor.codegen.wrapper import PythonWrapperCodegen, SymbolicCallArg
+
+from ..config import npu_block as NPU_ALIGN_BYTES
+
+if TYPE_CHECKING:
+    from torch._inductor.graph import GraphLowering
+
+def checkIfTrue(value, msg):
+    if not value :
+        raise RuntimeError(msg)
+    return True
+
+class DeferredNpuKernelLine(DeferredLineBase):
+    """
+    When using cpp wrapper, NPU kernel load and launch needs to wait for Triton kernels
+    to be tuned and stored as cubin files, so use a deferred line to backfill those information
+    """
+
+    def __init__(
+        self,
+        kernel_name: str,
+        line_template: str,
+        keys: Tuple[str, ...],
+        additional_files: List[str],
+    ):
+        super().__init__(line_template)
+        checkIfTrue(not isinstance(line_template, DeferredLineBase), "line template can not be DeferredLineBase")
+        self.additional_files = additional_files
+        self.kernel_name = kernel_name
+        self.line_template = line_template
+        self.keys = keys
+
+    def __call__(self):
+        if self.kernel_name.startswith("multi_kernel_"):
+            # MultiKernel will select one kernel after running the autotune block
+            self.kernel_name = MultiKernelCall.lookup_choice(self.kernel_name)
+        params = CudaKernelParamCache.get(self.kernel_name)
+        checkIfTrue(params is not None, f"{self.kernel_name} not found in CudaKernelParamCache")
+
+        for key in self.keys:
+            checkIfTrue(key in params, f"{key} not found in CudaKernelParamCache[{self.kernel_name}]")
+
+            if key == get_cpp_wrapper_cubin_path_name():
+                checkIfTrue(os.path.exists(params[key]), f"{params[key]} does not exist")
+                self.additional_files.append(params[key])
+
+        return self.line_template % tuple(params[key] for key in self.keys)
+
+    def _new_line(self, line):
+        return DeferredNpuKernelLine(
+            self.kernel_name, line, self.keys, self.additional_files
+        )
+
+
+class DeferredNpuDefaultGrid:
+    """
+    A container for the default grid, which may be used by DeferredNpuGridLine
+    """
+
+    def __init__(
+        self,
+        kernel_name: str,
+        grid,
+        grid_callable: Optional[Callable[..., Any]] = None,
+        **grid_extra_kwargs,
+    ):
+        self.kernel_name = kernel_name
+        self.grid = grid
+        self.grid_callable = grid_callable
+        self.grid_extra_kwargs = grid_extra_kwargs
+
+    def __iter__(self):
+        # DeferredNpuDefaultGrid can be passed to the base class, PythonWrapperCodegen,
+        # to generate the autotune code block, and thus we need this iterator
+        return iter(self.grid)
+
+    def _process_grid(self, grid: Union[List[Any], Tuple[Any, ...]]):
+        if isinstance(grid, (list, tuple)):
+            return [self._process_grid(e) for e in grid]
+        else:
+            return grid.inner_expr if isinstance(grid, SymbolicCallArg) else grid
+
+    def __call__(self):
+        if self.kernel_name.startswith("multi_kernel_"):
+            # MultiKernel will select one kernel after running the autotune block
+            self.kernel_name = MultiKernelCall.lookup_choice(self.kernel_name)
+
+        grid = self.grid
+        checkIfTrue(isinstance(grid, (list, tuple)), f"expected {grid=} to be a list")
+
+        grid = self._process_grid(grid)
+
+        checkIfTrue(self.grid_callable is not None, "grid_callable can't be None")
+
+        if not self.grid_extra_kwargs:
+            grid_fn = self.grid_callable(*grid)
+        else:
+            grid_fn = self.grid_callable(*grid, **self.grid_extra_kwargs)
+
+        params = CudaKernelParamCache.get(self.kernel_name)
+        checkIfTrue(params is not None, f"{self.kernel_name} not found in CudaKernelParamCache")
+
+        return grid_fn(params["meta"])
+
+
+class DeferredNpuGridLine(DeferredLineBase):
+    """
+    When using cpp wrapper, NPU kernel load and launch needs to wait for Triton kernels
+    to be tuned and stored as cubin files, so use a deferred line to backfill those information
+    """
+
+    def __init__(
+        self,
+        kernel_name: str,
+        grid_var: str,
+        grid,
+        autotune_configs,
+    ):
+        super().__init__("")
+        self.kernel_name = kernel_name
+        self.grid_var = grid_var
+        self.grid = grid
+        self.autotune_configs = autotune_configs
+
+    def __call__(self):
+        if self.kernel_name.startswith("multi_kernel_"):
+            # MultiKernel will select one kernel after running the autotune block
+            self.kernel_name = MultiKernelCall.lookup_choice(self.kernel_name)
+
+        params = CudaKernelParamCache.get(self.kernel_name)
+
+        checkIfTrue(params is not None, f"{self.kernel_name} not found in CudaKernelParamCache")
+
+        if self.autotune_configs is not None:
+            # This indicates the Triton kernel is a user-defined one.
+            grid = None
+            if len(self.grid) == 1:
+                grid = self.grid[0]
+            else:
+                for i, c in enumerate(self.autotune_configs):
+                    if all(arg == params["meta"][key] for key, arg in c.kwargs.items()):
+                        grid = self.grid[i]
+                        break
+            checkIfTrue(grid is not None, "grid can not be None")
+            grid_args_str = ", ".join(
+                [cexpr(V.graph.sizevars.simplify(item)) for item in grid]
+            )
+        else:
+            launch_grid = (params['grid_x'], params['grid_y'], params['grid_z'])
+            grid_args_str = ", ".join(
+                [cexpr(item) for item in launch_grid]
+            )
+
+        return f"\n    Grid {self.grid_var} = Grid({grid_args_str});\n"
+
+    def _new_line(self, line):
+        return DeferredNpuGridLine(
+            self.kernel_name, self.grid_var, self.grid, self.autotune_configs
+        )
+
+
+class CppWrapperNpu(CppWrapperCpu):
+    """
+    Generates cpp wrapper for running on NPU and calls CUDA kernels
+    """
+
+    def __init__(self) -> None:
+        self.device = 'npu'
+        self.device_codegen = get_device_op_overrides(self.device)
+        super().__init__()
+        self.grid_id = count()
+
+    @staticmethod
+    def create(
+        is_subgraph: bool, subgraph_name: str, parent_wrapper: PythonWrapperCodegen
+    ):
+        # TODO - support subgraph codegen by lifting functions. Check the
+        # comment at CppWrapperCpu `codegen_subgraph` function.
+        return CppWrapperNpu()
+
+    def write_header(self):
+        if V.graph.is_const_graph:
+            # We do not write header for constant graph, it will be written by main module.
+            return
+
+        super().write_header()
+        self.header.splice("#include <unistd.h>")
+        self.header.splice("#include <filesystem>")
+        self.header.splice(self.device_codegen.abi_compatible_header())
+        self.header.splice(
+            maybe_hipify_code_wrapper(self.device_codegen.kernel_driver())
+        )
+        self.header.splice("#include <torch_npu/csrc/framework/OpCommand.h>")
+        self.header.splice("#include \"experiment/runtime/runtime/rt.h\"")
+
+    def write_get_raw_stream(self, device_idx: int, graph=None) -> str:
+        name = f"stream{device_idx}"
+        self.writeline(
+            maybe_hipify_code_wrapper(
+                f"{self.device_codegen.cpp_stream_type()} {name};"
+            )
+        )
+        self.writeline(
+            f"AOTI_TORCH_ERROR_CODE_CHECK({self.device_codegen.aoti_get_stream()}({device_idx}, (void**)&{name}));"
+        )
+        return name
+
+    def codegen_inputs(self):
+        # See Note: [Input Alignment handling in Inductor]
+        #
+        # JIT Inductor does not guard on input alignment. It relies on copy_misaligned_inputs to
+        # copy misaligned inputs to aligned buffers. For AOTInductor, we expect users to use it
+        # as non-Python deployment for its best performance, so implicitly copying misaligned inputs
+        # to aligned buffers is going to bring a surprising performance hit. Instead, we check input
+        # alignment and throw an error if any input is misaligned.
+        if V.graph.aot_mode and V.graph.inputs_to_check:
+            for idx in V.graph.inputs_to_check:
+                input_name = V.graph.graph_input_names[idx]
+                checkIfTrue(input_name in V.graph.graph_inputs, f"{input_name} not found in graph inputs")
+
+                value = V.graph.graph_inputs[input_name]
+                checkIfTrue(isinstance(value, TensorBox), f"{input_name} is expected to be tensor but found as {type(value)}")
+
+                self.prefix.splice(
+                    f"""
+                    if ((long({input_name}.data_ptr()) & ({NPU_ALIGN_BYTES} -1)) != 0) {{
+                        throw std::runtime_error("{input_name} is not aligned to {NPU_ALIGN_BYTES} bytes");
+                    }}
+                    """
+                )
+
+        super().codegen_inputs()
+
+    def define_kernel(
+        self,
+        kernel_name: str,
+        kernel_body: str,
+        metadata: Optional[str] = None,
+        gpu=True,
+    ):
+        if gpu:
+            if config.triton.autotune_at_compile_time:
+                # Call PythonWrapperCodegen to create the autotune code block
+                PythonWrapperCodegen.define_kernel(
+                    self, kernel_name, kernel_body, metadata, gpu
+                )
+        else:
+            return CppWrapperCpu.define_kernel(
+                self, kernel_name, kernel_body, metadata, gpu
+            )
+
+    def generate(self, is_inference):
+        with dynamo_timed("CppWrapperNpu.generate", log_pt2_compile_event=True):
+            self.prefix.writeline("\n")
+            if not V.graph.aot_mode:
+                for kernel in chain(
+                    sorted(self.src_to_kernel.values()),
+                    sorted(
+                        [entry[0] for entry in self.user_defined_kernel_cache.values()]
+                    ),
+                ):
+                    self.prefix.writeline(
+                        maybe_hipify_code_wrapper(
+                            f"static {self.device_codegen.cpp_kernel_type()} {kernel} = nullptr;"
+                        )
+                    )
+                self.prefix.writeline("\n")
+            return super().generate(is_inference)
+
+    def generate_user_defined_triton_kernel(
+        self,
+        kernel_name: str,
+        raw_args: List[Any],
+        grid: List[Any],
+        configs,
+        triton_meta,
+        constexprs,
+    ):
+        if (
+            config.triton.autotune_at_compile_time
+            and kernel_name not in self.kernel_autotune_names
+        ):
+            # Call PythonWrapperCodegen to create the autotune code block
+            PythonWrapperCodegen.generate_user_defined_triton_kernel(
+                self,
+                kernel_name,
+                raw_args,
+                grid,
+                configs,
+                triton_meta,
+                constexprs,
+            )
+
+        # in C++ wrapper, we don't pass constexpr args, as they don't
+        # get added as parameters to the PTX code compiled from the
+        # user-defined Triton kernel (only non-constexpr args do)
+        raw_args = [
+            raw_arg for i, raw_arg in enumerate(raw_args) if i not in constexprs
+        ]
+        args = [self.val_to_arg_str(v) for v in raw_args]
+        arg_types = [
+            arg.get_dtype() if isinstance(arg, IRNode) else type(arg)
+            for arg in raw_args
+        ]
+
+        # Call self.generate_kernel_call to generate the real kernel call in cpp
+        self.generate_kernel_call(
+            kernel_name,
+            args,
+            arg_types=arg_types,
+            raw_args=raw_args,
+            grid=grid,
+            gpu=True,
+            triton=True,
+            triton_meta=triton_meta,
+            autotune_configs=configs,
+        )
+
+
+    @functools.lru_cache(None)  # noqa: B019
+    def generate_load_kernel_once(
+        self,
+        kernel_name: str,
+        device_index,
+        graph: "GraphLowering",  # for per-graph caching
+    ):
+        """
+        typedef struct {
+            const char *name; //mangled_name
+            const char *kernelPath; //get_cpp_wrapper_cubin_path_name()
+            int shared; // shared_mem
+            int device; // device_index
+        } LoadKernelArgs;
+        """
+
+        # keys = ("mangled_name", get_cpp_wrapper_cubin_path_name() , "shared_mem")
+        keys = (get_cpp_wrapper_cubin_path_name(), "mangled_name", "shared_mem")
+        kernel_var_name = f"kernels.{kernel_name}" if V.graph.aot_mode else kernel_name
+        self.writeline(f"if ({kernel_var_name} == nullptr) {{")
+        deferred_gpu_kernel_line = DeferredNpuKernelLine(
+            kernel_name,
+            # "    " + kernel_var_name + r' = loadKernel("%s", "%s", %s, {});'.format(
+            #     device_index
+            # ),
+            "    " + kernel_var_name + r' = loadKernel("%s", "%s", %s);',
+            keys,
+            self.additional_files,
+        )
+        self.writeline(deferred_gpu_kernel_line)
+        self.writeline("}")
+        return kernel_var_name
+
+    def codegen_tensor_item_npu(
+        self, dtype: torch.dtype, tensor: str, scalar: str, indented_buffer=None
+    ):
+        dtype_str = str(dtype).split(".")[-1]
+        writer = indented_buffer or self
+        
+        if dtype == torch.float16 or dtype == torch.bfloat16:
+            scalar_tmp = f"{scalar}_tmp"
+            writer.writeline(f"{DTYPE_TO_CPP[dtype]} {scalar_tmp};")
+            writer.writeline(
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_item_{dtype_str}({tensor}, &{scalar_tmp}));"
+            )
+            writer.writeline(f"float {scalar} = float({scalar_tmp});")
+            struct_data = f'float {scalar} __attribute__((aligned(4)));'
+            arg_data = f'static_cast<float>({scalar})'
+        else:
+            writer.writeline(f"{DTYPE_TO_CPP[dtype]} {scalar};")
+            writer.writeline(
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_item_{dtype_str}({tensor}, &{scalar}));"
+            )
+            struct_data = f'{DTYPE_TO_CPP[dtype]} {scalar} __attribute__((aligned(sizeof({DTYPE_TO_CPP[dtype]} ))));'
+            arg_data = f'static_cast<{DTYPE_TO_CPP[dtype]}>({scalar})'
+
+        return struct_data, arg_data
+    
+    def generate_args_decl(self, call_args, arg_types, arg_signatures, kernel_id, grid_var):
+        new_args: list[str] = []
+
+        # Add more cases for other types as needed
+        signature2dtype = {
+            "i1": "int32_t",
+            "i8": "int8_t",
+            "i16": "int16_t",
+            "i32": "int32_t",
+            "i64": "int64_t",
+            "u32": "uint32_t",
+            "u64": "uint64_t",
+            "fp16": "float",
+            "bf16": "float",
+            "fp32": "float",
+            "f32": "float",
+            "fp64": "double",
+        }
+
+        kernel_args_var = f"kernel_args_var_{kernel_id}"
+
+        rtError_t_str = f'ret_{kernel_id}'
+        ffts_addr_str = f'ffts_addr_{kernel_id}'
+        ffts_len_str = f'ffts_len_{kernel_id}'
+        workspace_addr_str = f'workspace_addr_{kernel_id}'
+
+        before_strucr_str =\
+            f"\n    rtError_t {rtError_t_str};\n" +\
+            f"    void* {ffts_addr_str} = NULL;\n" +\
+            f"    uint32_t {ffts_len_str};\n" +\
+            f"    {rtError_t_str} = rtGetC2cCtrlAddr((uint64_t*)&{ffts_addr_str}, &{ffts_len_str});\n" +\
+            f"    if ({rtError_t_str} != RT_ERROR_NONE) return;\n" +\
+            f"    void* {workspace_addr_str} = NULL;\n\n"
+
+        struct_def_head = f'    struct __attribute__((packed)) {{\n        void* ffts_addr __attribute__((aligned(8)));\n        void* workspace_addr __attribute__((aligned(8)));\n'
+        struct_def_end = f'\n        int32_t gridX __attribute__((aligned(4))); int32_t gridY __attribute__((aligned(4))); int32_t gridZ __attribute__((aligned(4)));\n    }}'
+        
+        struct_arg_head = f' {kernel_args_var} = {{\n        static_cast<void*>({ffts_addr_str}),\n        static_cast<void*>({workspace_addr_str}),\n'
+        struct_arg_end = f'\n        static_cast<int32_t>({grid_var}.grid_x), static_cast<int32_t>({grid_var}.grid_y), static_cast<int32_t>({grid_var}.grid_z)\n    }};\n'
+        
+        struct_def_body = '        '
+        struct_arg_body = '        '
+
+        def process_args(arg, arg_type, arg_signature=None):
+            var_name = f"var_{next(self.arg_var_id)}"
+            # ignore nvTmaDesc, as host-side TMA descriptors need
+            # to be passed to the compiled Triton kernel by value
+            if isinstance(arg_type, torch_dtype) and arg_signature != "nvTmaDesc":
+                if arg.endswith(".item()"): # scalar
+                    # Need to declare a scalar in this case
+                    arg = arg[:-7]
+                    # TODO: override to return dtype
+                    struct_data, arg_data = self.codegen_tensor_item_npu(
+                        arg_type,
+                        arg,
+                        var_name,
+                    )
+                else:
+                    # TODO: void*
+                    device_ptr_type = self.device_codegen.cpp_device_ptr()
+                    self.writeline(
+                        maybe_hipify_code_wrapper(
+                            f"{device_ptr_type} {var_name} = reinterpret_cast<{device_ptr_type}>({arg}.data_ptr());"
+                        )
+                    )
+                    struct_data = f'void* {var_name} __attribute__((aligned(8)));'
+                    arg_data = f'static_cast<void*>({var_name})'
+
+            elif arg_type in (sympy.Integer, int):
+                # TODO: int
+                self.writeline(f"int {var_name} = {cexpr(arg)};")
+                struct_data = f'int {var_name} __attribute__((aligned(4)));'
+                arg_data = f'static_cast<int>({var_name})'
+
+            elif arg_type in (sympy.Float, float):
+                # TODO: float
+                self.writeline(f"float {var_name} = {cexpr(arg)};")
+                struct_data = f'float {var_name} __attribute__((aligned(4)));'
+                arg_data = f'static_cast<float>({var_name})'
+
+            # For symbolic call arguments, examine the arg signatures from triton meta
+            # to explicitly cast to the right type
+            # Reason: `auto` can infer unexpected type against kernel input signature.
+            elif (
+                isinstance(arg_type, type(SymbolicCallArg))
+                and arg_signature is not None
+                and arg_signature in signature2dtype.keys()
+            ): 
+                # TODO： * or scalar symbolic type，currently only support scalar symbolic type
+                self.writeline(
+                    f"{signature2dtype[arg_signature]} {var_name} = {cexpr(arg)};"
+                )
+                struct_data = f'{signature2dtype[arg_signature]} {var_name} __attribute__((aligned(sizeof({signature2dtype[arg_signature]}))));'
+                arg_data = f'static_cast<{signature2dtype[arg_signature]}>({var_name})'
+            else:
+                raise TypeError("Infer arg_type to cpp failed!")
+                # self.writeline(f"auto {var_name} = {cexpr(arg)};")
+
+            nonlocal struct_def_body
+            nonlocal struct_arg_body
+            struct_def_body += struct_data + ' '
+            struct_arg_body += arg_data + ', '
+
+        for arg, arg_type, arg_signature in zip_longest(
+            call_args, arg_types, arg_signatures
+        ):
+            process_args(arg, arg_type, arg_signature)
+
+        return kernel_args_var, before_strucr_str +\
+            struct_def_head + struct_def_body + struct_def_end +\
+            struct_arg_head + struct_arg_body + struct_arg_end
+
+    def generate_default_grid(
+        self,
+        kernel_name: str,
+        grid_args: List[Any],
+        gpu: bool = True,
+        grid_callable: Optional[Callable[..., Any]] = default_grid_fn,
+        **grid_extra_kwargs,
+    ):
+        """
+        Generate grid configs for launching a CUDA kernel using the grid
+        function from triton_heuristics. Because its computation needs
+        to read kernel config after autotune, it is done in a deferred way
+        using DeferredNpuDefaultGrid.
+        """
+        checkIfTrue(gpu, "CppWrapperNpu.generate_default_grid does not support non-NPU")
+        return DeferredNpuDefaultGrid(
+            kernel_name, grid_args, grid_callable, **grid_extra_kwargs
+        )
+
+    def generate_kernel_call_npu(
+        self,
+        kernel_name: str,
+        call_args,
+        grid=None,
+        device_index=None,
+        npu=True,
+        triton=True,
+        arg_types=None,
+        raw_args=None,
+        grid_fn: str = "grid",
+        triton_meta=None,
+        autotune_configs=None,
+        grid_extra_kwargs="",
+    ):
+        if (
+            config.triton.autotune_at_compile_time
+            and kernel_name not in self.kernel_autotune_names
+        ):
+            # Call PythonWrapperCodegen to create the autotune code block
+            PythonWrapperCodegen.generate_kernel_call(
+                self,
+                kernel_name,
+                call_args,
+                grid,
+                device_index,
+                npu,
+                triton,
+                arg_types,
+                raw_args,
+                grid_fn,
+                triton_meta,
+                autotune_configs,
+                grid_extra_kwargs,
+            )
+
+        if device_index is None:
+            current_device = V.graph.get_current_device_or_throw()
+            device_index = current_device.index
+
+        stream = (
+            "stream"
+            if V.graph.aot_mode
+            else self.write_get_raw_stream(device_index, V.graph)
+        )
+
+        if triton:
+            device_index, call_args = self.prepare_triton_kernel_call(
+                device_index, call_args
+            )
+            kernel_var_name = self.generate_load_kernel_once(kernel_name, device_index, V.graph)
+
+            # args with value 1 are added into equal_to_1 and constants
+            # in triton_meta (in the Python codegen) which makes them
+            # inlined in the PTX and compiled CUBIN
+            arg_signatures = []
+            if (
+                triton_meta is not None
+                and triton_meta.get("configs")
+                and triton_meta.get("signature")
+            ):
+                equal_to_1 = triton_meta["configs"][0].equal_to_1
+                call_args = [
+                    arg for i, arg in enumerate(call_args) if i not in equal_to_1
+                ]
+                arg_types = [t for i, t in enumerate(arg_types) if i not in equal_to_1]
+                # extract the arg signatures from triton_meta
+                arg_signatures = triton_meta["signature"].values()
+                arg_signatures = [
+                    v for i, v in enumerate(arg_signatures) if i not in equal_to_1
+                ]
+
+            current_kernel_id = next(self.kernel_callsite_id)
+            current_grid_id = next(self.grid_id)
+            
+            # >>>>> gen grids
+            grid_var = f"{kernel_name}_grid_{current_grid_id}"
+            self.writeline(
+                DeferredNpuGridLine(kernel_name, grid_var, grid, autotune_configs)
+            )
+            # <<<<<
+
+            # >>>>> gen kernel args
+            kernel_args_var, call_args_str = self.generate_args_decl(
+                call_args, arg_types, arg_signatures, current_kernel_id, grid_var
+            )
+            self.writeline(f"{call_args_str}")
+            # <<<<<
+
+            kernel_var_name = (
+                f"kernels.{kernel_name}" if V.graph.aot_mode else kernel_name
+            )
+            # add debug printer code for all triton kernel related calls
+            debug_printer_manager = V.graph.wrapper_code.debug_printer
+            debug_printer_manager.set_printer_args(
+                call_args, kernel_name, arg_types, None
+            )
+            with debug_printer_manager:
+                
+                '''
+                typedef struct {
+                    const char* kernelName; // f"'{kernel_name}'" <- 'triton_unk_fused_sigmoid_1'
+                    const void* func; // kernel_var_name <- kernels.triton_unk_fused_sigmoid_1
+                    rtStream_t stream; // stream
+                    int gridX; // f"{grid_var}.grid_x",
+                    int gridY; // f"{grid_var}.grid_y",
+                    int gridZ; // f"{grid_var}.grid_z",
+                    int *profilerRegistered; //nullptr
+                    void *kernelArgs; // f'static_cast<void*>(&{kernel_args_var})'
+                    int32_t kernelArgsSize; // f'sizeof({kernel_args_var})'
+                } LaunchKernelArgs;
+                '''
+
+                self.writeline(f"if ({grid_var}.is_non_zero()) {{")
+                self.writeline(
+                    DeferredNpuKernelLine(
+                        kernel_name,
+                        r"    launchKernel({}, {}, {}, {}, {}, {}, {}, {});".format(\
+                            f'"{kernel_name}"',
+                            kernel_var_name,
+                            stream,
+                            f"{grid_var}.grid_x",
+                            f"{grid_var}.grid_y",
+                            f"{grid_var}.grid_z",
+                            f"static_cast<void*>(&{kernel_args_var})",
+                            f'sizeof({kernel_args_var})',
+                        ),
+                        tuple(),
+                        self.additional_files,
+                    ),
+                )
+
+                self.writeline("}\n")
+        else:
+            casted = []
+            for arg_type, arg in zip(arg_types, call_args):
+                new_arg = arg
+                if arg_type.endswith("*") and arg != "nullptr":
+                    new_arg = f"{arg}.data_ptr()"
+                casted.append(f"({arg_type}){new_arg}")
+            call_args_str = ", ".join(casted)
+            self.writeline(f"kernels.{kernel_name}({call_args_str}, {stream});")
+
+
+    def generate_kernel_call(
+        self,
+        kernel_name: str,
+        call_args,
+        grid=None,
+        device_index=None,
+        gpu=True,
+        triton=True,
+        arg_types=None,
+        raw_args=None,
+        grid_fn: str = "grid",
+        triton_meta=None,
+        autotune_configs=None,
+        grid_extra_kwargs="",
+    ):
+        """
+        Override the default value of argument 'gpu' to True here.
+        generate_kernel_call can still be called with gpu=False because of
+        a mix of cpu kernels and gpu kernels.
+        """
+
+        """
+        To fit with NPU: we write a new function 'generate_kernel_call_npu
+        and make a new parameter called 'npu', which always equals to 'gpu',
+        because 'gpu' parameter means 'not cpu' in upper logic
+        """
+
+        if not gpu:
+            # Even in CppWrapperNpu, we may see cpp kernels
+            return CppWrapperCpu.generate_kernel_call(
+                self,
+                kernel_name,
+                call_args,
+                grid,
+                device_index,
+                gpu,
+                triton,
+                arg_types,
+                raw_args,
+                grid_fn,
+                triton_meta,
+                autotune_configs,
+                grid_extra_kwargs,
+            )
+
+        self.generate_kernel_call_npu(
+            kernel_name,
+            call_args,
+            grid,
+            device_index,
+            gpu,
+            triton,
+            arg_types,
+            raw_args,
+            grid_fn,
+            triton_meta,
+            autotune_configs,
+            grid_extra_kwargs,
+        )
+        
+    def make_zero_buffer(self, name):
+        return f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_zero_({name}.get()));"
diff --git a/torch_npu/_inductor/lowering.py b/torch_npu/_inductor/lowering.py
index caac343535..c31736ddca 100644
--- a/torch_npu/_inductor/lowering.py
+++ b/torch_npu/_inductor/lowering.py
@@ -89,6 +89,8 @@ def _init_set(input_list, output_set):
 
 
 GENERATE_LIST = [
+    prims.iota,
+    aten.full,
     aten.mul,
     aten.add,
     aten.sub,
diff --git a/torch_npu/_inductor/npu_device.py b/torch_npu/_inductor/npu_device.py
new file mode 100644
index 0000000000..2a41cc978e
--- /dev/null
+++ b/torch_npu/_inductor/npu_device.py
@@ -0,0 +1,247 @@
+import torch
+from torch_npu.utils._inductor import NPUDeviceOpOverrides
+from torch_npu.utils._dynamo_device import NpuInterface, current_device, set_device
+from torch_npu.npu.utils import device_count
+
+## Override original inductor device overrides in torch_npu
+class NewNPUDeviceOpOverrides(NPUDeviceOpOverrides):
+    def import_get_raw_stream_as(self, name):
+        return f"from torch_npu._inductor import get_current_raw_stream as {name}"
+
+    def set_device(self, device_idx):
+        return f"torch.npu.set_device({device_idx})"
+
+    def synchronize(self):
+        return """
+                stream = torch.npu.current_stream()
+                stream.synchronize()
+                """
+
+    def device_guard(self, device_idx):
+        return f"torch.npu._DeviceGuard({device_idx})"
+
+    def cpp_aoti_device_guard(self):
+        raise NotImplementedError
+
+    def cpp_aoti_stream_guard(self):
+        return "AOTICudaStreamGuard"
+
+    def kernel_driver(self):
+        source_codes = """
+            static std::unordered_map<std::string, size_t> registered_names;
+            static std::unordered_map<std::string, std::unique_ptr<size_t>> func_stubs;
+
+            namespace {
+
+            struct Grid {
+                Grid(uint32_t x, uint32_t y, uint32_t z)
+                  : grid_x(x), grid_y(y), grid_z(z) {}
+                uint32_t grid_x;
+                uint32_t grid_y;
+                uint32_t grid_z;
+
+                bool is_non_zero() {
+                    return grid_x > 0 && grid_y > 0 && grid_z > 0;
+                }
+            };
+
+            }  // anonymous namespace
+
+            extern "C" {
+                typedef int (* callback)(unsigned int type, void* data, unsigned int len);
+                extern int MsprofReportApi(unsigned int  agingFlag, const MsprofApi *api);
+                extern unsigned long int  MsprofSysCycleTime();
+                extern int MsprofRegisterCallback(unsigned int moduleId, callback handle);
+                static unsigned int __MsprofFlagL0  = 0;
+                static unsigned int __MsprofFlagL1  = 0;
+
+                int ProfCtrlHandle(unsigned int CtrlType, void* CtrlData, unsigned int DataLen) {
+                    if ((CtrlData == nullptr) || (DataLen == 0U)) {
+                        return 1;
+                    }
+
+                    if (CtrlType == 1) {
+                        MsprofCommandHandle* handle = (MsprofCommandHandle *)(CtrlData);
+                        if (handle->type >= 6)  // 6 is not used here
+                            return 1;
+                        if (handle->type == 1) {  // init - 0  , start - 1
+                            __MsprofFlagL0 = ((0x00000800ULL & handle->profSwitch) == 0x00000800ULL) ? 1 : 0;
+                            __MsprofFlagL1 = ((0x00000002ULL & handle->profSwitch) == 0x00000002ULL) ? 1 : 0;
+                        }
+                    }
+                    return 0;
+                }
+            }
+
+            std::vector<std::string> stringSplit(const std::string& s) {
+                std::vector<std::string> tokens;
+                std::istringstream iss(s);
+                std::string token;
+                while (iss >> token) {
+                    tokens.push_back(token);
+                }
+                return tokens;
+            }
+            
+            static inline void * loadKernel(
+                std::string filePath,
+                const std::string &nameFuncMode,
+                uint32_t sharedMemBytes,
+                const std::optional<std::string> &cubinDir = std::nullopt) {
+                if (cubinDir) {
+                    std::filesystem::path p1{*cubinDir};
+                    std::filesystem::path p2{filePath};
+                    filePath = (p1 / p2.filename()).string();
+                }
+                auto splitNameMode = stringSplit(nameFuncMode);
+                if (splitNameMode.size() != 2) {
+                    throw std::runtime_error(std::string("funcName not right: ") + nameFuncMode);
+                }
+                auto funcName = splitNameMode[0];
+                auto kernel_mode_str = splitNameMode[1];
+                std::ifstream file(std::string(filePath), std::ios::binary | std::ios::ate);
+                if (!file.is_open()) {
+                    throw std::runtime_error(std::string("open npubin failed"));
+                }
+
+                std::streamsize data_size = file.tellg();
+
+                file.seekg(0, std::ios::beg);
+                char* buffer = new char[data_size];
+                if (!file.read(buffer, data_size)) {
+                    throw std::runtime_error(std::string("read npubin failed"));
+                }
+
+                rtError_t rtRet;
+
+                rtDevBinary_t devbin;
+                devbin.data = buffer;
+                devbin.length = data_size;
+                const std::string kernel_mode{kernel_mode_str};
+                if (kernel_mode == "aiv")
+                    devbin.magic = RT_DEV_BINARY_MAGIC_ELF_AIVEC;
+                else
+                    devbin.magic = RT_DEV_BINARY_MAGIC_ELF;
+                devbin.version = 0;
+
+                int device = 0;
+                rtRet = rtSetDevice(device);
+                if (rtRet != RT_ERROR_NONE) {
+                    throw std::runtime_error(std::string("rtSetDevice failed, 0x") + std::to_string(rtRet));
+                }
+
+                void *devbinHandle = NULL;
+                rtRet = rtDevBinaryRegister(&devbin, &devbinHandle);
+                if (rtRet != RT_ERROR_NONE) {
+                    throw std::runtime_error(std::string("rtDevBinaryRegister failed, 0x") + std::to_string(rtRet));
+                }
+
+                const char* name = funcName.c_str();
+
+                std::string stubName(name);
+                stubName += "_" + std::to_string(registered_names[name]);
+                registered_names[name]++;
+                auto registered = func_stubs.emplace(stubName, std::make_unique<size_t>(0));
+                void *func_stub_handle = registered.first->second.get();
+                rtRet = rtFunctionRegister(devbinHandle, func_stub_handle, stubName.c_str(),
+                                            (void *)name, 0);
+                if (rtRet != RT_ERROR_NONE) {
+                    throw std::runtime_error(std::string("rtFunctionRegister failed, stubName = ") + stubName
+                                + std::string(" , 0x") + std::to_string(rtRet));
+                }
+
+                return func_stub_handle;
+            }
+
+            static void launchKernel(std::string kernelName, const void* func, rtStream_t stream, int gridX, int gridY, int gridZ, void *kernelArgs, int32_t kernelArgsSize) {{
+            std::string name = "";
+            name.append(kernelName);
+            char *kargs = new char[kernelArgsSize];
+            memcpy(kargs, kernelArgs, kernelArgsSize);
+            auto launch_call = [=]() {{
+                uint32_t blockNum = gridX * gridY * gridZ;
+                
+                rtError_t ret;
+                unsigned long int beginTime = 0;
+                unsigned long int endTime = 0;
+                unsigned long int opName = 0;
+                unsigned int threadId = 0;
+                const char* kernelNameC = kernelName.c_str();
+                size_t length = name.length();
+                {{
+                    beginTime = MsprofSysCycleTime();
+                }}
+                ret = rtKernelLaunch(func, blockNum, kargs, kernelArgsSize, NULL, stream);
+                delete[] kargs;
+                return ret;
+            }};
+            at_npu::native::OpCommand cmd;
+            cmd.Name(name.c_str())
+                .SetCustomHandler(launch_call)
+                .Run();
+            }}
+        """
+        return source_codes
+
+    def abi_compatible_header(self):
+        return """
+            #include <fstream>
+            #include <vector>
+            #include <iostream>
+            #include <string>
+            #include <tuple>
+            #include <unordered_map>
+            #include <memory>
+            #include <filesystem>
+
+            #include <assert.h>
+            #include <stdbool.h>
+            #include <sys/syscall.h>
+            #include <torch_npu/csrc/framework/OpCommand.h>
+            #include <torch_npu/csrc/core/npu/NPUStream.h>
+            #include "experiment/runtime/runtime/rt.h"
+        """
+
+    def cpp_stream_type(self):
+        return "cudaStream_t"
+
+    def aoti_get_stream(self):
+        return "aoti_torch_get_current_cuda_stream"
+
+    def cpp_kernel_type(self):
+        return "void *"
+
+    def cpp_device_ptr(self):
+        return "void*"
+
+## Override original dynamo device interface in torch_npu
+class NewNpuInterface(NpuInterface):
+
+    @staticmethod
+    def is_available() -> bool:
+        return device_count() > 0
+
+    @staticmethod
+    def get_compute_capability(device=None):
+        # npu has no concept of cc. triton-npu compiler depends on subarch instead
+        return torch.npu.get_device_name(device)
+    
+    @staticmethod
+    def exchange_device(device: int) -> int:
+        curr_device = current_device()
+        set_device(device) 
+        return curr_device
+    
+    @staticmethod
+    def maybe_exchange_device(device: int) -> int:
+        return device 
+
+    @staticmethod
+    def is_bf16_supported(including_emulation: bool = False):
+        return True 
+    
+    # @staticmethod
+    # def get_device_properties(device=None):
+    #     props = NpuInterface.get_device_properties(device)
+    #     setattr(props, "multi_processor_count", num_vector_core )
+    #     return props 
\ No newline at end of file
-- 
Gitee


From e8bfc6313ed3936c779d4cd1af1d5f0fcefdb557 Mon Sep 17 00:00:00 2001
From: wl1259 <wanglei752@huawei.com>
Date: Wed, 7 May 2025 19:45:38 +0800
Subject: [PATCH 348/358] fix inter_core repeat compute

---
 torch_npu/_inductor/codegen/triton.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torch_npu/_inductor/codegen/triton.py b/torch_npu/_inductor/codegen/triton.py
index a25421917a..372409a6b8 100644
--- a/torch_npu/_inductor/codegen/triton.py
+++ b/torch_npu/_inductor/codegen/triton.py
@@ -157,8 +157,10 @@ class IterationRangesEntryNPUIndex(IterationRangesEntry):
         self.codegen = self._codegen
     # axis mask
     def _codegen_mask(self):
+
         if self.is_tiling_axis :
-            upper = f"{self.name}_numel"
+            BLOCK_NAME = f"{self.name.upper()}BLOCK"
+            upper = f"min({BLOCK_NAME}+{self.symbol()}_offset, {self.name}_numel)" if self.is_split_axis else f"{self.name}_numel"
             line = f"{self.name}_mask = {self.name} < {upper}"
             self.writeline(line)
             for var in self.var_directions.keys():
-- 
Gitee


From f3c0173e42dad357c5b90a7f57f2dbc8c0ce0b4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E7=AB=8B?= <zhangchunli19@huawei.com>
Date: Sat, 10 May 2025 08:35:47 +0000
Subject: [PATCH 349/358] move modify 'fix iota bugs' from inductor-ascend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 张春立 <zhangchunli19@huawei.com>
---
 torch_npu/_inductor/codegen/triton.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_npu/_inductor/codegen/triton.py b/torch_npu/_inductor/codegen/triton.py
index 372409a6b8..dd86d9f723 100644
--- a/torch_npu/_inductor/codegen/triton.py
+++ b/torch_npu/_inductor/codegen/triton.py
@@ -837,7 +837,7 @@ class NPUIndexTritonKernel(TritonKernel):
             reduction_1d = is_1d_reduction()
             do_indent = False
             # do nothing except for writing porintwise
-            if  len(self.loads._lines) == 0:
+            if  len(self.loads._lines) == 0 and len(self.stores._lines) == 0:
                 do_indent = False
                 indexing_code = None
                 #loop_body(index, indexing_code, is_last_axis, do_indent = do_indent)
@@ -859,7 +859,7 @@ class NPUIndexTritonKernel(TritonKernel):
             # tiling axis and but not last tiling
             elif range.is_tiling_axis :
                 do_indent = False 
-                if  len(self.loads._lines) == 0:
+                if  len(self.loads._lines) == 0 and len(self.stores._lines) == 0:
                     do_indent = False
                     indexing_code = None
                 if self.numof_reduction_axis() <= 1  :
-- 
Gitee


From 5f41ab60174ea940b012b6dff094d514258185d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E7=AB=8B?= <zhangchunli19@huawei.com>
Date: Sun, 11 May 2025 07:26:27 +0000
Subject: [PATCH 350/358] update
 torch_npu/_inductor/patch/ascend_aot_package.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 张春立 <zhangchunli19@huawei.com>
---
 .../_inductor/patch/ascend_aot_package.py     | 23 +++++++++++++++----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/torch_npu/_inductor/patch/ascend_aot_package.py b/torch_npu/_inductor/patch/ascend_aot_package.py
index d55d698388..88955c917a 100644
--- a/torch_npu/_inductor/patch/ascend_aot_package.py
+++ b/torch_npu/_inductor/patch/ascend_aot_package.py
@@ -4,7 +4,7 @@ import torch_npu
 import os
 import re
 import sys
-import inductor_npu
+import torch_npu._inductor
 from torch_npu.contrib import transfer_to_npu
 import torch._inductor.package as pkg
 
@@ -36,7 +36,7 @@ from xpu_graph.passes.patterns.targets.npu.triton_kernel.fused_brc_permute_sum i
 from xpu_graph.passes.patterns.targets.npu.triton_kernel.fused_div_mul_sum import fused_div_mul_sum
 
 import os
-import inductor_npu
+import torch_npu._inductor
 from torch_npu.contrib import transfer_to_npu\n\n
 """
     return header + modified_code
@@ -124,7 +124,7 @@ class OpCodeGenerator(ABC):
 
 class LibraryOpGenerator(OpCodeGenerator):
     def generate(self, node_id, opname, arglist, outbuf):
-        NewLines = []
+        NewLines = ["    // PATCHED_CODE :"]
         # 生成输入Tensor变量声明
         new_input_argnames = []
         for i, arg in enumerate(arglist):
@@ -237,6 +237,7 @@ class CodeManager:
     OP_REGISTRY = {
         "aten::addmm": {"revertlines": 2, "generator": LibraryOpGenerator()},
         "aten::gather": {"revertlines": 2, "generator": LibraryOpGenerator()},
+        "aten::gelu": {"revertlines": 2, "generator": LibraryOpGenerator()},
         "torch_npu_triton::fused_div_mul_sum": {
             "revertlines": 2,
             "generator": CustomOpGenerator(
@@ -360,7 +361,7 @@ class CodeManager:
                 revertlines = CodeManager.OP_REGISTRY[target]["revertlines"]
                 generator = CodeManager.OP_REGISTRY[target]["generator"]
                 self.pop_lines(revertlines)
-
+                print(f"[DEBUG] Trigger patch at proxy node {node_id}, opname = {target}")
                 new_lines = generator.generate(node_id, target.split("::")[-1], arglist, outbuf)
                 self.append_lines(new_lines)
 
@@ -467,6 +468,7 @@ class AOTIPkgManager:
         return compile_list, link_list
 
     def recompile(self, compile_cmd, link_cmd):
+        
         try:
             subprocess.run(compile_cmd, check=True)
         except Exception as e:
@@ -511,7 +513,18 @@ class AOTIPkgManager:
 
 
 if __name__ == "__main__":
+    # aoti_manager =  AOTIPkgManager(
+    #     pt2_path = "/host/zcl/workspace/aoti_modules/tutorial/model_0510.pt2",
+    #     weight_path = "/host/aoti_weights/weight.o",
+    #     new_name_prefix = f"tutorial_patched"
+    # )
+
+    # aoti_manager.make_new_pt2()
+
+    # exit()
+
     batch_size_list = [1, 2, 4, 8, 12, 16, 20, 24, 28, 32]
+    # batch_size_list = [12]
     for batch_size in batch_size_list:
         decorated_path = process_and_run_model(
             f"/host/zcl/deberta_pkgs/fx_graph_readable_{batch_size}.py",
@@ -530,4 +543,4 @@ if __name__ == "__main__":
             f"/host/aoti_weights/args_map_{batch_size}.json"
         )
 
-        print("[INFO] ---------- DONE BATCH {batch_size} ----------")
\ No newline at end of file
+        print(f"[INFO] ---------- DONE BATCH {batch_size} ----------")
\ No newline at end of file
-- 
Gitee


From e304de330fb6537da483a5251ffb74d8ed6ad7fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E7=AB=8B?= <zhangchunli19@huawei.com>
Date: Sun, 11 May 2025 07:28:21 +0000
Subject: [PATCH 351/358] update torch_npu/_inductor/patch/_runner.cpp.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 张春立 <zhangchunli19@huawei.com>
---
 torch_npu/_inductor/patch/_runner.cpp | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/torch_npu/_inductor/patch/_runner.cpp b/torch_npu/_inductor/patch/_runner.cpp
index adab65e162..73548b65c4 100644
--- a/torch_npu/_inductor/patch/_runner.cpp
+++ b/torch_npu/_inductor/patch/_runner.cpp
@@ -12,6 +12,7 @@
 #include <fstream>
 #include <string>
 #include <cctype>
+#include <time.h>
 
 void removeWhitespace(std::string& str) {
     std::string result;
@@ -55,6 +56,7 @@ void loadDebertaWeights(std::vector<torch::Tensor> &inputs, std::string weightAr
     if(inputs.size()==num)return;
     inputs.reserve(num);
     torch::jit::script::Module weightTensors = torch::jit::load(weightArgPath);
+    //FIXME
     for(int i=0;i<num;i++){
         std::string argName = "arg_" + std::to_string(i);
         inputs.push_back(weightTensors.attr(argName).toTensor());
@@ -82,20 +84,28 @@ std::vector<torch::Tensor> runDebertaModelInference(
     const std::string pt2Path = paths.at("pt2Path");
     const std::string weightArgPath = paths.at("weightArgPath");
     const std::string argMapPath = paths.at("argMapPath");
+    
+    std::cerr<<"pt path : "<<pt2Path<<std::endl;
+    std::cerr<<"weightArgPath : "<<weightArgPath<<std::endl;
+    std::cerr<<"argMapPath : "<<argMapPath<<std::endl;
 
     std::string json_str = parseArgMapJson(argMapPath);
 
     std::vector<torch::Tensor> inputs;
-    loadDebertaWeights(inputs, weightArgPath, extractValue(json_str, "input_arg_length"));
 
+    loadDebertaWeights(inputs, weightArgPath, extractValue(json_str, "input_arg_length"));
+    //FIXME
     inputs[extractValue(json_str, "input_ids")] = userInputs.at("input_ids");
     inputs[extractValue(json_str, "segment_ids")] = userInputs.at("segment_ids");
     inputs[extractValue(json_str, "input_mask")] = userInputs.at("input_mask");
 
+    
+
     torch::inductor::AOTIModelPackageLoader loader(pt2Path);
     torch::inductor::AOTIModelContainerRunner* runner = loader.get_runner();
     std::vector<torch::Tensor> outputs = runner->run(inputs);
 
+
     return outputs;
 }
 
@@ -105,7 +115,7 @@ int main() {
 
     // Status QSEngineInterfaceInherit::Infer(QSIOTensor& in, QSIOTensor& out, void* stream)
     //QSIOTensor -> userInputs
-    torch::jit::script::Module tensors = torch::jit::load("/host/zcl/deberta_aoti/deberta_inputs.pth");
+    torch::jit::script::Module tensors = torch::jit::load("/host/deberta_files/inputs/deberta_inputs_32.pth");
 
     // tensors.to(at::kPrivateUse1);
     torch::Tensor input_ids = tensors.attr("input_ids").toTensor().to(at::kPrivateUse1);
-- 
Gitee


From f934fba3c5b44bb825c96b8fe04eed31dc3b1cfd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E7=AB=8B?= <zhangchunli19@huawei.com>
Date: Sun, 11 May 2025 07:29:21 +0000
Subject: [PATCH 352/358] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=5Frunner=E7=9A=84CMa?=
 =?UTF-8?q?keList?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 张春立 <zhangchunli19@huawei.com>
---
 torch_npu/_inductor/patch/CMakeLists.txt | 36 ++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 torch_npu/_inductor/patch/CMakeLists.txt

diff --git a/torch_npu/_inductor/patch/CMakeLists.txt b/torch_npu/_inductor/patch/CMakeLists.txt
new file mode 100644
index 0000000000..a0de479ff2
--- /dev/null
+++ b/torch_npu/_inductor/patch/CMakeLists.txt
@@ -0,0 +1,36 @@
+# CMAKE_PREFIX_PATH=/home/wangmingfa/miniconda3/envs/wz_torch260/lib/python3.9/site-packages/torch/share/cmake cmake -DCMAKE_BUILD_TYPE=Debug ..
+# make
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+project(deberta_aoti)
+
+find_package(Torch REQUIRED)
+# npu
+include_directories("/usr/local/lib/python3.11/dist-packages/torch_npu/include")
+include_directories("/host/zcl/pta_v2.6/libtorch_npu/include")
+include_directories("/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/include")
+# include_directories("/usr/local/Ascend/T115/ascend-toolkit/latest/x86_64-linux/include")
+
+link_directories("/host/zcl/aoti_files")
+link_directories("/host/zcl/pta_v2.6/libtorch_npu/lib")
+link_directories("/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/lib64")
+# link_directories("/usr/local/Ascend/T115/ascend-toolkit/latest/x86_64-linux/lib64")
+link_directories("/usr/lib/x86_64-linux-gnu")
+
+message("-----${TORCH_LIBRARIES}")
+
+add_executable(deberta_aoti
+    _runner.cpp
+    # test_libtorch.cpp
+)
+
+SET(CMAKE_BUILD_TYPE "Debug")
+
+## npu
+target_link_libraries(deberta_aoti aoti_npu)
+target_link_libraries(deberta_aoti aoti_runner_npu)
+target_link_libraries(deberta_aoti aoti_npuops)
+target_link_libraries(deberta_aoti torch_npu)
+target_link_libraries(deberta_aoti ascendcl)
+
+target_link_libraries(deberta_aoti "${TORCH_LIBRARIES}")
+set_property(TARGET deberta_aoti PROPERTY CXX_STANDARD 17)
\ No newline at end of file
-- 
Gitee


From 8d8ad32cc4e1f7ed9d27565a3e6ad0240a127aa7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E7=AB=8B?= <zhangchunli19@huawei.com>
Date: Mon, 12 May 2025 04:55:16 +0000
Subject: [PATCH 353/358] upload aoti deploy bash
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 张春立 <zhangchunli19@huawei.com>
---
 .../_inductor/patch/deploy_aoti_model.sh      | 38 +++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 torch_npu/_inductor/patch/deploy_aoti_model.sh

diff --git a/torch_npu/_inductor/patch/deploy_aoti_model.sh b/torch_npu/_inductor/patch/deploy_aoti_model.sh
new file mode 100644
index 0000000000..5ae9a4a535
--- /dev/null
+++ b/torch_npu/_inductor/patch/deploy_aoti_model.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+set -euo pipefail  # 严格模式：任何错误立即终止脚本
+
+# 参数校验
+if [[ $# -ne 1 ]]; then
+    echo "[ERROR] Incorrect usage detected!"
+    echo "[INFO] Usage: $0 <input_path>"
+    exit 1
+fi
+
+input_path="$1"
+output_path=/host/deberta_files
+
+[[ -d "$input_path" ]] || { echo "[ERROR] Invalid directory: $input_path"; exit 1; }
+
+for file in "$input_path"/deberta_*.pt2; do
+    # 提取文件名中的数字编号（如deberta_12.pt2 → 12）
+    base_name=$(basename "$file" .pt2)
+    num="${base_name##*_}"
+    
+    # 构建目标路径（如output_path/batch_12）
+    target_dir="$output_path/batch_$num"
+    if [[ -d "$target_dir" ]]; then
+        echo "[INFO] Delete old path: $target_dir"
+        rm -rf "$target_dir" || { echo "[ERROR] Delete old path failed"; exit 1; }
+    fi
+
+    mkdir -p "$target_dir" || { echo "[ERROR] Create directory failed: $target_dir"; exit 1; }
+
+    # 复制文件到目标目录
+    cp -v "$file" "$target_dir/" || { echo "[ERROR] Failed to copy $file to $target_dir"; exit 1; }
+
+    # 解压文件并处理错误
+    zip_file="$target_dir/$(basename "$file")"
+    unzip "$zip_file" -d "$target_dir" || { echo "[ERROR] failed to unzip $zip_file"; exit 1; }
+done
+
+echo "[SUCCESS] All AOTI pt2 files have deployed to $output_path"
\ No newline at end of file
-- 
Gitee


From 4e65789fb7733c61ee41e43cac9ee21ece17d5a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E7=AB=8B?= <zhangchunli19@huawei.com>
Date: Mon, 12 May 2025 04:56:20 +0000
Subject: [PATCH 354/358] update
 torch_npu/_inductor/patch/ascend_aot_package.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 张春立 <zhangchunli19@huawei.com>
---
 .../_inductor/patch/ascend_aot_package.py     | 77 ++++++++++++-------
 1 file changed, 48 insertions(+), 29 deletions(-)

diff --git a/torch_npu/_inductor/patch/ascend_aot_package.py b/torch_npu/_inductor/patch/ascend_aot_package.py
index 88955c917a..133217fa84 100644
--- a/torch_npu/_inductor/patch/ascend_aot_package.py
+++ b/torch_npu/_inductor/patch/ascend_aot_package.py
@@ -18,6 +18,13 @@ import subprocess
 
 from abc import ABC, abstractmethod
 
+
+def MakePath(directory, name):
+    return os.path.abspath(os.path.join(directory, name))
+
+DEPLOY_KERNEL_PATH = "/host/deberta_files"
+
+
 def modify_class_name(module_code: str) -> str:
     """ replace '<lambda>' with 'testModule' """
     modified_code = re.sub(
@@ -125,7 +132,7 @@ class OpCodeGenerator(ABC):
 class LibraryOpGenerator(OpCodeGenerator):
     def generate(self, node_id, opname, arglist, outbuf):
         NewLines = ["    // PATCHED_CODE :"]
-        # 生成输入Tensor变量声明
+        # generate input declare
         new_input_argnames = []
         for i, arg in enumerate(arglist):
             if isinstance(arg, str):
@@ -137,12 +144,12 @@ class LibraryOpGenerator(OpCodeGenerator):
             else :
                 raise TypeError(f"can not generate unsupport argtype: {type(arg).__name__}")
 
-        # 生成函数调用
+        # generate func call
         func_arg_str = ", ".join(new_input_argnames)
 
         output_argname = f"{outbuf}_tensor"
         NewLines.append(f"    auto {output_argname} = at::{opname}({func_arg_str});")
-        # 生成输出句柄
+        # generate output tensor
         NewLines.append(f"    RAIIAtenTensorHandle {outbuf}(reinterpret_cast<AtenTensorHandle>(new at::Tensor({output_argname})));")
 
         return NewLines
@@ -208,10 +215,6 @@ class CustomOpGenerator(OpCodeGenerator):
         code.append("}")
         return code
 
-
-def MakePath(directory, name):
-    return os.path.abspath(os.path.join(directory, name))
-
 class FallbackData:
     def __init__(self, json_path: str):
         with open(json_path, 'r') as f:
@@ -248,9 +251,10 @@ class CodeManager:
         },
     }
 
-    def __init__(self, directory, cpp_name, json_name):
+    def __init__(self, directory, cpp_name, json_name, batch_size):
         self.code_list = []
         self.cpp_path = MakePath(directory, cpp_name)
+        self.batch_name = f"batch_{batch_size}"
         self.proxy_data = FallbackData(MakePath(directory, json_name))
 
     def clear(self):
@@ -323,6 +327,8 @@ class CodeManager:
             r'loadKernel\(\"(.*/)([^/]+\.cubin)\"'
         )
 
+        kernelPathPattern = r'/tmp/torchinductor_root/[^/]+'
+
         with open(self.cpp_path, 'r', encoding='utf-8') as f:
             for line in f:
                 # 保留原始行
@@ -336,11 +342,16 @@ class CodeManager:
                     linkCmd = line.replace("// ", "", 1)
                     continue
                 
-                # if loadKernelPattern.search(line):
-                #     self.pop_lines(1)
-                #     modified_line = re.sub(loadKernelPattern, r'loadKernel("\2"', line)
-                #     self.code_list.append(modified_line)
-                #     continue
+                if loadKernelPattern.search(line):
+                    self.pop_lines(1)
+                    modified_line = re.sub(
+                        kernelPathPattern,
+                        MakePath(DEPLOY_KERNEL_PATH, f"{self.batch_name}/data/aotinductor/model"),
+                        line
+                    )
+                    self.code_list.append(modified_line)
+                    print(f"[INFO] patched loadKernel line : {modified_line}")
+                    continue
 
                 # 检查是否匹配代理函数调用
                 match = fallbackOpPrefixPattern.search(line)
@@ -369,21 +380,28 @@ class CodeManager:
 
 
 class AOTIPkgManager:
-    def __init__(self,pt2_path, weight_path, new_name_prefix):
+    def __init__(self,pt2_path, weight_path, new_name_prefix, batch):
         self.binfiles = []           # .cubin 
         self.wrapper_name = None     # xxx.cpp 
         self.proxy_json_name = None  # xxx.json 
         self.metadata_json_name = None  # xxx_metadata.json 
         self.weight_name = None      # .o 
-        self.weight_path = weight_path
         self.shared_library_name = None  # .so
-        self.extract_dir = self.extract_pt2(pt2_path)
+
+        self.weight_path = weight_path
+        self.batch_size = batch
         self.new_name_prefix = new_name_prefix
+
+        self.extract_dir = self.extract_pt2(pt2_path)
         self.classify_files(self.extract_dir)
+        self.new_cpp_path = MakePath(self.extract_dir, self.new_name_prefix+".cpp")
+        self.new_so_path = MakePath(self.extract_dir, self.new_name_prefix+".so")
+
         self.code_manager = CodeManager(
             self.extract_dir,
             self.wrapper_name,
-            self.proxy_json_name
+            self.proxy_json_name,
+            self.batch_size,
         )
 
     def classify_files(self, directory):
@@ -441,10 +459,6 @@ class AOTIPkgManager:
         return os.path.join(extract_dir,"data/aotinductor/model")
 
     def rewrite_cpp_wrapper(self):
-        self.new_cpp_path = MakePath(self.extract_dir, self.new_name_prefix+".cpp")
-        # self.weight_path = MakePath(self.extract_dir, self.weight_name)
-        self.new_so_path = MakePath(self.extract_dir, self.new_name_prefix+".so")
-
         old_compile_cmd, old_link_cmd = self.code_manager.process_cpp_file()
         self.code_manager.save_new_file(self.new_cpp_path)
 
@@ -479,7 +493,7 @@ class AOTIPkgManager:
         except Exception as e:
             raise e
 
-    def repackage(self, input_arg_map_path, args_aoti_path):
+    def repackage(self, input_arg_map_path, args_aoti_path, const_S_path):
         new_proxy_json_path = MakePath(self.extract_dir, self.new_name_prefix + ".json")
         new_metadata_json_path = MakePath(self.extract_dir, self.new_name_prefix + "_metadata.json")
 
@@ -498,6 +512,9 @@ class AOTIPkgManager:
 
         if len(args_aoti_path)>3:
             file_list.append(args_aoti_path)
+        
+        if len(const_S_path)>3:
+            file_list.append(const_S_path)
 
         for filename in self.binfiles:
             file_list.append(MakePath(self.extract_dir, filename))
@@ -506,10 +523,10 @@ class AOTIPkgManager:
         pkg.package_aoti(new_pkg_path, file_list)
         print(f"[INFO] OUTPUT NEW AOTI PACKAGE TO: {new_pkg_path}")
 
-    def make_new_pt2(self, input_arg_map_path="", args_aoti_path=""):
+    def make_new_pt2(self, input_arg_map_path="", args_aoti_path="", const_S_path=""):
         compile_cmd, link_cmd = self.rewrite_cpp_wrapper()
         self.recompile(compile_cmd, link_cmd)
-        return self.repackage(input_arg_map_path, args_aoti_path)
+        return self.repackage(input_arg_map_path, args_aoti_path, const_S_path)
 
 
 if __name__ == "__main__":
@@ -524,7 +541,7 @@ if __name__ == "__main__":
     # exit()
 
     batch_size_list = [1, 2, 4, 8, 12, 16, 20, 24, 28, 32]
-    # batch_size_list = [12]
+    # batch_size_list = [32]
     for batch_size in batch_size_list:
         decorated_path = process_and_run_model(
             f"/host/zcl/deberta_pkgs/fx_graph_readable_{batch_size}.py",
@@ -533,14 +550,16 @@ if __name__ == "__main__":
         print(f"finished run outputcode, dump into {decorated_path}")
 
         aoti_manager =  AOTIPkgManager(
-        pt2_path = decorated_path,
-        weight_path = "/host/aoti_weights/weight.o",
-        new_name_prefix = f"deberta_{batch_size}"
+            pt2_path = decorated_path,
+            weight_path = "/host/aoti_weights/weight.o",
+            new_name_prefix = f"deberta_{batch_size}",
+            batch=batch_size
         )
 
         aoti_manager.make_new_pt2(
             f"/host/aoti_weights/weight_args_{batch_size}.pt",
-            f"/host/aoti_weights/args_map_{batch_size}.json"
+            f"/host/aoti_weights/args_map_{batch_size}.json",
+            f"/host/aoti_weights/consts.S"
         )
 
         print(f"[INFO] ---------- DONE BATCH {batch_size} ----------")
\ No newline at end of file
-- 
Gitee


From 6f005dc56cc72aee2c9ae05d6ade0326aeae0136 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E7=AB=8B?= <zhangchunli19@huawei.com>
Date: Thu, 15 May 2025 07:41:16 +0000
Subject: [PATCH 355/358] add torch_npu/_inductor/patch/__init__.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 张春立 <zhangchunli19@huawei.com>
---
 torch_npu/_inductor/patch/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 torch_npu/_inductor/patch/__init__.py

diff --git a/torch_npu/_inductor/patch/__init__.py b/torch_npu/_inductor/patch/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
-- 
Gitee


From d3d8c8ca14e76200ba183f6e9751894f0e6c7eb4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E7=AB=8B?= <zhangchunli19@huawei.com>
Date: Thu, 15 May 2025 07:42:09 +0000
Subject: [PATCH 356/358] update
 torch_npu/_inductor/patch/ascend_aot_package.py.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 张春立 <zhangchunli19@huawei.com>
---
 .../_inductor/patch/ascend_aot_package.py     | 135 ++++++++----------
 1 file changed, 59 insertions(+), 76 deletions(-)

diff --git a/torch_npu/_inductor/patch/ascend_aot_package.py b/torch_npu/_inductor/patch/ascend_aot_package.py
index 133217fa84..016c1fdc09 100644
--- a/torch_npu/_inductor/patch/ascend_aot_package.py
+++ b/torch_npu/_inductor/patch/ascend_aot_package.py
@@ -6,7 +6,7 @@ import re
 import sys
 import torch_npu._inductor
 from torch_npu.contrib import transfer_to_npu
-import torch._inductor.package as pkg
+import torch._inductor.package as inductor_package
 
 from typing import Dict, Any
 
@@ -15,16 +15,17 @@ import json
 import shutil
 import shlex
 import subprocess
+import logging
 
 from abc import ABC, abstractmethod
 
+DEBUG_MODE = os.getenv("DEBUG", 0)
 
 def MakePath(directory, name):
     return os.path.abspath(os.path.join(directory, name))
 
 DEPLOY_KERNEL_PATH = "/host/deberta_files"
 
-
 def modify_class_name(module_code: str) -> str:
     """ replace '<lambda>' with 'testModule' """
     modified_code = re.sub(
@@ -120,7 +121,7 @@ def process_and_run_model(input_path: str, do_aoti = False):
                 package_path=os.path.join(os.path.dirname(input_path),"origin.pt2"),
             )
         else:
-            print(model(*(fake_inputs.values())))
+            logging.info(model(*(fake_inputs.values())))
 
     return output_path
 
@@ -241,14 +242,6 @@ class CodeManager:
         "aten::addmm": {"revertlines": 2, "generator": LibraryOpGenerator()},
         "aten::gather": {"revertlines": 2, "generator": LibraryOpGenerator()},
         "aten::gelu": {"revertlines": 2, "generator": LibraryOpGenerator()},
-        "torch_npu_triton::fused_div_mul_sum": {
-            "revertlines": 2,
-            "generator": CustomOpGenerator(
-                kernel_path="/path/to/custom_op.cubin",
-                kernel_name="custom_op_name",
-                grid_size=(),
-            )
-        },
     }
 
     def __init__(self, directory, cpp_name, json_name, batch_size):
@@ -327,13 +320,29 @@ class CodeManager:
             r'loadKernel\(\"(.*/)([^/]+\.cubin)\"'
         )
 
-        kernelPathPattern = r'/tmp/torchinductor_root/[^/]+'
+        kernelPathPattern = r'/tmp/(?:.*/)*([^/]+\.cubin)'
+
+        launchKernelPattern = re.compile(
+            r'launchKernel\(\"'
+        )
+        launch_cnt=0
 
         with open(self.cpp_path, 'r', encoding='utf-8') as f:
             for line in f:
                 # 保留原始行
                 self.code_list.append(line.rstrip('\n'))
                 
+                if launchKernelPattern.search(line) and DEBUG_MODE:
+                    self.code_list.append("        {")
+                    self.code_list.append("            aclError error_flag = c10_npu::npuSynchronizeDevice();")
+                    self.code_list.append("            if(error_flag!=ACL_SUCCESS){")
+                    self.code_list.append(f"            std::cerr<<\"[DEBUG] failed to synchronize TT_kernel {launch_cnt}\"<<std::endl;")
+                    self.code_list.append("            throw std::runtime_error(std::to_string(error_flag));\n            }")
+                    self.code_list.append(f"            std::cerr<<\"[DEBUG] synchronized launch TT_kernel {launch_cnt}\"<<std::endl;")
+                    self.code_list.append("        }")
+                    launch_cnt+=1
+                    continue
+
                 if compileCmdPattern.search(line):
                     originCmd = line.replace("// ", "", 1)
                     continue
@@ -346,11 +355,11 @@ class CodeManager:
                     self.pop_lines(1)
                     modified_line = re.sub(
                         kernelPathPattern,
-                        MakePath(DEPLOY_KERNEL_PATH, f"{self.batch_name}/data/aotinductor/model"),
+                        rf'{MakePath(DEPLOY_KERNEL_PATH, f"{self.batch_name}/data/aotinductor/model")}/\1',
                         line
                     )
                     self.code_list.append(modified_line)
-                    print(f"[INFO] patched loadKernel line : {modified_line}")
+                    logging.info(f" patched loadKernel line : {modified_line}")
                     continue
 
                 # 检查是否匹配代理函数调用
@@ -372,14 +381,15 @@ class CodeManager:
                 revertlines = CodeManager.OP_REGISTRY[target]["revertlines"]
                 generator = CodeManager.OP_REGISTRY[target]["generator"]
                 self.pop_lines(revertlines)
-                print(f"[DEBUG] Trigger patch at proxy node {node_id}, opname = {target}")
+                logging.info(f" Trigger patch at proxy node {node_id}, opname = {target}")
                 new_lines = generator.generate(node_id, target.split("::")[-1], arglist, outbuf)
                 self.append_lines(new_lines)
+                
 
         return originCmd, linkCmd
 
 
-class AOTIPkgManager:
+class AOTIPkgManagerNpu:
     def __init__(self,pt2_path, weight_path, new_name_prefix, batch):
         self.binfiles = []           # .cubin 
         self.wrapper_name = None     # xxx.cpp 
@@ -423,12 +433,12 @@ class AOTIPkgManager:
                 self.weight_name = file.name
             elif file.suffix == ".so":
                 self.shared_library_name = file.name
-        print(f"[INFO] binfiles: cnt={len(self.binfiles)}, {self.binfiles}")
-        print(f"[INFO] wrapper_name = {self.wrapper_name}")
-        print(f"[INFO] metadata_json_name = {self.metadata_json_name}")
-        print(f"[INFO] proxy_json_name = {self.proxy_json_name}")
-        print(f"[INFO] weight_name = {self.weight_name}")
-        print(f"[INFO] shared_library_name = {self.shared_library_name}")
+        logging.info(f" binfiles: cnt={len(self.binfiles)}, {self.binfiles}")
+        logging.info(f" wrapper_name = {self.wrapper_name}")
+        logging.info(f" metadata_json_name = {self.metadata_json_name}")
+        logging.info(f" proxy_json_name = {self.proxy_json_name}")
+        logging.info(f" weight_name = {self.weight_name}")
+        logging.info(f" shared_library_name = {self.shared_library_name}")
         
     
     def extract_pt2(self, pt2_path: str) -> None:
@@ -474,10 +484,10 @@ class AOTIPkgManager:
             link_list[2] = self.weight_path
         link_list[-1] = self.new_so_path
 
-        print("[INFO] after rewrite_cpp_wrapper:")
-        print(f"[INFO] new_cpp_path = {self.new_cpp_path}")
-        print(f"[INFO] compile_list = {compile_list}")
-        print(f"[INFO] link_list = {link_list}")
+        logging.info(" after rewrite_cpp_wrapper:")
+        logging.info(f" new_cpp_path = {self.new_cpp_path}")
+        logging.info(f" compile_list = {compile_list}")
+        logging.info(f" link_list = {link_list}")
 
         return compile_list, link_list
 
@@ -493,7 +503,8 @@ class AOTIPkgManager:
         except Exception as e:
             raise e
 
-    def repackage(self, input_arg_map_path, args_aoti_path, const_S_path):
+
+    def repackage(self, new_pt2_directory, extra_files):
         new_proxy_json_path = MakePath(self.extract_dir, self.new_name_prefix + ".json")
         new_metadata_json_path = MakePath(self.extract_dir, self.new_name_prefix + "_metadata.json")
 
@@ -507,59 +518,31 @@ class AOTIPkgManager:
             new_metadata_json_path,
         ]
 
-        if len(input_arg_map_path)>3:
-            file_list.append(input_arg_map_path)
-
-        if len(args_aoti_path)>3:
-            file_list.append(args_aoti_path)
-        
-        if len(const_S_path)>3:
-            file_list.append(const_S_path)
-
         for filename in self.binfiles:
             file_list.append(MakePath(self.extract_dir, filename))
-        
-        new_pkg_path = MakePath(self.pt2_dir, self.new_name_prefix + ".pt2")
-        pkg.package_aoti(new_pkg_path, file_list)
-        print(f"[INFO] OUTPUT NEW AOTI PACKAGE TO: {new_pkg_path}")
 
-    def make_new_pt2(self, input_arg_map_path="", args_aoti_path="", const_S_path=""):
-        compile_cmd, link_cmd = self.rewrite_cpp_wrapper()
-        self.recompile(compile_cmd, link_cmd)
-        return self.repackage(input_arg_map_path, args_aoti_path, const_S_path)
-
-
-if __name__ == "__main__":
-    # aoti_manager =  AOTIPkgManager(
-    #     pt2_path = "/host/zcl/workspace/aoti_modules/tutorial/model_0510.pt2",
-    #     weight_path = "/host/aoti_weights/weight.o",
-    #     new_name_prefix = f"tutorial_patched"
-    # )
-
-    # aoti_manager.make_new_pt2()
-
-    # exit()
+        from pathlib import Path
+        for extra_file in extra_files:
+            try:
+                Path(extra_file).resolve(strict=True)
+            except Exception as e:
+                raise e
+            file_list.append(extra_file)
+
+        if len(new_pt2_directory)==0:
+            new_pkg_path = MakePath(self.pt2_dir, self.new_name_prefix + ".pt2")
+        else:
+            new_pkg_path = MakePath(new_pt2_directory, self.new_name_prefix + ".pt2")
 
-    batch_size_list = [1, 2, 4, 8, 12, 16, 20, 24, 28, 32]
-    # batch_size_list = [32]
-    for batch_size in batch_size_list:
-        decorated_path = process_and_run_model(
-            f"/host/zcl/deberta_pkgs/fx_graph_readable_{batch_size}.py",
-            do_aoti=True
-        )
-        print(f"finished run outputcode, dump into {decorated_path}")
+        inductor_package.package_aoti(new_pkg_path, file_list)
+        logging.info(f" OUTPUT NEW AOTI PACKAGE TO: {new_pkg_path}")
 
-        aoti_manager =  AOTIPkgManager(
-            pt2_path = decorated_path,
-            weight_path = "/host/aoti_weights/weight.o",
-            new_name_prefix = f"deberta_{batch_size}",
-            batch=batch_size
-        )
+        return new_pkg_path
 
-        aoti_manager.make_new_pt2(
-            f"/host/aoti_weights/weight_args_{batch_size}.pt",
-            f"/host/aoti_weights/args_map_{batch_size}.json",
-            f"/host/aoti_weights/consts.S"
-        )
 
-        print(f"[INFO] ---------- DONE BATCH {batch_size} ----------")
\ No newline at end of file
+    def make_new_pt2(self, new_pt2_directory="", extra_files = []):
+        compile_cmd, link_cmd = self.rewrite_cpp_wrapper()
+        self.recompile(compile_cmd, link_cmd)
+        new_pkg_path =  self.repackage(new_pt2_directory, extra_files)
+        logging.info(f" ---------- SUCCESS MAKE NEW PT2 BATCH {self.batch_size} ----------")
+        return new_pkg_path
\ No newline at end of file
-- 
Gitee


From 1707f957bae4ed36421b0bd4507efedde2fbaa64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E7=AB=8B?= <zhangchunli19@huawei.com>
Date: Thu, 15 May 2025 07:43:09 +0000
Subject: [PATCH 357/358] update torch_npu/_inductor/patch/_runner.cpp.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 张春立 <zhangchunli19@huawei.com>
---
 torch_npu/_inductor/patch/_runner.cpp | 50 +++++++++++++++++++++------
 1 file changed, 40 insertions(+), 10 deletions(-)

diff --git a/torch_npu/_inductor/patch/_runner.cpp b/torch_npu/_inductor/patch/_runner.cpp
index 73548b65c4..555c8f32e2 100644
--- a/torch_npu/_inductor/patch/_runner.cpp
+++ b/torch_npu/_inductor/patch/_runner.cpp
@@ -6,13 +6,17 @@
 #include <torch/torch.h>
 #include <torch/csrc/inductor/aoti_runner/model_container_runner_npu.h>
 #include <torch/csrc/inductor/aoti_package/model_package_loader.h>
+
+#include "torch_npu/csrc/core/npu/register/OptionRegister.h"
+#include "torch_npu/csrc/core/npu/register/OptionsManager.h"
+
 #include <torch_npu/torch_npu.h>
 #include <acl/acl.h>
 
 #include <fstream>
 #include <string>
 #include <cctype>
-#include <time.h>
+#include <chrono>
 
 void removeWhitespace(std::string& str) {
     std::string result;
@@ -89,21 +93,31 @@ std::vector<torch::Tensor> runDebertaModelInference(
     std::cerr<<"weightArgPath : "<<weightArgPath<<std::endl;
     std::cerr<<"argMapPath : "<<argMapPath<<std::endl;
 
-    std::string json_str = parseArgMapJson(argMapPath);
+    // std::string json_str = parseArgMapJson(argMapPath);
 
-    std::vector<torch::Tensor> inputs;
+    std::vector<torch::Tensor> inputs(4);
 
-    loadDebertaWeights(inputs, weightArgPath, extractValue(json_str, "input_arg_length"));
+    // loadDebertaWeights(inputs, weightArgPath, extractValue(json_str, "input_arg_length"));
     //FIXME
-    inputs[extractValue(json_str, "input_ids")] = userInputs.at("input_ids");
-    inputs[extractValue(json_str, "segment_ids")] = userInputs.at("segment_ids");
-    inputs[extractValue(json_str, "input_mask")] = userInputs.at("input_mask");
+    // inputs[extractValue(json_str, "input_ids")] = userInputs.at("input_ids");
+    // inputs[extractValue(json_str, "segment_ids")] = userInputs.at("segment_ids");
+    // inputs[extractValue(json_str, "input_mask")] = userInputs.at("input_mask");
 
-    
+    inputs[0] = userInputs.at("input_ids");
+    inputs[1] = userInputs.at("segment_ids");
+    inputs[2] = userInputs.at("input_mask");
+    inputs[3] = userInputs.at("batching_index");
 
     torch::inductor::AOTIModelPackageLoader loader(pt2Path);
     torch::inductor::AOTIModelContainerRunner* runner = loader.get_runner();
     std::vector<torch::Tensor> outputs = runner->run(inputs);
+    
+
+    // aclError error_flag = c10_npu::npuSynchronizeDevice();
+    // if(error_flag!=ACL_SUCCESS){
+    //     std::cout<<"fxxk 0"<<std::endl;
+    //     throw std::runtime_error(std::to_string(error_flag));
+    // }
 
 
     return outputs;
@@ -115,22 +129,38 @@ int main() {
 
     // Status QSEngineInterfaceInherit::Infer(QSIOTensor& in, QSIOTensor& out, void* stream)
     //QSIOTensor -> userInputs
-    torch::jit::script::Module tensors = torch::jit::load("/host/deberta_files/inputs/deberta_inputs_32.pth");
+    c10_npu::option::SetOption("ALLOW_INTERNAL_FORMAT","disable");
+
+    torch::jit::script::Module tensors = torch::jit::load("/host/deberta_files/inputs/deberta_inputs_1.pth");
 
     // tensors.to(at::kPrivateUse1);
     torch::Tensor input_ids = tensors.attr("input_ids").toTensor().to(at::kPrivateUse1);
     torch::Tensor segment_ids = tensors.attr("segment_ids").toTensor().to(at::kPrivateUse1);
     torch::Tensor input_mask = tensors.attr("input_mask").toTensor().to(at::kPrivateUse1);
+    torch::Tensor batching_index = tensors.attr("batching_index").toTensor().to(at::kPrivateUse1);
+
+    // std::cout<<input_ids<<std::endl;
+    // std::cout<<segment_ids<<std::endl;
+    // std::cout<<input_mask<<std::endl;
+    // std::cout<<batching_index<<std::endl;
+
     int batchSize = input_ids.size(0) - 1;
 
     std::map<std::string,torch::Tensor> userInputs={
         {"input_ids", input_ids},
         {"segment_ids", segment_ids},
-        {"input_mask", input_mask}
+        {"input_mask", input_mask},
+        {"batching_index", batching_index}
     };
 
     std::vector<torch::Tensor> outputs = runDebertaModelInference(userInputs, batchSize);
 
+    // aclError error_flag = c10_npu::npuSynchronizeDevice();
+    // if(error_flag!=ACL_SUCCESS){
+    //     std::cout<<"fxxk"<<std::endl;
+    //     throw std::runtime_error(std::to_string(error_flag));
+    // }
+
     for(auto &out: outputs)out = out.to(torch::kCPU);
     torch_npu::finalize_npu();
 
-- 
Gitee


From fdd7d19999a21fc4228179177f7a7d6cf7fc6f30 Mon Sep 17 00:00:00 2001
From: zhuceHW <zhuce@huawei.com>
Date: Fri, 16 May 2025 08:00:26 +0000
Subject: [PATCH 358/358] not to lowering aten.slice_scatter, which is not
 support in current version

Signed-off-by: zhuceHW <zhuce@huawei.com>
---
 torch_npu/_inductor/lowering.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch_npu/_inductor/lowering.py b/torch_npu/_inductor/lowering.py
index c31736ddca..fb1dd5ce1e 100644
--- a/torch_npu/_inductor/lowering.py
+++ b/torch_npu/_inductor/lowering.py
@@ -141,7 +141,6 @@ GENERATE_LIST = [
     # npu.npu_dtype_cast
     npu_dtype_cast,
     aten.select_scatter,
-    aten.slice_scatter,
     prims.broadcast_in_dim,
     prims.maximum,
     aten.ne,
@@ -187,6 +186,7 @@ LOWERING_OVERLOAD_OP = [
     aten.nll_loss_forward,
     aten.gather,
     aten.cat,
+    aten.slice_scatter,
     #aten.clone, cast permute_reshape will fail if enable this 
 ]
 
-- 
Gitee