diff --git a/torch_npu/csrc/profiler/init.cpp b/torch_npu/csrc/profiler/init.cpp index 94ad31d5a68a3e87af53020549b697740b56747e..e2427f8b8ec4d3d6f9802b1e1ab3315383d75253 100644 --- a/torch_npu/csrc/profiler/init.cpp +++ b/torch_npu/csrc/profiler/init.cpp @@ -16,6 +16,7 @@ #include "torch_npu/csrc/profiler/profiler_python.h" #include "torch_npu/csrc/profiler/npu_profiler.h" #include "torch_npu/csrc/toolkit/profiler/common/utils.h" +#include "torch_npu/csrc/toolkit/profiler/inc/mspti_adapter.h" #include "torch_npu/csrc/framework/interface/LibAscendHal.h" #include "torch_npu/csrc/core/npu/NPUException.h" @@ -86,6 +87,10 @@ PyObject* profiler_initExtension(PyObject* _unused, PyObject *unused) { m.def("_get_monotonic", torch_npu::toolkit::profiler::Utils::GetClockMonotonicRawNs); m.def("_get_host_uid", torch_npu::toolkit::profiler::Utils::GetHostUid); + // mspti + m.def("_mspti_enable", torch_npu::toolkit::profiler::MsptiEnable); + m.def("_mspti_disable", torch_npu::toolkit::profiler::MsptiDisable); + torch_npu::profiler::python_tracer::init(); Py_RETURN_TRUE; } diff --git a/torch_npu/csrc/toolkit/profiler/CMakeLists.txt b/torch_npu/csrc/toolkit/profiler/CMakeLists.txt index 0ddc517c6e7434e2c761692b0ae4121079f9e14a..319c27c2327944a3a063664d04b8dcfe000382a1 100644 --- a/torch_npu/csrc/toolkit/profiler/CMakeLists.txt +++ b/torch_npu/csrc/toolkit/profiler/CMakeLists.txt @@ -1,5 +1,13 @@ set(PROFILER_NAME npu_profiler) +set(DEFAULT_ASCEND_HOME_PATH "/usr/local/Ascend/ascend_toolkit/latest") +if(DEFINED ENV{ASCEND_HOME_PATH}) + message("ASCEND_HOME_PATH: $ENV{ASCEND_HOME_PATH}") + set(DEFAULT_ASCEND_HOME_PATH $ENV{ASCEND_HOME_PATH}) +else() + message("NOT DEFINED ASCEND_HOME_PATH VARIABLES") +endif() + FILE(GLOB NPU_PROF_SRCS common/*.cpp src/*.cpp @@ -15,9 +23,14 @@ add_library(${PROFILER_NAME} SHARED target_include_directories(${PROFILER_NAME} PRIVATE ${NPU_PROF_INC} + ${DEFAULT_ASCEND_HOME_PATH}/include +) + +target_link_directories(${PROFILER_NAME} PRIVATE + ${DEFAULT_ASCEND_HOME_PATH}/lib64 ) -target_link_libraries(${PROFILER_NAME} PRIVATE torch_cpu pthread) +target_link_libraries(${PROFILER_NAME} PRIVATE torch_cpu pthread mspti ascendcl) target_compile_options(${PROFILER_NAME} PRIVATE ${TORCH_CXX_FLAGS} diff --git a/torch_npu/csrc/toolkit/profiler/inc/mspti_adapter.h b/torch_npu/csrc/toolkit/profiler/inc/mspti_adapter.h new file mode 100644 index 0000000000000000000000000000000000000000..b4b71d78974bc4947322d287b518f042c233e658 --- /dev/null +++ b/torch_npu/csrc/toolkit/profiler/inc/mspti_adapter.h @@ -0,0 +1,12 @@ +#pragma once + +namespace torch_npu { +namespace toolkit { +namespace profiler { + +void MsptiEnable(); +void MsptiDisable(); + +} +} +} \ No newline at end of file diff --git a/torch_npu/csrc/toolkit/profiler/src/mspti_adapter.cpp b/torch_npu/csrc/toolkit/profiler/src/mspti_adapter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9ade16b285624f51a036d26e9a1356c60379d706 --- /dev/null +++ b/torch_npu/csrc/toolkit/profiler/src/mspti_adapter.cpp @@ -0,0 +1,116 @@ +#include "torch_npu/csrc/toolkit/profiler/inc/mspti_adapter.h" + +#include "mspti/mspti.h" +#include "mstx/ms_tools_ext.h" +#include "acl/acl_rt.h" + +#include +#include +#include + +namespace torch_npu { +namespace toolkit { +namespace profiler { + +msptiSubscriberHandle g_client{nullptr}; +std::atomic g_totalApi{0}; +std::atomic g_totalKernel{0}; +std::atomic g_totalMarker{0}; + +void UserBufferRequest(uint8_t **buffer, size_t *size, size_t *maxNumRecords) { + constexpr uint32_t SIZE = 5 * 1024 * 1024; + uint8_t *pBuffer = (uint8_t*)malloc(SIZE); + *buffer = pBuffer; + *size = SIZE; + *maxNumRecords = 0; +} + +static void ShowApiInfo(msptiActivityApi* api) { + if (api == nullptr) { + printf("Api nullptr\n"); + return; + } + g_totalApi++; +} + +static void ShowKernelInfo(msptiActivityKernel* kernel) { + if (kernel == nullptr) { + printf("Kernel nullptr\n"); + return; + } + g_totalKernel++; +} + +static void ShowMarkerInfo(msptiActivityMarker* marker) { + if (marker == nullptr) { + printf("Marker nullptr\n"); + return; + } + g_totalMarker++; +} + +void UserBufferComplete(uint8_t *buffer, size_t size, size_t validSize) { + if (validSize > 0) { + msptiActivity *pRecord = nullptr; + msptiResult status = MSPTI_SUCCESS; + do { + status = msptiActivityGetNextRecord(buffer, validSize, &pRecord); + if (status == MSPTI_SUCCESS) { + if (pRecord->kind == MSPTI_ACTIVITY_KIND_API) { + msptiActivityApi* api = reinterpret_cast(pRecord); + ShowApiInfo(api); + } else if (pRecord->kind == MSPTI_ACTIVITY_KIND_KERNEL) { + msptiActivityKernel* kernel = reinterpret_cast(pRecord); + ShowKernelInfo(kernel); + } else if (pRecord->kind == MSPTI_ACTIVITY_KIND_MARKER) { + msptiActivityMarker* marker = reinterpret_cast(pRecord); + ShowMarkerInfo(marker); + } + } else if (status == MSPTI_ERROR_MAX_LIMIT_REACHED) { + break; + } + } while (1); + } +} + +void UserCallback(void* pUserData, msptiCallbackDomain domain, msptiCallbackId id, const msptiCallbackData* pCallbackInfo) { + thread_local uint64_t markId = 0; + if (pCallbackInfo->callbackSite == MSPTI_API_ENTER) { + mstxMarkA("start", nullptr); + markId = mstxRangeStartA("Range start", nullptr); + } else if (pCallbackInfo->callbackSite == MSPTI_API_EXIT) { + mstxMarkA("end", nullptr); + mstxRangeEnd(markId); + } +} + +void MsptiEnable() { + msptiSubscribe(&g_client, UserCallback, nullptr); + msptiEnableCallback(1, g_client, MSPTI_CB_DOMAIN_RUNTIME, MSPTI_CBID_RUNTIME_MALLOC); + msptiEnableCallback(1, g_client, MSPTI_CB_DOMAIN_RUNTIME, MSPTI_CBID_RUNTIME_FREE); + msptiEnableCallback(1, g_client, MSPTI_CB_DOMAIN_RUNTIME, MSPTI_CBID_RUNTIME_LAUNCH); + msptiEnableCallback(1, g_client, MSPTI_CB_DOMAIN_RUNTIME, MSPTI_CBID_RUNTIME_FFTS_LAUNCH); + msptiActivityRegisterCallbacks(UserBufferRequest, UserBufferComplete); + msptiActivityEnable(MSPTI_ACTIVITY_KIND_MARKER); + msptiActivityEnable(MSPTI_ACTIVITY_KIND_API); + msptiActivityEnable(MSPTI_ACTIVITY_KIND_KERNEL); +} + +void MsptiDisable() { + msptiEnableCallback(0, g_client, MSPTI_CB_DOMAIN_RUNTIME, MSPTI_CBID_RUNTIME_MALLOC); + msptiEnableCallback(0, g_client, MSPTI_CB_DOMAIN_RUNTIME, MSPTI_CBID_RUNTIME_FREE); + msptiEnableCallback(0, g_client, MSPTI_CB_DOMAIN_RUNTIME, MSPTI_CBID_RUNTIME_LAUNCH); + msptiEnableCallback(0, g_client, MSPTI_CB_DOMAIN_RUNTIME, MSPTI_CBID_RUNTIME_FFTS_LAUNCH); + msptiActivityDisable(MSPTI_ACTIVITY_KIND_MARKER); + msptiActivityDisable(MSPTI_ACTIVITY_KIND_API); + msptiActivityDisable(MSPTI_ACTIVITY_KIND_KERNEL); + msptiActivityFlushAll(1); + msptiUnsubscribe(g_client); + int32_t deviceId = 0; + aclrtGetDevice(&deviceId); + printf("[Total] device: %d, api: %lu, kernel: %lu, marker: %lu\n", deviceId, g_totalApi.load(), g_totalKernel.load(), g_totalMarker.load()); +} + +} +} +} \ No newline at end of file diff --git a/torch_npu/profiler/__init__.py b/torch_npu/profiler/__init__.py index d31bd85ebb2cb8bd5c84008cd1c9159aca9aaa51..2d2dd4eab91d89b08b075ff69e80f9081fdf2793 100644 --- a/torch_npu/profiler/__init__.py +++ b/torch_npu/profiler/__init__.py @@ -11,7 +11,7 @@ from .experimental_config import _ExperimentalConfig, supported_profiler_level, supported_export_type, ProfilerLevel, AiCMetrics, ExportType from ._non_intrusive_profile import _NonIntrusiveProfile -__all__ = ["profile", "ProfilerActivity", "supported_activities", "tensorboard_trace_handler", "schedule", +__all__ = ["profile", "mspti", "ProfilerActivity", "supported_activities", "tensorboard_trace_handler", "schedule", "ProfilerAction", "_ExperimentalConfig", "supported_profiler_level", "supported_ai_core_metrics", "supported_export_type", "ProfilerLevel", "AiCMetrics", "ExportType"] diff --git a/torch_npu/profiler/mspti.py b/torch_npu/profiler/mspti.py new file mode 100644 index 0000000000000000000000000000000000000000..f32901b4d6381d950c8e030467ac6e5ad3341f79 --- /dev/null +++ b/torch_npu/profiler/mspti.py @@ -0,0 +1,13 @@ +from torch_npu._C._profiler import _mspti_enable, _mspti_disable + +class mspti: + def __init__(self): + pass + + def start(self): + print("_mspti_enable") + _mspti_enable() + + def stop(self): + print("_mspti_disable") + _mspti_disable()