From 74dd980a22f20bbbe3ebca646d2fa5e272425092 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 8 Jan 2025 06:30:16 -0800 Subject: [PATCH 01/52] perf/x86/intel/uncore: Clean up func_id mainline inclusion from mainline-v6.14-rc1 commit 3f710be02ea648001ba18fb2c9fa7765e743dec2 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3f710be02ea648001ba18fb2c9fa7765e743dec2 --------------------------- The below warning may be triggered on GNR when the PCIE uncore units are exposed. WARNING: CPU: 4 PID: 1 at arch/x86/events/intel/uncore.c:1169 uncore_pci_pmu_register+0x158/0x190 The current uncore driver assumes that all the devices in the same PMU have the exact same devfn. It's true for the previous platforms. But it doesn't work for the new PCIE uncore units on GNR. The assumption doesn't make sense. There is no reason to limit the devices from the same PMU to the same devfn. Also, the current code just throws the warning, but still registers the device. The WARN_ON_ONCE() should be removed. The func_id is used by the later event_init() to check if a event->pmu has valid devices. For cpu and mmio uncore PMUs, they are always valid. For pci uncore PMUs, it's set when the PMU is registered. It can be replaced by the pmu->registered. Clean up the func_id. Intel-SIG: commit 3f710be02ea6 perf/x86/intel/uncore: Clean up func_id Backport to support more uncore units on GNR Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Eric Hu Link: https://lkml.kernel.org/r/20250108143017.1793781-1-kan.liang@linux.intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li Signed-off-by: Jason Zeng --- arch/x86/events/intel/uncore.c | 20 +++++++------------- arch/x86/events/intel/uncore.h | 1 - arch/x86/events/intel/uncore_snb.c | 2 +- 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 4d856b51307f..b2453052dad8 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -744,7 +744,7 @@ static int uncore_pmu_event_init(struct perf_event *event) pmu = uncore_event_to_pmu(event); /* no device found for this pmu */ - if (pmu->func_id < 0) + if (!pmu->registered) return -ENOENT; /* Sampling not supported yet */ @@ -991,7 +991,7 @@ static void uncore_types_exit(struct intel_uncore_type **types) uncore_type_exit(*types); } -static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) +static int __init uncore_type_init(struct intel_uncore_type *type) { struct intel_uncore_pmu *pmus; size_t size; @@ -1004,7 +1004,6 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) size = uncore_max_dies() * sizeof(struct intel_uncore_box *); for (i = 0; i < type->num_boxes; i++) { - pmus[i].func_id = setid ? i : -1; pmus[i].pmu_idx = i; pmus[i].type = type; pmus[i].boxes = kzalloc(size, GFP_KERNEL); @@ -1054,12 +1053,12 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) } static int __init -uncore_types_init(struct intel_uncore_type **types, bool setid) +uncore_types_init(struct intel_uncore_type **types) { int ret; for (; *types; types++) { - ret = uncore_type_init(*types, setid); + ret = uncore_type_init(*types); if (ret) return ret; } @@ -1159,11 +1158,6 @@ static int uncore_pci_pmu_register(struct pci_dev *pdev, if (!box) return -ENOMEM; - if (pmu->func_id < 0) - pmu->func_id = pdev->devfn; - else - WARN_ON_ONCE(pmu->func_id != pdev->devfn); - atomic_inc(&box->refcnt); box->dieid = die; box->pci_dev = pdev; @@ -1409,7 +1403,7 @@ static int __init uncore_pci_init(void) goto err; } - ret = uncore_types_init(uncore_pci_uncores, false); + ret = uncore_types_init(uncore_pci_uncores); if (ret) goto errtype; @@ -1677,7 +1671,7 @@ static int __init uncore_cpu_init(void) { int ret; - ret = uncore_types_init(uncore_msr_uncores, true); + ret = uncore_types_init(uncore_msr_uncores); if (ret) goto err; @@ -1696,7 +1690,7 @@ static int __init uncore_mmio_init(void) struct intel_uncore_type **types = uncore_mmio_uncores; int ret; - ret = uncore_types_init(types, true); + ret = uncore_types_init(types); if (ret) goto err; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 027ef292c602..9ba2305c0d9a 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -125,7 +125,6 @@ struct intel_uncore_pmu { struct pmu pmu; char name[UNCORE_PMU_NAME_LEN]; int pmu_idx; - int func_id; bool registered; atomic_t activeboxes; cpumask_t cpu_mask; diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 7fd4334e12a1..3725bb6571c4 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -881,7 +881,7 @@ static int snb_uncore_imc_event_init(struct perf_event *event) pmu = uncore_event_to_pmu(event); /* no device found for this pmu */ - if (pmu->func_id < 0) + if (!pmu->registered) return -ENOENT; /* Sampling not supported yet */ -- Gitee From 05cbc1b546c39fbe8773c7df8feca2629f88682f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 8 Jan 2025 06:30:17 -0800 Subject: [PATCH 02/52] perf/x86/intel/uncore: Support more units on Granite Rapids mainline inclusion from mainline-v6.14-rc1 commit 6d642735cdb6cdb814d2b6c81652caa53ce04842 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6d642735cdb6cdb814d2b6c81652caa53ce04842 --------------------------- The same CXL PMONs support is also avaiable on GNR. Apply spr_uncore_cxlcm and spr_uncore_cxldp to GNR as well. The other units were broken on early HW samples, so they were ignored in the early enabling patch. The issue has been fixed and verified on the later production HW. Add UPI, B2UPI, B2HOT, PCIEX16 and PCIEX8 for GNR. Intel-SIG: commit 6d642735cdb6 Support more units on Granite Rapids. Backport to support more uncore units on GNR Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Eric Hu Link: https://lkml.kernel.org/r/20250108143017.1793781-2-kan.liang@linux.intel.com [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li Signed-off-by: Jason Zeng --- arch/x86/events/intel/uncore_snbep.c | 48 ++++++++++++++++++---------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index dcfabf678807..41105cf1e632 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -6680,17 +6680,8 @@ void spr_uncore_mmio_init(void) /* GNR uncore support */ #define UNCORE_GNR_NUM_UNCORE_TYPES 23 -#define UNCORE_GNR_TYPE_15 15 -#define UNCORE_GNR_B2UPI 18 -#define UNCORE_GNR_TYPE_21 21 -#define UNCORE_GNR_TYPE_22 22 int gnr_uncore_units_ignore[] = { - UNCORE_SPR_UPI, - UNCORE_GNR_TYPE_15, - UNCORE_GNR_B2UPI, - UNCORE_GNR_TYPE_21, - UNCORE_GNR_TYPE_22, UNCORE_IGNORE_END }; @@ -6699,6 +6690,31 @@ static struct intel_uncore_type gnr_uncore_ubox = { .attr_update = uncore_alias_groups, }; +static struct intel_uncore_type gnr_uncore_pciex8 = { + SPR_UNCORE_PCI_COMMON_FORMAT(), + .name = "pciex8", +}; + +static struct intel_uncore_type gnr_uncore_pciex16 = { + SPR_UNCORE_PCI_COMMON_FORMAT(), + .name = "pciex16", +}; + +static struct intel_uncore_type gnr_uncore_upi = { + SPR_UNCORE_PCI_COMMON_FORMAT(), + .name = "upi", +}; + +static struct intel_uncore_type gnr_uncore_b2upi = { + SPR_UNCORE_PCI_COMMON_FORMAT(), + .name = "b2upi", +}; + +static struct intel_uncore_type gnr_uncore_b2hot = { + .name = "b2hot", + .attr_update = uncore_alias_groups, +}; + static struct intel_uncore_type gnr_uncore_b2cmi = { SPR_UNCORE_PCI_COMMON_FORMAT(), .name = "b2cmi", @@ -6723,21 +6739,21 @@ static struct intel_uncore_type *gnr_uncores[UNCORE_GNR_NUM_UNCORE_TYPES] = { &gnr_uncore_ubox, &spr_uncore_imc, NULL, + &gnr_uncore_upi, NULL, NULL, NULL, + &spr_uncore_cxlcm, + &spr_uncore_cxldp, NULL, - NULL, - NULL, - NULL, - NULL, + &gnr_uncore_b2hot, &gnr_uncore_b2cmi, &gnr_uncore_b2cxl, - NULL, + &gnr_uncore_b2upi, NULL, &gnr_uncore_mdf_sbo, - NULL, - NULL, + &gnr_uncore_pciex16, + &gnr_uncore_pciex8, }; static struct freerunning_counters gnr_iio_freerunning[] = { -- Gitee From a43186906c6a7cf746e32f06702b46211b0d8c28 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 25 Oct 2023 13:16:24 -0700 Subject: [PATCH 03/52] tools headers UAPI: Sync include/uapi/linux/perf_event.h header with the kernel mainline inclusion from mainline-v6.8-rc1 commit 76db7aab1fca6688ddf9f388157521c442e0ffb8 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=76db7aab1fca6688ddf9f388157521c442e0ffb8 --------------------------- Sync the new sample type for the branch counters feature. Intel-SIG: commit 76db7aab1fca tools headers UAPI: Sync include/uapi/linux/perf_event.h header with the kernel Signed-off-by: Kan Liang Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Bayduraev Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tinghao Zhang Link: https://lore.kernel.org/r/20231025201626.3000228-6-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Jason Zeng --- tools/include/uapi/linux/perf_event.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 39c6a250dd1b..3a64499b0f5d 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -204,6 +204,8 @@ enum perf_branch_sample_type_shift { PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT = 18, /* save privilege mode */ + PERF_SAMPLE_BRANCH_COUNTERS_SHIFT = 19, /* save occurrences of events on a branch */ + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ }; @@ -235,6 +237,8 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_COUNTERS = 1U << PERF_SAMPLE_BRANCH_COUNTERS_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; @@ -982,6 +986,12 @@ enum perf_event_type { * { u64 nr; * { u64 hw_idx; } && PERF_SAMPLE_BRANCH_HW_INDEX * { u64 from, to, flags } lbr[nr]; + * # + * # The format of the counters is decided by the + * # "branch_counter_nr" and "branch_counter_width", + * # which are defined in the ABI. + * # + * { u64 counters; } cntr[nr] && PERF_SAMPLE_BRANCH_COUNTERS * } && PERF_SAMPLE_BRANCH_STACK * * { u64 abi; # enum perf_sample_regs_abi @@ -1427,6 +1437,9 @@ struct perf_branch_entry { reserved:31; }; +/* Size of used info bits in struct perf_branch_entry */ +#define PERF_BRANCH_ENTRY_INFO_BITS_MAX 33 + union perf_sample_weight { __u64 full; #if defined(__LITTLE_ENDIAN_BITFIELD) -- Gitee From d0ea0bd17c6808cea7c7d982fecaac2720cc435e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 26 Jun 2024 07:35:33 -0700 Subject: [PATCH 04/52] perf/x86/intel: Support the PEBS event mask mainline inclusion from mainline-v6.11-rc1 commit a23eb2fc1d818cdac9b31f032842d55483a6a040 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a23eb2fc1d818cdac9b31f032842d55483a6a040 --------------------------- The current perf assumes that the counters that support PEBS are contiguous. But it's not guaranteed with the new leaf 0x23 introduced. The counters are enumerated with a counter mask. There may be holes in the counter mask for future platforms or in a virtualization environment. Store the PEBS event mask rather than the maximum number of PEBS counters in the x86 PMU structures. Intel-SIG: commit a23eb2fc1d81 perf/x86/intel: Support the PEBS event mask Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Reviewed-by: Ian Rogers Link: https://lkml.kernel.org/r/20240626143545.480761-2-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 8 ++++---- arch/x86/events/intel/ds.c | 15 ++++++++------- arch/x86/events/perf_event.h | 15 +++++++++++++-- arch/x86/include/asm/intel_ds.h | 1 + 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index fb63ccab535e..6df25d8b7b10 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4826,7 +4826,7 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) { intel_pmu_check_num_counters(&pmu->num_counters, &pmu->num_counters_fixed, &pmu->intel_ctrl, (1ULL << pmu->num_counters_fixed) - 1); - pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters); + pmu->pebs_events_mask = intel_pmu_pebs_mask(GENMASK_ULL(pmu->num_counters - 1, 0)); pmu->unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, 0, pmu->num_counters, 0, 0); @@ -6210,7 +6210,7 @@ static __always_inline int intel_pmu_init_hybrid(enum hybrid_pmu_type pmus) pmu->num_counters = x86_pmu.num_counters; pmu->num_counters_fixed = x86_pmu.num_counters_fixed; - pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters); + pmu->pebs_events_mask = intel_pmu_pebs_mask(GENMASK_ULL(pmu->num_counters - 1, 0)); pmu->unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, 0, pmu->num_counters, 0, 0); @@ -6323,7 +6323,7 @@ __init int intel_pmu_init(void) x86_pmu.events_maskl = ebx.full; x86_pmu.events_mask_len = eax.split.mask_length; - x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); + x86_pmu.pebs_events_mask = intel_pmu_pebs_mask(GENMASK_ULL(x86_pmu.num_counters - 1, 0)); x86_pmu.pebs_capable = PEBS_COUNTER_MASK; /* @@ -6957,7 +6957,7 @@ __init int intel_pmu_init(void) pmu->num_counters_fixed = x86_pmu.num_counters_fixed; } - pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters); + pmu->pebs_events_mask = intel_pmu_pebs_mask(GENMASK_ULL(pmu->num_counters - 1, 0)); pmu->unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, 0, pmu->num_counters, 0, 0); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 1c649edd5131..61383b15f31f 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1136,7 +1136,7 @@ void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sche static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) { struct debug_store *ds = cpuc->ds; - int max_pebs_events = hybrid(cpuc->pmu, max_pebs_events); + int max_pebs_events = intel_pmu_max_num_pebs(cpuc->pmu); int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); u64 threshold; int reserved; @@ -2151,6 +2151,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d void *base, *at, *top; short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; short error[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; + int max_pebs_events = intel_pmu_max_num_pebs(NULL); int bit, i, size; u64 mask; @@ -2162,8 +2163,8 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d ds->pebs_index = ds->pebs_buffer_base; - mask = (1ULL << x86_pmu.max_pebs_events) - 1; - size = x86_pmu.max_pebs_events; + mask = x86_pmu.pebs_events_mask; + size = max_pebs_events; if (x86_pmu.flags & PMU_FL_PEBS_ALL) { mask |= ((1ULL << x86_pmu.num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED; size = INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed; @@ -2202,8 +2203,9 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d pebs_status = p->status = cpuc->pebs_enabled; bit = find_first_bit((unsigned long *)&pebs_status, - x86_pmu.max_pebs_events); - if (bit >= x86_pmu.max_pebs_events) + max_pebs_events); + + if (!(x86_pmu.pebs_events_mask & (1 << bit))) continue; /* @@ -2261,7 +2263,6 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d { short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int max_pebs_events = hybrid(cpuc->pmu, max_pebs_events); int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); struct debug_store *ds = cpuc->ds; struct perf_event *event; @@ -2277,7 +2278,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d ds->pebs_index = ds->pebs_buffer_base; - mask = ((1ULL << max_pebs_events) - 1) | + mask = hybrid(cpuc->pmu, pebs_events_mask) | (((1ULL << num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED); size = INTEL_PMC_IDX_FIXED + num_counters_fixed; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 6e40b55f68da..ddb33f67e4e3 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -691,7 +691,7 @@ struct x86_hybrid_pmu { cpumask_t supported_cpus; union perf_capabilities intel_cap; u64 intel_ctrl; - int max_pebs_events; + u64 pebs_events_mask; int num_counters; int num_counters_fixed; struct event_constraint unconstrained; @@ -859,7 +859,7 @@ struct x86_pmu { pebs_ept :1; int pebs_record_size; int pebs_buffer_size; - int max_pebs_events; + u64 pebs_events_mask; void (*drain_pebs)(struct pt_regs *regs, struct perf_sample_data *data); struct event_constraint *pebs_constraints; void (*pebs_aliases)(struct perf_event *event); @@ -1656,6 +1656,17 @@ static inline int is_ht_workaround_enabled(void) return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED); } +static inline u64 intel_pmu_pebs_mask(u64 cntr_mask) +{ + return MAX_PEBS_EVENTS_MASK & cntr_mask; +} + +static inline int intel_pmu_max_num_pebs(struct pmu *pmu) +{ + static_assert(MAX_PEBS_EVENTS == 32); + return fls((u32)hybrid(pmu, pebs_events_mask)); +} + #else /* CONFIG_CPU_SUP_INTEL */ static inline void reserve_ds_buffers(void) diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h index 2f9eeb5c3069..5dbeac48a5b9 100644 --- a/arch/x86/include/asm/intel_ds.h +++ b/arch/x86/include/asm/intel_ds.h @@ -9,6 +9,7 @@ /* The maximal number of PEBS events: */ #define MAX_PEBS_EVENTS_FMT4 8 #define MAX_PEBS_EVENTS 32 +#define MAX_PEBS_EVENTS_MASK GENMASK_ULL(MAX_PEBS_EVENTS - 1, 0) #define MAX_FIXED_PEBS_EVENTS 16 /* -- Gitee From 29ac3f48a493f5b4b6b7dba71cbd7308f45141a5 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 26 Jun 2024 07:35:34 -0700 Subject: [PATCH 05/52] perf/x86: Support counter mask mainline inclusion from mainline-v6.11-rc1 commit 722e42e45c2f1c6d1adec7813651dba5139f52f4 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=722e42e45c2f1c6d1adec7813651dba5139f52f4 --------------------------- The current perf assumes that both GP and fixed counters are contiguous. But it's not guaranteed on newer Intel platforms or in a virtualization environment. Use the counter mask to replace the number of counters for both GP and the fixed counters. For the other ARCHs or old platforms which don't support a counter mask, using GENMASK_ULL(num_counter - 1, 0) to replace. There is no functional change for them. The interface to KVM is not changed. The number of counters still be passed to KVM. It can be updated later separately. Intel-SIG: commit 722e42e45c2f perf/x86: Support counter mask Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Reviewed-by: Ian Rogers Link: https://lkml.kernel.org/r/20240626143545.480761-3-kan.liang@linux.intel.com [Dapeng Mi: resolve conflict and amend commit log] Signed-off-by: Dapeng Mi Conflicts: arch/x86/events/amd/core.c arch/x86/events/core.c arch/x86/events/zhaoxin/core.c [jz: - Resolve conflict due to f73cefa3b72e "perf/x86: Fix smp_processor_id()-in -preemptible warnings" was backported before this commit. - Also resolve indentation conflict due to 4da97fa2c083 "perf/zhaoxin/uncore: Enhance uncore support and fix related bugs"] Signed-off-by: Jason Zeng --- arch/x86/events/amd/core.c | 24 ++--- arch/x86/events/core.c | 106 ++++++++++----------- arch/x86/events/intel/core.c | 164 ++++++++++++++++----------------- arch/x86/events/intel/ds.c | 19 ++-- arch/x86/events/intel/knc.c | 2 +- arch/x86/events/intel/p4.c | 10 +- arch/x86/events/intel/p6.c | 2 +- arch/x86/events/perf_event.h | 47 ++++++++-- arch/x86/events/zhaoxin/core.c | 24 ++--- 9 files changed, 209 insertions(+), 189 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index f5e5906ae36c..e7edce877878 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -435,7 +435,7 @@ static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc, * be removed on one CPU at a time AND PMU is disabled * when we come here */ - for (i = 0; i < x86_pmu.num_counters; i++) { + for_each_set_bit(i, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { if (cmpxchg(nb->owners + i, event, NULL) == event) break; } @@ -502,7 +502,7 @@ __amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *ev * because of successive calls to x86_schedule_events() from * hw_perf_group_sched_in() without hw_perf_enable() */ - for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) { + for_each_set_bit(idx, c->idxmsk, x86_pmu_max_num_counters(NULL)) { if (new == -1 || hwc->idx == idx) /* assign free slot, prefer hwc->idx */ old = cmpxchg(nb->owners + idx, NULL, event); @@ -545,7 +545,7 @@ static struct amd_nb *amd_alloc_nb(int cpu) /* * initialize all possible NB constraints */ - for (i = 0; i < x86_pmu.num_counters; i++) { + for_each_set_bit(i, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { __set_bit(i, nb->event_constraints[i].idxmsk); nb->event_constraints[i].weight = 1; } @@ -738,7 +738,7 @@ static void amd_pmu_check_overflow(void) * counters are always enabled when this function is called and * ARCH_PERFMON_EVENTSEL_INT is always set. */ - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { if (!test_bit(idx, cpuc->active_mask)) continue; @@ -758,7 +758,7 @@ static void amd_pmu_enable_all(int added) amd_brs_enable_all(); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { /* only activate events which are marked as active */ if (!test_bit(idx, cpuc->active_mask)) continue; @@ -951,7 +951,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) /* Clear any reserved bits set by buggy microcode */ status &= amd_pmu_global_cntr_mask; - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { if (!test_bit(idx, cpuc->active_mask)) continue; @@ -1291,7 +1291,7 @@ static __initconst const struct x86_pmu amd_pmu = { .addr_offset = amd_pmu_addr_offset, .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = AMD64_NUM_COUNTERS, + .cntr_mask64 = GENMASK_ULL(AMD64_NUM_COUNTERS - 1, 0), .add = amd_pmu_add_event, .del = amd_pmu_del_event, .cntval_bits = 48, @@ -1390,7 +1390,7 @@ static int __init amd_core_pmu_init(void) */ x86_pmu.eventsel = MSR_F15H_PERF_CTL; x86_pmu.perfctr = MSR_F15H_PERF_CTR; - x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; + x86_pmu.cntr_mask64 = GENMASK_ULL(AMD64_NUM_COUNTERS_CORE - 1, 0); /* Check for Performance Monitoring v2 support */ if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) { @@ -1400,9 +1400,9 @@ static int __init amd_core_pmu_init(void) x86_pmu.version = 2; /* Find the number of available Core PMCs */ - x86_pmu.num_counters = ebx.split.num_core_pmc; + x86_pmu.cntr_mask64 = GENMASK_ULL(ebx.split.num_core_pmc - 1, 0); - amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1; + amd_pmu_global_cntr_mask = x86_pmu.cntr_mask64; /* Update PMC handling functions */ x86_pmu.enable_all = amd_pmu_v2_enable_all; @@ -1430,12 +1430,12 @@ static int __init amd_core_pmu_init(void) * even numbered counter that has a consecutive adjacent odd * numbered counter following it. */ - for (i = 0; i < x86_pmu.num_counters - 1; i += 2) + for (i = 0; i < x86_pmu_max_num_counters(NULL) - 1; i += 2) even_ctr_mask |= BIT_ULL(i); pair_constraint = (struct event_constraint) __EVENT_CONSTRAINT(0, even_ctr_mask, 0, - x86_pmu.num_counters / 2, 0, + x86_pmu_max_num_counters(NULL) / 2, 0, PERF_X86_EVENT_PAIR); x86_pmu.get_event_constraints = amd_get_event_constraints_f17h; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 606d8bb43d58..46e1f78af3f3 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -191,29 +191,31 @@ static DEFINE_MUTEX(pmc_reserve_mutex); #ifdef CONFIG_X86_LOCAL_APIC -static inline int get_possible_num_counters(void) +static inline u64 get_possible_counter_mask(void) { - int i, num_counters = x86_pmu.num_counters; + u64 cntr_mask = x86_pmu.cntr_mask64; + int i; if (!is_hybrid()) - return num_counters; + return cntr_mask; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) - num_counters = max_t(int, num_counters, x86_pmu.hybrid_pmu[i].num_counters); + cntr_mask |= x86_pmu.hybrid_pmu[i].cntr_mask64; - return num_counters; + return cntr_mask; } static bool reserve_pmc_hardware(void) { - int i, num_counters = get_possible_num_counters(); + u64 cntr_mask = get_possible_counter_mask(); + int i, end; - for (i = 0; i < num_counters; i++) { + for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) { if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) goto perfctr_fail; } - for (i = 0; i < num_counters; i++) { + for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) { if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) goto eventsel_fail; } @@ -221,13 +223,14 @@ static bool reserve_pmc_hardware(void) return true; eventsel_fail: - for (i--; i >= 0; i--) + end = i; + for_each_set_bit(i, (unsigned long *)&cntr_mask, end) release_evntsel_nmi(x86_pmu_config_addr(i)); - - i = num_counters; + i = X86_PMC_IDX_MAX; perfctr_fail: - for (i--; i >= 0; i--) + end = i; + for_each_set_bit(i, (unsigned long *)&cntr_mask, end) release_perfctr_nmi(x86_pmu_event_addr(i)); return false; @@ -235,9 +238,10 @@ static bool reserve_pmc_hardware(void) static void release_pmc_hardware(void) { - int i, num_counters = get_possible_num_counters(); + u64 cntr_mask = get_possible_counter_mask(); + int i; - for (i = 0; i < num_counters; i++) { + for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) { release_perfctr_nmi(x86_pmu_event_addr(i)); release_evntsel_nmi(x86_pmu_config_addr(i)); } @@ -250,7 +254,8 @@ static void release_pmc_hardware(void) {} #endif -bool check_hw_exists(struct pmu *pmu, int num_counters, int num_counters_fixed) +bool check_hw_exists(struct pmu *pmu, unsigned long *cntr_mask, + unsigned long *fixed_cntr_mask) { u64 val, val_fail = -1, val_new= ~0; int i, reg, reg_fail = -1, ret = 0; @@ -261,7 +266,7 @@ bool check_hw_exists(struct pmu *pmu, int num_counters, int num_counters_fixed) * Check to see if the BIOS enabled any of the counters, if so * complain and bail. */ - for (i = 0; i < num_counters; i++) { + for_each_set_bit(i, cntr_mask, X86_PMC_IDX_MAX) { reg = x86_pmu_config_addr(i); ret = rdmsrl_safe(reg, &val); if (ret) @@ -275,12 +280,12 @@ bool check_hw_exists(struct pmu *pmu, int num_counters, int num_counters_fixed) } } - if (num_counters_fixed) { + if (*(u64 *)fixed_cntr_mask) { reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; ret = rdmsrl_safe(reg, &val); if (ret) goto msr_fail; - for (i = 0; i < num_counters_fixed; i++) { + for_each_set_bit(i, fixed_cntr_mask, X86_PMC_IDX_MAX) { if (fixed_counter_disabled(i, pmu)) continue; if (val & (0x03ULL << i*4)) { @@ -681,7 +686,7 @@ void x86_pmu_disable_all(void) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { struct hw_perf_event *hwc = &cpuc->events[idx]->hw; u64 val; @@ -738,7 +743,7 @@ void x86_pmu_enable_all(int added) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { struct hw_perf_event *hwc = &cpuc->events[idx]->hw; if (!test_bit(idx, cpuc->active_mask)) @@ -977,7 +982,6 @@ EXPORT_SYMBOL_GPL(perf_assign_events); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { - int num_counters = hybrid(cpuc->pmu, num_counters); struct event_constraint *c; struct perf_event *e; int n0, i, wmin, wmax, unsched = 0; @@ -1053,7 +1057,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) /* slow path */ if (i != n) { - int gpmax = num_counters; + int gpmax = x86_pmu_max_num_counters(cpuc->pmu); /* * Do not allow scheduling of more than half the available @@ -1074,7 +1078,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) * the extra Merge events needed by large increment events. */ if (x86_pmu.flags & PMU_FL_PAIR) { - gpmax = num_counters - cpuc->n_pair; + gpmax -= cpuc->n_pair; WARN_ON(gpmax <= 0); } @@ -1159,12 +1163,10 @@ static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event, */ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) { - int num_counters = hybrid(cpuc->pmu, num_counters); - int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); struct perf_event *event; int n, max_count; - max_count = num_counters + num_counters_fixed; + max_count = x86_pmu_num_counters(cpuc->pmu) + x86_pmu_num_counters_fixed(cpuc->pmu); /* current number of events already accepted */ n = cpuc->n_events; @@ -1521,23 +1523,23 @@ static void x86_pmu_start(struct perf_event *event, int flags) void perf_event_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; - u64 pebs, debugctl; - struct cpu_hw_events *cpuc; + unsigned long *cntr_mask, *fixed_cntr_mask; struct event_constraint *pebs_constraints; - int num_counters, num_counters_fixed, cpu, idx; + struct cpu_hw_events *cpuc; + u64 pebs, debugctl; + int cpu, idx; guard(irqsave)(); cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_events, cpu); - num_counters = hybrid(cpuc->pmu, num_counters); - num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); + cntr_mask = hybrid(cpuc->pmu, cntr_mask); + fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask); pebs_constraints = hybrid(cpuc->pmu, pebs_constraints); - if (!num_counters) + if (!*(u64 *)cntr_mask) return; - if (x86_pmu.version >= 2) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); @@ -1560,7 +1562,7 @@ void perf_event_print_debug(void) } pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); - for (idx = 0; idx < num_counters; idx++) { + for_each_set_bit(idx, cntr_mask, X86_PMC_IDX_MAX) { rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl); rdmsrl(x86_pmu_event_addr(idx), pmc_count); @@ -1573,7 +1575,7 @@ void perf_event_print_debug(void) pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } - for (idx = 0; idx < num_counters_fixed; idx++) { + for_each_set_bit(idx, fixed_cntr_mask, X86_PMC_IDX_MAX) { if (fixed_counter_disabled(idx, cpuc->pmu)) continue; rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); @@ -1686,7 +1688,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) */ apic_write(APIC_LVTPC, APIC_DM_NMI); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { if (!test_bit(idx, cpuc->active_mask)) continue; @@ -2042,18 +2044,15 @@ static void _x86_pmu_read(struct perf_event *event) static_call(x86_pmu_update)(event); } -void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, - u64 intel_ctrl) +void x86_pmu_show_pmu_cap(struct pmu *pmu) { pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); - pr_info("... generic registers: %d\n", num_counters); + pr_info("... generic registers: %d\n", x86_pmu_num_counters(pmu)); pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %lu\n", - hweight64((((1ULL << num_counters_fixed) - 1) - << INTEL_PMC_IDX_FIXED) & intel_ctrl)); - pr_info("... event mask: %016Lx\n", intel_ctrl); + pr_info("... fixed-purpose events: %d\n", x86_pmu_num_counters_fixed(pmu)); + pr_info("... event mask: %016Lx\n", hybrid(pmu, intel_ctrl)); } static int __init init_hw_perf_events(void) @@ -2090,7 +2089,7 @@ static int __init init_hw_perf_events(void) pmu_check_apic(); /* sanity check that the hardware exists or is emulated */ - if (!check_hw_exists(&pmu, x86_pmu.num_counters, x86_pmu.num_counters_fixed)) + if (!check_hw_exists(&pmu, x86_pmu.cntr_mask, x86_pmu.fixed_cntr_mask)) goto out_bad_pmu; pr_cont("%s PMU driver.\n", x86_pmu.name); @@ -2101,14 +2100,14 @@ static int __init init_hw_perf_events(void) quirk->func(); if (!x86_pmu.intel_ctrl) - x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; + x86_pmu.intel_ctrl = x86_pmu.cntr_mask64; perf_events_lapic_init(); register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); unconstrained = (struct event_constraint) - __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, - 0, x86_pmu.num_counters, 0, 0); + __EVENT_CONSTRAINT(0, x86_pmu.cntr_mask64, + 0, x86_pmu_num_counters(NULL), 0, 0); x86_pmu_format_group.attrs = x86_pmu.format_attrs; @@ -2117,11 +2116,8 @@ static int __init init_hw_perf_events(void) pmu.attr_update = x86_pmu.attr_update; - if (!is_hybrid()) { - x86_pmu_show_pmu_cap(x86_pmu.num_counters, - x86_pmu.num_counters_fixed, - x86_pmu.intel_ctrl); - } + if (!is_hybrid()) + x86_pmu_show_pmu_cap(NULL); if (!x86_pmu.read) x86_pmu.read = _x86_pmu_read; @@ -2485,7 +2481,7 @@ void perf_clear_dirty_counters(void) for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) { if (i >= INTEL_PMC_IDX_FIXED) { /* Metrics and fake events don't have corresponding HW counters. */ - if ((i - INTEL_PMC_IDX_FIXED) >= hybrid(cpuc->pmu, num_counters_fixed)) + if (!test_bit(i - INTEL_PMC_IDX_FIXED, hybrid(cpuc->pmu, fixed_cntr_mask))) continue; wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + (i - INTEL_PMC_IDX_FIXED), 0); @@ -3051,8 +3047,8 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) * base PMU holds the correct number of counters for P-cores. */ cap->version = x86_pmu.version; - cap->num_counters_gp = x86_pmu.num_counters; - cap->num_counters_fixed = x86_pmu.num_counters_fixed; + cap->num_counters_gp = x86_pmu_num_counters(NULL); + cap->num_counters_fixed = x86_pmu_num_counters_fixed(NULL); cap->bit_width_gp = x86_pmu.cntval_bits; cap->bit_width_fixed = x86_pmu.cntval_bits; cap->events_mask = (unsigned int)x86_pmu.events_maskl; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 6df25d8b7b10..26eee1a93865 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2869,23 +2869,23 @@ static void intel_pmu_reset(void) { struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); - int num_counters = hybrid(cpuc->pmu, num_counters); + unsigned long *cntr_mask = hybrid(cpuc->pmu, cntr_mask); + unsigned long *fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask); unsigned long flags; int idx; - if (!num_counters) + if (!*(u64 *)cntr_mask) return; local_irq_save(flags); pr_info("clearing PMU state on CPU#%d\n", smp_processor_id()); - for (idx = 0; idx < num_counters; idx++) { + for_each_set_bit(idx, cntr_mask, INTEL_PMC_MAX_GENERIC) { wrmsrl_safe(x86_pmu_config_addr(idx), 0ull); wrmsrl_safe(x86_pmu_event_addr(idx), 0ull); } - for (idx = 0; idx < num_counters_fixed; idx++) { + for_each_set_bit(idx, fixed_cntr_mask, INTEL_PMC_MAX_FIXED) { if (fixed_counter_disabled(idx, cpuc->pmu)) continue; wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); @@ -2935,8 +2935,7 @@ static void x86_pmu_handle_guest_pebs(struct pt_regs *regs, !guest_pebs_idxs) return; - for_each_set_bit(bit, (unsigned long *)&guest_pebs_idxs, - INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed) { + for_each_set_bit(bit, (unsigned long *)&guest_pebs_idxs, X86_PMC_IDX_MAX) { event = cpuc->events[bit]; if (!event->attr.precise_ip) continue; @@ -4279,7 +4278,7 @@ static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr, void *data) struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; int idx; - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[idx]; arr[idx].msr = x86_pmu_config_addr(idx); @@ -4297,7 +4296,7 @@ static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr, void *data) arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE; } - *nr = x86_pmu.num_counters; + *nr = x86_pmu_max_num_counters(cpuc->pmu); return arr; } @@ -4312,7 +4311,7 @@ static void core_pmu_enable_all(int added) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { struct hw_perf_event *hwc = &cpuc->events[idx]->hw; if (!test_bit(idx, cpuc->active_mask) || @@ -4782,13 +4781,33 @@ static void flip_smm_bit(void *data) } } -static void intel_pmu_check_num_counters(int *num_counters, - int *num_counters_fixed, - u64 *intel_ctrl, u64 fixed_mask); +static void intel_pmu_check_counters_mask(u64 *cntr_mask, + u64 *fixed_cntr_mask, + u64 *intel_ctrl) +{ + unsigned int bit; + + bit = fls64(*cntr_mask); + if (bit > INTEL_PMC_MAX_GENERIC) { + WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", + bit, INTEL_PMC_MAX_GENERIC); + *cntr_mask &= GENMASK_ULL(INTEL_PMC_MAX_GENERIC - 1, 0); + } + *intel_ctrl = *cntr_mask; + + bit = fls64(*fixed_cntr_mask); + if (bit > INTEL_PMC_MAX_FIXED) { + WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", + bit, INTEL_PMC_MAX_FIXED); + *fixed_cntr_mask &= GENMASK_ULL(INTEL_PMC_MAX_FIXED - 1, 0); + } + + *intel_ctrl |= *fixed_cntr_mask << INTEL_PMC_IDX_FIXED; +} static void intel_pmu_check_event_constraints(struct event_constraint *event_constraints, - int num_counters, - int num_counters_fixed, + u64 cntr_mask, + u64 fixed_cntr_mask, u64 intel_ctrl); static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs); @@ -4811,11 +4830,10 @@ static void update_pmu_cap(struct x86_hybrid_pmu *pmu) if (sub_bitmaps & ARCH_PERFMON_NUM_COUNTER_LEAF_BIT) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF, &eax, &ebx, &ecx, &edx); - pmu->num_counters = fls(eax); - pmu->num_counters_fixed = fls(ebx); + pmu->cntr_mask64 = eax; + pmu->fixed_cntr_mask64 = ebx; } - if (!intel_pmu_broken_perf_cap()) { /* Perf Metric (Bit 15) and PEBS via PT (Bit 16) are hybrid enumeration */ rdmsrl(MSR_IA32_PERF_CAPABILITIES, pmu->intel_cap.capabilities); @@ -4824,12 +4842,12 @@ static void update_pmu_cap(struct x86_hybrid_pmu *pmu) static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) { - intel_pmu_check_num_counters(&pmu->num_counters, &pmu->num_counters_fixed, - &pmu->intel_ctrl, (1ULL << pmu->num_counters_fixed) - 1); - pmu->pebs_events_mask = intel_pmu_pebs_mask(GENMASK_ULL(pmu->num_counters - 1, 0)); + intel_pmu_check_counters_mask(&pmu->cntr_mask64, &pmu->fixed_cntr_mask64, + &pmu->intel_ctrl); + pmu->pebs_events_mask = intel_pmu_pebs_mask(pmu->cntr_mask64); pmu->unconstrained = (struct event_constraint) - __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, - 0, pmu->num_counters, 0, 0); + __EVENT_CONSTRAINT(0, pmu->cntr_mask64, + 0, x86_pmu_num_counters(&pmu->pmu), 0, 0); if (pmu->intel_cap.perf_metrics) pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; @@ -4842,8 +4860,8 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) pmu->pmu.capabilities &= ~PERF_PMU_CAP_AUX_OUTPUT; intel_pmu_check_event_constraints(pmu->event_constraints, - pmu->num_counters, - pmu->num_counters_fixed, + pmu->cntr_mask64, + pmu->fixed_cntr_mask64, pmu->intel_ctrl); intel_pmu_check_extra_regs(pmu->extra_regs); @@ -4904,7 +4922,7 @@ static bool init_hybrid_pmu(int cpu) intel_pmu_check_hybrid_pmus(pmu); - if (!check_hw_exists(&pmu->pmu, pmu->num_counters, pmu->num_counters_fixed)) + if (!check_hw_exists(&pmu->pmu, pmu->cntr_mask, pmu->fixed_cntr_mask)) return false; pr_info("%s PMU driver: ", pmu->name); @@ -4914,8 +4932,7 @@ static bool init_hybrid_pmu(int cpu) pr_cont("\n"); - x86_pmu_show_pmu_cap(pmu->num_counters, pmu->num_counters_fixed, - pmu->intel_ctrl); + x86_pmu_show_pmu_cap(&pmu->pmu); end: cpumask_set_cpu(cpu, &pmu->supported_cpus); @@ -6095,29 +6112,9 @@ static const struct attribute_group *hybrid_attr_update[] = { static struct attribute *empty_attrs; -static void intel_pmu_check_num_counters(int *num_counters, - int *num_counters_fixed, - u64 *intel_ctrl, u64 fixed_mask) -{ - if (*num_counters > INTEL_PMC_MAX_GENERIC) { - WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", - *num_counters, INTEL_PMC_MAX_GENERIC); - *num_counters = INTEL_PMC_MAX_GENERIC; - } - *intel_ctrl = (1ULL << *num_counters) - 1; - - if (*num_counters_fixed > INTEL_PMC_MAX_FIXED) { - WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", - *num_counters_fixed, INTEL_PMC_MAX_FIXED); - *num_counters_fixed = INTEL_PMC_MAX_FIXED; - } - - *intel_ctrl |= fixed_mask << INTEL_PMC_IDX_FIXED; -} - static void intel_pmu_check_event_constraints(struct event_constraint *event_constraints, - int num_counters, - int num_counters_fixed, + u64 cntr_mask, + u64 fixed_cntr_mask, u64 intel_ctrl) { struct event_constraint *c; @@ -6154,10 +6151,9 @@ static void intel_pmu_check_event_constraints(struct event_constraint *event_con * generic counters */ if (!use_fixed_pseudo_encoding(c->code)) - c->idxmsk64 |= (1ULL << num_counters) - 1; + c->idxmsk64 |= cntr_mask; } - c->idxmsk64 &= - ~(~0ULL << (INTEL_PMC_IDX_FIXED + num_counters_fixed)); + c->idxmsk64 &= cntr_mask | (fixed_cntr_mask << INTEL_PMC_IDX_FIXED); c->weight = hweight64(c->idxmsk64); } } @@ -6208,12 +6204,12 @@ static __always_inline int intel_pmu_init_hybrid(enum hybrid_pmu_type pmus) pmu->pmu_type = intel_hybrid_pmu_type_map[bit].id; pmu->name = intel_hybrid_pmu_type_map[bit].name; - pmu->num_counters = x86_pmu.num_counters; - pmu->num_counters_fixed = x86_pmu.num_counters_fixed; - pmu->pebs_events_mask = intel_pmu_pebs_mask(GENMASK_ULL(pmu->num_counters - 1, 0)); + pmu->cntr_mask64 = x86_pmu.cntr_mask64; + pmu->fixed_cntr_mask64 = x86_pmu.fixed_cntr_mask64; + pmu->pebs_events_mask = intel_pmu_pebs_mask(pmu->cntr_mask64); pmu->unconstrained = (struct event_constraint) - __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, - 0, pmu->num_counters, 0, 0); + __EVENT_CONSTRAINT(0, pmu->cntr_mask64, + 0, x86_pmu_num_counters(&pmu->pmu), 0, 0); pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities; if (pmu->pmu_type & hybrid_small) { @@ -6316,14 +6312,14 @@ __init int intel_pmu_init(void) x86_pmu = intel_pmu; x86_pmu.version = version; - x86_pmu.num_counters = eax.split.num_counters; + x86_pmu.cntr_mask64 = GENMASK_ULL(eax.split.num_counters - 1, 0); x86_pmu.cntval_bits = eax.split.bit_width; x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; x86_pmu.events_maskl = ebx.full; x86_pmu.events_mask_len = eax.split.mask_length; - x86_pmu.pebs_events_mask = intel_pmu_pebs_mask(GENMASK_ULL(x86_pmu.num_counters - 1, 0)); + x86_pmu.pebs_events_mask = intel_pmu_pebs_mask(x86_pmu.cntr_mask64); x86_pmu.pebs_capable = PEBS_COUNTER_MASK; /* @@ -6333,12 +6329,10 @@ __init int intel_pmu_init(void) if (version > 1 && version < 5) { int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR); - x86_pmu.num_counters_fixed = - max((int)edx.split.num_counters_fixed, assume); - - fixed_mask = (1L << x86_pmu.num_counters_fixed) - 1; + x86_pmu.fixed_cntr_mask64 = + GENMASK_ULL(max((int)edx.split.num_counters_fixed, assume) - 1, 0); } else if (version >= 5) - x86_pmu.num_counters_fixed = fls(fixed_mask); + x86_pmu.fixed_cntr_mask64 = fixed_mask; if (boot_cpu_has(X86_FEATURE_PDCM)) { u64 capabilities; @@ -6938,11 +6932,13 @@ __init int intel_pmu_init(void) pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX]; intel_pmu_init_glc(&pmu->pmu); if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) { - pmu->num_counters = x86_pmu.num_counters + 2; - pmu->num_counters_fixed = x86_pmu.num_counters_fixed + 1; + pmu->cntr_mask64 <<= 2; + pmu->cntr_mask64 |= 0x3; + pmu->fixed_cntr_mask64 <<= 1; + pmu->fixed_cntr_mask64 |= 0x1; } else { - pmu->num_counters = x86_pmu.num_counters; - pmu->num_counters_fixed = x86_pmu.num_counters_fixed; + pmu->cntr_mask64 = x86_pmu.cntr_mask64; + pmu->fixed_cntr_mask64 = x86_pmu.fixed_cntr_mask64; } /* @@ -6952,15 +6948,16 @@ __init int intel_pmu_init(void) * mistakenly add extra counters for P-cores. Correct the number of * counters here. */ - if ((pmu->num_counters > 8) || (pmu->num_counters_fixed > 4)) { - pmu->num_counters = x86_pmu.num_counters; - pmu->num_counters_fixed = x86_pmu.num_counters_fixed; + if ((x86_pmu_num_counters(&pmu->pmu) > 8) || (x86_pmu_num_counters_fixed(&pmu->pmu) > 4)) { + pmu->cntr_mask64 = x86_pmu.cntr_mask64; + pmu->fixed_cntr_mask64 = x86_pmu.fixed_cntr_mask64; } - pmu->pebs_events_mask = intel_pmu_pebs_mask(GENMASK_ULL(pmu->num_counters - 1, 0)); + pmu->pebs_events_mask = intel_pmu_pebs_mask(pmu->cntr_mask64); pmu->unconstrained = (struct event_constraint) - __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, - 0, pmu->num_counters, 0, 0); + __EVENT_CONSTRAINT(0, pmu->cntr_mask64, + 0, x86_pmu_num_counters(&pmu->pmu), 0, 0); + pmu->extra_regs = intel_glc_extra_regs; /* Initialize Atom core specific PerfMon capabilities.*/ @@ -7027,9 +7024,9 @@ __init int intel_pmu_init(void) * The constraints may be cut according to the CPUID enumeration * by inserting the EVENT_CONSTRAINT_END. */ - if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) - x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; - intel_v5_gen_event_constraints[x86_pmu.num_counters_fixed].weight = -1; + if (fls64(x86_pmu.fixed_cntr_mask64) > INTEL_PMC_MAX_FIXED) + x86_pmu.fixed_cntr_mask64 &= GENMASK_ULL(INTEL_PMC_MAX_FIXED - 1, 0); + intel_v5_gen_event_constraints[fls64(x86_pmu.fixed_cntr_mask64)].weight = -1; x86_pmu.event_constraints = intel_v5_gen_event_constraints; pr_cont("generic architected perfmon, "); name = "generic_arch_v5+"; @@ -7056,18 +7053,17 @@ __init int intel_pmu_init(void) x86_pmu.attr_update = hybrid_attr_update; } - intel_pmu_check_num_counters(&x86_pmu.num_counters, - &x86_pmu.num_counters_fixed, - &x86_pmu.intel_ctrl, - (u64)fixed_mask); + intel_pmu_check_counters_mask(&x86_pmu.cntr_mask64, + &x86_pmu.fixed_cntr_mask64, + &x86_pmu.intel_ctrl); /* AnyThread may be deprecated on arch perfmon v5 or later */ if (x86_pmu.intel_cap.anythread_deprecated) x86_pmu.format_attrs = intel_arch_formats_attr; intel_pmu_check_event_constraints(x86_pmu.event_constraints, - x86_pmu.num_counters, - x86_pmu.num_counters_fixed, + x86_pmu.cntr_mask64, + x86_pmu.fixed_cntr_mask64, x86_pmu.intel_ctrl); /* * Access LBR MSR may cause #GP under certain circumstances. diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 61383b15f31f..debd224bf6f3 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1137,7 +1137,6 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) { struct debug_store *ds = cpuc->ds; int max_pebs_events = intel_pmu_max_num_pebs(cpuc->pmu); - int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); u64 threshold; int reserved; @@ -1145,7 +1144,7 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) return; if (x86_pmu.flags & PMU_FL_PEBS_ALL) - reserved = max_pebs_events + num_counters_fixed; + reserved = max_pebs_events + x86_pmu_max_num_counters_fixed(cpuc->pmu); else reserved = max_pebs_events; @@ -2166,8 +2165,8 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d mask = x86_pmu.pebs_events_mask; size = max_pebs_events; if (x86_pmu.flags & PMU_FL_PEBS_ALL) { - mask |= ((1ULL << x86_pmu.num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED; - size = INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed; + mask |= x86_pmu.fixed_cntr_mask64 << INTEL_PMC_IDX_FIXED; + size = INTEL_PMC_IDX_FIXED + x86_pmu_max_num_counters_fixed(NULL); } if (unlikely(base >= top)) { @@ -2263,11 +2262,10 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d { short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); struct debug_store *ds = cpuc->ds; struct perf_event *event; void *base, *at, *top; - int bit, size; + int bit; u64 mask; if (!x86_pmu.pebs_active) @@ -2279,11 +2277,10 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d ds->pebs_index = ds->pebs_buffer_base; mask = hybrid(cpuc->pmu, pebs_events_mask) | - (((1ULL << num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED); - size = INTEL_PMC_IDX_FIXED + num_counters_fixed; + (hybrid(cpuc->pmu, fixed_cntr_mask64) << INTEL_PMC_IDX_FIXED); if (unlikely(base >= top)) { - intel_pmu_pebs_event_update_no_drain(cpuc, size); + intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX); return; } @@ -2293,11 +2290,11 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d pebs_status = get_pebs_status(at) & cpuc->pebs_enabled; pebs_status &= mask; - for_each_set_bit(bit, (unsigned long *)&pebs_status, size) + for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) counts[bit]++; } - for_each_set_bit(bit, (unsigned long *)&mask, size) { + for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) { if (counts[bit] == 0) continue; diff --git a/arch/x86/events/intel/knc.c b/arch/x86/events/intel/knc.c index 618001c208e8..034a1f6a457c 100644 --- a/arch/x86/events/intel/knc.c +++ b/arch/x86/events/intel/knc.c @@ -303,7 +303,7 @@ static const struct x86_pmu knc_pmu __initconst = { .apic = 1, .max_period = (1ULL << 39) - 1, .version = 0, - .num_counters = 2, + .cntr_mask64 = 0x3, .cntval_bits = 40, .cntval_mask = (1ULL << 40) - 1, .get_event_constraints = x86_get_event_constraints, diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index 35936188db01..844bc4fc4724 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -919,7 +919,7 @@ static void p4_pmu_disable_all(void) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[idx]; if (!test_bit(idx, cpuc->active_mask)) continue; @@ -998,7 +998,7 @@ static void p4_pmu_enable_all(int added) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[idx]; if (!test_bit(idx, cpuc->active_mask)) continue; @@ -1040,7 +1040,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) cpuc = this_cpu_ptr(&cpu_hw_events); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { int overflow; if (!test_bit(idx, cpuc->active_mask)) { @@ -1353,7 +1353,7 @@ static __initconst const struct x86_pmu p4_pmu = { * though leave it restricted at moment assuming * HT is on */ - .num_counters = ARCH_P4_MAX_CCCR, + .cntr_mask64 = GENMASK_ULL(ARCH_P4_MAX_CCCR - 1, 0), .apic = 1, .cntval_bits = ARCH_P4_CNTRVAL_BITS, .cntval_mask = ARCH_P4_CNTRVAL_MASK, @@ -1395,7 +1395,7 @@ __init int p4_pmu_init(void) * * Solve this by zero'ing out the registers to mimic a reset. */ - for (i = 0; i < x86_pmu.num_counters; i++) { + for_each_set_bit(i, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { reg = x86_pmu_config_addr(i); wrmsrl_safe(reg, 0ULL); } diff --git a/arch/x86/events/intel/p6.c b/arch/x86/events/intel/p6.c index 408879b0c0d4..a6cffb4f4ef5 100644 --- a/arch/x86/events/intel/p6.c +++ b/arch/x86/events/intel/p6.c @@ -214,7 +214,7 @@ static __initconst const struct x86_pmu p6_pmu = { .apic = 1, .max_period = (1ULL << 31) - 1, .version = 0, - .num_counters = 2, + .cntr_mask64 = 0x3, /* * Events have 40 bits implemented. However they are designed such * that bits [32-39] are sign extensions of bit 31. As such the diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ddb33f67e4e3..0050cee7b490 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -692,8 +692,14 @@ struct x86_hybrid_pmu { union perf_capabilities intel_cap; u64 intel_ctrl; u64 pebs_events_mask; - int num_counters; - int num_counters_fixed; + union { + u64 cntr_mask64; + unsigned long cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; + union { + u64 fixed_cntr_mask64; + unsigned long fixed_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; struct event_constraint unconstrained; u64 hw_cache_event_ids @@ -781,8 +787,14 @@ struct x86_pmu { int (*rdpmc_index)(int index); u64 (*event_map)(int); int max_events; - int num_counters; - int num_counters_fixed; + union { + u64 cntr_mask64; + unsigned long cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; + union { + u64 fixed_cntr_mask64; + unsigned long fixed_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; int cntval_bits; u64 cntval_mask; union { @@ -1133,8 +1145,8 @@ static inline int x86_pmu_rdpmc_index(int index) return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index; } -bool check_hw_exists(struct pmu *pmu, int num_counters, - int num_counters_fixed); +bool check_hw_exists(struct pmu *pmu, unsigned long *cntr_mask, + unsigned long *fixed_cntr_mask); int x86_add_exclusive(unsigned int what); @@ -1205,8 +1217,27 @@ void x86_pmu_enable_event(struct perf_event *event); int x86_pmu_handle_irq(struct pt_regs *regs); -void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, - u64 intel_ctrl); +void x86_pmu_show_pmu_cap(struct pmu *pmu); + +static inline int x86_pmu_num_counters(struct pmu *pmu) +{ + return hweight64(hybrid(pmu, cntr_mask64)); +} + +static inline int x86_pmu_max_num_counters(struct pmu *pmu) +{ + return fls64(hybrid(pmu, cntr_mask64)); +} + +static inline int x86_pmu_num_counters_fixed(struct pmu *pmu) +{ + return hweight64(hybrid(pmu, fixed_cntr_mask64)); +} + +static inline int x86_pmu_max_num_counters_fixed(struct pmu *pmu) +{ + return fls64(hybrid(pmu, fixed_cntr_mask64)); +} extern struct event_constraint emptyconstraint; diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c index 8ece3a70588e..2091c7c203ec 100644 --- a/arch/x86/events/zhaoxin/core.c +++ b/arch/x86/events/zhaoxin/core.c @@ -569,14 +569,14 @@ __init int zhaoxin_pmu_init(void) x86_pmu = zhaoxin_pmu; pr_info("Version check pass!\n"); - x86_pmu.version = version; - x86_pmu.num_counters = eax.split.num_counters; - x86_pmu.cntval_bits = eax.split.bit_width; - x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; - x86_pmu.events_maskl = ebx.full; - x86_pmu.events_mask_len = eax.split.mask_length; - - x86_pmu.num_counters_fixed = edx.split.num_counters_fixed; + x86_pmu.version = version; + x86_pmu.cntr_mask64 = GENMASK_ULL(eax.split.num_counters - 1, 0); + x86_pmu.cntval_bits = eax.split.bit_width; + x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.events_maskl = ebx.full; + x86_pmu.events_mask_len = eax.split.mask_length; + + x86_pmu.fixed_cntr_mask64 = GENMASK_ULL(edx.split.num_counters_fixed - 1, 0); x86_add_quirk(zhaoxin_arch_events_quirk); switch (boot_cpu_data.x86) { @@ -669,13 +669,13 @@ __init int zhaoxin_pmu_init(void) return -ENODEV; } - x86_pmu.intel_ctrl = (1 << (x86_pmu.num_counters)) - 1; - x86_pmu.intel_ctrl |= ((1LL << x86_pmu.num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED; + x86_pmu.intel_ctrl = x86_pmu.cntr_mask64; + x86_pmu.intel_ctrl |= x86_pmu.fixed_cntr_mask64 << INTEL_PMC_IDX_FIXED; if (x86_pmu.event_constraints) { for_each_event_constraint(c, x86_pmu.event_constraints) { - c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; - c->weight += x86_pmu.num_counters; + c->idxmsk64 |= x86_pmu.cntr_mask64; + c->weight += x86_pmu_num_counters(NULL); } } -- Gitee From f60b0d66954e5872a6d7736dcae486e81859974f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 26 Jun 2024 07:35:35 -0700 Subject: [PATCH 06/52] perf/x86: Add Lunar Lake and Arrow Lake support mainline inclusion from mainline-v6.11-rc1 commit a932aa0e868f92c5219d84b58c1131fc05ddcf79 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a932aa0e868f92c5219d84b58c1131fc05ddcf79 --------------------------- From PMU's perspective, Lunar Lake and Arrow Lake are similar to the previous generation Meteor Lake. Both are hybrid platforms, with e-core and p-core. The key differences include: - The e-core supports 3 new fixed counters - The p-core supports an updated PEBS Data Source format - More GP counters (Updated event constraint table) - New Architectural performance monitoring V6 (New Perfmon MSRs aliasing, umask2, eq). - New PEBS format V6 (Counters Snapshotting group) - New RDPMC metrics clear mode The legacy features, the 3 new fixed counters and updated event constraint table are enabled in this patch. The new PEBS data source format, the architectural performance monitoring V6, the PEBS format V6, and the new RDPMC metrics clear mode are supported in the following patches. Intel-SIG: commit a932aa0e868f perf/x86: Add Lunar Lake and Arrow Lake support Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Reviewed-by: Ian Rogers Link: https://lkml.kernel.org/r/20240626143545.480761-4-kan.liang@linux.intel.com [Dapeng Mi: resolve conflict and amend commit log] Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 117 ++++++++++++++++++++++++++++++ arch/x86/events/intel/ds.c | 24 ++++++ arch/x86/events/perf_event.h | 2 + arch/x86/include/asm/perf_event.h | 4 + 4 files changed, 147 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 26eee1a93865..7317f9367281 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -211,6 +211,17 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly = EVENT_CONSTRAINT_END }; +static struct event_constraint intel_skt_event_constraints[] __read_mostly = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF_TSC_P */ + FIXED_EVENT_CONSTRAINT(0x0073, 4), /* TOPDOWN_BAD_SPECULATION.ALL */ + FIXED_EVENT_CONSTRAINT(0x019c, 5), /* TOPDOWN_FE_BOUND.ALL */ + FIXED_EVENT_CONSTRAINT(0x02c2, 6), /* TOPDOWN_RETIRING.ALL */ + EVENT_CONSTRAINT_END +}; + static struct event_constraint intel_skl_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -360,6 +371,55 @@ static struct extra_reg intel_rwc_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +static struct event_constraint intel_lnc_event_constraints[] = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF_TSC_P */ + FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_HEAVY_OPS, 4), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BR_MISPREDICT, 5), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FETCH_LAT, 6), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_MEM_BOUND, 7), + + INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), + INTEL_UEVENT_CONSTRAINT(0x0175, 0x4), + + INTEL_EVENT_CONSTRAINT(0x2e, 0x3ff), + INTEL_EVENT_CONSTRAINT(0x3c, 0x3ff), + /* + * Generally event codes < 0x90 are restricted to counters 0-3. + * The 0x2E and 0x3C are exception, which has no restriction. + */ + INTEL_EVENT_CONSTRAINT_RANGE(0x01, 0x8f, 0xf), + + INTEL_UEVENT_CONSTRAINT(0x01a3, 0xf), + INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), + INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), + INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), + INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1), + INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1), + INTEL_UEVENT_CONSTRAINT(0x10a4, 0x1), + INTEL_UEVENT_CONSTRAINT(0x01b1, 0x8), + INTEL_UEVENT_CONSTRAINT(0x02cd, 0x3), + INTEL_EVENT_CONSTRAINT(0xce, 0x1), + + INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xdf, 0xf), + /* + * Generally event codes >= 0x90 are likely to have no restrictions. + * The exception are defined as above. + */ + INTEL_EVENT_CONSTRAINT_RANGE(0x90, 0xfe, 0x3ff), + + EVENT_CONSTRAINT_END +}; + + EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); @@ -5912,6 +5972,23 @@ static struct attribute *adl_hybrid_events_attrs[] = { NULL, }; +EVENT_ATTR_STR_HYBRID(topdown-retiring, td_retiring_lnl, "event=0xc2,umask=0x02;event=0x00,umask=0x80", hybrid_big_small); +EVENT_ATTR_STR_HYBRID(topdown-fe-bound, td_fe_bound_lnl, "event=0x9c,umask=0x01;event=0x00,umask=0x82", hybrid_big_small); +EVENT_ATTR_STR_HYBRID(topdown-be-bound, td_be_bound_lnl, "event=0xa4,umask=0x02;event=0x00,umask=0x83", hybrid_big_small); + +static struct attribute *lnl_hybrid_events_attrs[] = { + EVENT_PTR(slots_adl), + EVENT_PTR(td_retiring_lnl), + EVENT_PTR(td_bad_spec_adl), + EVENT_PTR(td_fe_bound_lnl), + EVENT_PTR(td_be_bound_lnl), + EVENT_PTR(td_heavy_ops_adl), + EVENT_PTR(td_br_mis_adl), + EVENT_PTR(td_fetch_lat_adl), + EVENT_PTR(td_mem_bound_adl), + NULL +}; + /* Must be in IDX order */ EVENT_ATTR_STR_HYBRID(mem-loads, mem_ld_adl, "event=0xd0,umask=0x5,ldlat=3;event=0xcd,umask=0x1,ldlat=3", hybrid_big_small); EVENT_ATTR_STR_HYBRID(mem-stores, mem_st_adl, "event=0xd0,umask=0x6;event=0xcd,umask=0x2", hybrid_big_small); @@ -6269,6 +6346,21 @@ static __always_inline void intel_pmu_init_grt(struct pmu *pmu) hybrid(pmu, extra_regs) = intel_grt_extra_regs; } +static __always_inline void intel_pmu_init_lnc(struct pmu *pmu) +{ + intel_pmu_init_glc(pmu); + hybrid(pmu, event_constraints) = intel_lnc_event_constraints; + hybrid(pmu, pebs_constraints) = intel_lnc_pebs_event_constraints; + hybrid(pmu, extra_regs) = intel_rwc_extra_regs; +} + +static __always_inline void intel_pmu_init_skt(struct pmu *pmu) +{ + intel_pmu_init_grt(pmu); + hybrid(pmu, event_constraints) = intel_skt_event_constraints; + hybrid(pmu, extra_regs) = intel_cmt_extra_regs; +} + __init int intel_pmu_init(void) { struct attribute **extra_skl_attr = &empty_attrs; @@ -6999,6 +7091,31 @@ __init int intel_pmu_init(void) name = "meteorlake_hybrid"; break; + case INTEL_FAM6_LUNARLAKE_M: + case INTEL_FAM6_ARROWLAKE: + intel_pmu_init_hybrid(hybrid_big_small); + + x86_pmu.get_event_constraints = mtl_get_event_constraints; + x86_pmu.hw_config = adl_hw_config; + + td_attr = lnl_hybrid_events_attrs; + mem_attr = mtl_hybrid_mem_attrs; + tsx_attr = adl_hybrid_tsx_attrs; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr; + + /* Initialize big core specific PerfMon capabilities.*/ + pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX]; + intel_pmu_init_lnc(&pmu->pmu); + + /* Initialize Atom core specific PerfMon capabilities.*/ + pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX]; + intel_pmu_init_skt(&pmu->pmu); + + pr_cont("Lunarlake Hybrid events, "); + name = "lunarlake_hybrid"; + break; + default: switch (x86_pmu.version) { case 1: diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index debd224bf6f3..c4186759641a 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1085,6 +1085,30 @@ struct event_constraint intel_glc_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_lnc_pebs_event_constraints[] = { + INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), + + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */ + + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf), + + INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf), + + /* + * Everything else is handled by PMU_FL_PEBS_ALL, because we + * need the full constraints from the main table. + */ + + EVENT_CONSTRAINT_END +}; + struct event_constraint *intel_pebs_constraints(struct perf_event *event) { struct event_constraint *pebs_constraints = hybrid(event->pmu, pebs_constraints); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 0050cee7b490..65bca423b265 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1577,6 +1577,8 @@ extern struct event_constraint intel_icl_pebs_event_constraints[]; extern struct event_constraint intel_glc_pebs_event_constraints[]; +extern struct event_constraint intel_lnc_pebs_event_constraints[]; + struct event_constraint *intel_pebs_constraints(struct perf_event *event); void intel_pmu_pebs_add(struct perf_event *event); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 066721067f8a..ebe7f2c3cd59 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -315,6 +315,10 @@ struct x86_pmu_capability { #define INTEL_PMC_IDX_FIXED_SLOTS (INTEL_PMC_IDX_FIXED + 3) #define INTEL_PMC_MSK_FIXED_SLOTS (1ULL << INTEL_PMC_IDX_FIXED_SLOTS) +/* TOPDOWN_BAD_SPECULATION.ALL: fixed counter 4 (Atom only) */ +/* TOPDOWN_FE_BOUND.ALL: fixed counter 5 (Atom only) */ +/* TOPDOWN_RETIRING.ALL: fixed counter 6 (Atom only) */ + static inline bool use_fixed_pseudo_encoding(u64 code) { return !(code & 0xff); -- Gitee From 11b540b11298b0670f03dec547a101e639065af0 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 26 Jun 2024 07:35:36 -0700 Subject: [PATCH 07/52] perf/x86/intel: Rename model-specific pebs_latency_data functions mainline inclusion from mainline-v6.11-rc1 commit 090262439f66df03d4e9d0e52e14104b729e2ef8 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=090262439f66df03d4e9d0e52e14104b729e2ef8 --------------------------- The model-specific pebs_latency_data functions of ADL and MTL use the "small" as a postfix to indicate the e-core. The postfix is too generic for a model-specific function. It cannot provide useful information that can directly map it to a specific uarch, which can facilitate the development and maintenance. Use the abbr of the uarch to rename the model-specific functions. Intel-SIG: commit 090262439f66 perf/x86/intel: Rename model-specific pebs_latency_data functions Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ian Rogers Link: https://lkml.kernel.org/r/20240626143545.480761-5-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 8 ++++---- arch/x86/events/intel/ds.c | 20 ++++++++++---------- arch/x86/events/perf_event.h | 4 ++-- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7317f9367281..4734b67e9db3 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6639,7 +6639,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ATOM_GRACEMONT: intel_pmu_init_grt(NULL); intel_pmu_pebs_data_source_grt(); - x86_pmu.pebs_latency_data = adl_latency_data_small; + x86_pmu.pebs_latency_data = grt_latency_data; x86_pmu.get_event_constraints = tnt_get_event_constraints; td_attr = tnt_events_attrs; mem_attr = grt_mem_attrs; @@ -6653,7 +6653,7 @@ __init int intel_pmu_init(void) intel_pmu_init_grt(NULL); x86_pmu.extra_regs = intel_cmt_extra_regs; intel_pmu_pebs_data_source_cmt(); - x86_pmu.pebs_latency_data = mtl_latency_data_small; + x86_pmu.pebs_latency_data = cmt_latency_data; x86_pmu.get_event_constraints = cmt_get_event_constraints; td_attr = cmt_events_attrs; mem_attr = grt_mem_attrs; @@ -7009,7 +7009,7 @@ __init int intel_pmu_init(void) */ intel_pmu_init_hybrid(hybrid_big_small); - x86_pmu.pebs_latency_data = adl_latency_data_small; + x86_pmu.pebs_latency_data = grt_latency_data; x86_pmu.get_event_constraints = adl_get_event_constraints; x86_pmu.hw_config = adl_hw_config; x86_pmu.get_hybrid_cpu_type = adl_get_hybrid_cpu_type; @@ -7066,7 +7066,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_METEORLAKE_L: intel_pmu_init_hybrid(hybrid_big_small); - x86_pmu.pebs_latency_data = mtl_latency_data_small; + x86_pmu.pebs_latency_data = cmt_latency_data; x86_pmu.get_event_constraints = mtl_get_event_constraints; x86_pmu.hw_config = adl_hw_config; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index c4186759641a..56f5749b7832 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -256,8 +256,8 @@ static inline void pebs_set_tlb_lock(u64 *val, bool tlb, bool lock) } /* Retrieve the latency data for e-core of ADL */ -static u64 __adl_latency_data_small(struct perf_event *event, u64 status, - u8 dse, bool tlb, bool lock, bool blk) +static u64 __grt_latency_data(struct perf_event *event, u64 status, + u8 dse, bool tlb, bool lock, bool blk) { u64 val; @@ -276,27 +276,27 @@ static u64 __adl_latency_data_small(struct perf_event *event, u64 status, return val; } -u64 adl_latency_data_small(struct perf_event *event, u64 status) +u64 grt_latency_data(struct perf_event *event, u64 status) { union intel_x86_pebs_dse dse; dse.val = status; - return __adl_latency_data_small(event, status, dse.ld_dse, - dse.ld_locked, dse.ld_stlb_miss, - dse.ld_data_blk); + return __grt_latency_data(event, status, dse.ld_dse, + dse.ld_locked, dse.ld_stlb_miss, + dse.ld_data_blk); } /* Retrieve the latency data for e-core of MTL */ -u64 mtl_latency_data_small(struct perf_event *event, u64 status) +u64 cmt_latency_data(struct perf_event *event, u64 status) { union intel_x86_pebs_dse dse; dse.val = status; - return __adl_latency_data_small(event, status, dse.mtl_dse, - dse.mtl_stlb_miss, dse.mtl_locked, - dse.mtl_fwd_blk); + return __grt_latency_data(event, status, dse.mtl_dse, + dse.mtl_stlb_miss, dse.mtl_locked, + dse.mtl_fwd_blk); } static u64 load_latency_data(struct perf_event *event, u64 status) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 65bca423b265..8885c0868ad4 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1543,9 +1543,9 @@ void intel_pmu_disable_bts(void); int intel_pmu_drain_bts_buffer(void); -u64 adl_latency_data_small(struct perf_event *event, u64 status); +u64 grt_latency_data(struct perf_event *event, u64 status); -u64 mtl_latency_data_small(struct perf_event *event, u64 status); +u64 cmt_latency_data(struct perf_event *event, u64 status); extern struct event_constraint intel_core2_pebs_event_constraints[]; -- Gitee From b06e5d15c1db9e988f018f1b9a822ffa587095c4 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 26 Jun 2024 07:35:37 -0700 Subject: [PATCH 08/52] perf/x86/intel: Support new data source for Lunar Lake mainline inclusion from mainline-v6.11-rc1 commit 608f6976c309793ceea37292c54b057dab091944 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=608f6976c309793ceea37292c54b057dab091944 --------------------------- A new PEBS data source format is introduced for the p-core of Lunar Lake. The data source field is extended to 8 bits with new encodings. A new layout is introduced into the union intel_x86_pebs_dse. Introduce the lnl_latency_data() to parse the new format. Enlarge the pebs_data_source[] accordingly to include new encodings. Only the mem load and the mem store events can generate the data source. Introduce INTEL_HYBRID_LDLAT_CONSTRAINT and INTEL_HYBRID_STLAT_CONSTRAINT to mark them. Add two new bits for the new cache-related data src, L2_MHB and MSC. The L2_MHB is short for L2 Miss Handling Buffer, which is similar to LFB (Line Fill Buffer), but to track the L2 Cache misses. The MSC stands for the memory-side cache. Intel-SIG: commit 608f6976c309 perf/x86/intel: Support new data source for Lunar Lake Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Reviewed-by: Ian Rogers Link: https://lkml.kernel.org/r/20240626143545.480761-6-kan.liang@linux.intel.com [Dapeng Mi: resolve conflict in perf_event.h and amend commit log] Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 2 + arch/x86/events/intel/ds.c | 94 ++++++++++++++++++++++++++++++++- arch/x86/events/perf_event.h | 16 +++++- include/uapi/linux/perf_event.h | 6 ++- 4 files changed, 113 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 4734b67e9db3..e583687b8706 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -7095,6 +7095,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ARROWLAKE: intel_pmu_init_hybrid(hybrid_big_small); + x86_pmu.pebs_latency_data = lnl_latency_data; x86_pmu.get_event_constraints = mtl_get_event_constraints; x86_pmu.hw_config = adl_hw_config; @@ -7112,6 +7113,7 @@ __init int intel_pmu_init(void) pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX]; intel_pmu_init_skt(&pmu->pmu); + intel_pmu_pebs_data_source_lnl(); pr_cont("Lunarlake Hybrid events, "); name = "lunarlake_hybrid"; break; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 56f5749b7832..f05515effed2 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -62,6 +62,15 @@ union intel_x86_pebs_dse { unsigned int mtl_fwd_blk:1; unsigned int ld_reserved4:24; }; + struct { + unsigned int lnc_dse:8; + unsigned int ld_reserved5:2; + unsigned int lnc_stlb_miss:1; + unsigned int lnc_locked:1; + unsigned int lnc_data_blk:1; + unsigned int lnc_addr_blk:1; + unsigned int ld_reserved6:18; + }; }; @@ -76,7 +85,7 @@ union intel_x86_pebs_dse { #define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS)) /* Version for Sandy Bridge and later */ -static u64 pebs_data_source[] = { +static u64 pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX] = { P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA),/* 0x00:ukn L3 */ OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 local */ OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */ @@ -172,6 +181,40 @@ void __init intel_pmu_pebs_data_source_cmt(void) __intel_pmu_pebs_data_source_cmt(pebs_data_source); } +/* Version for Lion Cove and later */ +static u64 lnc_pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX] = { + P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA), /* 0x00: ukn L3 */ + OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 hit */ + OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x02: L1 hit */ + OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x03: LFB/L1 Miss Handling Buffer hit */ + 0, /* 0x04: Reserved */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x05: L2 Hit */ + OP_LH | LEVEL(L2_MHB) | P(SNOOP, NONE), /* 0x06: L2 Miss Handling Buffer Hit */ + 0, /* 0x07: Reserved */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, NONE), /* 0x08: L3 Hit */ + 0, /* 0x09: Reserved */ + 0, /* 0x0a: Reserved */ + 0, /* 0x0b: Reserved */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOPX, FWD), /* 0x0c: L3 Hit Snoop Fwd */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x0d: L3 Hit Snoop HitM */ + 0, /* 0x0e: Reserved */ + P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x0f: L3 Miss Snoop HitM */ + OP_LH | LEVEL(MSC) | P(SNOOP, NONE), /* 0x10: Memory-side Cache Hit */ + OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | P(SNOOP, NONE), /* 0x11: Local Memory Hit */ +}; + +void __init intel_pmu_pebs_data_source_lnl(void) +{ + u64 *data_source; + + data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].pebs_data_source; + memcpy(data_source, lnc_pebs_data_source, sizeof(lnc_pebs_data_source)); + + data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source; + memcpy(data_source, pebs_data_source, sizeof(pebs_data_source)); + __intel_pmu_pebs_data_source_cmt(data_source); +} + static u64 precise_store_data(u64 status) { union intel_x86_pebs_dse dse; @@ -263,7 +306,7 @@ static u64 __grt_latency_data(struct perf_event *event, u64 status, WARN_ON_ONCE(hybrid_pmu(event->pmu)->pmu_type == hybrid_big); - dse &= PERF_PEBS_DATA_SOURCE_MASK; + dse &= PERF_PEBS_DATA_SOURCE_GRT_MASK; val = hybrid_var(event->pmu, pebs_data_source)[dse]; pebs_set_tlb_lock(&val, tlb, lock); @@ -299,6 +342,51 @@ u64 cmt_latency_data(struct perf_event *event, u64 status) dse.mtl_fwd_blk); } +static u64 lnc_latency_data(struct perf_event *event, u64 status) +{ + union intel_x86_pebs_dse dse; + union perf_mem_data_src src; + u64 val; + + dse.val = status; + + /* LNC core latency data */ + val = hybrid_var(event->pmu, pebs_data_source)[status & PERF_PEBS_DATA_SOURCE_MASK]; + if (!val) + val = P(OP, LOAD) | LEVEL(NA) | P(SNOOP, NA); + + if (dse.lnc_stlb_miss) + val |= P(TLB, MISS) | P(TLB, L2); + else + val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); + + if (dse.lnc_locked) + val |= P(LOCK, LOCKED); + + if (dse.lnc_data_blk) + val |= P(BLK, DATA); + if (dse.lnc_addr_blk) + val |= P(BLK, ADDR); + if (!dse.lnc_data_blk && !dse.lnc_addr_blk) + val |= P(BLK, NA); + + src.val = val; + if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) + src.mem_op = P(OP, STORE); + + return src.val; +} + +u64 lnl_latency_data(struct perf_event *event, u64 status) +{ + struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); + + if (pmu->pmu_type == hybrid_small) + return cmt_latency_data(event, status); + + return lnc_latency_data(event, status); +} + static u64 load_latency_data(struct perf_event *event, u64 status) { union intel_x86_pebs_dse dse; @@ -1089,6 +1177,8 @@ struct event_constraint intel_lnc_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), + INTEL_HYBRID_LDLAT_CONSTRAINT(0x1cd, 0x3ff), + INTEL_HYBRID_STLAT_CONSTRAINT(0x2cd, 0x3), INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */ diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 8885c0868ad4..172b139052a0 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -483,6 +483,14 @@ struct cpu_hw_events { __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LAT_HYBRID) +#define INTEL_HYBRID_LDLAT_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LAT_HYBRID|PERF_X86_EVENT_PEBS_LD_HSW) + +#define INTEL_HYBRID_STLAT_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LAT_HYBRID|PERF_X86_EVENT_PEBS_ST_HSW) + /* Event constraint, but match on all event flags too. */ #define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS) @@ -662,8 +670,10 @@ enum { x86_lbr_exclusive_max, }; -#define PERF_PEBS_DATA_SOURCE_MAX 0x10 +#define PERF_PEBS_DATA_SOURCE_MAX 0x100 #define PERF_PEBS_DATA_SOURCE_MASK (PERF_PEBS_DATA_SOURCE_MAX - 1) +#define PERF_PEBS_DATA_SOURCE_GRT_MAX 0x10 +#define PERF_PEBS_DATA_SOURCE_GRT_MASK (PERF_PEBS_DATA_SOURCE_GRT_MAX - 1) enum hybrid_cpu_type { HYBRID_INTEL_NONE, @@ -1547,6 +1557,8 @@ u64 grt_latency_data(struct perf_event *event, u64 status); u64 cmt_latency_data(struct perf_event *event, u64 status); +u64 lnl_latency_data(struct perf_event *event, u64 status); + extern struct event_constraint intel_core2_pebs_event_constraints[]; extern struct event_constraint intel_atom_pebs_event_constraints[]; @@ -1668,6 +1680,8 @@ void intel_pmu_pebs_data_source_mtl(void); void intel_pmu_pebs_data_source_cmt(void); +void intel_pmu_pebs_data_source_lnl(void); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 3a64499b0f5d..4842c36fdf80 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1349,12 +1349,14 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ #define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ #define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ -/* 5-0x7 available */ +#define PERF_MEM_LVLNUM_L2_MHB 0x05 /* L2 Miss Handling Buffer */ +#define PERF_MEM_LVLNUM_MSC 0x06 /* Memory-side Cache */ +/* 0x7 available */ #define PERF_MEM_LVLNUM_UNC 0x08 /* Uncached */ #define PERF_MEM_LVLNUM_CXL 0x09 /* CXL */ #define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ #define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ -#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */ +#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB / L1 Miss Handling Buffer */ #define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ #define PERF_MEM_LVLNUM_PMEM 0x0e /* PMEM */ #define PERF_MEM_LVLNUM_NA 0x0f /* N/A */ -- Gitee From 0e5e8fd2333c780e889ceef470c45182b175ed70 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 26 Jun 2024 07:35:38 -0700 Subject: [PATCH 09/52] perf/x86: Add config_mask to represent EVENTSEL bitmask mainline inclusion from mainline-v6.11-rc1 commit e8fb5d6e765838e913253ef7c9b6fd8ec76c8d53 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e8fb5d6e765838e913253ef7c9b6fd8ec76c8d53 --------------------------- Different vendors may support different fields in EVENTSEL MSR, such as Intel would introduce new fields umask2 and eq bits in EVENTSEL MSR since Perfmon version 6. However, a fixed mask X86_RAW_EVENT_MASK is used to filter the attr.config. Introduce a new config_mask to record the real supported EVENTSEL bitmask. Only apply it to the existing code now. No functional change. Intel-SIG: commit e8fb5d6e7658 perf/x86: Add config_mask to represent EVENTSEL bitmask Co-developed-by: Dapeng Mi Signed-off-by: Dapeng Mi Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Reviewed-by: Ian Rogers Link: https://lkml.kernel.org/r/20240626143545.480761-7-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/core.c | 5 ++++- arch/x86/events/intel/core.c | 1 + arch/x86/events/perf_event.h | 7 +++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 46e1f78af3f3..8f0982639a28 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -626,7 +626,7 @@ int x86_pmu_hw_config(struct perf_event *event) event->hw.config |= ARCH_PERFMON_EVENTSEL_OS; if (event->attr.type == event->pmu->type) - event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; + event->hw.config |= x86_pmu_get_event_config(event); if (is_sampling_event(event) && !event->attr.freq && x86_pmu.limit_period) { s64 left = event->attr.sample_period; @@ -2102,6 +2102,9 @@ static int __init init_hw_perf_events(void) if (!x86_pmu.intel_ctrl) x86_pmu.intel_ctrl = x86_pmu.cntr_mask64; + if (!x86_pmu.config_mask) + x86_pmu.config_mask = X86_RAW_EVENT_MASK; + perf_events_lapic_init(); register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index e583687b8706..68a21893cccb 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6284,6 +6284,7 @@ static __always_inline int intel_pmu_init_hybrid(enum hybrid_pmu_type pmus) pmu->cntr_mask64 = x86_pmu.cntr_mask64; pmu->fixed_cntr_mask64 = x86_pmu.fixed_cntr_mask64; pmu->pebs_events_mask = intel_pmu_pebs_mask(pmu->cntr_mask64); + pmu->config_mask = X86_RAW_EVENT_MASK; pmu->unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, pmu->cntr_mask64, 0, x86_pmu_num_counters(&pmu->pmu), 0, 0); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 172b139052a0..79f789bb88d0 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -702,6 +702,7 @@ struct x86_hybrid_pmu { union perf_capabilities intel_cap; u64 intel_ctrl; u64 pebs_events_mask; + u64 config_mask; union { u64 cntr_mask64; unsigned long cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; @@ -797,6 +798,7 @@ struct x86_pmu { int (*rdpmc_index)(int index); u64 (*event_map)(int); int max_events; + u64 config_mask; union { u64 cntr_mask64; unsigned long cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; @@ -1249,6 +1251,11 @@ static inline int x86_pmu_max_num_counters_fixed(struct pmu *pmu) return fls64(hybrid(pmu, fixed_cntr_mask64)); } +static inline u64 x86_pmu_get_event_config(struct perf_event *event) +{ + return event->attr.config & hybrid(event->pmu, config_mask); +} + extern struct event_constraint emptyconstraint; extern struct event_constraint unconstrained; -- Gitee From be2f68257b1e5f318f7385820f3ea767c417f928 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 26 Jun 2024 07:35:39 -0700 Subject: [PATCH 10/52] perf/x86/intel: Support PERFEVTSEL extension mainline inclusion from mainline-v6.11-rc1 commit dce0c74d2d180ce21d074b4f977821a567ab0020 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=dce0c74d2d180ce21d074b4f977821a567ab0020 --------------------------- Two new fields (the unit mask2, and the equal flag) are added in the IA32_PERFEVTSELx MSRs. They can be enumerated by the CPUID.23H.0.EBX. Update the config_mask in x86_pmu and x86_hybrid_pmu for the true layout of the PERFEVTSEL. Expose the new formats into sysfs if they are available. The umask extension reuses the same format attr name "umask" as the previous umask. Add umask2_show to determine/display the correct format for the current machine. Intel-SIG: commit dce0c74d2d18 perf/x86/intel: Support PERFEVTSEL extension Co-developed-by: Dapeng Mi Signed-off-by: Dapeng Mi Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Reviewed-by: Ian Rogers Link: https://lkml.kernel.org/r/20240626143545.480761-8-kan.liang@linux.intel.com [Dapeng Mi: resolve conflict and amend commit log] Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 69 +++++++++++++++++++++++++++++-- arch/x86/include/asm/perf_event.h | 4 ++ 2 files changed, 69 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 68a21893cccb..b207214507f1 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4730,8 +4730,55 @@ PMU_FORMAT_ATTR(pc, "config:19" ); PMU_FORMAT_ATTR(any, "config:21" ); /* v3 + */ PMU_FORMAT_ATTR(inv, "config:23" ); PMU_FORMAT_ATTR(cmask, "config:24-31" ); -PMU_FORMAT_ATTR(in_tx, "config:32"); -PMU_FORMAT_ATTR(in_tx_cp, "config:33"); +PMU_FORMAT_ATTR(in_tx, "config:32" ); +PMU_FORMAT_ATTR(in_tx_cp, "config:33" ); +PMU_FORMAT_ATTR(eq, "config:36" ); /* v6 + */ + +static ssize_t umask2_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + u64 mask = hybrid(dev_get_drvdata(dev), config_mask) & ARCH_PERFMON_EVENTSEL_UMASK2; + + if (mask == ARCH_PERFMON_EVENTSEL_UMASK2) + return sprintf(page, "config:8-15,40-47\n"); + + /* Roll back to the old format if umask2 is not supported. */ + return sprintf(page, "config:8-15\n"); +} + +static struct device_attribute format_attr_umask2 = + __ATTR(umask, 0444, umask2_show, NULL); + +static struct attribute *format_evtsel_ext_attrs[] = { + &format_attr_umask2.attr, + &format_attr_eq.attr, + NULL +}; + +static umode_t +evtsel_ext_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + struct device *dev = kobj_to_dev(kobj); + u64 mask; + + /* + * The umask and umask2 have different formats but share the + * same attr name. In update mode, the previous value of the + * umask is unconditionally removed before is_visible. If + * umask2 format is not enumerated, it's impossible to roll + * back to the old format. + * Does the check in umask2_show rather than is_visible. + */ + if (i == 0) + return attr->mode; + + mask = hybrid(dev_get_drvdata(dev), config_mask); + if (i == 1) + return (mask & ARCH_PERFMON_EVENTSEL_EQ) ? attr->mode : 0; + + return 0; +} static struct attribute *intel_arch_formats_attr[] = { &format_attr_event.attr, @@ -4884,8 +4931,14 @@ static inline bool intel_pmu_broken_perf_cap(void) static void update_pmu_cap(struct x86_hybrid_pmu *pmu) { - unsigned int sub_bitmaps = cpuid_eax(ARCH_PERFMON_EXT_LEAF); - unsigned int eax, ebx, ecx, edx; + unsigned int sub_bitmaps, eax, ebx, ecx, edx; + + cpuid(ARCH_PERFMON_EXT_LEAF, &sub_bitmaps, &ebx, &ecx, &edx); + + if (ebx & ARCH_PERFMON_EXT_UMASK2) + pmu->config_mask |= ARCH_PERFMON_EVENTSEL_UMASK2; + if (ebx & ARCH_PERFMON_EXT_EQ) + pmu->config_mask |= ARCH_PERFMON_EVENTSEL_EQ; if (sub_bitmaps & ARCH_PERFMON_NUM_COUNTER_LEAF_BIT) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF, @@ -5932,6 +5985,12 @@ static struct attribute_group group_format_extra_skl = { .is_visible = exra_is_visible, }; +static struct attribute_group group_format_evtsel_ext = { + .name = "format", + .attrs = format_evtsel_ext_attrs, + .is_visible = evtsel_ext_is_visible, +}; + static struct attribute_group group_default = { .attrs = intel_pmu_attrs, .is_visible = default_is_visible, @@ -5945,6 +6004,7 @@ static const struct attribute_group *attr_update[] = { &group_caps_lbr, &group_format_extra, &group_format_extra_skl, + &group_format_evtsel_ext, &group_default, NULL, }; @@ -6182,6 +6242,7 @@ static const struct attribute_group *hybrid_attr_update[] = { &group_caps_gen, &group_caps_lbr, &hybrid_group_format_extra, + &group_format_evtsel_ext, &group_default, &hybrid_group_cpus, NULL, diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index ebe7f2c3cd59..335202d94804 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -32,6 +32,8 @@ #define ARCH_PERFMON_EVENTSEL_INV (1ULL << 23) #define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL #define ARCH_PERFMON_EVENTSEL_BR_CNTR (1ULL << 35) +#define ARCH_PERFMON_EVENTSEL_EQ (1ULL << 36) +#define ARCH_PERFMON_EVENTSEL_UMASK2 (0xFFULL << 40) #define INTEL_FIXED_BITS_MASK 0xFULL #define INTEL_FIXED_BITS_STRIDE 4 @@ -193,6 +195,8 @@ union cpuid10_edx { * detection/enumeration details: */ #define ARCH_PERFMON_EXT_LEAF 0x00000023 +#define ARCH_PERFMON_EXT_UMASK2 0x1 +#define ARCH_PERFMON_EXT_EQ 0x2 #define ARCH_PERFMON_NUM_COUNTER_LEAF_BIT 0x1 #define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1 -- Gitee From 729027f920e7490e85424c4eddeaa94219aa1e57 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 26 Jun 2024 07:35:40 -0700 Subject: [PATCH 11/52] perf/x86/intel: Support Perfmon MSRs aliasing mainline inclusion from mainline-v6.11-rc1 commit 149fd4712bcd492a031945f92e5ce19879f62311 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=149fd4712bcd492a031945f92e5ce19879f62311 --------------------------- The architectural performance monitoring V6 supports a new range of counters' MSRs in the 19xxH address range. They include all the GP counter MSRs, the GP control MSRs, and the fixed counter MSRs. The step between each sibling counter is 4. Add intel_pmu_addr_offset() to calculate the correct offset. Add fixedctr in struct x86_pmu to store the address of the fixed counter 0. It can be used to calculate the rest of the fixed counters. The MSR address of the fixed counter control is not changed. Intel-SIG: commit 149fd4712bcd perf/x86/intel: Support Perfmon MSRs aliasing Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Reviewed-by: Ian Rogers Link: https://lkml.kernel.org/r/20240626143545.480761-9-kan.liang@linux.intel.com [Dapeng Mi: resolve conflict and amend commit log] Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/core.c | 7 +++---- arch/x86/events/intel/core.c | 17 ++++++++++++++++- arch/x86/events/perf_event.h | 7 +++++++ arch/x86/include/asm/msr-index.h | 6 ++++++ 4 files changed, 32 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 8f0982639a28..471eaa46d55f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1238,8 +1238,7 @@ static inline void x86_assign_hw_event(struct perf_event *event, fallthrough; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1: hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + - (idx - INTEL_PMC_IDX_FIXED); + hwc->event_base = x86_pmu_fixed_ctr_addr(idx - INTEL_PMC_IDX_FIXED); hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | INTEL_PMC_FIXED_RDPMC_BASE; break; @@ -1578,7 +1577,7 @@ void perf_event_print_debug(void) for_each_set_bit(idx, fixed_cntr_mask, X86_PMC_IDX_MAX) { if (fixed_counter_disabled(idx, cpuc->pmu)) continue; - rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); + rdmsrl(x86_pmu_fixed_ctr_addr(idx), pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", cpu, idx, pmc_count); @@ -2487,7 +2486,7 @@ void perf_clear_dirty_counters(void) if (!test_bit(i - INTEL_PMC_IDX_FIXED, hybrid(cpuc->pmu, fixed_cntr_mask))) continue; - wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + (i - INTEL_PMC_IDX_FIXED), 0); + wrmsrl(x86_pmu_fixed_ctr_addr(i - INTEL_PMC_IDX_FIXED), 0); } else { wrmsrl(x86_pmu_event_addr(i), 0); } diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index b207214507f1..84eb95e0f2fb 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2948,7 +2948,7 @@ static void intel_pmu_reset(void) for_each_set_bit(idx, fixed_cntr_mask, INTEL_PMC_MAX_FIXED) { if (fixed_counter_disabled(idx, cpuc->pmu)) continue; - wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + wrmsrl_safe(x86_pmu_fixed_ctr_addr(idx), 0ull); } if (ds) @@ -5289,6 +5289,7 @@ static __initconst const struct x86_pmu core_pmu = { .schedule_events = x86_schedule_events, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .fixedctr = MSR_ARCH_PERFMON_FIXED_CTR0, .event_map = intel_pmu_event_map, .max_events = ARRAY_SIZE(intel_perfmon_event_map), .apic = 1, @@ -5342,6 +5343,7 @@ static __initconst const struct x86_pmu intel_pmu = { .schedule_events = x86_schedule_events, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .fixedctr = MSR_ARCH_PERFMON_FIXED_CTR0, .event_map = intel_pmu_event_map, .max_events = ARRAY_SIZE(intel_perfmon_event_map), .apic = 1, @@ -6316,6 +6318,11 @@ static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs) } } +static inline int intel_pmu_v6_addr_offset(int index, bool eventsel) +{ + return MSR_IA32_PMC_V6_STEP * index; +} + static const struct { enum hybrid_pmu_type id; char *name; } intel_hybrid_pmu_type_map[] __initconst = { { hybrid_small, "cpu_atom" }, { hybrid_big, "cpu_core" }, @@ -7285,6 +7292,14 @@ __init int intel_pmu_init(void) pr_cont("full-width counters, "); } + /* Support V6+ MSR Aliasing */ + if (x86_pmu.version >= 6) { + x86_pmu.perfctr = MSR_IA32_PMC_V6_GP0_CTR; + x86_pmu.eventsel = MSR_IA32_PMC_V6_GP0_CFG_A; + x86_pmu.fixedctr = MSR_IA32_PMC_V6_FX0_CTR; + x86_pmu.addr_offset = intel_pmu_v6_addr_offset; + } + if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 79f789bb88d0..f9dae477d5e8 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -794,6 +794,7 @@ struct x86_pmu { int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; unsigned perfctr; + unsigned fixedctr; int (*addr_offset)(int index, bool eventsel); int (*rdpmc_index)(int index); u64 (*event_map)(int); @@ -1152,6 +1153,12 @@ static inline unsigned int x86_pmu_event_addr(int index) x86_pmu.addr_offset(index, false) : index); } +static inline unsigned int x86_pmu_fixed_ctr_addr(int index) +{ + return x86_pmu.fixedctr + (x86_pmu.addr_offset ? + x86_pmu.addr_offset(index, false) : index); +} + static inline int x86_pmu_rdpmc_index(int index) { return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 9860a3cb69d0..ee6fe147c828 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -570,6 +570,12 @@ #define MSR_RELOAD_PMC0 0x000014c1 #define MSR_RELOAD_FIXED_CTR0 0x00001309 +/* V6 PMON MSR range */ +#define MSR_IA32_PMC_V6_GP0_CTR 0x1900 +#define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901 +#define MSR_IA32_PMC_V6_FX0_CTR 0x1980 +#define MSR_IA32_PMC_V6_STEP 4 + /* * AMD64 MSRs. Not complete. See the architecture manual for a more * complete list. -- Gitee From 3f4e0ffc5a04e2335bd802ed7729001386c7b886 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 19 Nov 2024 05:55:02 -0800 Subject: [PATCH 12/52] perf/x86/intel/ds: Clarify adaptive PEBS processing mainline inclusion from mainline-v6.13-rc3 commit 7087bfb0adc9a12ec3b463b1d38072c5efce5d6c category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7087bfb0adc9a12ec3b463b1d38072c5efce5d6c --------------------------- Modify the pebs_basic and pebs_meminfo structs to make the bitfields more explicit to ease readability of the code. Intel-SIG: commit 7087bfb0adc9 perf/x86/intel/ds: Clarify adaptive PEBS processing Co-developed-by: Stephane Eranian Signed-off-by: Stephane Eranian Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241119135504.1463839-3-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/ds.c | 43 ++++++++++++++----------------- arch/x86/include/asm/perf_event.h | 16 ++++++++++-- 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index f05515effed2..acf538996953 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1895,8 +1895,6 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs, } #define PEBS_LATENCY_MASK 0xffff -#define PEBS_CACHE_LATENCY_OFFSET 32 -#define PEBS_RETIRE_LATENCY_OFFSET 32 /* * With adaptive PEBS the layout depends on what fields are configured. @@ -1910,8 +1908,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct pebs_basic *basic = __pebs; void *next_record = basic + 1; - u64 sample_type; - u64 format_size; + u64 sample_type, format_group; struct pebs_meminfo *meminfo = NULL; struct pebs_gprs *gprs = NULL; struct x86_perf_regs *perf_regs; @@ -1923,7 +1920,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, perf_regs->xmm_regs = NULL; sample_type = event->attr.sample_type; - format_size = basic->format_size; + format_group = basic->format_group; perf_sample_data_init(data, 0, event->hw.last_period); data->period = event->hw.last_period; @@ -1945,7 +1942,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY) - data->weight.var3_w = format_size >> PEBS_RETIRE_LATENCY_OFFSET & PEBS_LATENCY_MASK; + data->weight.var3_w = basic->retire_latency; else data->weight.var3_w = 0; } @@ -1955,12 +1952,12 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, * But PERF_SAMPLE_TRANSACTION needs gprs->ax. * Save the pointer here but process later. */ - if (format_size & PEBS_DATACFG_MEMINFO) { + if (format_group & PEBS_DATACFG_MEMINFO) { meminfo = next_record; next_record = meminfo + 1; } - if (format_size & PEBS_DATACFG_GP) { + if (format_group & PEBS_DATACFG_GP) { gprs = next_record; next_record = gprs + 1; @@ -1973,14 +1970,13 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, adaptive_pebs_save_regs(regs, gprs); } - if (format_size & PEBS_DATACFG_MEMINFO) { + if (format_group & PEBS_DATACFG_MEMINFO) { if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { - u64 weight = meminfo->latency; + u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ? + meminfo->cache_latency : meminfo->mem_latency; - if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) { - data->weight.var2_w = weight & PEBS_LATENCY_MASK; - weight >>= PEBS_CACHE_LATENCY_OFFSET; - } + if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) + data->weight.var2_w = meminfo->instr_latency; /* * Although meminfo::latency is defined as a u64, @@ -1988,12 +1984,13 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, * in practice on Ice Lake and earlier platforms. */ if (sample_type & PERF_SAMPLE_WEIGHT) { - data->weight.full = weight ?: + data->weight.full = latency ?: intel_get_tsx_weight(meminfo->tsx_tuning); } else { - data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?: + data->weight.var1_dw = (u32)latency ?: intel_get_tsx_weight(meminfo->tsx_tuning); } + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } @@ -2014,16 +2011,16 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } } - if (format_size & PEBS_DATACFG_XMMS) { + if (format_group & PEBS_DATACFG_XMMS) { struct pebs_xmm *xmm = next_record; next_record = xmm + 1; perf_regs->xmm_regs = xmm->xmm; } - if (format_size & PEBS_DATACFG_LBRS) { + if (format_group & PEBS_DATACFG_LBRS) { struct lbr_entry *lbr = next_record; - int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT) + int num_lbr = ((format_group >> PEBS_DATACFG_LBR_SHIFT) & 0xff) + 1; next_record = next_record + num_lbr * sizeof(struct lbr_entry); @@ -2033,11 +2030,11 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } } - WARN_ONCE(next_record != __pebs + (format_size >> 48), - "PEBS record size %llu, expected %llu, config %llx\n", - format_size >> 48, + WARN_ONCE(next_record != __pebs + basic->format_size, + "PEBS record size %u, expected %llu, config %llx\n", + basic->format_size, (u64)(next_record - __pebs), - basic->format_size); + format_group); } static inline void * diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 335202d94804..79fbcb5c04b8 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -430,7 +430,9 @@ static inline bool is_topdown_idx(int idx) */ struct pebs_basic { - u64 format_size; + u64 format_group:32, + retire_latency:16, + format_size:16; u64 ip; u64 applicable_counters; u64 tsc; @@ -439,7 +441,17 @@ struct pebs_basic { struct pebs_meminfo { u64 address; u64 aux; - u64 latency; + union { + /* pre Alder Lake */ + u64 mem_latency; + /* Alder Lake and later */ + struct { + u64 instr_latency:16; + u64 pad2:16; + u64 cache_latency:16; + u64 pad3:16; + }; + }; u64 tsx_tuning; }; -- Gitee From 0a6afe0cd10c7af58ef95a0313093d6d2aac6b43 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 19 Nov 2024 05:55:03 -0800 Subject: [PATCH 13/52] perf/x86/intel/ds: Factor out functions for PEBS records processing mainline inclusion from mainline-v6.14-rc1 commit 3c00ed344cef4dbb57d8769b961af414132a173a category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3c00ed344cef4dbb57d8769b961af414132a173a --------------------------- Factor out functions to process normal and the last PEBS records, which can be shared with the later patch. Move the event updating related codes (intel_pmu_save_and_restart()) to the end, where all samples have been processed. For the current usage, it doesn't matter when perf updates event counts and reset the counter. Because all counters are stopped when the PEBS buffer is drained. Drop the return of the !intel_pmu_save_and_restart(event) check. Because it never happen. The intel_pmu_save_and_restart(event) only returns 0, when !hwc->event_base or the period_left > 0. - The !hwc->event_base is impossible for the PEBS event, since the PEBS event is only available on GP and fixed counters, which always have a valid hwc->event_base. - The check only happens for the case of non-AUTO_RELOAD and single PEBS, which implies that the event must be overflowed. The period_left must be always <= 0 for an overflowed event after the x86_pmu_update(). Intel-SIG: commit 3c00ed344cef perf/x86/intel/ds: Factor out functions for PEBS records processing Co-developed-by: "Peter Zijlstra (Intel)" Signed-off-by: "Peter Zijlstra (Intel)" Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241119135504.1463839-4-kan.liang@linux.intel.com [Dapeng Mi: resolve conflict and amend commit log] Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/ds.c | 109 +++++++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 42 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index acf538996953..9818d26483b9 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2136,46 +2136,33 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count) return 0; } +typedef void (*setup_fn)(struct perf_event *, struct pt_regs *, void *, + struct perf_sample_data *, struct pt_regs *); + +static struct pt_regs dummy_iregs; + static __always_inline void __intel_pmu_pebs_event(struct perf_event *event, struct pt_regs *iregs, + struct pt_regs *regs, struct perf_sample_data *data, - void *base, void *top, - int bit, int count, - void (*setup_sample)(struct perf_event *, - struct pt_regs *, - void *, - struct perf_sample_data *, - struct pt_regs *)) + void *at, + setup_fn setup_sample) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; - struct x86_perf_regs perf_regs; - struct pt_regs *regs = &perf_regs.regs; - void *at = get_next_pebs_record_by_bit(base, top, bit); - static struct pt_regs dummy_iregs; - - if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { - /* - * Now, auto-reload is only enabled in fixed period mode. - * The reload value is always hwc->sample_period. - * May need to change it, if auto-reload is enabled in - * freq mode later. - */ - intel_pmu_save_and_restart_reload(event, count); - } else if (!intel_pmu_save_and_restart(event)) - return; - - if (!iregs) - iregs = &dummy_iregs; + setup_sample(event, iregs, at, data, regs); + perf_event_output(event, data, regs); +} - while (count > 1) { - setup_sample(event, iregs, at, data, regs); - perf_event_output(event, data, regs); - at += cpuc->pebs_record_size; - at = get_next_pebs_record_by_bit(at, top, bit); - count--; - } +static __always_inline void +__intel_pmu_pebs_last_event(struct perf_event *event, + struct pt_regs *iregs, + struct pt_regs *regs, + struct perf_sample_data *data, + void *at, + int count, + setup_fn setup_sample) +{ + struct hw_perf_event *hwc = &event->hw; setup_sample(event, iregs, at, data, regs); if (iregs == &dummy_iregs) { @@ -2194,6 +2181,44 @@ __intel_pmu_pebs_event(struct perf_event *event, if (perf_event_overflow(event, data, regs)) x86_pmu_stop(event, 0); } + + if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { + /* + * Now, auto-reload is only enabled in fixed period mode. + * The reload value is always hwc->sample_period. + * May need to change it, if auto-reload is enabled in + * freq mode later. + */ + intel_pmu_save_and_restart_reload(event, count); + } else + intel_pmu_save_and_restart(event); +} + +static __always_inline void +__intel_pmu_pebs_events(struct perf_event *event, + struct pt_regs *iregs, + struct perf_sample_data *data, + void *base, void *top, + int bit, int count, + setup_fn setup_sample) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_regs perf_regs; + struct pt_regs *regs = &perf_regs.regs; + void *at = get_next_pebs_record_by_bit(base, top, bit); + int cnt = count; + + if (!iregs) + iregs = &dummy_iregs; + + while (cnt > 1) { + __intel_pmu_pebs_event(event, iregs, regs, data, at, setup_sample); + at += cpuc->pebs_record_size; + at = get_next_pebs_record_by_bit(at, top, bit); + cnt--; + } + + __intel_pmu_pebs_last_event(event, iregs, regs, data, at, count, setup_sample); } static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data) @@ -2230,8 +2255,8 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_ return; } - __intel_pmu_pebs_event(event, iregs, data, at, top, 0, n, - setup_pebs_fixed_sample_data); + __intel_pmu_pebs_events(event, iregs, data, at, top, 0, n, + setup_pebs_fixed_sample_data); } static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size) @@ -2362,9 +2387,9 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d } if (counts[bit]) { - __intel_pmu_pebs_event(event, iregs, data, base, - top, bit, counts[bit], - setup_pebs_fixed_sample_data); + __intel_pmu_pebs_events(event, iregs, data, base, + top, bit, counts[bit], + setup_pebs_fixed_sample_data); } } } @@ -2416,9 +2441,9 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d if (WARN_ON_ONCE(!event->attr.precise_ip)) continue; - __intel_pmu_pebs_event(event, iregs, data, base, - top, bit, counts[bit], - setup_pebs_adaptive_sample_data); + __intel_pmu_pebs_events(event, iregs, data, base, + top, bit, counts[bit], + setup_pebs_adaptive_sample_data); } } -- Gitee From 7176fe58afa477aa388e53a05f89ec2c9546d9ab Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 19 Nov 2024 05:55:04 -0800 Subject: [PATCH 14/52] perf/x86/intel/ds: Simplify the PEBS records processing for adaptive PEBS mainline inclusion from mainline-v6.14-rc1 commit ae55e308bde2267df79c4475daa85e174b7ab4c8 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ae55e308bde2267df79c4475daa85e174b7ab4c8 --------------------------- The current code may iterate all the PEBS records in the DS area several times. The first loop is to find all active events and calculate the available records for each event. Then iterate the whole buffer again and again to process available records until all active events are processed. The algorithm is inherited from the old generations. The old PEBS hardware does not deal well with the situation when events happen near each other. SW has to drop the error records. Multiple iterations are required. The hardware limit has been addressed on newer platforms with adaptive PEBS. A simple one-iteration algorithm is introduced. The samples are output by record order with the patch, rather than the event order. It doesn't impact the post-processing. The perf tool always sorts the records by time before presenting them to the end user. In an NMI, the last record has to be specially handled. Add a last[] variable to track the last unprocessed record of each event. Test: 11 PEBS events are used in the perf test. Only the basic information is collected. perf record -e instructions:up,...,instructions:up -c 2000003 benchmark The ftrace is used to record the duration of the intel_pmu_drain_pebs_icl(). The average duration reduced from 62.04us to 57.94us. A small improvement can be observed with the new algorithm. Also, the implementation becomes simpler and more straightforward. Intel-SIG: commit ae55e308bde2 perf/x86/intel/ds: Simplify the PEBS records processing for adaptive PEBS Suggested-by: Stephane Eranian Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://lore.kernel.org/r/20241119135504.1463839-5-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/ds.c | 43 +++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 9818d26483b9..bdb4f05fcd87 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2397,8 +2397,12 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data) { short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; + void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS]; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct debug_store *ds = cpuc->ds; + struct x86_perf_regs perf_regs; + struct pt_regs *regs = &perf_regs.regs; + struct pebs_basic *basic; struct perf_event *event; void *base, *at, *top; int bit; @@ -2420,30 +2424,41 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d return; } - for (at = base; at < top; at += cpuc->pebs_record_size) { + if (!iregs) + iregs = &dummy_iregs; + + /* Process all but the last event for each counter. */ + for (at = base; at < top; at += basic->format_size) { u64 pebs_status; - pebs_status = get_pebs_status(at) & cpuc->pebs_enabled; - pebs_status &= mask; + basic = at; + if (basic->format_size != cpuc->pebs_record_size) + continue; + + pebs_status = basic->applicable_counters & cpuc->pebs_enabled & mask; + for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) { + event = cpuc->events[bit]; - for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) - counts[bit]++; + if (WARN_ON_ONCE(!event) || + WARN_ON_ONCE(!event->attr.precise_ip)) + continue; + + if (counts[bit]++) { + __intel_pmu_pebs_event(event, iregs, regs, data, last[bit], + setup_pebs_adaptive_sample_data); + } + last[bit] = at; + } } for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) { - if (counts[bit] == 0) + if (!counts[bit]) continue; event = cpuc->events[bit]; - if (WARN_ON_ONCE(!event)) - continue; - - if (WARN_ON_ONCE(!event->attr.precise_ip)) - continue; - __intel_pmu_pebs_events(event, iregs, data, base, - top, bit, counts[bit], - setup_pebs_adaptive_sample_data); + __intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit], + counts[bit], setup_pebs_adaptive_sample_data); } } -- Gitee From ea1bb7753ebfcd415def2353ab4c6819ea59543f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 16 Dec 2024 12:45:02 -0800 Subject: [PATCH 15/52] perf/x86/intel/ds: Add PEBS format 6 mainline inclusion from mainline-v6.13-rc5 commit b8c3a2502a205321fe66c356f4b70cabd8e1a5fc category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b8c3a2502a205321fe66c356f4b70cabd8e1a5fc --------------------------- The only difference between 5 and 6 is the new counters snapshotting group, without the following counters snapshotting enabling patches, it's impossible to utilize the feature in a PEBS record. It's safe to share the same code path with format 5. Add format 6, so the end user can at least utilize the legacy PEBS features. Intel-SIG: commit b8c3a2502a20 perf/x86/intel/ds: Add PEBS format 6 Fixes: a932aa0e868f ("perf/x86: Add Lunar Lake and Arrow Lake support") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20241216204505.748363-1-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/ds.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index bdb4f05fcd87..93c77db5e20e 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2523,6 +2523,7 @@ void __init intel_ds_init(void) x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME; break; + case 6: case 5: x86_pmu.pebs_ept = 1; fallthrough; -- Gitee From c676a6f060a88b724f5c45612773ae8df65f9625 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 11 Dec 2024 08:03:17 -0800 Subject: [PATCH 16/52] perf/x86/intel: Support RDPMC metrics clear mode mainline inclusion from mainline-v6.14-rc1 commit 0e45818ec1896c2b4aee0ec6721022ad625ea531 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0e45818ec1896c2b4aee0ec6721022ad625ea531 --------------------------- The new RDPMC enhancement, metrics clear mode, is to clear the PERF_METRICS-related resources as well as the fixed-function performance monitoring counter 3 after the read is performed. It is available for ring 3. The feature is enumerated by the IA32_PERF_CAPABILITIES.RDPMC_CLEAR_METRICS[bit 19]. To enable the feature, the IA32_FIXED_CTR_CTRL.METRICS_CLEAR_EN[bit 14] must be set. Two ways were considered to enable the feature. - Expose a knob in the sysfs globally. One user may affect the measurement of other users when changing the knob. The solution is dropped. - Introduce a new event format, metrics_clear, for the slots event to disable/enable the feature only for the current process. Users can utilize the feature as needed. The latter solution is implemented in the patch. The current KVM doesn't support the perf metrics yet. For virtualization, the feature can be enabled later separately. Intel-SIG: commit 0e45818ec189 perf/x86/intel: Support RDPMC metrics clear mode Suggested-by: Andi Kleen Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Reviewed-by: Ian Rogers Link: https://lkml.kernel.org/r/20241211160318.235056-1-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 20 +++++++++++++++++++- arch/x86/events/perf_event.h | 1 + arch/x86/include/asm/perf_event.h | 4 ++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 84eb95e0f2fb..cbe62aa3d9e6 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2811,6 +2811,9 @@ static void intel_pmu_enable_fixed(struct perf_event *event) return; idx = INTEL_PMC_IDX_FIXED_SLOTS; + + if (event->attr.config1 & INTEL_TD_CFG_METRIC_CLEAR) + bits |= INTEL_FIXED_3_METRICS_CLEAR; } intel_set_masks(event, idx); @@ -4147,7 +4150,12 @@ static int intel_pmu_hw_config(struct perf_event *event) * is used in a metrics group, it too cannot support sampling. */ if (intel_pmu_has_cap(event, PERF_CAP_METRICS_IDX) && is_topdown_event(event)) { - if (event->attr.config1 || event->attr.config2) + /* The metrics_clear can only be set for the slots event */ + if (event->attr.config1 && + (!is_slots_event(event) || (event->attr.config1 & ~INTEL_TD_CFG_METRIC_CLEAR))) + return -EINVAL; + + if (event->attr.config2) return -EINVAL; /* @@ -4734,6 +4742,8 @@ PMU_FORMAT_ATTR(in_tx, "config:32" ); PMU_FORMAT_ATTR(in_tx_cp, "config:33" ); PMU_FORMAT_ATTR(eq, "config:36" ); /* v6 + */ +PMU_FORMAT_ATTR(metrics_clear, "config1:0"); /* PERF_CAPABILITIES.RDPMC_METRICS_CLEAR */ + static ssize_t umask2_show(struct device *dev, struct device_attribute *attr, char *page) @@ -4753,6 +4763,7 @@ static struct device_attribute format_attr_umask2 = static struct attribute *format_evtsel_ext_attrs[] = { &format_attr_umask2.attr, &format_attr_eq.attr, + &format_attr_metrics_clear.attr, NULL }; @@ -4777,6 +4788,13 @@ evtsel_ext_is_visible(struct kobject *kobj, struct attribute *attr, int i) if (i == 1) return (mask & ARCH_PERFMON_EVENTSEL_EQ) ? attr->mode : 0; + /* PERF_CAPABILITIES.RDPMC_METRICS_CLEAR */ + if (i == 2) { + union perf_capabilities intel_cap = hybrid(dev_get_drvdata(dev), intel_cap); + + return intel_cap.rdpmc_metrics_clear ? attr->mode : 0; + } + return 0; } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index f9dae477d5e8..b04685130cc7 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -631,6 +631,7 @@ union perf_capabilities { u64 pebs_output_pt_available:1; u64 pebs_timing_info:1; u64 anythread_deprecated:1; + u64 rdpmc_metrics_clear:1; }; u64 capabilities; }; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 79fbcb5c04b8..23aa088010a3 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -41,6 +41,7 @@ #define INTEL_FIXED_0_USER (1ULL << 1) #define INTEL_FIXED_0_ANYTHREAD (1ULL << 2) #define INTEL_FIXED_0_ENABLE_PMI (1ULL << 3) +#define INTEL_FIXED_3_METRICS_CLEAR (1ULL << 2) #define HSW_IN_TX (1ULL << 32) #define HSW_IN_TX_CHECKPOINTED (1ULL << 33) @@ -380,6 +381,9 @@ static inline bool use_fixed_pseudo_encoding(u64 code) #define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_MEM_BOUND #define INTEL_TD_METRIC_NUM 8 +#define INTEL_TD_CFG_METRIC_CLEAR_BIT 0 +#define INTEL_TD_CFG_METRIC_CLEAR BIT_ULL(INTEL_TD_CFG_METRIC_CLEAR_BIT) + static inline bool is_metric_idx(int idx) { return (unsigned)(idx - INTEL_PMC_IDX_METRIC_BASE) < INTEL_TD_METRIC_NUM; -- Gitee From 0a16d61f2e74fe1a9aac9bfdad09b88f9beaf5ad Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 21 Jan 2025 07:23:03 -0800 Subject: [PATCH 17/52] perf/x86/intel: Support PEBS counters snapshotting mainline inclusion from mainline-v6.15-rc1 commit e02e9b0374c378aab016ae8ace60d9d98ab8caa6 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e02e9b0374c378aab016ae8ace60d9d98ab8caa6 --------------------------- The counters snapshotting is a new adaptive PEBS extension, which can capture programmable counters, fixed-function counters, and performance metrics in a PEBS record. The feature is available in the PEBS format V6. The target counters can be configured in the new fields of MSR_PEBS_CFG. Then the PEBS HW will generate the bit mask of counters (Counters Group Header) followed by the content of all the requested counters into a PEBS record. The current Linux perf sample read feature can read all events in the group when any event in the group is overflowed. But the rdpmc in the NMI/overflow handler has a small gap from overflow. Also, there is some overhead for each rdpmc read. The counters snapshotting feature can be used as an accurate and low-overhead replacement. Extend intel_update_topdown_event() to accept the value from PEBS records. Add a new PEBS_CNTR flag to indicate a sample read group that utilizes the counters snapshotting feature. When the group is scheduled, the PEBS configure can be updated accordingly. To prevent the case that a PEBS record value might be in the past relative to what is already in the event, perf always stops the PMU and drains the PEBS buffer before updating the corresponding event->count. Intel-SIG: commit e02e9b0374c3 perf/x86/intel: Support PEBS counters snapshotting Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250121152303.3128733-4-kan.liang@linux.intel.com [Dapeng Mi: resolve conflict and amend commit log] Signed-off-by: Dapeng Mi [jz: resolve conflict due to b0823d5fbacb ("perf/x86/intel: Fix crash in icl_update_topdown_event()") already backported] Signed-off-by: Jason Zeng --- arch/x86/events/core.c | 13 ++ arch/x86/events/intel/core.c | 75 ++++++++--- arch/x86/events/intel/ds.c | 191 +++++++++++++++++++++++++++-- arch/x86/events/perf_event.h | 13 ++ arch/x86/events/perf_event_flags.h | 2 +- arch/x86/include/asm/perf_event.h | 15 +++ 6 files changed, 284 insertions(+), 25 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 471eaa46d55f..15c4f9c18fe9 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -94,6 +94,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter); +DEFINE_STATIC_CALL_NULL(x86_pmu_late_setup, *x86_pmu.late_setup); + /* * This one is magic, it will get called even when PMU init fails (because * there is no PMU), in which case it should simply return NULL. @@ -1298,6 +1300,15 @@ static void x86_pmu_enable(struct pmu *pmu) if (cpuc->n_added) { int n_running = cpuc->n_events - cpuc->n_added; + + /* + * The late setup (after counters are scheduled) + * is required for some cases, e.g., PEBS counters + * snapshotting. Because an accurate counter index + * is needed. + */ + static_call_cond(x86_pmu_late_setup)(); + /* * apply assignment obtained either from * hw_perf_group_sched_in() or x86_pmu_enable() @@ -2036,6 +2047,8 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs); static_call_update(x86_pmu_filter, x86_pmu.filter); + + static_call_update(x86_pmu_late_setup, x86_pmu.late_setup); } static void _x86_pmu_read(struct perf_event *event) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index cbe62aa3d9e6..dc41d4c293fd 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2700,7 +2700,7 @@ static void update_saved_topdown_regs(struct perf_event *event, u64 slots, * modify by a NMI. PMU has to be disabled before calling this function. */ -static u64 intel_update_topdown_event(struct perf_event *event, int metric_end) +static u64 intel_update_topdown_event(struct perf_event *event, int metric_end, u64 *val) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *other; @@ -2708,13 +2708,24 @@ static u64 intel_update_topdown_event(struct perf_event *event, int metric_end) bool reset = true; int idx; - /* read Fixed counter 3 */ - rdpmcl((3 | INTEL_PMC_FIXED_RDPMC_BASE), slots); - if (!slots) - return 0; + if (!val) { + /* read Fixed counter 3 */ + rdpmcl((3 | INTEL_PMC_FIXED_RDPMC_BASE), slots); + if (!slots) + return 0; - /* read PERF_METRICS */ - rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics); + /* read PERF_METRICS */ + rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics); + } else { + slots = val[0]; + metrics = val[1]; + /* + * Don't reset the PERF_METRICS and Fixed counter 3 + * for each PEBS record read. Utilize the RDPMC metrics + * clear mode. + */ + reset = false; + } for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) { if (!is_topdown_idx(idx)) @@ -2757,17 +2768,19 @@ static u64 intel_update_topdown_event(struct perf_event *event, int metric_end) return slots; } -static u64 icl_update_topdown_event(struct perf_event *event) +static u64 icl_update_topdown_event(struct perf_event *event, u64 *val) { return intel_update_topdown_event(event, INTEL_PMC_IDX_METRIC_BASE + - x86_pmu.num_topdown_events - 1); + x86_pmu.num_topdown_events - 1, + val); } -DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update); +DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, intel_pmu_topdown_event_update); static void intel_pmu_read_event(struct perf_event *event) { - if (event->hw.flags & (PERF_X86_EVENT_AUTO_RELOAD | PERF_X86_EVENT_TOPDOWN)) { + if (event->hw.flags & (PERF_X86_EVENT_AUTO_RELOAD | PERF_X86_EVENT_TOPDOWN) || + is_pebs_counter_event_group(event)) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); bool pmu_enabled = cpuc->enabled; @@ -2779,8 +2792,12 @@ static void intel_pmu_read_event(struct perf_event *event) if (pmu_enabled) intel_pmu_disable_all(); - if (is_topdown_count(event)) - static_call(intel_pmu_update_topdown_event)(event); + /* + * If the PEBS counters snapshotting is enabled, + * the topdown event is available in PEBS records. + */ + if (is_topdown_count(event) && !is_pebs_counter_event_group(event)) + static_call(intel_pmu_update_topdown_event)(event, NULL); else intel_pmu_drain_pebs_buffer(); @@ -2923,7 +2940,7 @@ static int intel_pmu_set_period(struct perf_event *event) static u64 intel_pmu_update(struct perf_event *event) { if (unlikely(is_topdown_count(event))) - return static_call(intel_pmu_update_topdown_event)(event); + return static_call(intel_pmu_update_topdown_event)(event, NULL); return x86_perf_event_update(event); } @@ -3089,7 +3106,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) */ if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) { handled++; - static_call(intel_pmu_update_topdown_event)(NULL); + static_call(intel_pmu_update_topdown_event)(NULL, NULL); } /* @@ -3107,6 +3124,27 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) if (!test_bit(bit, cpuc->active_mask)) continue; + /* + * There may be unprocessed PEBS records in the PEBS buffer, + * which still stores the previous values. + * Process those records first before handling the latest value. + * For example, + * A is a regular counter + * B is a PEBS event which reads A + * C is a PEBS event + * + * The following can happen: + * B-assist A=1 + * C A=2 + * B-assist A=3 + * A-overflow-PMI A=4 + * C-assist-PMI (PEBS buffer) A=5 + * + * The PEBS buffer has to be drained before handling the A-PMI + */ + if (is_pebs_counter_event_group(event)) + x86_pmu.drain_pebs(regs, &data); + if (!intel_pmu_save_and_restart(event)) continue; @@ -4135,6 +4173,13 @@ static int intel_pmu_hw_config(struct perf_event *event) event->hw.flags |= PERF_X86_EVENT_PEBS_VIA_PT; } + if ((event->attr.sample_type & PERF_SAMPLE_READ) && + (x86_pmu.intel_cap.pebs_format >= 6) && + x86_pmu.intel_cap.pebs_baseline && + is_sampling_event(event) && + event->attr.precise_ip) + event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR; + if ((event->attr.type == PERF_TYPE_HARDWARE) || (event->attr.type == PERF_TYPE_HW_CACHE)) return 0; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 93c77db5e20e..a9776bede243 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1272,6 +1272,19 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) ds->pebs_interrupt_threshold = threshold; } +#define PEBS_DATACFG_CNTRS(x) \ + ((x >> PEBS_DATACFG_CNTR_SHIFT) & PEBS_DATACFG_CNTR_MASK) + +#define PEBS_DATACFG_CNTR_BIT(x) \ + (((1ULL << x) & PEBS_DATACFG_CNTR_MASK) << PEBS_DATACFG_CNTR_SHIFT) + +#define PEBS_DATACFG_FIX(x) \ + ((x >> PEBS_DATACFG_FIX_SHIFT) & PEBS_DATACFG_FIX_MASK) + +#define PEBS_DATACFG_FIX_BIT(x) \ + (((1ULL << (x)) & PEBS_DATACFG_FIX_MASK) \ + << PEBS_DATACFG_FIX_SHIFT) + static void adaptive_pebs_record_size_update(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1286,10 +1299,58 @@ static void adaptive_pebs_record_size_update(void) sz += sizeof(struct pebs_xmm); if (pebs_data_cfg & PEBS_DATACFG_LBRS) sz += x86_pmu.lbr_nr * sizeof(struct lbr_entry); + if (pebs_data_cfg & (PEBS_DATACFG_METRICS | PEBS_DATACFG_CNTR)) { + sz += sizeof(struct pebs_cntr_header); + + /* Metrics base and Metrics Data */ + if (pebs_data_cfg & PEBS_DATACFG_METRICS) + sz += 2 * sizeof(u64); + + if (pebs_data_cfg & PEBS_DATACFG_CNTR) { + sz += (hweight64(PEBS_DATACFG_CNTRS(pebs_data_cfg)) + + hweight64(PEBS_DATACFG_FIX(pebs_data_cfg))) * + sizeof(u64); + } + } cpuc->pebs_record_size = sz; } +static void __intel_pmu_pebs_update_cfg(struct perf_event *event, + int idx, u64 *pebs_data_cfg) +{ + if (is_metric_event(event)) { + *pebs_data_cfg |= PEBS_DATACFG_METRICS; + return; + } + + *pebs_data_cfg |= PEBS_DATACFG_CNTR; + + if (idx >= INTEL_PMC_IDX_FIXED) + *pebs_data_cfg |= PEBS_DATACFG_FIX_BIT(idx - INTEL_PMC_IDX_FIXED); + else + *pebs_data_cfg |= PEBS_DATACFG_CNTR_BIT(idx); +} + + +static void intel_pmu_late_setup(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event; + u64 pebs_data_cfg = 0; + int i; + + for (i = 0; i < cpuc->n_events; i++) { + event = cpuc->event_list[i]; + if (!is_pebs_counter_event_group(event)) + continue; + __intel_pmu_pebs_update_cfg(event, cpuc->assign[i], &pebs_data_cfg); + } + + if (pebs_data_cfg & ~cpuc->pebs_data_cfg) + cpuc->pebs_data_cfg |= pebs_data_cfg | PEBS_UPDATE_DS_SW; +} + #define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \ PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_WEIGHT_TYPE | \ @@ -1894,12 +1955,89 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs, #endif } +static void intel_perf_event_update_pmc(struct perf_event *event, u64 pmc) +{ + int shift = 64 - x86_pmu.cntval_bits; + struct hw_perf_event *hwc; + u64 delta, prev_pmc; + + /* + * A recorded counter may not have an assigned event in the + * following cases. The value should be dropped. + * - An event is deleted. There is still an active PEBS event. + * The PEBS record doesn't shrink on pmu::del(). + * If the counter of the deleted event once occurred in a PEBS + * record, PEBS still records the counter until the counter is + * reassigned. + * - An event is stopped for some reason, e.g., throttled. + * During this period, another event is added and takes the + * counter of the stopped event. The stopped event is assigned + * to another new and uninitialized counter, since the + * x86_pmu_start(RELOAD) is not invoked for a stopped event. + * The PEBS__DATA_CFG is updated regardless of the event state. + * The uninitialized counter can be recorded in a PEBS record. + * But the cpuc->events[uninitialized_counter] is always NULL, + * because the event is stopped. The uninitialized value is + * safely dropped. + */ + if (!event) + return; + + hwc = &event->hw; + prev_pmc = local64_read(&hwc->prev_count); + + /* Only update the count when the PMU is disabled */ + WARN_ON(this_cpu_read(cpu_hw_events.enabled)); + local64_set(&hwc->prev_count, pmc); + + delta = (pmc << shift) - (prev_pmc << shift); + delta >>= shift; + + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); +} + +static inline void __setup_pebs_counter_group(struct cpu_hw_events *cpuc, + struct perf_event *event, + struct pebs_cntr_header *cntr, + void *next_record) +{ + int bit; + + for_each_set_bit(bit, (unsigned long *)&cntr->cntr, INTEL_PMC_MAX_GENERIC) { + intel_perf_event_update_pmc(cpuc->events[bit], *(u64 *)next_record); + next_record += sizeof(u64); + } + + for_each_set_bit(bit, (unsigned long *)&cntr->fixed, INTEL_PMC_MAX_FIXED) { + /* The slots event will be handled with perf_metric later */ + if ((cntr->metrics == INTEL_CNTR_METRICS) && + (bit + INTEL_PMC_IDX_FIXED == INTEL_PMC_IDX_FIXED_SLOTS)) { + next_record += sizeof(u64); + continue; + } + intel_perf_event_update_pmc(cpuc->events[bit + INTEL_PMC_IDX_FIXED], + *(u64 *)next_record); + next_record += sizeof(u64); + } + + /* HW will reload the value right after the overflow. */ + if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) + local64_set(&event->hw.prev_count, (u64)-event->hw.sample_period); + + if (cntr->metrics == INTEL_CNTR_METRICS) { + static_call(intel_pmu_update_topdown_event) + (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS], + (u64 *)next_record); + next_record += 2 * sizeof(u64); + } +} + #define PEBS_LATENCY_MASK 0xffff /* * With adaptive PEBS the layout depends on what fields are configured. */ - static void setup_pebs_adaptive_sample_data(struct perf_event *event, struct pt_regs *iregs, void *__pebs, struct perf_sample_data *data, @@ -2030,6 +2168,28 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } } + if (format_group & (PEBS_DATACFG_CNTR | PEBS_DATACFG_METRICS)) { + struct pebs_cntr_header *cntr = next_record; + unsigned int nr; + + next_record += sizeof(struct pebs_cntr_header); + /* + * The PEBS_DATA_CFG is a global register, which is the + * superset configuration for all PEBS events. + * For the PEBS record of non-sample-read group, ignore + * the counter snapshot fields. + */ + if (is_pebs_counter_event_group(event)) { + __setup_pebs_counter_group(cpuc, event, cntr, next_record); + data->sample_flags |= PERF_SAMPLE_READ; + } + + nr = hweight32(cntr->cntr) + hweight32(cntr->fixed); + if (cntr->metrics == INTEL_CNTR_METRICS) + nr += 2; + next_record += nr * sizeof(u64); + } + WARN_ONCE(next_record != __pebs + basic->format_size, "PEBS record size %u, expected %llu, config %llx\n", basic->format_size, @@ -2183,13 +2343,21 @@ __intel_pmu_pebs_last_event(struct perf_event *event, } if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { - /* - * Now, auto-reload is only enabled in fixed period mode. - * The reload value is always hwc->sample_period. - * May need to change it, if auto-reload is enabled in - * freq mode later. - */ - intel_pmu_save_and_restart_reload(event, count); + if ((is_pebs_counter_event_group(event))) { + /* + * The value of each sample has been updated when setup + * the corresponding sample data. + */ + perf_event_update_userpage(event); + } else { + /* + * Now, auto-reload is only enabled in fixed period mode. + * The reload value is always hwc->sample_period. + * May need to change it, if auto-reload is enabled in + * freq mode later. + */ + intel_pmu_save_and_restart_reload(event, count); + } } else intel_pmu_save_and_restart(event); } @@ -2524,6 +2692,11 @@ void __init intel_ds_init(void) break; case 6: + if (x86_pmu.intel_cap.pebs_baseline) { + x86_pmu.large_pebs_flags |= PERF_SAMPLE_READ; + x86_pmu.late_setup = intel_pmu_late_setup; + } + fallthrough; case 5: x86_pmu.pebs_ept = 1; fallthrough; @@ -2548,7 +2721,7 @@ void __init intel_ds_init(void) PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); } - pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual); + pr_cont("PEBS fmt%d%c%s, ", format, pebs_type, pebs_qual); if (!is_hybrid() && x86_pmu.intel_cap.pebs_output_pt_available) { pr_cont("PEBS-via-PT, "); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index b04685130cc7..677317d5d627 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -122,6 +122,11 @@ static inline bool is_branch_counters_group(struct perf_event *event) return check_leader_group(event->group_leader, PERF_X86_EVENT_BRANCH_COUNTERS); } +static inline bool is_pebs_counter_event_group(struct perf_event *event) +{ + return event->group_leader->hw.flags & PERF_X86_EVENT_PEBS_CNTR; +} + struct amd_nb { int nb_id; /* NorthBridge id */ int refcnt; /* reference count */ @@ -793,6 +798,7 @@ struct x86_pmu { u64 (*update)(struct perf_event *event); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); + void (*late_setup)(void); unsigned eventsel; unsigned perfctr; unsigned fixedctr; @@ -1101,6 +1107,7 @@ extern struct x86_pmu x86_pmu __read_mostly; DECLARE_STATIC_CALL(x86_pmu_set_period, *x86_pmu.set_period); DECLARE_STATIC_CALL(x86_pmu_update, *x86_pmu.update); DECLARE_STATIC_CALL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); +DECLARE_STATIC_CALL(x86_pmu_late_setup, *x86_pmu.late_setup); static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) { @@ -1142,6 +1149,12 @@ extern u64 __read_mostly hw_cache_extra_regs u64 x86_perf_event_update(struct perf_event *event); +static inline u64 intel_pmu_topdown_event_update(struct perf_event *event, u64 *val) +{ + return x86_perf_event_update(event); +} +DECLARE_STATIC_CALL(intel_pmu_update_topdown_event, intel_pmu_topdown_event_update); + static inline unsigned int x86_pmu_config_addr(int index) { return x86_pmu.eventsel + (x86_pmu.addr_offset ? diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h index 6c977c19f2cd..1d9e385649b5 100644 --- a/arch/x86/events/perf_event_flags.h +++ b/arch/x86/events/perf_event_flags.h @@ -9,7 +9,7 @@ PERF_ARCH(PEBS_LD_HSW, 0x00008) /* haswell style datala, load */ PERF_ARCH(PEBS_NA_HSW, 0x00010) /* haswell style datala, unknown */ PERF_ARCH(EXCL, 0x00020) /* HT exclusivity on counter */ PERF_ARCH(DYNAMIC, 0x00040) /* dynamic alloc'd constraint */ - /* 0x00080 */ +PERF_ARCH(PEBS_CNTR, 0x00080) /* PEBS counters snapshot */ PERF_ARCH(EXCL_ACCT, 0x00100) /* accounted EXCL event */ PERF_ARCH(AUTO_RELOAD, 0x00200) /* use PEBS auto-reload */ PERF_ARCH(LARGE_PEBS, 0x00400) /* use large PEBS */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 23aa088010a3..c133704bed0f 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -149,6 +149,12 @@ #define PEBS_DATACFG_XMMS BIT_ULL(2) #define PEBS_DATACFG_LBRS BIT_ULL(3) #define PEBS_DATACFG_LBR_SHIFT 24 +#define PEBS_DATACFG_CNTR BIT_ULL(4) +#define PEBS_DATACFG_CNTR_SHIFT 32 +#define PEBS_DATACFG_CNTR_MASK GENMASK_ULL(15, 0) +#define PEBS_DATACFG_FIX_SHIFT 48 +#define PEBS_DATACFG_FIX_MASK GENMASK_ULL(7, 0) +#define PEBS_DATACFG_METRICS BIT_ULL(5) /* Steal the highest bit of pebs_data_cfg for SW usage */ #define PEBS_UPDATE_DS_SW BIT_ULL(63) @@ -468,6 +474,15 @@ struct pebs_xmm { u64 xmm[16*2]; /* two entries for each register */ }; +struct pebs_cntr_header { + u32 cntr; + u32 fixed; + u32 metrics; + u32 reserved; +}; + +#define INTEL_CNTR_METRICS 0x3 + /* * AMD Extended Performance Monitoring and Debug cpuid feature detection */ -- Gitee From 411bfbf5ad2c1b95622fc6f81a01eac6daeef93f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 29 Jan 2025 07:48:19 -0800 Subject: [PATCH 18/52] perf/x86/intel: Fix ARCH_PERFMON_NUM_COUNTER_LEAF mainline inclusion from mainline-v6.14-rc3 commit 47a973fd75639fe80d59f9e1860113bb2a0b112b category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=47a973fd75639fe80d59f9e1860113bb2a0b112b --------------------------- The EAX of the CPUID Leaf 023H enumerates the mask of valid sub-leaves. To tell the availability of the sub-leaf 1 (enumerate the counter mask), perf should check the bit 1 (0x2) of EAS, rather than bit 0 (0x1). The error is not user-visible on bare metal. Because the sub-leaf 0 and the sub-leaf 1 are always available. However, it may bring issues in a virtualization environment when a VMM only enumerates the sub-leaf 0. Introduce the cpuid35_e?x to replace the macros, which makes the implementation style consistent. Intel-SIG: commit 47a973fd7563 perf/x86/intel: Fix ARCH_PERFMON_NUM_COUNTER_LEAF Fixes: eb467aaac21e ("perf/x86/intel: Support Architectural PerfMon Extension leaf") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250129154820.3755948-3-kan.liang@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 18 ++++++++++-------- arch/x86/include/asm/perf_event.h | 28 +++++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index dc41d4c293fd..e574c68c0c55 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4994,20 +4994,22 @@ static inline bool intel_pmu_broken_perf_cap(void) static void update_pmu_cap(struct x86_hybrid_pmu *pmu) { - unsigned int sub_bitmaps, eax, ebx, ecx, edx; + unsigned int cntr, fixed_cntr, ecx, edx; + union cpuid35_eax eax; + union cpuid35_ebx ebx; - cpuid(ARCH_PERFMON_EXT_LEAF, &sub_bitmaps, &ebx, &ecx, &edx); + cpuid(ARCH_PERFMON_EXT_LEAF, &eax.full, &ebx.full, &ecx, &edx); - if (ebx & ARCH_PERFMON_EXT_UMASK2) + if (ebx.split.umask2) pmu->config_mask |= ARCH_PERFMON_EVENTSEL_UMASK2; - if (ebx & ARCH_PERFMON_EXT_EQ) + if (ebx.split.eq) pmu->config_mask |= ARCH_PERFMON_EVENTSEL_EQ; - if (sub_bitmaps & ARCH_PERFMON_NUM_COUNTER_LEAF_BIT) { + if (eax.split.cntr_subleaf) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF, - &eax, &ebx, &ecx, &edx); - pmu->cntr_mask64 = eax; - pmu->fixed_cntr_mask64 = ebx; + &cntr, &fixed_cntr, &ecx, &edx); + pmu->cntr_mask64 = cntr; + pmu->fixed_cntr_mask64 = fixed_cntr; } if (!intel_pmu_broken_perf_cap()) { diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index c133704bed0f..80b01734ede8 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -202,11 +202,33 @@ union cpuid10_edx { * detection/enumeration details: */ #define ARCH_PERFMON_EXT_LEAF 0x00000023 -#define ARCH_PERFMON_EXT_UMASK2 0x1 -#define ARCH_PERFMON_EXT_EQ 0x2 -#define ARCH_PERFMON_NUM_COUNTER_LEAF_BIT 0x1 #define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1 +union cpuid35_eax { + struct { + unsigned int leaf0:1; + /* Counters Sub-Leaf */ + unsigned int cntr_subleaf:1; + /* Auto Counter Reload Sub-Leaf */ + unsigned int acr_subleaf:1; + /* Events Sub-Leaf */ + unsigned int events_subleaf:1; + unsigned int reserved:28; + } split; + unsigned int full; +}; + +union cpuid35_ebx { + struct { + /* UnitMask2 Supported */ + unsigned int umask2:1; + /* EQ-bit Supported */ + unsigned int eq:1; + unsigned int reserved:30; + } split; + unsigned int full; +}; + /* * Intel Architectural LBR CPUID detection/enumeration details: */ -- Gitee From 7741742c81ba18efebc28ba89e6385c560d7ed3a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 10 Mar 2025 11:15:36 -0700 Subject: [PATCH 19/52] perf: Extend per event callchain limit to branch stack mainline inclusion from mainline-v6.15-rc1 commit c53e14f1ea4a8f8ddd9b2cd850fcbc0d934b79f5 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c53e14f1ea4a8f8ddd9b2cd850fcbc0d934b79f5 --------------------------- The commit 97c79a38cd45 ("perf core: Per event callchain limit") introduced a per-event term to allow finer tuning of the depth of callchains to save space. It should be applied to the branch stack as well. For example, autoFDO collections require maximum LBR entries. In the meantime, other system-wide LBR users may only be interested in the latest a few number of LBRs. A per-event LBR depth would save the perf output buffer. The patch simply drops the uninterested branches, but HW still collects the maximum branches. There may be a model-specific optimization that can reduce the HW depth for some cases to reduce the overhead further. But it isn't included in the patch set. Because it's not useful for all cases. For example, ARCH LBR can utilize the PEBS and XSAVE to collect LBRs. The depth should have less impact on the collecting overhead. The model-specific optimization may be implemented later separately. Intel-SIG: commit c53e14f1ea4a perf: Extend per event callchain limit to branch stack Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250310181536.3645382-1-kan.liang@linux.intel.com Signed-off-by: Jason Zeng --- include/linux/perf_event.h | 3 +++ include/uapi/linux/perf_event.h | 2 ++ 2 files changed, 5 insertions(+) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 826fb16906fe..20f9c0f2be81 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1324,6 +1324,9 @@ static inline void perf_sample_save_brstack(struct perf_sample_data *data, if (branch_sample_hw_index(event)) size += sizeof(u64); + + brs->nr = min_t(u16, event->attr.sample_max_stack, brs->nr); + size += brs->nr * sizeof(struct perf_branch_entry); /* diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 4842c36fdf80..008403fe0b0b 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -385,6 +385,8 @@ enum perf_event_read_format { * * @sample_max_stack: Max number of frame pointers in a callchain, * should be < /proc/sys/kernel/perf_event_max_stack + * Max number of entries of branch stack + * should be < hardware limit */ struct perf_event_attr { -- Gitee From f12f9a2a936f7141ae15bb2d2956a5b2cc83f85a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 27 Mar 2025 12:52:13 -0700 Subject: [PATCH 20/52] perf/x86: Add dynamic constraint mainline inclusion from mainline-v6.16-rc1 commit 4dfe3232cc04325a09e96f6c7f9546ba6c0b132b category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4dfe3232cc04325a09e96f6c7f9546ba6c0b132b --------------------------- More and more features require a dynamic event constraint, e.g., branch counter logging, auto counter reload, Arch PEBS, etc. Add a generic flag, PMU_FL_DYN_CONSTRAINT, to indicate the case. It avoids keeping adding the individual flag in intel_cpuc_prepare(). Add a variable dyn_constraint in the struct hw_perf_event to track the dynamic constraint of the event. Apply it if it's updated. Apply the generic dynamic constraint for branch counter logging. Many features on and after V6 require dynamic constraint. So unconditionally set the flag for V6+. Intel-SIG: commit 4dfe3232cc04 perf/x86: Add dynamic constraint Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Thomas Falcon Link: https://lkml.kernel.org/r/20250327195217.2683619-2-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/core.c | 1 + arch/x86/events/intel/core.c | 21 +++++++++++++++------ arch/x86/events/intel/lbr.c | 2 +- arch/x86/events/perf_event.h | 1 + include/linux/perf_event.h | 1 + 5 files changed, 19 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 15c4f9c18fe9..094ef82424f9 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -675,6 +675,7 @@ static int __x86_pmu_event_init(struct perf_event *event) event->hw.idx = -1; event->hw.last_cpu = -1; event->hw.last_tag = ~0ULL; + event->hw.dyn_constraint = ~0ULL; /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index e574c68c0c55..5d147ab2344e 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3716,10 +3716,9 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, if (cpuc->excl_cntrs) return intel_get_excl_constraints(cpuc, event, idx, c2); - /* Not all counters support the branch counter feature. */ - if (branch_sample_counters(event)) { + if (event->hw.dyn_constraint != ~0ULL) { c2 = dyn_constraint(cpuc, c2, idx); - c2->idxmsk64 &= x86_pmu.lbr_counters; + c2->idxmsk64 &= event->hw.dyn_constraint; c2->weight = hweight64(c2->idxmsk64); } @@ -4117,15 +4116,19 @@ static int intel_pmu_hw_config(struct perf_event *event) leader = event->group_leader; if (branch_sample_call_stack(leader)) return -EINVAL; - if (branch_sample_counters(leader)) + if (branch_sample_counters(leader)) { num++; + leader->hw.dyn_constraint &= x86_pmu.lbr_counters; + } leader->hw.flags |= PERF_X86_EVENT_BRANCH_COUNTERS; for_each_sibling_event(sibling, leader) { if (branch_sample_call_stack(sibling)) return -EINVAL; - if (branch_sample_counters(sibling)) + if (branch_sample_counters(sibling)) { num++; + sibling->hw.dyn_constraint &= x86_pmu.lbr_counters; + } } if (num > fls(x86_pmu.lbr_counters)) @@ -4903,7 +4906,7 @@ int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu) goto err; } - if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA | PMU_FL_BR_CNTR)) { + if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA | PMU_FL_DYN_CONSTRAINT)) { size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint); cpuc->constraint_list = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu)); @@ -6585,6 +6588,12 @@ __init int intel_pmu_init(void) pr_cont(" AnyThread deprecated, "); } + /* + * Many features on and after V6 require dynamic constraint, + * e.g., Arch PEBS, ACR. + */ + if (version >= 6) + x86_pmu.flags |= PMU_FL_DYN_CONSTRAINT; /* * Install the hw-cache-events table: */ diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 78cd5084104e..5f677447b812 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1608,7 +1608,7 @@ void __init intel_pmu_arch_lbr_init(void) x86_pmu.lbr_nr = lbr_nr; if (!!x86_pmu.lbr_counters) - x86_pmu.flags |= PMU_FL_BR_CNTR; + x86_pmu.flags |= PMU_FL_BR_CNTR | PMU_FL_DYN_CONSTRAINT; if (x86_pmu.lbr_mispred) static_branch_enable(&x86_lbr_mispred); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 677317d5d627..7332a1625d8e 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1059,6 +1059,7 @@ do { \ #define PMU_FL_MEM_LOADS_AUX 0x100 /* Require an auxiliary event for the complete memory info */ #define PMU_FL_RETIRE_LATENCY 0x200 /* Support Retire Latency in PEBS */ #define PMU_FL_BR_CNTR 0x400 /* Support branch counter logging */ +#define PMU_FL_DYN_CONSTRAINT 0x800 /* Needs dynamic constraint */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 20f9c0f2be81..e3e0ac171562 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -159,6 +159,7 @@ struct hw_perf_event { struct { /* hardware */ u64 config; u64 last_tag; + u64 dyn_constraint; unsigned long config_base; unsigned long event_base; int event_base_rdpmc; -- Gitee From 28b611f0ba57c0236b7f977fe7880621c1e5e55e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 27 Mar 2025 12:52:14 -0700 Subject: [PATCH 21/52] perf/x86/intel: Track the num of events needs late setup mainline inclusion from mainline-v6.16-rc1 commit 0a6557938d8f189024a03aca77e58763930840ee category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0a6557938d8f189024a03aca77e58763930840ee --------------------------- When a machine supports PEBS v6, perf unconditionally searches the cpuc->event_list[] for every event and check if the late setup is required, which is unnecessary. The late setup is only required for special events, e.g., events support counters snapshotting feature. Add n_late_setup to track the num of events that needs the late setup. Other features, e.g., auto counter reload feature, require the late setup as well. Add a wrapper, intel_pmu_pebs_late_setup, for the events that support counters snapshotting feature. Intel-SIG: commit 0a6557938d8f perf/x86/intel: Track the num of events needs late setup Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Thomas Falcon Link: https://lkml.kernel.org/r/20250327195217.2683619-3-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 14 ++++++++++++++ arch/x86/events/intel/ds.c | 3 +-- arch/x86/events/perf_event.h | 5 +++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 5d147ab2344e..90fe082b8e32 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2589,6 +2589,8 @@ static void intel_pmu_del_event(struct perf_event *event) intel_pmu_lbr_del(event); if (event->attr.precise_ip) intel_pmu_pebs_del(event); + if (is_pebs_counter_event_group(event)) + this_cpu_ptr(&cpu_hw_events)->n_late_setup--; } static int icl_set_topdown_event_period(struct perf_event *event) @@ -2900,12 +2902,24 @@ static void intel_pmu_enable_event(struct perf_event *event) } } +void intel_pmu_late_setup(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (!cpuc->n_late_setup) + return; + + intel_pmu_pebs_late_setup(cpuc); +} + static void intel_pmu_add_event(struct perf_event *event) { if (event->attr.precise_ip) intel_pmu_pebs_add(event); if (intel_pmu_needs_branch_stack(event)) intel_pmu_lbr_add(event); + if (is_pebs_counter_event_group(event)) + this_cpu_ptr(&cpu_hw_events)->n_late_setup++; } /* diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index a9776bede243..a78556527447 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1333,9 +1333,8 @@ static void __intel_pmu_pebs_update_cfg(struct perf_event *event, } -static void intel_pmu_late_setup(void) +void intel_pmu_pebs_late_setup(struct cpu_hw_events *cpuc) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *event; u64 pebs_data_cfg = 0; int i; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 7332a1625d8e..6f09e568f50b 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -268,6 +268,7 @@ struct cpu_hw_events { struct event_constraint *event_constraint[X86_PMC_IDX_MAX]; int n_excl; /* the number of exclusive events */ + int n_late_setup; /* the num of events needs late setup */ unsigned int txn_flags; int is_fake; @@ -1582,6 +1583,8 @@ void intel_pmu_disable_bts(void); int intel_pmu_drain_bts_buffer(void); +void intel_pmu_late_setup(void); + u64 grt_latency_data(struct perf_event *event, u64 status); u64 cmt_latency_data(struct perf_event *event, u64 status); @@ -1636,6 +1639,8 @@ void intel_pmu_pebs_disable_all(void); void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in); +void intel_pmu_pebs_late_setup(struct cpu_hw_events *cpuc); + void intel_pmu_drain_pebs_buffer(void); void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr); -- Gitee From 3badf3a2f75856554f8451df3026d099fa655b7c Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 27 Mar 2025 12:52:15 -0700 Subject: [PATCH 22/52] perf: Extend the bit width of the arch-specific flag mainline inclusion from mainline-v6.16-rc1 commit c9449c8506a5df5052ef4d17867699517b10b55a category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c9449c8506a5df5052ef4d17867699517b10b55a --------------------------- The auto counter reload feature requires an event flag to indicate an auto counter reload group, which can only be scheduled on specific counters that enumerated in CPUID. However, the hw_perf_event.flags has run out on X86. Two solutions were considered to address the issue. - Currently, 20 bits are reserved for the architecture-specific flags. Only the bit 31 is used for the generic flag. There is still plenty of space left. Reserve 8 more bits for the arch-specific flags. - Add a new X86 specific hw_perf_event.flags1 to support more flags. The former is implemented. Enough room is still left in the global generic flag. Intel-SIG: commit c9449c8506a5 perf: Extend the bit width of the arch-specific flag Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Thomas Falcon Link: https://lkml.kernel.org/r/20250327195217.2683619-4-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/perf_event_flags.h | 41 +++++++++++++++--------------- include/linux/perf_event.h | 2 +- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h index 1d9e385649b5..70078334e4a3 100644 --- a/arch/x86/events/perf_event_flags.h +++ b/arch/x86/events/perf_event_flags.h @@ -2,23 +2,24 @@ /* * struct hw_perf_event.flags flags */ -PERF_ARCH(PEBS_LDLAT, 0x00001) /* ld+ldlat data address sampling */ -PERF_ARCH(PEBS_ST, 0x00002) /* st data address sampling */ -PERF_ARCH(PEBS_ST_HSW, 0x00004) /* haswell style datala, store */ -PERF_ARCH(PEBS_LD_HSW, 0x00008) /* haswell style datala, load */ -PERF_ARCH(PEBS_NA_HSW, 0x00010) /* haswell style datala, unknown */ -PERF_ARCH(EXCL, 0x00020) /* HT exclusivity on counter */ -PERF_ARCH(DYNAMIC, 0x00040) /* dynamic alloc'd constraint */ -PERF_ARCH(PEBS_CNTR, 0x00080) /* PEBS counters snapshot */ -PERF_ARCH(EXCL_ACCT, 0x00100) /* accounted EXCL event */ -PERF_ARCH(AUTO_RELOAD, 0x00200) /* use PEBS auto-reload */ -PERF_ARCH(LARGE_PEBS, 0x00400) /* use large PEBS */ -PERF_ARCH(PEBS_VIA_PT, 0x00800) /* use PT buffer for PEBS */ -PERF_ARCH(PAIR, 0x01000) /* Large Increment per Cycle */ -PERF_ARCH(LBR_SELECT, 0x02000) /* Save/Restore MSR_LBR_SELECT */ -PERF_ARCH(TOPDOWN, 0x04000) /* Count Topdown slots/metrics events */ -PERF_ARCH(PEBS_STLAT, 0x08000) /* st+stlat data address sampling */ -PERF_ARCH(AMD_BRS, 0x10000) /* AMD Branch Sampling */ -PERF_ARCH(PEBS_LAT_HYBRID, 0x20000) /* ld and st lat for hybrid */ -PERF_ARCH(NEEDS_BRANCH_STACK, 0x40000) /* require branch stack setup */ -PERF_ARCH(BRANCH_COUNTERS, 0x80000) /* logs the counters in the extra space of each branch */ +PERF_ARCH(PEBS_LDLAT, 0x0000001) /* ld+ldlat data address sampling */ +PERF_ARCH(PEBS_ST, 0x0000002) /* st data address sampling */ +PERF_ARCH(PEBS_ST_HSW, 0x0000004) /* haswell style datala, store */ +PERF_ARCH(PEBS_LD_HSW, 0x0000008) /* haswell style datala, load */ +PERF_ARCH(PEBS_NA_HSW, 0x0000010) /* haswell style datala, unknown */ +PERF_ARCH(EXCL, 0x0000020) /* HT exclusivity on counter */ +PERF_ARCH(DYNAMIC, 0x0000040) /* dynamic alloc'd constraint */ +PERF_ARCH(PEBS_CNTR, 0x0000080) /* PEBS counters snapshot */ +PERF_ARCH(EXCL_ACCT, 0x0000100) /* accounted EXCL event */ +PERF_ARCH(AUTO_RELOAD, 0x0000200) /* use PEBS auto-reload */ +PERF_ARCH(LARGE_PEBS, 0x0000400) /* use large PEBS */ +PERF_ARCH(PEBS_VIA_PT, 0x0000800) /* use PT buffer for PEBS */ +PERF_ARCH(PAIR, 0x0001000) /* Large Increment per Cycle */ +PERF_ARCH(LBR_SELECT, 0x0002000) /* Save/Restore MSR_LBR_SELECT */ +PERF_ARCH(TOPDOWN, 0x0004000) /* Count Topdown slots/metrics events */ +PERF_ARCH(PEBS_STLAT, 0x0008000) /* st+stlat data address sampling */ +PERF_ARCH(AMD_BRS, 0x0010000) /* AMD Branch Sampling */ +PERF_ARCH(PEBS_LAT_HYBRID, 0x0020000) /* ld and st lat for hybrid */ +PERF_ARCH(NEEDS_BRANCH_STACK, 0x0040000) /* require branch stack setup */ +PERF_ARCH(BRANCH_COUNTERS, 0x0080000) /* logs the counters in the extra space of each branch */ +PERF_ARCH(ACR, 0x0100000) /* Auto counter reload */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index e3e0ac171562..f22255857f3f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -145,7 +145,7 @@ struct hw_perf_event_extra { * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific * usage. */ -#define PERF_EVENT_FLAG_ARCH 0x000fffff +#define PERF_EVENT_FLAG_ARCH 0x0fffffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); -- Gitee From 1b271b0cb287abe8f8710b2436ec1bac22dd1c19 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 27 Mar 2025 12:52:16 -0700 Subject: [PATCH 23/52] perf/x86/intel: Add CPUID enumeration for the auto counter reload mainline inclusion from mainline-v6.16-rc1 commit 1856c6c2f8416b1340652cccfa1fc302ac8d5ecd category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1856c6c2f8416b1340652cccfa1fc302ac8d5ecd --------------------------- The counters that support the auto counter reload feature can be enumerated in the CPUID Leaf 0x23 sub-leaf 0x2. Add acr_cntr_mask to store the mask of counters which are reloadable. Add acr_cause_mask to store the mask of counters which can cause reload. Since the e-core and p-core may have different numbers of counters, track the masks in the struct x86_hybrid_pmu as well. Intel-SIG: commit 1856c6c2f841 perf/x86/intel: Add CPUID enumeration for the auto counter reload Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Thomas Falcon Link: https://lkml.kernel.org/r/20250327195217.2683619-5-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 10 ++++++++++ arch/x86/events/perf_event.h | 17 +++++++++++++++++ arch/x86/include/asm/perf_event.h | 1 + 3 files changed, 28 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 90fe082b8e32..8a17dd989fb6 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5029,6 +5029,16 @@ static void update_pmu_cap(struct x86_hybrid_pmu *pmu) pmu->fixed_cntr_mask64 = fixed_cntr; } + if (eax.split.acr_subleaf) { + cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_ACR_LEAF, + &cntr, &fixed_cntr, &ecx, &edx); + /* The mask of the counters which can be reloaded */ + pmu->acr_cntr_mask64 = cntr | ((u64)fixed_cntr << INTEL_PMC_IDX_FIXED); + + /* The mask of the counters which can cause a reload of reloadable counters */ + pmu->acr_cause_mask64 = ecx | ((u64)edx << INTEL_PMC_IDX_FIXED); + } + if (!intel_pmu_broken_perf_cap()) { /* Perf Metric (Bit 15) and PEBS via PT (Bit 16) are hybrid enumeration */ rdmsrl(MSR_IA32_PERF_CAPABILITIES, pmu->intel_cap.capabilities); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 6f09e568f50b..3252a83d4327 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -718,6 +718,15 @@ struct x86_hybrid_pmu { u64 fixed_cntr_mask64; unsigned long fixed_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; }; + + union { + u64 acr_cntr_mask64; + unsigned long acr_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; + union { + u64 acr_cause_mask64; + unsigned long acr_cause_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; struct event_constraint unconstrained; u64 hw_cache_event_ids @@ -816,6 +825,14 @@ struct x86_pmu { u64 fixed_cntr_mask64; unsigned long fixed_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; }; + union { + u64 acr_cntr_mask64; + unsigned long acr_cntr_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; + union { + u64 acr_cause_mask64; + unsigned long acr_cause_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + }; int cntval_bits; u64 cntval_mask; union { diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 80b01734ede8..08d2853395cb 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -203,6 +203,7 @@ union cpuid10_edx { */ #define ARCH_PERFMON_EXT_LEAF 0x00000023 #define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1 +#define ARCH_PERFMON_ACR_LEAF 0x2 union cpuid35_eax { struct { -- Gitee From bebe9e1ddb53dc4e9ed965ad61e932d97341aa3d Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 27 Mar 2025 12:52:17 -0700 Subject: [PATCH 24/52] perf/x86/intel: Support auto counter reload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.16-rc1 commit ec980e4facef8110f6fce27e5b6344660117f01f category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ec980e4facef8110f6fce27e5b6344660117f01f --------------------------- The relative rates among two or more events are useful for performance analysis, e.g., a high branch miss rate may indicate a performance issue. Usually, the samples with a relative rate that exceeds some threshold are more useful. However, the traditional sampling takes samples of events separately. To get the relative rates among two or more events, a high sample rate is required, which can bring high overhead. Many samples taken in the non-hotspot area are also dropped (useless) in the post-process. The auto counter reload (ACR) feature takes samples when the relative rate of two or more events exceeds some threshold, which provides the fine-grained information at a low cost. To support the feature, two sets of MSRs are introduced. For a given counter IA32_PMC_GPn_CTR/IA32_PMC_FXm_CTR, bit fields in the IA32_PMC_GPn_CFG_B/IA32_PMC_FXm_CFG_B MSR indicate which counter(s) can cause a reload of that counter. The reload value is stored in the IA32_PMC_GPn_CFG_C/IA32_PMC_FXm_CFG_C. The details can be found at Intel SDM (085), Volume 3, 21.9.11 Auto Counter Reload. In the hw_config(), an ACR event is specially configured, because the cause/reloadable counter mask has to be applied to the dyn_constraint. Besides the HW limit, e.g., not support perf metrics, PDist and etc, a SW limit is applied as well. ACR events in a group must be contiguous. It facilitates the later conversion from the event idx to the counter idx. Otherwise, the intel_pmu_acr_late_setup() has to traverse the whole event list again to find the "cause" event. Also, add a new flag PERF_X86_EVENT_ACR to indicate an ACR group, which is set to the group leader. The late setup() is also required for an ACR group. It's to convert the event idx to the counter idx, and saved it in hw.config1. The ACR configuration MSRs are only updated in the enable_event(). The disable_event() doesn't clear the ACR CFG register. Add acr_cfg_b/acr_cfg_c in the struct cpu_hw_events to cache the MSR values. It can avoid a MSR write if the value is not changed. Expose an acr_mask to the sysfs. The perf tool can utilize the new format to configure the relation of events in the group. The bit sequence of the acr_mask follows the events enabled order of the group. Example: Here is the snippet of the mispredict.c. Since the array has a random numbers, jumps are random and often mispredicted. The mispredicted rate depends on the compared value. For the Loop1, ~11% of all branches are mispredicted. For the Loop2, ~21% of all branches are mispredicted. main() { ... for (i = 0; i < N; i++) data[i] = rand() % 256; ... /* Loop 1 */ for (k = 0; k < 50; k++) for (i = 0; i < N; i++) if (data[i] >= 64) sum += data[i]; ... ... /* Loop 2 */ for (k = 0; k < 50; k++) for (i = 0; i < N; i++) if (data[i] >= 128) sum += data[i]; ... } Usually, a code with a high branch miss rate means a bad performance. To understand the branch miss rate of the codes, the traditional method usually samples both branches and branch-misses events. E.g., perf record -e "{cpu_atom/branch-misses/ppu, cpu_atom/branch-instructions/u}" -c 1000000 -- ./mispredict [ perf record: Woken up 4 times to write data ] [ perf record: Captured and wrote 0.925 MB perf.data (5106 samples) ] The 5106 samples are from both events and spread in both Loops. In the post-process stage, a user can know that the Loop 2 has a 21% branch miss rate. Then they can focus on the samples of branch-misses events for the Loop 2. With this patch, the user can generate the samples only when the branch miss rate > 20%. For example, perf record -e "{cpu_atom/branch-misses,period=200000,acr_mask=0x2/ppu, cpu_atom/branch-instructions,period=1000000,acr_mask=0x3/u}" -- ./mispredict (Two different periods are applied to branch-misses and branch-instructions. The ratio is set to 20%. If the branch-instructions is overflowed first, the branch-miss rate < 20%. No samples should be generated. All counters should be automatically reloaded. If the branch-misses is overflowed first, the branch-miss rate > 20%. A sample triggered by the branch-misses event should be generated. Just the counter of the branch-instructions should be automatically reloaded. The branch-misses event should only be automatically reloaded when the branch-instructions is overflowed. So the "cause" event is the branch-instructions event. The acr_mask is set to 0x2, since the event index in the group of branch-instructions is 1. The branch-instructions event is automatically reloaded no matter which events are overflowed. So the "cause" events are the branch-misses and the branch-instructions event. The acr_mask should be set to 0x3.) [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.098 MB perf.data (2498 samples) ] $perf report Percent │154: movl $0x0,-0x14(%rbp) │ ↓ jmp 1af │ for (i = j; i < N; i++) │15d: mov -0x10(%rbp),%eax │ mov %eax,-0x18(%rbp) │ ↓ jmp 1a2 │ if (data[i] >= 128) │165: mov -0x18(%rbp),%eax │ cltq │ lea 0x0(,%rax,4),%rdx │ mov -0x8(%rbp),%rax │ add %rdx,%rax │ mov (%rax),%eax │ ┌──cmp $0x7f,%eax 100.00 0.00 │ ├──jle 19e │ │sum += data[i]; The 2498 samples are all from the branch-misses events for the Loop 2. The number of samples and overhead is significantly reduced without losing any information. Intel-SIG: commit ec980e4facef perf/x86/intel: Support auto counter reload Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Thomas Falcon Link: https://lkml.kernel.org/r/20250327195217.2683619-6-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 226 ++++++++++++++++++++++++++++++- arch/x86/events/perf_event.h | 10 ++ arch/x86/include/asm/msr-index.h | 4 + include/linux/perf_event.h | 1 + 4 files changed, 239 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 8a17dd989fb6..fcb4e24de2d1 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2589,7 +2589,8 @@ static void intel_pmu_del_event(struct perf_event *event) intel_pmu_lbr_del(event); if (event->attr.precise_ip) intel_pmu_pebs_del(event); - if (is_pebs_counter_event_group(event)) + if (is_pebs_counter_event_group(event) || + is_acr_event_group(event)) this_cpu_ptr(&cpu_hw_events)->n_late_setup--; } @@ -2868,6 +2869,52 @@ static void intel_pmu_enable_fixed(struct perf_event *event) cpuc->fixed_ctrl_val |= bits; } +static void intel_pmu_config_acr(int idx, u64 mask, u32 reload) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int msr_b, msr_c; + + if (!mask && !cpuc->acr_cfg_b[idx]) + return; + + if (idx < INTEL_PMC_IDX_FIXED) { + msr_b = MSR_IA32_PMC_V6_GP0_CFG_B; + msr_c = MSR_IA32_PMC_V6_GP0_CFG_C; + } else { + msr_b = MSR_IA32_PMC_V6_FX0_CFG_B; + msr_c = MSR_IA32_PMC_V6_FX0_CFG_C; + idx -= INTEL_PMC_IDX_FIXED; + } + + if (cpuc->acr_cfg_b[idx] != mask) { + wrmsrl(msr_b + x86_pmu.addr_offset(idx, false), mask); + cpuc->acr_cfg_b[idx] = mask; + } + /* Only need to update the reload value when there is a valid config value. */ + if (mask && cpuc->acr_cfg_c[idx] != reload) { + wrmsrl(msr_c + x86_pmu.addr_offset(idx, false), reload); + cpuc->acr_cfg_c[idx] = reload; + } +} + +static void intel_pmu_enable_acr(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!is_acr_event_group(event) || !event->attr.config2) { + /* + * The disable doesn't clear the ACR CFG register. + * Check and clear the ACR CFG register. + */ + intel_pmu_config_acr(hwc->idx, 0, 0); + return; + } + + intel_pmu_config_acr(hwc->idx, hwc->config1, -hwc->sample_period); +} + +DEFINE_STATIC_CALL_NULL(intel_pmu_enable_acr_event, intel_pmu_enable_acr); + static void intel_pmu_enable_event(struct perf_event *event) { u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE; @@ -2882,9 +2929,12 @@ static void intel_pmu_enable_event(struct perf_event *event) if (branch_sample_counters(event)) enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR; intel_set_masks(event, idx); + static_call_cond(intel_pmu_enable_acr_event)(event); __x86_pmu_enable_event(hwc, enable_mask); break; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: + static_call_cond(intel_pmu_enable_acr_event)(event); + fallthrough; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: intel_pmu_enable_fixed(event); break; @@ -2902,6 +2952,31 @@ static void intel_pmu_enable_event(struct perf_event *event) } } +static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc) +{ + struct perf_event *event, *leader; + int i, j, idx; + + for (i = 0; i < cpuc->n_events; i++) { + leader = cpuc->event_list[i]; + if (!is_acr_event_group(leader)) + continue; + + /* The ACR events must be contiguous. */ + for (j = i; j < cpuc->n_events; j++) { + event = cpuc->event_list[j]; + if (event->group_leader != leader->group_leader) + break; + for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) { + if (WARN_ON_ONCE(i + idx > cpuc->n_events)) + return; + __set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1); + } + } + i = j - 1; + } +} + void intel_pmu_late_setup(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -2910,6 +2985,7 @@ void intel_pmu_late_setup(void) return; intel_pmu_pebs_late_setup(cpuc); + intel_pmu_acr_late_setup(cpuc); } static void intel_pmu_add_event(struct perf_event *event) @@ -2918,7 +2994,8 @@ static void intel_pmu_add_event(struct perf_event *event) intel_pmu_pebs_add(event); if (intel_pmu_needs_branch_stack(event)) intel_pmu_lbr_add(event); - if (is_pebs_counter_event_group(event)) + if (is_pebs_counter_event_group(event) || + is_acr_event_group(event)) this_cpu_ptr(&cpu_hw_events)->n_late_setup++; } @@ -4073,6 +4150,39 @@ static u64 intel_pmu_freq_start_period(struct perf_event *event) return start; } +static inline bool intel_pmu_has_acr(struct pmu *pmu) +{ + return !!hybrid(pmu, acr_cause_mask64); +} + +static bool intel_pmu_is_acr_group(struct perf_event *event) +{ + /* The group leader has the ACR flag set */ + if (is_acr_event_group(event)) + return true; + + /* The acr_mask is set */ + if (event->attr.config2) + return true; + + return false; +} + +static inline void intel_pmu_set_acr_cntr_constr(struct perf_event *event, + u64 *cause_mask, int *num) +{ + event->hw.dyn_constraint &= hybrid(event->pmu, acr_cntr_mask64); + *cause_mask |= event->attr.config2; + *num += 1; +} + +static inline void intel_pmu_set_acr_caused_constr(struct perf_event *event, + int idx, u64 cause_mask) +{ + if (test_bit(idx, (unsigned long *)&cause_mask)) + event->hw.dyn_constraint &= hybrid(event->pmu, acr_cause_mask64); +} + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -4197,6 +4307,94 @@ static int intel_pmu_hw_config(struct perf_event *event) event->attr.precise_ip) event->group_leader->hw.flags |= PERF_X86_EVENT_PEBS_CNTR; + if (intel_pmu_has_acr(event->pmu) && intel_pmu_is_acr_group(event)) { + struct perf_event *sibling, *leader = event->group_leader; + struct pmu *pmu = event->pmu; + bool has_sw_event = false; + int num = 0, idx = 0; + u64 cause_mask = 0; + + /* Not support perf metrics */ + if (is_metric_event(event)) + return -EINVAL; + + /* Not support freq mode */ + if (event->attr.freq) + return -EINVAL; + + /* PDist is not supported */ + if (event->attr.config2 && event->attr.precise_ip > 2) + return -EINVAL; + + /* The reload value cannot exceeds the max period */ + if (event->attr.sample_period > x86_pmu.max_period) + return -EINVAL; + /* + * The counter-constraints of each event cannot be finalized + * unless the whole group is scanned. However, it's hard + * to know whether the event is the last one of the group. + * Recalculate the counter-constraints for each event when + * adding a new event. + * + * The group is traversed twice, which may be optimized later. + * In the first round, + * - Find all events which do reload when other events + * overflow and set the corresponding counter-constraints + * - Add all events, which can cause other events reload, + * in the cause_mask + * - Error out if the number of events exceeds the HW limit + * - The ACR events must be contiguous. + * Error out if there are non-X86 events between ACR events. + * This is not a HW limit, but a SW limit. + * With the assumption, the intel_pmu_acr_late_setup() can + * easily convert the event idx to counter idx without + * traversing the whole event list. + */ + if (!is_x86_event(leader)) + return -EINVAL; + + if (leader->attr.config2) + intel_pmu_set_acr_cntr_constr(leader, &cause_mask, &num); + + if (leader->nr_siblings) { + for_each_sibling_event(sibling, leader) { + if (!is_x86_event(sibling)) { + has_sw_event = true; + continue; + } + if (!sibling->attr.config2) + continue; + if (has_sw_event) + return -EINVAL; + intel_pmu_set_acr_cntr_constr(sibling, &cause_mask, &num); + } + } + if (leader != event && event->attr.config2) { + if (has_sw_event) + return -EINVAL; + intel_pmu_set_acr_cntr_constr(event, &cause_mask, &num); + } + + if (hweight64(cause_mask) > hweight64(hybrid(pmu, acr_cause_mask64)) || + num > hweight64(hybrid(event->pmu, acr_cntr_mask64))) + return -EINVAL; + /* + * In the second round, apply the counter-constraints for + * the events which can cause other events reload. + */ + intel_pmu_set_acr_caused_constr(leader, idx++, cause_mask); + + if (leader->nr_siblings) { + for_each_sibling_event(sibling, leader) + intel_pmu_set_acr_caused_constr(sibling, idx++, cause_mask); + } + + if (leader != event) + intel_pmu_set_acr_caused_constr(event, idx, cause_mask); + + leader->hw.flags |= PERF_X86_EVENT_ACR; + } + if ((event->attr.type == PERF_TYPE_HARDWARE) || (event->attr.type == PERF_TYPE_HW_CACHE)) return 0; @@ -6043,6 +6241,21 @@ td_is_visible(struct kobject *kobj, struct attribute *attr, int i) return attr->mode; } +PMU_FORMAT_ATTR(acr_mask, "config2:0-63"); + +static struct attribute *format_acr_attrs[] = { + &format_attr_acr_mask.attr, + NULL +}; + +static umode_t +acr_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + struct device *dev = kobj_to_dev(kobj); + + return intel_pmu_has_acr(dev_get_drvdata(dev)) ? attr->mode : 0; +} + static struct attribute_group group_events_td = { .name = "events", .is_visible = td_is_visible, @@ -6085,6 +6298,12 @@ static struct attribute_group group_format_evtsel_ext = { .is_visible = evtsel_ext_is_visible, }; +static struct attribute_group group_format_acr = { + .name = "format", + .attrs = format_acr_attrs, + .is_visible = acr_is_visible, +}; + static struct attribute_group group_default = { .attrs = intel_pmu_attrs, .is_visible = default_is_visible, @@ -6099,6 +6318,7 @@ static const struct attribute_group *attr_update[] = { &group_format_extra, &group_format_extra_skl, &group_format_evtsel_ext, + &group_format_acr, &group_default, NULL, }; @@ -6337,6 +6557,7 @@ static const struct attribute_group *hybrid_attr_update[] = { &group_caps_lbr, &hybrid_group_format_extra, &group_format_evtsel_ext, + &group_format_acr, &group_default, &hybrid_group_cpus, NULL, @@ -6520,6 +6741,7 @@ static __always_inline void intel_pmu_init_skt(struct pmu *pmu) intel_pmu_init_grt(pmu); hybrid(pmu, event_constraints) = intel_skt_event_constraints; hybrid(pmu, extra_regs) = intel_cmt_extra_regs; + static_call_update(intel_pmu_enable_acr_event, intel_pmu_enable_acr); } __init int intel_pmu_init(void) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 3252a83d4327..2bacdd34b671 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -127,6 +127,11 @@ static inline bool is_pebs_counter_event_group(struct perf_event *event) return event->group_leader->hw.flags & PERF_X86_EVENT_PEBS_CNTR; } +static inline bool is_acr_event_group(struct perf_event *event) +{ + return event->group_leader->hw.flags & PERF_X86_EVENT_ACR; +} + struct amd_nb { int nb_id; /* NorthBridge id */ int refcnt; /* reference count */ @@ -294,6 +299,10 @@ struct cpu_hw_events { u64 fixed_ctrl_val; u64 active_fixed_ctrl_val; + /* Intel ACR configuration */ + u64 acr_cfg_b[X86_PMC_IDX_MAX]; + u64 acr_cfg_c[X86_PMC_IDX_MAX]; + /* * Intel LBR bits */ @@ -1120,6 +1129,7 @@ static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\ .pmu_type = _pmu, \ } +int is_x86_event(struct perf_event *event); struct pmu *x86_get_pmu(unsigned int cpu); extern struct x86_pmu x86_pmu __read_mostly; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ee6fe147c828..21b04d9c29e2 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -573,7 +573,11 @@ /* V6 PMON MSR range */ #define MSR_IA32_PMC_V6_GP0_CTR 0x1900 #define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901 +#define MSR_IA32_PMC_V6_GP0_CFG_B 0x1902 +#define MSR_IA32_PMC_V6_GP0_CFG_C 0x1903 #define MSR_IA32_PMC_V6_FX0_CTR 0x1980 +#define MSR_IA32_PMC_V6_FX0_CFG_B 0x1982 +#define MSR_IA32_PMC_V6_FX0_CFG_C 0x1983 #define MSR_IA32_PMC_V6_STEP 4 /* diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f22255857f3f..717ed8691f5a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -158,6 +158,7 @@ struct hw_perf_event { union { struct { /* hardware */ u64 config; + u64 config1; u64 last_tag; u64 dyn_constraint; unsigned long config_base; -- Gitee From 1878402345225972e482ad9621f6da9522b5105d Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Tue, 15 Apr 2025 10:41:34 +0000 Subject: [PATCH 25/52] perf/x86/intel: Don't clear perf metrics overflow bit unconditionally mainline inclusion from mainline-v6.15-rc3 commit a5f5e1238f4ff919816f69e77d2537a48911767b category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a5f5e1238f4ff919816f69e77d2537a48911767b --------------------------- The below code would always unconditionally clear other status bits like perf metrics overflow bit once PEBS buffer overflows: status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; This is incorrect. Perf metrics overflow bit should be cleared only when fixed counter 3 in PEBS counter group. Otherwise perf metrics overflow could be missed to handle. Intel-SIG: commit a5f5e1238f4f perf/x86/intel: Don't clear perf metrics overflow bit unconditionally Closes: https://lore.kernel.org/all/20250225110012.GK31462@noisy.programming.kicks-ass.net/ Fixes: 7b2c05a15d29 ("perf/x86/intel: Generic support for hardware TopDown metrics") Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Kan Liang Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250415104135.318169-1-dapeng1.mi@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index fcb4e24de2d1..95555a3dd5e3 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3126,7 +3126,6 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int bit; int handled = 0; - u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); inc_irq_stat(apic_perf_irqs); @@ -3170,7 +3169,6 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) handled++; x86_pmu_handle_guest_pebs(regs, &data); static_call(x86_pmu_drain_pebs)(regs, &data); - status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; /* * PMI throttle may be triggered, which stops the PEBS event. @@ -3181,6 +3179,15 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) */ if (pebs_enabled != cpuc->pebs_enabled) wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); + + /* + * Above PEBS handler (PEBS counters snapshotting) has updated fixed + * counter 3 and perf metrics counts if they are in counter group, + * unnecessary to update again. + */ + if (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS] && + is_pebs_counter_event_group(cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS])) + status &= ~GLOBAL_STATUS_PERF_METRICS_OVF_BIT; } /* @@ -3200,6 +3207,8 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) static_call(intel_pmu_update_topdown_event)(NULL, NULL); } + status &= hybrid(cpuc->pmu, intel_ctrl); + /* * Checkpointed counters can lead to 'spurious' PMIs because the * rollback caused by the PMI will have cleared the overflow status -- Gitee From 482900885f989cbff502b0337642b99466de5b46 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Tue, 15 Apr 2025 11:44:08 +0000 Subject: [PATCH 26/52] perf/x86/intel: Add PMU support for Clearwater Forest mainline inclusion from mainline-v6.16-rc1 commit 48d66c89dce1e3687174608a5f5c31d5961a9916 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=48d66c89dce1e3687174608a5f5c31d5961a9916 --------------------------- From the PMU's perspective, Clearwater Forest is similar to the previous generation Sierra Forest. The key differences are the ARCH PEBS feature and the new added 3 fixed counters for topdown L1 metrics events. The ARCH PEBS is supported in the following patches. This patch provides support for basic perfmon features and 3 new added fixed counters. Intel-SIG: commit 48d66c89dce1 perf/x86/intel: Add PMU support for Clearwater Forest Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20250415114428.341182-3-dapeng1.mi@linux.intel.com [Dapeng Mi: resolve conflict and amend commit log] Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 95555a3dd5e3..da75d79c0f20 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2210,6 +2210,18 @@ static struct extra_reg intel_cmt_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound_skt, "event=0x9c,umask=0x01"); +EVENT_ATTR_STR(topdown-retiring, td_retiring_skt, "event=0xc2,umask=0x02"); +EVENT_ATTR_STR(topdown-be-bound, td_be_bound_skt, "event=0xa4,umask=0x02"); + +static struct attribute *skt_events_attrs[] = { + EVENT_PTR(td_fe_bound_skt), + EVENT_PTR(td_retiring_skt), + EVENT_PTR(td_bad_spec_cmt), + EVENT_PTR(td_be_bound_skt), + NULL, +}; + #define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */ #define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */ #define KNL_MCDRAM_LOCAL BIT_ULL(21) @@ -7060,6 +7072,18 @@ __init int intel_pmu_init(void) name = "crestmont"; break; + case INTEL_FAM6_ATOM_DARKMONT_X: + intel_pmu_init_skt(NULL); + intel_pmu_pebs_data_source_cmt(); + x86_pmu.pebs_latency_data = cmt_latency_data; + x86_pmu.get_event_constraints = cmt_get_event_constraints; + td_attr = skt_events_attrs; + mem_attr = grt_mem_attrs; + extra_attr = cmt_format_attr; + pr_cont("Darkmont events, "); + name = "darkmont"; + break; + case INTEL_FAM6_WESTMERE: case INTEL_FAM6_WESTMERE_EP: case INTEL_FAM6_WESTMERE_EX: -- Gitee From 402a1e4d37ad3d89e31a758d2be7c69736b0f933 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Tue, 15 Apr 2025 11:44:09 +0000 Subject: [PATCH 27/52] perf/x86/intel: Parse CPUID archPerfmonExt leaves for non-hybrid CPUs mainline inclusion from mainline-v6.16-rc1 commit 25c623f41438fafc6f63c45e2e141d7bcff78299 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=25c623f41438fafc6f63c45e2e141d7bcff78299 --------------------------- CPUID archPerfmonExt (0x23) leaves are supported to enumerate CPU level's PMU capabilities on non-hybrid processors as well. This patch supports to parse archPerfmonExt leaves on non-hybrid processors. Architectural PEBS leverages archPerfmonExt sub-leaves 0x4 and 0x5 to enumerate the PEBS capabilities as well. This patch is a precursor of the subsequent arch-PEBS enabling patches. Intel-SIG: commit 25c623f41438 perf/x86/intel: Parse CPUID archPerfmonExt leaves for non-hybrid CPUs Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20250415114428.341182-4-dapeng1.mi@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index da75d79c0f20..53da2672ce4b 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5228,7 +5228,7 @@ static inline bool intel_pmu_broken_perf_cap(void) return false; } -static void update_pmu_cap(struct x86_hybrid_pmu *pmu) +static void update_pmu_cap(struct pmu *pmu) { unsigned int cntr, fixed_cntr, ecx, edx; union cpuid35_eax eax; @@ -5237,30 +5237,30 @@ static void update_pmu_cap(struct x86_hybrid_pmu *pmu) cpuid(ARCH_PERFMON_EXT_LEAF, &eax.full, &ebx.full, &ecx, &edx); if (ebx.split.umask2) - pmu->config_mask |= ARCH_PERFMON_EVENTSEL_UMASK2; + hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_UMASK2; if (ebx.split.eq) - pmu->config_mask |= ARCH_PERFMON_EVENTSEL_EQ; + hybrid(pmu, config_mask) |= ARCH_PERFMON_EVENTSEL_EQ; if (eax.split.cntr_subleaf) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF, &cntr, &fixed_cntr, &ecx, &edx); - pmu->cntr_mask64 = cntr; - pmu->fixed_cntr_mask64 = fixed_cntr; + hybrid(pmu, cntr_mask64) = cntr; + hybrid(pmu, fixed_cntr_mask64) = fixed_cntr; } if (eax.split.acr_subleaf) { cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_ACR_LEAF, &cntr, &fixed_cntr, &ecx, &edx); /* The mask of the counters which can be reloaded */ - pmu->acr_cntr_mask64 = cntr | ((u64)fixed_cntr << INTEL_PMC_IDX_FIXED); + hybrid(pmu, acr_cntr_mask64) = cntr | ((u64)fixed_cntr << INTEL_PMC_IDX_FIXED); /* The mask of the counters which can cause a reload of reloadable counters */ - pmu->acr_cause_mask64 = ecx | ((u64)edx << INTEL_PMC_IDX_FIXED); + hybrid(pmu, acr_cause_mask64) = ecx | ((u64)edx << INTEL_PMC_IDX_FIXED); } if (!intel_pmu_broken_perf_cap()) { /* Perf Metric (Bit 15) and PEBS via PT (Bit 16) are hybrid enumeration */ - rdmsrl(MSR_IA32_PERF_CAPABILITIES, pmu->intel_cap.capabilities); + rdmsrl(MSR_IA32_PERF_CAPABILITIES, hybrid(pmu, intel_cap).capabilities); } } @@ -5342,7 +5342,7 @@ static bool init_hybrid_pmu(int cpu) goto end; if (this_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT)) - update_pmu_cap(pmu); + update_pmu_cap(&pmu->pmu); intel_pmu_check_hybrid_pmus(pmu); @@ -6817,6 +6817,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_events_mask = intel_pmu_pebs_mask(x86_pmu.cntr_mask64); x86_pmu.pebs_capable = PEBS_COUNTER_MASK; + x86_pmu.config_mask = X86_RAW_EVENT_MASK; /* * Quirk: v2 perfmon does not report fixed-purpose events, so @@ -7594,6 +7595,18 @@ __init int intel_pmu_init(void) x86_pmu.attr_update = hybrid_attr_update; } + /* + * The archPerfmonExt (0x23) includes an enhanced enumeration of + * PMU architectural features with a per-core view. For non-hybrid, + * each core has the same PMU capabilities. It's good enough to + * update the x86_pmu from the booting CPU. For hybrid, the x86_pmu + * is used to keep the common capabilities. Still keep the values + * from the leaf 0xa. The core specific update will be done later + * when a new type is online. + */ + if (!is_hybrid() && boot_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT)) + update_pmu_cap(NULL); + intel_pmu_check_counters_mask(&x86_pmu.cntr_mask64, &x86_pmu.fixed_cntr_mask64, &x86_pmu.intel_ctrl); -- Gitee From f43d0eba8cec7fb523614ad9144b75fcabed5c8d Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 24 Apr 2025 06:47:15 -0700 Subject: [PATCH 28/52] perf/x86/intel: Check the X86 leader for pebs_counter_event_group mainline inclusion from mainline-v6.15-rc5 commit e9988ad7b1744991118ac348a804f9395368a284 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e9988ad7b1744991118ac348a804f9395368a284 --------------------------- The PEBS counters snapshotting group also requires a group flag in the leader. The leader must be a X86 event. Intel-SIG: commit e9988ad7b174 perf/x86/intel: Check the X86 leader for pebs_counter_event_group Fixes: e02e9b0374c3 ("perf/x86/intel: Support PEBS counters snapshotting") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250424134718.311934-3-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 2bacdd34b671..037aed44c412 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -124,7 +124,7 @@ static inline bool is_branch_counters_group(struct perf_event *event) static inline bool is_pebs_counter_event_group(struct perf_event *event) { - return event->group_leader->hw.flags & PERF_X86_EVENT_PEBS_CNTR; + return check_leader_group(event->group_leader, PERF_X86_EVENT_PEBS_CNTR); } static inline bool is_acr_event_group(struct perf_event *event) -- Gitee From aff6c6234acfc9437deb5e5cef870b83c2d4312b Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 24 Apr 2025 06:47:16 -0700 Subject: [PATCH 29/52] perf/x86/intel: Check the X86 leader for ACR group mainline inclusion from mainline-v6.16-rc1 commit efd448540e6243dbdaf0a7e1bcf49734e73f3c93 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=efd448540e6243dbdaf0a7e1bcf49734e73f3c93 --------------------------- The auto counter reload group also requires a group flag in the leader. The leader must be a X86 event. Intel-SIG: commit efd448540e62 perf/x86/intel: Check the X86 leader for ACR group Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250424134718.311934-4-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 037aed44c412..e5237c126e4b 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -129,7 +129,7 @@ static inline bool is_pebs_counter_event_group(struct perf_event *event) static inline bool is_acr_event_group(struct perf_event *event) { - return event->group_leader->hw.flags & PERF_X86_EVENT_ACR; + return check_leader_group(event->group_leader, PERF_X86_EVENT_ACR); } struct amd_nb { -- Gitee From 28700d4a8f5d0cce58a8314f4d3e94b4628afd8b Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 24 Apr 2025 06:47:17 -0700 Subject: [PATCH 30/52] perf/x86: Optimize the is_x86_event mainline inclusion from mainline-v6.16-rc1 commit 3e830f657f69ab6a4822d72ec2f364c6d51beef8 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3e830f657f69ab6a4822d72ec2f364c6d51beef8 --------------------------- The current is_x86_event has to go through the hybrid_pmus list to find the matched pmu, then check if it's a X86 PMU and a X86 event. It's not necessary. The X86 PMU has a unique type ID on a non-hybrid machine, and a unique capability type. They are good enough to do the check. Intel-SIG: commit 3e830f657f69 perf/x86: Optimize the is_x86_event Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250424134718.311934-5-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/core.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 094ef82424f9..504d799683f8 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -758,15 +758,16 @@ void x86_pmu_enable_all(int added) int is_x86_event(struct perf_event *event) { - int i; - - if (!is_hybrid()) - return event->pmu == &pmu; - - for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { - if (event->pmu == &x86_pmu.hybrid_pmu[i].pmu) - return true; - } + /* + * For a non-hybrid platforms, the type of X86 pmu is + * always PERF_TYPE_RAW. + * For a hybrid platform, the PERF_PMU_CAP_EXTENDED_HW_TYPE + * is a unique capability for the X86 PMU. + * Use them to detect a X86 event. + */ + if (event->pmu->type == PERF_TYPE_RAW || + event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE) + return true; return false; } -- Gitee From 207408adbfd2ea3a8a762dc2acc61456fa51eaf4 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:07:50 -0700 Subject: [PATCH 31/52] tools/include: Sync uapi/linux/perf.h with the kernel sources mainline inclusion from mainline-v6.16-rc1 commit 8ec9497d3ef34fab216e277eca5035811f06b421 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8ec9497d3ef34fab216e277eca5035811f06b421 -------------------------------- To pick up changes from: 608f6976c309 perf/x86/intel: Support new data source for Lunar Lake This should be used to beautify perf syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h Please see tools/include/uapi/README for details (it's in the first patch of this series). Intel-SIG: commit 8ec9497d3ef3 tools/include: Sync uapi/linux/perf.h with the kernel sources Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Ian Rogers Cc: Adrian Hunter Cc: "Liang, Kan" Cc: linux-perf-users@vger.kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Jason Zeng --- tools/include/uapi/linux/perf_event.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 3a64499b0f5d..4842c36fdf80 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -1349,12 +1349,14 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ #define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ #define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ -/* 5-0x7 available */ +#define PERF_MEM_LVLNUM_L2_MHB 0x05 /* L2 Miss Handling Buffer */ +#define PERF_MEM_LVLNUM_MSC 0x06 /* Memory-side Cache */ +/* 0x7 available */ #define PERF_MEM_LVLNUM_UNC 0x08 /* Uncached */ #define PERF_MEM_LVLNUM_CXL 0x09 /* CXL */ #define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ #define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ -#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */ +#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB / L1 Miss Handling Buffer */ #define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ #define PERF_MEM_LVLNUM_PMEM 0x0e /* PMEM */ #define PERF_MEM_LVLNUM_NA 0x0f /* N/A */ -- Gitee From e585c57fb7d4464319857eaca0b3eb99dee6e605 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 9 Apr 2025 17:11:18 -0700 Subject: [PATCH 32/52] tools headers: Update the uapi/linux/perf_event.h copy with the kernel sources mainline inclusion from mainline-v6.15-rc3 commit ae62977331fcbf5c9a4260c88d9f94450db2d99a category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ae62977331fcbf5c9a4260c88d9f94450db2d99a -------------------------------- To pick up the changes in: c53e14f1ea4a8f8d perf: Extend per event callchain limit to branch stack Addressing this perf tools build warning: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h Please see tools/include/uapi/README for further details. Intel-SIG: commit ae62977331fc tools headers: Update the uapi/linux/perf_event.h copy with the kernel sources Acked-by: Ingo Molnar Tested-by: Venkat Rao Bagalkote Link: https://lore.kernel.org/r/20250410001125.391820-4-namhyung@kernel.org Signed-off-by: Namhyung Kim Signed-off-by: Jason Zeng --- tools/include/uapi/linux/perf_event.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 4842c36fdf80..008403fe0b0b 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -385,6 +385,8 @@ enum perf_event_read_format { * * @sample_max_stack: Max number of frame pointers in a callchain, * should be < /proc/sys/kernel/perf_event_max_stack + * Max number of entries of branch stack + * should be < hardware limit */ struct perf_event_attr { -- Gitee From c722f64cd300b26f7889fb6a4fa0bed5210dcd4a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 22 May 2025 09:51:22 +0200 Subject: [PATCH 33/52] perf/uapi: Clean up a bit mainline inclusion from mainline-v6.16-rc1 commit 44889ff67cee7b9ee2d305690ce7a5488b137a66 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=44889ff67cee7b9ee2d305690ce7a5488b137a66 --------------------------- When applying a recent commit to the header I noticed that we have accumulated quite a bit of historic noise in this header, so do a bit of spring cleaning: - Define bitfields in a vertically aligned fashion, like perf_event_mmap_page::capabilities already does. This makes it easier to see the distribution and sizing of bits within a word, at a glance. The following is much more readable: __u64 cap_bit0 : 1, cap_bit0_is_deprecated : 1, cap_user_rdpmc : 1, cap_user_time : 1, cap_user_time_zero : 1, cap_user_time_short : 1, cap_____res : 58; Than: __u64 cap_bit0:1, cap_bit0_is_deprecated:1, cap_user_rdpmc:1, cap_user_time:1, cap_user_time_zero:1, cap_user_time_short:1, cap_____res:58; So convert all bitfield definitions from the latter style to the former style. - Fix typos and grammar - Fix capitalization - Remove whitespace noise - Harmonize the definitions of various generations and groups of PERF_MEM_ ABI values. - Vertically align all definitions and assignments to the same column (48), as the first definition (enum perf_type_id), throughout the entire header. - And in general make the code and comments to be more in sync with each other and to be more readable overall. No change in functionality. Copy the changes over to tools/include/uapi/linux/perf_event.h. Intel-SIG: commit 44889ff67cee perf/uapi: Clean up a bit Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Alexander Shishkin Cc: Mark Rutland Cc: Namhyung Kim Cc: Ian Rogers Link: https://lore.kernel.org/r/20250521221529.2547099-1-irogers@google.com Signed-off-by: Jason Zeng --- include/uapi/linux/perf_event.h | 652 +++++++++++++------------- tools/include/uapi/linux/perf_event.h | 652 +++++++++++++------------- 2 files changed, 662 insertions(+), 642 deletions(-) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 008403fe0b0b..365184d931ad 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -39,18 +39,21 @@ enum perf_type_id { /* * attr.config layout for type PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE + * * PERF_TYPE_HARDWARE: 0xEEEEEEEE000000AA * AA: hardware event ID * EEEEEEEE: PMU type ID + * * PERF_TYPE_HW_CACHE: 0xEEEEEEEE00DDCCBB * BB: hardware cache ID * CC: hardware cache op ID * DD: hardware cache op result ID * EEEEEEEE: PMU type ID - * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied. + * + * If the PMU type ID is 0, PERF_TYPE_RAW will be applied. */ -#define PERF_PMU_TYPE_SHIFT 32 -#define PERF_HW_EVENT_MASK 0xffffffff +#define PERF_PMU_TYPE_SHIFT 32 +#define PERF_HW_EVENT_MASK 0xffffffff /* * Generalized performance event event_id types, used by the @@ -112,7 +115,7 @@ enum perf_hw_cache_op_result_id { /* * Special "software" events provided by the kernel, even if the hardware * does not support performance events. These events measure various - * physical and sw events of the kernel (and allow the profiling of them as + * physical and SW events of the kernel (and allow the profiling of them as * well): */ enum perf_sw_ids { @@ -167,8 +170,9 @@ enum perf_event_sample_format { }; #define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) + /* - * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set + * Values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set. * * If the user does not pass priv level information via branch_sample_type, * the kernel uses the event's priv level. Branch and event priv levels do @@ -178,20 +182,20 @@ enum perf_event_sample_format { * of branches and therefore it supersedes all the other types. */ enum perf_branch_sample_type_shift { - PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */ - PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */ - PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */ - - PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */ - PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */ - PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */ - PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */ - PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */ - PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */ - PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */ + PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */ + PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */ + PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */ + + PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */ + PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */ + PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */ + PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */ + PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */ + PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */ + PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */ PERF_SAMPLE_BRANCH_COND_SHIFT = 10, /* conditional branches */ - PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = 11, /* call/ret stack */ + PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = 11, /* CALL/RET stack */ PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT = 12, /* indirect jumps */ PERF_SAMPLE_BRANCH_CALL_SHIFT = 13, /* direct call */ @@ -210,96 +214,95 @@ enum perf_branch_sample_type_shift { }; enum perf_branch_sample_type { - PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT, - PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT, - PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT, + PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT, + PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT, + PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT, - PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT, - PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT, - PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT, - PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT, - PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT, - PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT, - PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT, - PERF_SAMPLE_BRANCH_COND = 1U << PERF_SAMPLE_BRANCH_COND_SHIFT, + PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT, + PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT, + PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT, + PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT, + PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT, + PERF_SAMPLE_BRANCH_COND = 1U << PERF_SAMPLE_BRANCH_COND_SHIFT, - PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT, - PERF_SAMPLE_BRANCH_IND_JUMP = 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT, - PERF_SAMPLE_BRANCH_CALL = 1U << PERF_SAMPLE_BRANCH_CALL_SHIFT, + PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT, + PERF_SAMPLE_BRANCH_IND_JUMP = 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT, + PERF_SAMPLE_BRANCH_CALL = 1U << PERF_SAMPLE_BRANCH_CALL_SHIFT, - PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT, - PERF_SAMPLE_BRANCH_NO_CYCLES = 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT, + PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT, + PERF_SAMPLE_BRANCH_NO_CYCLES = 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT, - PERF_SAMPLE_BRANCH_TYPE_SAVE = - 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_TYPE_SAVE = 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, - PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, + PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, - PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, - PERF_SAMPLE_BRANCH_COUNTERS = 1U << PERF_SAMPLE_BRANCH_COUNTERS_SHIFT, + PERF_SAMPLE_BRANCH_COUNTERS = 1U << PERF_SAMPLE_BRANCH_COUNTERS_SHIFT, - PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; /* - * Common flow change classification + * Common control flow change classifications: */ enum { - PERF_BR_UNKNOWN = 0, /* unknown */ - PERF_BR_COND = 1, /* conditional */ - PERF_BR_UNCOND = 2, /* unconditional */ - PERF_BR_IND = 3, /* indirect */ - PERF_BR_CALL = 4, /* function call */ - PERF_BR_IND_CALL = 5, /* indirect function call */ - PERF_BR_RET = 6, /* function return */ - PERF_BR_SYSCALL = 7, /* syscall */ - PERF_BR_SYSRET = 8, /* syscall return */ - PERF_BR_COND_CALL = 9, /* conditional function call */ - PERF_BR_COND_RET = 10, /* conditional function return */ - PERF_BR_ERET = 11, /* exception return */ - PERF_BR_IRQ = 12, /* irq */ - PERF_BR_SERROR = 13, /* system error */ - PERF_BR_NO_TX = 14, /* not in transaction */ - PERF_BR_EXTEND_ABI = 15, /* extend ABI */ + PERF_BR_UNKNOWN = 0, /* Unknown */ + PERF_BR_COND = 1, /* Conditional */ + PERF_BR_UNCOND = 2, /* Unconditional */ + PERF_BR_IND = 3, /* Indirect */ + PERF_BR_CALL = 4, /* Function call */ + PERF_BR_IND_CALL = 5, /* Indirect function call */ + PERF_BR_RET = 6, /* Function return */ + PERF_BR_SYSCALL = 7, /* Syscall */ + PERF_BR_SYSRET = 8, /* Syscall return */ + PERF_BR_COND_CALL = 9, /* Conditional function call */ + PERF_BR_COND_RET = 10, /* Conditional function return */ + PERF_BR_ERET = 11, /* Exception return */ + PERF_BR_IRQ = 12, /* IRQ */ + PERF_BR_SERROR = 13, /* System error */ + PERF_BR_NO_TX = 14, /* Not in transaction */ + PERF_BR_EXTEND_ABI = 15, /* Extend ABI */ PERF_BR_MAX, }; /* - * Common branch speculation outcome classification + * Common branch speculation outcome classifications: */ enum { - PERF_BR_SPEC_NA = 0, /* Not available */ - PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */ - PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */ - PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */ + PERF_BR_SPEC_NA = 0, /* Not available */ + PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */ + PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */ + PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */ PERF_BR_SPEC_MAX, }; enum { - PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ - PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ - PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ - PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ - PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ - PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ - PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ - PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ + PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ + PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ + PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ + PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ + PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ + PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ + PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ + PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ PERF_BR_NEW_MAX, }; enum { - PERF_BR_PRIV_UNKNOWN = 0, - PERF_BR_PRIV_USER = 1, - PERF_BR_PRIV_KERNEL = 2, - PERF_BR_PRIV_HV = 3, + PERF_BR_PRIV_UNKNOWN = 0, + PERF_BR_PRIV_USER = 1, + PERF_BR_PRIV_KERNEL = 2, + PERF_BR_PRIV_HV = 3, }; -#define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 -#define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 -#define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 -#define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 -#define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 +#define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 +#define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 +#define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 +#define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 +#define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ @@ -310,9 +313,9 @@ enum { * Values to determine ABI of the registers dump. */ enum perf_sample_regs_abi { - PERF_SAMPLE_REGS_ABI_NONE = 0, - PERF_SAMPLE_REGS_ABI_32 = 1, - PERF_SAMPLE_REGS_ABI_64 = 2, + PERF_SAMPLE_REGS_ABI_NONE = 0, + PERF_SAMPLE_REGS_ABI_32 = 1, + PERF_SAMPLE_REGS_ABI_64 = 2, }; /* @@ -320,21 +323,21 @@ enum perf_sample_regs_abi { * abort events. Multiple bits can be set. */ enum { - PERF_TXN_ELISION = (1 << 0), /* From elision */ - PERF_TXN_TRANSACTION = (1 << 1), /* From transaction */ - PERF_TXN_SYNC = (1 << 2), /* Instruction is related */ - PERF_TXN_ASYNC = (1 << 3), /* Instruction not related */ - PERF_TXN_RETRY = (1 << 4), /* Retry possible */ - PERF_TXN_CONFLICT = (1 << 5), /* Conflict abort */ - PERF_TXN_CAPACITY_WRITE = (1 << 6), /* Capacity write abort */ - PERF_TXN_CAPACITY_READ = (1 << 7), /* Capacity read abort */ + PERF_TXN_ELISION = (1 << 0), /* From elision */ + PERF_TXN_TRANSACTION = (1 << 1), /* From transaction */ + PERF_TXN_SYNC = (1 << 2), /* Instruction is related */ + PERF_TXN_ASYNC = (1 << 3), /* Instruction is not related */ + PERF_TXN_RETRY = (1 << 4), /* Retry possible */ + PERF_TXN_CONFLICT = (1 << 5), /* Conflict abort */ + PERF_TXN_CAPACITY_WRITE = (1 << 6), /* Capacity write abort */ + PERF_TXN_CAPACITY_READ = (1 << 7), /* Capacity read abort */ - PERF_TXN_MAX = (1 << 8), /* non-ABI */ + PERF_TXN_MAX = (1 << 8), /* non-ABI */ - /* bits 32..63 are reserved for the abort code */ + /* Bits 32..63 are reserved for the abort code */ - PERF_TXN_ABORT_MASK = (0xffffffffULL << 32), - PERF_TXN_ABORT_SHIFT = 32, + PERF_TXN_ABORT_MASK = (0xffffffffULL << 32), + PERF_TXN_ABORT_SHIFT = 32, }; /* @@ -369,24 +372,22 @@ enum perf_event_read_format { PERF_FORMAT_MAX = 1U << 5, /* non-ABI */ }; -#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ -#define PERF_ATTR_SIZE_VER1 72 /* add: config2 */ -#define PERF_ATTR_SIZE_VER2 80 /* add: branch_sample_type */ -#define PERF_ATTR_SIZE_VER3 96 /* add: sample_regs_user */ - /* add: sample_stack_user */ -#define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */ -#define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */ -#define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */ -#define PERF_ATTR_SIZE_VER7 128 /* add: sig_data */ -#define PERF_ATTR_SIZE_VER8 136 /* add: config3 */ +#define PERF_ATTR_SIZE_VER0 64 /* Size of first published 'struct perf_event_attr' */ +#define PERF_ATTR_SIZE_VER1 72 /* Add: config2 */ +#define PERF_ATTR_SIZE_VER2 80 /* Add: branch_sample_type */ +#define PERF_ATTR_SIZE_VER3 96 /* Add: sample_regs_user */ + /* Add: sample_stack_user */ +#define PERF_ATTR_SIZE_VER4 104 /* Add: sample_regs_intr */ +#define PERF_ATTR_SIZE_VER5 112 /* Add: aux_watermark */ +#define PERF_ATTR_SIZE_VER6 120 /* Add: aux_sample_size */ +#define PERF_ATTR_SIZE_VER7 128 /* Add: sig_data */ +#define PERF_ATTR_SIZE_VER8 136 /* Add: config3 */ /* - * Hardware event_id to monitor via a performance monitoring event: - * - * @sample_max_stack: Max number of frame pointers in a callchain, - * should be < /proc/sys/kernel/perf_event_max_stack - * Max number of entries of branch stack - * should be < hardware limit + * 'struct perf_event_attr' contains various attributes that define + * a performance event - most of them hardware related configuration + * details, but also a lot of behavioral switches and values implemented + * by the kernel. */ struct perf_event_attr { @@ -396,7 +397,7 @@ struct perf_event_attr { __u32 type; /* - * Size of the attr structure, for fwd/bwd compat. + * Size of the attr structure, for forward/backwards compatibility. */ __u32 size; @@ -451,21 +452,21 @@ struct perf_event_attr { comm_exec : 1, /* flag comm events that are due to an exec */ use_clockid : 1, /* use @clockid for time fields */ context_switch : 1, /* context switch data */ - write_backward : 1, /* Write ring buffer from end to beginning */ + write_backward : 1, /* write ring buffer from end to beginning */ namespaces : 1, /* include namespaces data */ ksymbol : 1, /* include ksymbol events */ - bpf_event : 1, /* include bpf events */ + bpf_event : 1, /* include BPF events */ aux_output : 1, /* generate AUX records instead of events */ cgroup : 1, /* include cgroup events */ text_poke : 1, /* include text poke events */ - build_id : 1, /* use build id in mmap2 events */ + build_id : 1, /* use build ID in mmap2 events */ inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ remove_on_exec : 1, /* event is removed from task on exec */ sigtrap : 1, /* send synchronous SIGTRAP on event */ __reserved_1 : 26; union { - __u32 wakeup_events; /* wakeup every n events */ + __u32 wakeup_events; /* wake up every n events */ __u32 wakeup_watermark; /* bytes before wakeup */ }; @@ -474,13 +475,13 @@ struct perf_event_attr { __u64 bp_addr; __u64 kprobe_func; /* for perf_kprobe */ __u64 uprobe_path; /* for perf_uprobe */ - __u64 config1; /* extension of config */ + __u64 config1; /* extension of config */ }; union { __u64 bp_len; - __u64 kprobe_addr; /* when kprobe_func == NULL */ + __u64 kprobe_addr; /* when kprobe_func == NULL */ __u64 probe_offset; /* for perf_[k,u]probe */ - __u64 config2; /* extension of config1 */ + __u64 config2; /* extension of config1 */ }; __u64 branch_sample_type; /* enum perf_branch_sample_type */ @@ -510,7 +511,16 @@ struct perf_event_attr { * Wakeup watermark for AUX area */ __u32 aux_watermark; + + /* + * Max number of frame pointers in a callchain, should be + * lower than /proc/sys/kernel/perf_event_max_stack. + * + * Max number of entries of branch stack should be lower + * than the hardware limit. + */ __u16 sample_max_stack; + __u16 __reserved_2; __u32 aux_sample_size; __u32 __reserved_3; @@ -528,7 +538,7 @@ struct perf_event_attr { /* * Structure used by below PERF_EVENT_IOC_QUERY_BPF command - * to query bpf programs attached to the same perf tracepoint + * to query BPF programs attached to the same perf tracepoint * as the given perf event. */ struct perf_event_query_bpf { @@ -550,21 +560,21 @@ struct perf_event_query_bpf { /* * Ioctls that can be done on a perf event fd: */ -#define PERF_EVENT_IOC_ENABLE _IO ('$', 0) -#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) -#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) -#define PERF_EVENT_IOC_RESET _IO ('$', 3) -#define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64) -#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) -#define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) -#define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) -#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) -#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) +#define PERF_EVENT_IOC_ENABLE _IO ('$', 0) +#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) +#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) +#define PERF_EVENT_IOC_RESET _IO ('$', 3) +#define PERF_EVENT_IOC_PERIOD _IOW ('$', 4, __u64) +#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) +#define PERF_EVENT_IOC_SET_FILTER _IOW ('$', 6, char *) +#define PERF_EVENT_IOC_ID _IOR ('$', 7, __u64 *) +#define PERF_EVENT_IOC_SET_BPF _IOW ('$', 8, __u32) +#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW ('$', 9, __u32) #define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *) -#define PERF_EVENT_IOC_MODIFY_ATTRIBUTES _IOW('$', 11, struct perf_event_attr *) +#define PERF_EVENT_IOC_MODIFY_ATTRIBUTES _IOW ('$', 11, struct perf_event_attr *) enum perf_event_ioc_flags { - PERF_IOC_FLAG_GROUP = 1U << 0, + PERF_IOC_FLAG_GROUP = 1U << 0, }; /* @@ -575,7 +585,7 @@ struct perf_event_mmap_page { __u32 compat_version; /* lowest version this is compat with */ /* - * Bits needed to read the hw events in user-space. + * Bits needed to read the HW events in user-space. * * u32 seq, time_mult, time_shift, index, width; * u64 count, enabled, running; @@ -613,7 +623,7 @@ struct perf_event_mmap_page { __u32 index; /* hardware event identifier */ __s64 offset; /* add to hardware event value */ __u64 time_enabled; /* time event active */ - __u64 time_running; /* time event on cpu */ + __u64 time_running; /* time event on CPU */ union { __u64 capabilities; struct { @@ -641,7 +651,7 @@ struct perf_event_mmap_page { /* * If cap_usr_time the below fields can be used to compute the time - * delta since time_enabled (in ns) using rdtsc or similar. + * delta since time_enabled (in ns) using RDTSC or similar. * * u64 quot, rem; * u64 delta; @@ -714,7 +724,7 @@ struct perf_event_mmap_page { * after reading this value. * * When the mapping is PROT_WRITE the @data_tail value should be - * written by userspace to reflect the last read data, after issueing + * written by user-space to reflect the last read data, after issuing * an smp_mb() to separate the data read from the ->data_tail store. * In this case the kernel will not over-write unread data. * @@ -730,7 +740,7 @@ struct perf_event_mmap_page { /* * AUX area is defined by aux_{offset,size} fields that should be set - * by the userspace, so that + * by the user-space, so that * * aux_offset >= data_offset + data_size * @@ -804,7 +814,7 @@ struct perf_event_mmap_page { * Indicates that thread was preempted in TASK_RUNNING state. * * PERF_RECORD_MISC_MMAP_BUILD_ID: - * Indicates that mmap2 event carries build id data. + * Indicates that mmap2 event carries build ID data. */ #define PERF_RECORD_MISC_EXACT_IP (1 << 14) #define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT (1 << 14) @@ -815,26 +825,26 @@ struct perf_event_mmap_page { #define PERF_RECORD_MISC_EXT_RESERVED (1 << 15) struct perf_event_header { - __u32 type; - __u16 misc; - __u16 size; + __u32 type; + __u16 misc; + __u16 size; }; struct perf_ns_link_info { - __u64 dev; - __u64 ino; + __u64 dev; + __u64 ino; }; enum { - NET_NS_INDEX = 0, - UTS_NS_INDEX = 1, - IPC_NS_INDEX = 2, - PID_NS_INDEX = 3, - USER_NS_INDEX = 4, - MNT_NS_INDEX = 5, - CGROUP_NS_INDEX = 6, - - NR_NAMESPACES, /* number of available namespaces */ + NET_NS_INDEX = 0, + UTS_NS_INDEX = 1, + IPC_NS_INDEX = 2, + PID_NS_INDEX = 3, + USER_NS_INDEX = 4, + MNT_NS_INDEX = 5, + CGROUP_NS_INDEX = 6, + + NR_NAMESPACES, /* number of available namespaces */ }; enum perf_event_type { @@ -850,11 +860,11 @@ enum perf_event_type { * optional fields being ignored. * * struct sample_id { - * { u32 pid, tid; } && PERF_SAMPLE_TID - * { u64 time; } && PERF_SAMPLE_TIME - * { u64 id; } && PERF_SAMPLE_ID - * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID - * { u32 cpu, res; } && PERF_SAMPLE_CPU + * { u32 pid, tid; } && PERF_SAMPLE_TID + * { u64 time; } && PERF_SAMPLE_TIME + * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID + * { u32 cpu, res; } && PERF_SAMPLE_CPU * { u64 id; } && PERF_SAMPLE_IDENTIFIER * } && perf_event_attr::sample_id_all * @@ -865,7 +875,7 @@ enum perf_event_type { /* * The MMAP events record the PROT_EXEC mappings so that we can - * correlate userspace IPs to code. They have the following structure: + * correlate user-space IPs to code. They have the following structure: * * struct { * struct perf_event_header header; @@ -875,7 +885,7 @@ enum perf_event_type { * u64 len; * u64 pgoff; * char filename[]; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_MMAP = 1, @@ -885,7 +895,7 @@ enum perf_event_type { * struct perf_event_header header; * u64 id; * u64 lost; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_LOST = 2, @@ -896,7 +906,7 @@ enum perf_event_type { * * u32 pid, tid; * char comm[]; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_COMM = 3, @@ -907,7 +917,7 @@ enum perf_event_type { * u32 pid, ppid; * u32 tid, ptid; * u64 time; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_EXIT = 4, @@ -918,7 +928,7 @@ enum perf_event_type { * u64 time; * u64 id; * u64 stream_id; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_THROTTLE = 5, @@ -930,7 +940,7 @@ enum perf_event_type { * u32 pid, ppid; * u32 tid, ptid; * u64 time; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_FORK = 7, @@ -941,7 +951,7 @@ enum perf_event_type { * u32 pid, tid; * * struct read_format values; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_READ = 8, @@ -996,12 +1006,12 @@ enum perf_event_type { * { u64 counters; } cntr[nr] && PERF_SAMPLE_BRANCH_COUNTERS * } && PERF_SAMPLE_BRANCH_STACK * - * { u64 abi; # enum perf_sample_regs_abi - * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER + * { u64 abi; # enum perf_sample_regs_abi + * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER * - * { u64 size; - * char data[size]; - * u64 dyn_size; } && PERF_SAMPLE_STACK_USER + * { u64 size; + * char data[size]; + * u64 dyn_size; } && PERF_SAMPLE_STACK_USER * * { union perf_sample_weight * { @@ -1061,7 +1071,7 @@ enum perf_event_type { * }; * u32 prot, flags; * char filename[]; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_MMAP2 = 10, @@ -1070,12 +1080,12 @@ enum perf_event_type { * Records that new data landed in the AUX buffer part. * * struct { - * struct perf_event_header header; + * struct perf_event_header header; * - * u64 aux_offset; - * u64 aux_size; + * u64 aux_offset; + * u64 aux_size; * u64 flags; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_AUX = 11, @@ -1158,7 +1168,7 @@ enum perf_event_type { PERF_RECORD_KSYMBOL = 17, /* - * Record bpf events: + * Record BPF events: * enum perf_bpf_event_type { * PERF_BPF_EVENT_UNKNOWN = 0, * PERF_BPF_EVENT_PROG_LOAD = 1, @@ -1236,181 +1246,181 @@ enum perf_record_ksymbol_type { #define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER (1 << 0) enum perf_bpf_event_type { - PERF_BPF_EVENT_UNKNOWN = 0, - PERF_BPF_EVENT_PROG_LOAD = 1, - PERF_BPF_EVENT_PROG_UNLOAD = 2, - PERF_BPF_EVENT_MAX, /* non-ABI */ + PERF_BPF_EVENT_UNKNOWN = 0, + PERF_BPF_EVENT_PROG_LOAD = 1, + PERF_BPF_EVENT_PROG_UNLOAD = 2, + PERF_BPF_EVENT_MAX, /* non-ABI */ }; -#define PERF_MAX_STACK_DEPTH 127 -#define PERF_MAX_CONTEXTS_PER_STACK 8 +#define PERF_MAX_STACK_DEPTH 127 +#define PERF_MAX_CONTEXTS_PER_STACK 8 enum perf_callchain_context { - PERF_CONTEXT_HV = (__u64)-32, - PERF_CONTEXT_KERNEL = (__u64)-128, - PERF_CONTEXT_USER = (__u64)-512, + PERF_CONTEXT_HV = (__u64)-32, + PERF_CONTEXT_KERNEL = (__u64)-128, + PERF_CONTEXT_USER = (__u64)-512, - PERF_CONTEXT_GUEST = (__u64)-2048, - PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, - PERF_CONTEXT_GUEST_USER = (__u64)-2560, + PERF_CONTEXT_GUEST = (__u64)-2048, + PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, + PERF_CONTEXT_GUEST_USER = (__u64)-2560, - PERF_CONTEXT_MAX = (__u64)-4095, + PERF_CONTEXT_MAX = (__u64)-4095, }; /** * PERF_RECORD_AUX::flags bits */ -#define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ -#define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ -#define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ -#define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ +#define PERF_AUX_FLAG_TRUNCATED 0x0001 /* Record was truncated to fit */ +#define PERF_AUX_FLAG_OVERWRITE 0x0002 /* Snapshot from overwrite mode */ +#define PERF_AUX_FLAG_PARTIAL 0x0004 /* Record contains gaps */ +#define PERF_AUX_FLAG_COLLISION 0x0008 /* Sample collided with another */ #define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK 0xff00 /* PMU specific trace format type */ /* CoreSight PMU AUX buffer formats */ -#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */ -#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */ -#define PERF_FLAG_FD_NO_GROUP (1UL << 0) -#define PERF_FLAG_FD_OUTPUT (1UL << 1) -#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ -#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ +#define PERF_FLAG_FD_NO_GROUP (1UL << 0) +#define PERF_FLAG_FD_OUTPUT (1UL << 1) +#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup ID, per-CPU mode only */ +#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ #if defined(__LITTLE_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_op:5, /* type of opcode */ - mem_lvl:14, /* memory hierarchy level */ - mem_snoop:5, /* snoop mode */ - mem_lock:2, /* lock instr */ - mem_dtlb:7, /* tlb access */ - mem_lvl_num:4, /* memory hierarchy level number */ - mem_remote:1, /* remote */ - mem_snoopx:2, /* snoop mode, ext */ - mem_blk:3, /* access blocked */ - mem_hops:3, /* hop level */ - mem_rsvd:18; + __u64 mem_op : 5, /* Type of opcode */ + mem_lvl : 14, /* Memory hierarchy level */ + mem_snoop : 5, /* Snoop mode */ + mem_lock : 2, /* Lock instr */ + mem_dtlb : 7, /* TLB access */ + mem_lvl_num : 4, /* Memory hierarchy level number */ + mem_remote : 1, /* Remote */ + mem_snoopx : 2, /* Snoop mode, ext */ + mem_blk : 3, /* Access blocked */ + mem_hops : 3, /* Hop level */ + mem_rsvd : 18; }; }; #elif defined(__BIG_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_rsvd:18, - mem_hops:3, /* hop level */ - mem_blk:3, /* access blocked */ - mem_snoopx:2, /* snoop mode, ext */ - mem_remote:1, /* remote */ - mem_lvl_num:4, /* memory hierarchy level number */ - mem_dtlb:7, /* tlb access */ - mem_lock:2, /* lock instr */ - mem_snoop:5, /* snoop mode */ - mem_lvl:14, /* memory hierarchy level */ - mem_op:5; /* type of opcode */ + __u64 mem_rsvd : 18, + mem_hops : 3, /* Hop level */ + mem_blk : 3, /* Access blocked */ + mem_snoopx : 2, /* Snoop mode, ext */ + mem_remote : 1, /* Remote */ + mem_lvl_num : 4, /* Memory hierarchy level number */ + mem_dtlb : 7, /* TLB access */ + mem_lock : 2, /* Lock instr */ + mem_snoop : 5, /* Snoop mode */ + mem_lvl : 14, /* Memory hierarchy level */ + mem_op : 5; /* Type of opcode */ }; }; #else -#error "Unknown endianness" +# error "Unknown endianness" #endif -/* type of opcode (load/store/prefetch,code) */ -#define PERF_MEM_OP_NA 0x01 /* not available */ -#define PERF_MEM_OP_LOAD 0x02 /* load instruction */ -#define PERF_MEM_OP_STORE 0x04 /* store instruction */ -#define PERF_MEM_OP_PFETCH 0x08 /* prefetch */ -#define PERF_MEM_OP_EXEC 0x10 /* code (execution) */ -#define PERF_MEM_OP_SHIFT 0 +/* Type of memory opcode: */ +#define PERF_MEM_OP_NA 0x0001 /* Not available */ +#define PERF_MEM_OP_LOAD 0x0002 /* Load instruction */ +#define PERF_MEM_OP_STORE 0x0004 /* Store instruction */ +#define PERF_MEM_OP_PFETCH 0x0008 /* Prefetch */ +#define PERF_MEM_OP_EXEC 0x0010 /* Code (execution) */ +#define PERF_MEM_OP_SHIFT 0 /* - * PERF_MEM_LVL_* namespace being depricated to some extent in the + * The PERF_MEM_LVL_* namespace is being deprecated to some extent in * favour of newer composite PERF_MEM_{LVLNUM_,REMOTE_,SNOOPX_} fields. - * Supporting this namespace inorder to not break defined ABIs. + * We support this namespace in order to not break defined ABIs. * - * memory hierarchy (memory level, hit or miss) + * Memory hierarchy (memory level, hit or miss) */ -#define PERF_MEM_LVL_NA 0x01 /* not available */ -#define PERF_MEM_LVL_HIT 0x02 /* hit level */ -#define PERF_MEM_LVL_MISS 0x04 /* miss level */ -#define PERF_MEM_LVL_L1 0x08 /* L1 */ -#define PERF_MEM_LVL_LFB 0x10 /* Line Fill Buffer */ -#define PERF_MEM_LVL_L2 0x20 /* L2 */ -#define PERF_MEM_LVL_L3 0x40 /* L3 */ -#define PERF_MEM_LVL_LOC_RAM 0x80 /* Local DRAM */ -#define PERF_MEM_LVL_REM_RAM1 0x100 /* Remote DRAM (1 hop) */ -#define PERF_MEM_LVL_REM_RAM2 0x200 /* Remote DRAM (2 hops) */ -#define PERF_MEM_LVL_REM_CCE1 0x400 /* Remote Cache (1 hop) */ -#define PERF_MEM_LVL_REM_CCE2 0x800 /* Remote Cache (2 hops) */ -#define PERF_MEM_LVL_IO 0x1000 /* I/O memory */ -#define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */ -#define PERF_MEM_LVL_SHIFT 5 - -#define PERF_MEM_REMOTE_REMOTE 0x01 /* Remote */ -#define PERF_MEM_REMOTE_SHIFT 37 - -#define PERF_MEM_LVLNUM_L1 0x01 /* L1 */ -#define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ -#define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ -#define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ -#define PERF_MEM_LVLNUM_L2_MHB 0x05 /* L2 Miss Handling Buffer */ -#define PERF_MEM_LVLNUM_MSC 0x06 /* Memory-side Cache */ -/* 0x7 available */ -#define PERF_MEM_LVLNUM_UNC 0x08 /* Uncached */ -#define PERF_MEM_LVLNUM_CXL 0x09 /* CXL */ -#define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ -#define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ -#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB / L1 Miss Handling Buffer */ -#define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ -#define PERF_MEM_LVLNUM_PMEM 0x0e /* PMEM */ -#define PERF_MEM_LVLNUM_NA 0x0f /* N/A */ - -#define PERF_MEM_LVLNUM_SHIFT 33 - -/* snoop mode */ -#define PERF_MEM_SNOOP_NA 0x01 /* not available */ -#define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */ -#define PERF_MEM_SNOOP_HIT 0x04 /* snoop hit */ -#define PERF_MEM_SNOOP_MISS 0x08 /* snoop miss */ -#define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */ -#define PERF_MEM_SNOOP_SHIFT 19 - -#define PERF_MEM_SNOOPX_FWD 0x01 /* forward */ -#define PERF_MEM_SNOOPX_PEER 0x02 /* xfer from peer */ -#define PERF_MEM_SNOOPX_SHIFT 38 - -/* locked instruction */ -#define PERF_MEM_LOCK_NA 0x01 /* not available */ -#define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */ -#define PERF_MEM_LOCK_SHIFT 24 +#define PERF_MEM_LVL_NA 0x0001 /* Not available */ +#define PERF_MEM_LVL_HIT 0x0002 /* Hit level */ +#define PERF_MEM_LVL_MISS 0x0004 /* Miss level */ +#define PERF_MEM_LVL_L1 0x0008 /* L1 */ +#define PERF_MEM_LVL_LFB 0x0010 /* Line Fill Buffer */ +#define PERF_MEM_LVL_L2 0x0020 /* L2 */ +#define PERF_MEM_LVL_L3 0x0040 /* L3 */ +#define PERF_MEM_LVL_LOC_RAM 0x0080 /* Local DRAM */ +#define PERF_MEM_LVL_REM_RAM1 0x0100 /* Remote DRAM (1 hop) */ +#define PERF_MEM_LVL_REM_RAM2 0x0200 /* Remote DRAM (2 hops) */ +#define PERF_MEM_LVL_REM_CCE1 0x0400 /* Remote Cache (1 hop) */ +#define PERF_MEM_LVL_REM_CCE2 0x0800 /* Remote Cache (2 hops) */ +#define PERF_MEM_LVL_IO 0x1000 /* I/O memory */ +#define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */ +#define PERF_MEM_LVL_SHIFT 5 + +#define PERF_MEM_REMOTE_REMOTE 0x0001 /* Remote */ +#define PERF_MEM_REMOTE_SHIFT 37 + +#define PERF_MEM_LVLNUM_L1 0x0001 /* L1 */ +#define PERF_MEM_LVLNUM_L2 0x0002 /* L2 */ +#define PERF_MEM_LVLNUM_L3 0x0003 /* L3 */ +#define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */ +#define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */ +#define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */ +/* 0x007 available */ +#define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */ +#define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */ +#define PERF_MEM_LVLNUM_IO 0x000a /* I/O */ +#define PERF_MEM_LVLNUM_ANY_CACHE 0x000b /* Any cache */ +#define PERF_MEM_LVLNUM_LFB 0x000c /* LFB / L1 Miss Handling Buffer */ +#define PERF_MEM_LVLNUM_RAM 0x000d /* RAM */ +#define PERF_MEM_LVLNUM_PMEM 0x000e /* PMEM */ +#define PERF_MEM_LVLNUM_NA 0x000f /* N/A */ + +#define PERF_MEM_LVLNUM_SHIFT 33 + +/* Snoop mode */ +#define PERF_MEM_SNOOP_NA 0x0001 /* Not available */ +#define PERF_MEM_SNOOP_NONE 0x0002 /* No snoop */ +#define PERF_MEM_SNOOP_HIT 0x0004 /* Snoop hit */ +#define PERF_MEM_SNOOP_MISS 0x0008 /* Snoop miss */ +#define PERF_MEM_SNOOP_HITM 0x0010 /* Snoop hit modified */ +#define PERF_MEM_SNOOP_SHIFT 19 + +#define PERF_MEM_SNOOPX_FWD 0x0001 /* Forward */ +#define PERF_MEM_SNOOPX_PEER 0x0002 /* Transfer from peer */ +#define PERF_MEM_SNOOPX_SHIFT 38 + +/* Locked instruction */ +#define PERF_MEM_LOCK_NA 0x0001 /* Not available */ +#define PERF_MEM_LOCK_LOCKED 0x0002 /* Locked transaction */ +#define PERF_MEM_LOCK_SHIFT 24 /* TLB access */ -#define PERF_MEM_TLB_NA 0x01 /* not available */ -#define PERF_MEM_TLB_HIT 0x02 /* hit level */ -#define PERF_MEM_TLB_MISS 0x04 /* miss level */ -#define PERF_MEM_TLB_L1 0x08 /* L1 */ -#define PERF_MEM_TLB_L2 0x10 /* L2 */ -#define PERF_MEM_TLB_WK 0x20 /* Hardware Walker*/ -#define PERF_MEM_TLB_OS 0x40 /* OS fault handler */ -#define PERF_MEM_TLB_SHIFT 26 +#define PERF_MEM_TLB_NA 0x0001 /* Not available */ +#define PERF_MEM_TLB_HIT 0x0002 /* Hit level */ +#define PERF_MEM_TLB_MISS 0x0004 /* Miss level */ +#define PERF_MEM_TLB_L1 0x0008 /* L1 */ +#define PERF_MEM_TLB_L2 0x0010 /* L2 */ +#define PERF_MEM_TLB_WK 0x0020 /* Hardware Walker*/ +#define PERF_MEM_TLB_OS 0x0040 /* OS fault handler */ +#define PERF_MEM_TLB_SHIFT 26 /* Access blocked */ -#define PERF_MEM_BLK_NA 0x01 /* not available */ -#define PERF_MEM_BLK_DATA 0x02 /* data could not be forwarded */ -#define PERF_MEM_BLK_ADDR 0x04 /* address conflict */ -#define PERF_MEM_BLK_SHIFT 40 - -/* hop level */ -#define PERF_MEM_HOPS_0 0x01 /* remote core, same node */ -#define PERF_MEM_HOPS_1 0x02 /* remote node, same socket */ -#define PERF_MEM_HOPS_2 0x03 /* remote socket, same board */ -#define PERF_MEM_HOPS_3 0x04 /* remote board */ +#define PERF_MEM_BLK_NA 0x0001 /* Not available */ +#define PERF_MEM_BLK_DATA 0x0002 /* Data could not be forwarded */ +#define PERF_MEM_BLK_ADDR 0x0004 /* Address conflict */ +#define PERF_MEM_BLK_SHIFT 40 + +/* Hop level */ +#define PERF_MEM_HOPS_0 0x0001 /* Remote core, same node */ +#define PERF_MEM_HOPS_1 0x0002 /* Remote node, same socket */ +#define PERF_MEM_HOPS_2 0x0003 /* Remote socket, same board */ +#define PERF_MEM_HOPS_3 0x0004 /* Remote board */ /* 5-7 available */ -#define PERF_MEM_HOPS_SHIFT 43 +#define PERF_MEM_HOPS_SHIFT 43 #define PERF_MEM_S(a, s) \ (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) /* - * single taken branch record layout: + * Layout of single taken branch records: * * from: source instruction (may not always be a branch insn) * to: branch target @@ -1429,37 +1439,37 @@ union perf_mem_data_src { struct perf_branch_entry { __u64 from; __u64 to; - __u64 mispred:1, /* target mispredicted */ - predicted:1,/* target predicted */ - in_tx:1, /* in transaction */ - abort:1, /* transaction abort */ - cycles:16, /* cycle count to last branch */ - type:4, /* branch type */ - spec:2, /* branch speculation info */ - new_type:4, /* additional branch type */ - priv:3, /* privilege level */ - reserved:31; + __u64 mispred : 1, /* target mispredicted */ + predicted : 1, /* target predicted */ + in_tx : 1, /* in transaction */ + abort : 1, /* transaction abort */ + cycles : 16, /* cycle count to last branch */ + type : 4, /* branch type */ + spec : 2, /* branch speculation info */ + new_type : 4, /* additional branch type */ + priv : 3, /* privilege level */ + reserved : 31; }; /* Size of used info bits in struct perf_branch_entry */ #define PERF_BRANCH_ENTRY_INFO_BITS_MAX 33 union perf_sample_weight { - __u64 full; + __u64 full; #if defined(__LITTLE_ENDIAN_BITFIELD) struct { - __u32 var1_dw; - __u16 var2_w; - __u16 var3_w; + __u32 var1_dw; + __u16 var2_w; + __u16 var3_w; }; #elif defined(__BIG_ENDIAN_BITFIELD) struct { - __u16 var3_w; - __u16 var2_w; - __u32 var1_dw; + __u16 var3_w; + __u16 var2_w; + __u32 var1_dw; }; #else -#error "Unknown endianness" +# error "Unknown endianness" #endif }; diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 008403fe0b0b..365184d931ad 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -39,18 +39,21 @@ enum perf_type_id { /* * attr.config layout for type PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE + * * PERF_TYPE_HARDWARE: 0xEEEEEEEE000000AA * AA: hardware event ID * EEEEEEEE: PMU type ID + * * PERF_TYPE_HW_CACHE: 0xEEEEEEEE00DDCCBB * BB: hardware cache ID * CC: hardware cache op ID * DD: hardware cache op result ID * EEEEEEEE: PMU type ID - * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied. + * + * If the PMU type ID is 0, PERF_TYPE_RAW will be applied. */ -#define PERF_PMU_TYPE_SHIFT 32 -#define PERF_HW_EVENT_MASK 0xffffffff +#define PERF_PMU_TYPE_SHIFT 32 +#define PERF_HW_EVENT_MASK 0xffffffff /* * Generalized performance event event_id types, used by the @@ -112,7 +115,7 @@ enum perf_hw_cache_op_result_id { /* * Special "software" events provided by the kernel, even if the hardware * does not support performance events. These events measure various - * physical and sw events of the kernel (and allow the profiling of them as + * physical and SW events of the kernel (and allow the profiling of them as * well): */ enum perf_sw_ids { @@ -167,8 +170,9 @@ enum perf_event_sample_format { }; #define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) + /* - * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set + * Values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set. * * If the user does not pass priv level information via branch_sample_type, * the kernel uses the event's priv level. Branch and event priv levels do @@ -178,20 +182,20 @@ enum perf_event_sample_format { * of branches and therefore it supersedes all the other types. */ enum perf_branch_sample_type_shift { - PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */ - PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */ - PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */ - - PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */ - PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */ - PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */ - PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */ - PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */ - PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */ - PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */ + PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */ + PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */ + PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */ + + PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */ + PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */ + PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */ + PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */ + PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */ + PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */ + PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */ PERF_SAMPLE_BRANCH_COND_SHIFT = 10, /* conditional branches */ - PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = 11, /* call/ret stack */ + PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = 11, /* CALL/RET stack */ PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT = 12, /* indirect jumps */ PERF_SAMPLE_BRANCH_CALL_SHIFT = 13, /* direct call */ @@ -210,96 +214,95 @@ enum perf_branch_sample_type_shift { }; enum perf_branch_sample_type { - PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT, - PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT, - PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT, + PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT, + PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT, + PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT, - PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT, - PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT, - PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT, - PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT, - PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT, - PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT, - PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT, - PERF_SAMPLE_BRANCH_COND = 1U << PERF_SAMPLE_BRANCH_COND_SHIFT, + PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT, + PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT, + PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT, + PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT, + PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT, + PERF_SAMPLE_BRANCH_COND = 1U << PERF_SAMPLE_BRANCH_COND_SHIFT, - PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT, - PERF_SAMPLE_BRANCH_IND_JUMP = 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT, - PERF_SAMPLE_BRANCH_CALL = 1U << PERF_SAMPLE_BRANCH_CALL_SHIFT, + PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT, + PERF_SAMPLE_BRANCH_IND_JUMP = 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT, + PERF_SAMPLE_BRANCH_CALL = 1U << PERF_SAMPLE_BRANCH_CALL_SHIFT, - PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT, - PERF_SAMPLE_BRANCH_NO_CYCLES = 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT, + PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT, + PERF_SAMPLE_BRANCH_NO_CYCLES = 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT, - PERF_SAMPLE_BRANCH_TYPE_SAVE = - 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_TYPE_SAVE = 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, - PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, + PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, - PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, - PERF_SAMPLE_BRANCH_COUNTERS = 1U << PERF_SAMPLE_BRANCH_COUNTERS_SHIFT, + PERF_SAMPLE_BRANCH_COUNTERS = 1U << PERF_SAMPLE_BRANCH_COUNTERS_SHIFT, - PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; /* - * Common flow change classification + * Common control flow change classifications: */ enum { - PERF_BR_UNKNOWN = 0, /* unknown */ - PERF_BR_COND = 1, /* conditional */ - PERF_BR_UNCOND = 2, /* unconditional */ - PERF_BR_IND = 3, /* indirect */ - PERF_BR_CALL = 4, /* function call */ - PERF_BR_IND_CALL = 5, /* indirect function call */ - PERF_BR_RET = 6, /* function return */ - PERF_BR_SYSCALL = 7, /* syscall */ - PERF_BR_SYSRET = 8, /* syscall return */ - PERF_BR_COND_CALL = 9, /* conditional function call */ - PERF_BR_COND_RET = 10, /* conditional function return */ - PERF_BR_ERET = 11, /* exception return */ - PERF_BR_IRQ = 12, /* irq */ - PERF_BR_SERROR = 13, /* system error */ - PERF_BR_NO_TX = 14, /* not in transaction */ - PERF_BR_EXTEND_ABI = 15, /* extend ABI */ + PERF_BR_UNKNOWN = 0, /* Unknown */ + PERF_BR_COND = 1, /* Conditional */ + PERF_BR_UNCOND = 2, /* Unconditional */ + PERF_BR_IND = 3, /* Indirect */ + PERF_BR_CALL = 4, /* Function call */ + PERF_BR_IND_CALL = 5, /* Indirect function call */ + PERF_BR_RET = 6, /* Function return */ + PERF_BR_SYSCALL = 7, /* Syscall */ + PERF_BR_SYSRET = 8, /* Syscall return */ + PERF_BR_COND_CALL = 9, /* Conditional function call */ + PERF_BR_COND_RET = 10, /* Conditional function return */ + PERF_BR_ERET = 11, /* Exception return */ + PERF_BR_IRQ = 12, /* IRQ */ + PERF_BR_SERROR = 13, /* System error */ + PERF_BR_NO_TX = 14, /* Not in transaction */ + PERF_BR_EXTEND_ABI = 15, /* Extend ABI */ PERF_BR_MAX, }; /* - * Common branch speculation outcome classification + * Common branch speculation outcome classifications: */ enum { - PERF_BR_SPEC_NA = 0, /* Not available */ - PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */ - PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */ - PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */ + PERF_BR_SPEC_NA = 0, /* Not available */ + PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */ + PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */ + PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */ PERF_BR_SPEC_MAX, }; enum { - PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ - PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ - PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ - PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ - PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ - PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ - PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ - PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ + PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ + PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ + PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ + PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ + PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ + PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ + PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ + PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ PERF_BR_NEW_MAX, }; enum { - PERF_BR_PRIV_UNKNOWN = 0, - PERF_BR_PRIV_USER = 1, - PERF_BR_PRIV_KERNEL = 2, - PERF_BR_PRIV_HV = 3, + PERF_BR_PRIV_UNKNOWN = 0, + PERF_BR_PRIV_USER = 1, + PERF_BR_PRIV_KERNEL = 2, + PERF_BR_PRIV_HV = 3, }; -#define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 -#define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 -#define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 -#define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 -#define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 +#define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 +#define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 +#define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 +#define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 +#define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ @@ -310,9 +313,9 @@ enum { * Values to determine ABI of the registers dump. */ enum perf_sample_regs_abi { - PERF_SAMPLE_REGS_ABI_NONE = 0, - PERF_SAMPLE_REGS_ABI_32 = 1, - PERF_SAMPLE_REGS_ABI_64 = 2, + PERF_SAMPLE_REGS_ABI_NONE = 0, + PERF_SAMPLE_REGS_ABI_32 = 1, + PERF_SAMPLE_REGS_ABI_64 = 2, }; /* @@ -320,21 +323,21 @@ enum perf_sample_regs_abi { * abort events. Multiple bits can be set. */ enum { - PERF_TXN_ELISION = (1 << 0), /* From elision */ - PERF_TXN_TRANSACTION = (1 << 1), /* From transaction */ - PERF_TXN_SYNC = (1 << 2), /* Instruction is related */ - PERF_TXN_ASYNC = (1 << 3), /* Instruction not related */ - PERF_TXN_RETRY = (1 << 4), /* Retry possible */ - PERF_TXN_CONFLICT = (1 << 5), /* Conflict abort */ - PERF_TXN_CAPACITY_WRITE = (1 << 6), /* Capacity write abort */ - PERF_TXN_CAPACITY_READ = (1 << 7), /* Capacity read abort */ + PERF_TXN_ELISION = (1 << 0), /* From elision */ + PERF_TXN_TRANSACTION = (1 << 1), /* From transaction */ + PERF_TXN_SYNC = (1 << 2), /* Instruction is related */ + PERF_TXN_ASYNC = (1 << 3), /* Instruction is not related */ + PERF_TXN_RETRY = (1 << 4), /* Retry possible */ + PERF_TXN_CONFLICT = (1 << 5), /* Conflict abort */ + PERF_TXN_CAPACITY_WRITE = (1 << 6), /* Capacity write abort */ + PERF_TXN_CAPACITY_READ = (1 << 7), /* Capacity read abort */ - PERF_TXN_MAX = (1 << 8), /* non-ABI */ + PERF_TXN_MAX = (1 << 8), /* non-ABI */ - /* bits 32..63 are reserved for the abort code */ + /* Bits 32..63 are reserved for the abort code */ - PERF_TXN_ABORT_MASK = (0xffffffffULL << 32), - PERF_TXN_ABORT_SHIFT = 32, + PERF_TXN_ABORT_MASK = (0xffffffffULL << 32), + PERF_TXN_ABORT_SHIFT = 32, }; /* @@ -369,24 +372,22 @@ enum perf_event_read_format { PERF_FORMAT_MAX = 1U << 5, /* non-ABI */ }; -#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ -#define PERF_ATTR_SIZE_VER1 72 /* add: config2 */ -#define PERF_ATTR_SIZE_VER2 80 /* add: branch_sample_type */ -#define PERF_ATTR_SIZE_VER3 96 /* add: sample_regs_user */ - /* add: sample_stack_user */ -#define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */ -#define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */ -#define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */ -#define PERF_ATTR_SIZE_VER7 128 /* add: sig_data */ -#define PERF_ATTR_SIZE_VER8 136 /* add: config3 */ +#define PERF_ATTR_SIZE_VER0 64 /* Size of first published 'struct perf_event_attr' */ +#define PERF_ATTR_SIZE_VER1 72 /* Add: config2 */ +#define PERF_ATTR_SIZE_VER2 80 /* Add: branch_sample_type */ +#define PERF_ATTR_SIZE_VER3 96 /* Add: sample_regs_user */ + /* Add: sample_stack_user */ +#define PERF_ATTR_SIZE_VER4 104 /* Add: sample_regs_intr */ +#define PERF_ATTR_SIZE_VER5 112 /* Add: aux_watermark */ +#define PERF_ATTR_SIZE_VER6 120 /* Add: aux_sample_size */ +#define PERF_ATTR_SIZE_VER7 128 /* Add: sig_data */ +#define PERF_ATTR_SIZE_VER8 136 /* Add: config3 */ /* - * Hardware event_id to monitor via a performance monitoring event: - * - * @sample_max_stack: Max number of frame pointers in a callchain, - * should be < /proc/sys/kernel/perf_event_max_stack - * Max number of entries of branch stack - * should be < hardware limit + * 'struct perf_event_attr' contains various attributes that define + * a performance event - most of them hardware related configuration + * details, but also a lot of behavioral switches and values implemented + * by the kernel. */ struct perf_event_attr { @@ -396,7 +397,7 @@ struct perf_event_attr { __u32 type; /* - * Size of the attr structure, for fwd/bwd compat. + * Size of the attr structure, for forward/backwards compatibility. */ __u32 size; @@ -451,21 +452,21 @@ struct perf_event_attr { comm_exec : 1, /* flag comm events that are due to an exec */ use_clockid : 1, /* use @clockid for time fields */ context_switch : 1, /* context switch data */ - write_backward : 1, /* Write ring buffer from end to beginning */ + write_backward : 1, /* write ring buffer from end to beginning */ namespaces : 1, /* include namespaces data */ ksymbol : 1, /* include ksymbol events */ - bpf_event : 1, /* include bpf events */ + bpf_event : 1, /* include BPF events */ aux_output : 1, /* generate AUX records instead of events */ cgroup : 1, /* include cgroup events */ text_poke : 1, /* include text poke events */ - build_id : 1, /* use build id in mmap2 events */ + build_id : 1, /* use build ID in mmap2 events */ inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ remove_on_exec : 1, /* event is removed from task on exec */ sigtrap : 1, /* send synchronous SIGTRAP on event */ __reserved_1 : 26; union { - __u32 wakeup_events; /* wakeup every n events */ + __u32 wakeup_events; /* wake up every n events */ __u32 wakeup_watermark; /* bytes before wakeup */ }; @@ -474,13 +475,13 @@ struct perf_event_attr { __u64 bp_addr; __u64 kprobe_func; /* for perf_kprobe */ __u64 uprobe_path; /* for perf_uprobe */ - __u64 config1; /* extension of config */ + __u64 config1; /* extension of config */ }; union { __u64 bp_len; - __u64 kprobe_addr; /* when kprobe_func == NULL */ + __u64 kprobe_addr; /* when kprobe_func == NULL */ __u64 probe_offset; /* for perf_[k,u]probe */ - __u64 config2; /* extension of config1 */ + __u64 config2; /* extension of config1 */ }; __u64 branch_sample_type; /* enum perf_branch_sample_type */ @@ -510,7 +511,16 @@ struct perf_event_attr { * Wakeup watermark for AUX area */ __u32 aux_watermark; + + /* + * Max number of frame pointers in a callchain, should be + * lower than /proc/sys/kernel/perf_event_max_stack. + * + * Max number of entries of branch stack should be lower + * than the hardware limit. + */ __u16 sample_max_stack; + __u16 __reserved_2; __u32 aux_sample_size; __u32 __reserved_3; @@ -528,7 +538,7 @@ struct perf_event_attr { /* * Structure used by below PERF_EVENT_IOC_QUERY_BPF command - * to query bpf programs attached to the same perf tracepoint + * to query BPF programs attached to the same perf tracepoint * as the given perf event. */ struct perf_event_query_bpf { @@ -550,21 +560,21 @@ struct perf_event_query_bpf { /* * Ioctls that can be done on a perf event fd: */ -#define PERF_EVENT_IOC_ENABLE _IO ('$', 0) -#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) -#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) -#define PERF_EVENT_IOC_RESET _IO ('$', 3) -#define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64) -#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) -#define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) -#define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) -#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) -#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) +#define PERF_EVENT_IOC_ENABLE _IO ('$', 0) +#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) +#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) +#define PERF_EVENT_IOC_RESET _IO ('$', 3) +#define PERF_EVENT_IOC_PERIOD _IOW ('$', 4, __u64) +#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) +#define PERF_EVENT_IOC_SET_FILTER _IOW ('$', 6, char *) +#define PERF_EVENT_IOC_ID _IOR ('$', 7, __u64 *) +#define PERF_EVENT_IOC_SET_BPF _IOW ('$', 8, __u32) +#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW ('$', 9, __u32) #define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *) -#define PERF_EVENT_IOC_MODIFY_ATTRIBUTES _IOW('$', 11, struct perf_event_attr *) +#define PERF_EVENT_IOC_MODIFY_ATTRIBUTES _IOW ('$', 11, struct perf_event_attr *) enum perf_event_ioc_flags { - PERF_IOC_FLAG_GROUP = 1U << 0, + PERF_IOC_FLAG_GROUP = 1U << 0, }; /* @@ -575,7 +585,7 @@ struct perf_event_mmap_page { __u32 compat_version; /* lowest version this is compat with */ /* - * Bits needed to read the hw events in user-space. + * Bits needed to read the HW events in user-space. * * u32 seq, time_mult, time_shift, index, width; * u64 count, enabled, running; @@ -613,7 +623,7 @@ struct perf_event_mmap_page { __u32 index; /* hardware event identifier */ __s64 offset; /* add to hardware event value */ __u64 time_enabled; /* time event active */ - __u64 time_running; /* time event on cpu */ + __u64 time_running; /* time event on CPU */ union { __u64 capabilities; struct { @@ -641,7 +651,7 @@ struct perf_event_mmap_page { /* * If cap_usr_time the below fields can be used to compute the time - * delta since time_enabled (in ns) using rdtsc or similar. + * delta since time_enabled (in ns) using RDTSC or similar. * * u64 quot, rem; * u64 delta; @@ -714,7 +724,7 @@ struct perf_event_mmap_page { * after reading this value. * * When the mapping is PROT_WRITE the @data_tail value should be - * written by userspace to reflect the last read data, after issueing + * written by user-space to reflect the last read data, after issuing * an smp_mb() to separate the data read from the ->data_tail store. * In this case the kernel will not over-write unread data. * @@ -730,7 +740,7 @@ struct perf_event_mmap_page { /* * AUX area is defined by aux_{offset,size} fields that should be set - * by the userspace, so that + * by the user-space, so that * * aux_offset >= data_offset + data_size * @@ -804,7 +814,7 @@ struct perf_event_mmap_page { * Indicates that thread was preempted in TASK_RUNNING state. * * PERF_RECORD_MISC_MMAP_BUILD_ID: - * Indicates that mmap2 event carries build id data. + * Indicates that mmap2 event carries build ID data. */ #define PERF_RECORD_MISC_EXACT_IP (1 << 14) #define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT (1 << 14) @@ -815,26 +825,26 @@ struct perf_event_mmap_page { #define PERF_RECORD_MISC_EXT_RESERVED (1 << 15) struct perf_event_header { - __u32 type; - __u16 misc; - __u16 size; + __u32 type; + __u16 misc; + __u16 size; }; struct perf_ns_link_info { - __u64 dev; - __u64 ino; + __u64 dev; + __u64 ino; }; enum { - NET_NS_INDEX = 0, - UTS_NS_INDEX = 1, - IPC_NS_INDEX = 2, - PID_NS_INDEX = 3, - USER_NS_INDEX = 4, - MNT_NS_INDEX = 5, - CGROUP_NS_INDEX = 6, - - NR_NAMESPACES, /* number of available namespaces */ + NET_NS_INDEX = 0, + UTS_NS_INDEX = 1, + IPC_NS_INDEX = 2, + PID_NS_INDEX = 3, + USER_NS_INDEX = 4, + MNT_NS_INDEX = 5, + CGROUP_NS_INDEX = 6, + + NR_NAMESPACES, /* number of available namespaces */ }; enum perf_event_type { @@ -850,11 +860,11 @@ enum perf_event_type { * optional fields being ignored. * * struct sample_id { - * { u32 pid, tid; } && PERF_SAMPLE_TID - * { u64 time; } && PERF_SAMPLE_TIME - * { u64 id; } && PERF_SAMPLE_ID - * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID - * { u32 cpu, res; } && PERF_SAMPLE_CPU + * { u32 pid, tid; } && PERF_SAMPLE_TID + * { u64 time; } && PERF_SAMPLE_TIME + * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID + * { u32 cpu, res; } && PERF_SAMPLE_CPU * { u64 id; } && PERF_SAMPLE_IDENTIFIER * } && perf_event_attr::sample_id_all * @@ -865,7 +875,7 @@ enum perf_event_type { /* * The MMAP events record the PROT_EXEC mappings so that we can - * correlate userspace IPs to code. They have the following structure: + * correlate user-space IPs to code. They have the following structure: * * struct { * struct perf_event_header header; @@ -875,7 +885,7 @@ enum perf_event_type { * u64 len; * u64 pgoff; * char filename[]; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_MMAP = 1, @@ -885,7 +895,7 @@ enum perf_event_type { * struct perf_event_header header; * u64 id; * u64 lost; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_LOST = 2, @@ -896,7 +906,7 @@ enum perf_event_type { * * u32 pid, tid; * char comm[]; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_COMM = 3, @@ -907,7 +917,7 @@ enum perf_event_type { * u32 pid, ppid; * u32 tid, ptid; * u64 time; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_EXIT = 4, @@ -918,7 +928,7 @@ enum perf_event_type { * u64 time; * u64 id; * u64 stream_id; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_THROTTLE = 5, @@ -930,7 +940,7 @@ enum perf_event_type { * u32 pid, ppid; * u32 tid, ptid; * u64 time; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_FORK = 7, @@ -941,7 +951,7 @@ enum perf_event_type { * u32 pid, tid; * * struct read_format values; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_READ = 8, @@ -996,12 +1006,12 @@ enum perf_event_type { * { u64 counters; } cntr[nr] && PERF_SAMPLE_BRANCH_COUNTERS * } && PERF_SAMPLE_BRANCH_STACK * - * { u64 abi; # enum perf_sample_regs_abi - * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER + * { u64 abi; # enum perf_sample_regs_abi + * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER * - * { u64 size; - * char data[size]; - * u64 dyn_size; } && PERF_SAMPLE_STACK_USER + * { u64 size; + * char data[size]; + * u64 dyn_size; } && PERF_SAMPLE_STACK_USER * * { union perf_sample_weight * { @@ -1061,7 +1071,7 @@ enum perf_event_type { * }; * u32 prot, flags; * char filename[]; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_MMAP2 = 10, @@ -1070,12 +1080,12 @@ enum perf_event_type { * Records that new data landed in the AUX buffer part. * * struct { - * struct perf_event_header header; + * struct perf_event_header header; * - * u64 aux_offset; - * u64 aux_size; + * u64 aux_offset; + * u64 aux_size; * u64 flags; - * struct sample_id sample_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_AUX = 11, @@ -1158,7 +1168,7 @@ enum perf_event_type { PERF_RECORD_KSYMBOL = 17, /* - * Record bpf events: + * Record BPF events: * enum perf_bpf_event_type { * PERF_BPF_EVENT_UNKNOWN = 0, * PERF_BPF_EVENT_PROG_LOAD = 1, @@ -1236,181 +1246,181 @@ enum perf_record_ksymbol_type { #define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER (1 << 0) enum perf_bpf_event_type { - PERF_BPF_EVENT_UNKNOWN = 0, - PERF_BPF_EVENT_PROG_LOAD = 1, - PERF_BPF_EVENT_PROG_UNLOAD = 2, - PERF_BPF_EVENT_MAX, /* non-ABI */ + PERF_BPF_EVENT_UNKNOWN = 0, + PERF_BPF_EVENT_PROG_LOAD = 1, + PERF_BPF_EVENT_PROG_UNLOAD = 2, + PERF_BPF_EVENT_MAX, /* non-ABI */ }; -#define PERF_MAX_STACK_DEPTH 127 -#define PERF_MAX_CONTEXTS_PER_STACK 8 +#define PERF_MAX_STACK_DEPTH 127 +#define PERF_MAX_CONTEXTS_PER_STACK 8 enum perf_callchain_context { - PERF_CONTEXT_HV = (__u64)-32, - PERF_CONTEXT_KERNEL = (__u64)-128, - PERF_CONTEXT_USER = (__u64)-512, + PERF_CONTEXT_HV = (__u64)-32, + PERF_CONTEXT_KERNEL = (__u64)-128, + PERF_CONTEXT_USER = (__u64)-512, - PERF_CONTEXT_GUEST = (__u64)-2048, - PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, - PERF_CONTEXT_GUEST_USER = (__u64)-2560, + PERF_CONTEXT_GUEST = (__u64)-2048, + PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, + PERF_CONTEXT_GUEST_USER = (__u64)-2560, - PERF_CONTEXT_MAX = (__u64)-4095, + PERF_CONTEXT_MAX = (__u64)-4095, }; /** * PERF_RECORD_AUX::flags bits */ -#define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ -#define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ -#define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ -#define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ +#define PERF_AUX_FLAG_TRUNCATED 0x0001 /* Record was truncated to fit */ +#define PERF_AUX_FLAG_OVERWRITE 0x0002 /* Snapshot from overwrite mode */ +#define PERF_AUX_FLAG_PARTIAL 0x0004 /* Record contains gaps */ +#define PERF_AUX_FLAG_COLLISION 0x0008 /* Sample collided with another */ #define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK 0xff00 /* PMU specific trace format type */ /* CoreSight PMU AUX buffer formats */ -#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */ -#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */ -#define PERF_FLAG_FD_NO_GROUP (1UL << 0) -#define PERF_FLAG_FD_OUTPUT (1UL << 1) -#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ -#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ +#define PERF_FLAG_FD_NO_GROUP (1UL << 0) +#define PERF_FLAG_FD_OUTPUT (1UL << 1) +#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup ID, per-CPU mode only */ +#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ #if defined(__LITTLE_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_op:5, /* type of opcode */ - mem_lvl:14, /* memory hierarchy level */ - mem_snoop:5, /* snoop mode */ - mem_lock:2, /* lock instr */ - mem_dtlb:7, /* tlb access */ - mem_lvl_num:4, /* memory hierarchy level number */ - mem_remote:1, /* remote */ - mem_snoopx:2, /* snoop mode, ext */ - mem_blk:3, /* access blocked */ - mem_hops:3, /* hop level */ - mem_rsvd:18; + __u64 mem_op : 5, /* Type of opcode */ + mem_lvl : 14, /* Memory hierarchy level */ + mem_snoop : 5, /* Snoop mode */ + mem_lock : 2, /* Lock instr */ + mem_dtlb : 7, /* TLB access */ + mem_lvl_num : 4, /* Memory hierarchy level number */ + mem_remote : 1, /* Remote */ + mem_snoopx : 2, /* Snoop mode, ext */ + mem_blk : 3, /* Access blocked */ + mem_hops : 3, /* Hop level */ + mem_rsvd : 18; }; }; #elif defined(__BIG_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_rsvd:18, - mem_hops:3, /* hop level */ - mem_blk:3, /* access blocked */ - mem_snoopx:2, /* snoop mode, ext */ - mem_remote:1, /* remote */ - mem_lvl_num:4, /* memory hierarchy level number */ - mem_dtlb:7, /* tlb access */ - mem_lock:2, /* lock instr */ - mem_snoop:5, /* snoop mode */ - mem_lvl:14, /* memory hierarchy level */ - mem_op:5; /* type of opcode */ + __u64 mem_rsvd : 18, + mem_hops : 3, /* Hop level */ + mem_blk : 3, /* Access blocked */ + mem_snoopx : 2, /* Snoop mode, ext */ + mem_remote : 1, /* Remote */ + mem_lvl_num : 4, /* Memory hierarchy level number */ + mem_dtlb : 7, /* TLB access */ + mem_lock : 2, /* Lock instr */ + mem_snoop : 5, /* Snoop mode */ + mem_lvl : 14, /* Memory hierarchy level */ + mem_op : 5; /* Type of opcode */ }; }; #else -#error "Unknown endianness" +# error "Unknown endianness" #endif -/* type of opcode (load/store/prefetch,code) */ -#define PERF_MEM_OP_NA 0x01 /* not available */ -#define PERF_MEM_OP_LOAD 0x02 /* load instruction */ -#define PERF_MEM_OP_STORE 0x04 /* store instruction */ -#define PERF_MEM_OP_PFETCH 0x08 /* prefetch */ -#define PERF_MEM_OP_EXEC 0x10 /* code (execution) */ -#define PERF_MEM_OP_SHIFT 0 +/* Type of memory opcode: */ +#define PERF_MEM_OP_NA 0x0001 /* Not available */ +#define PERF_MEM_OP_LOAD 0x0002 /* Load instruction */ +#define PERF_MEM_OP_STORE 0x0004 /* Store instruction */ +#define PERF_MEM_OP_PFETCH 0x0008 /* Prefetch */ +#define PERF_MEM_OP_EXEC 0x0010 /* Code (execution) */ +#define PERF_MEM_OP_SHIFT 0 /* - * PERF_MEM_LVL_* namespace being depricated to some extent in the + * The PERF_MEM_LVL_* namespace is being deprecated to some extent in * favour of newer composite PERF_MEM_{LVLNUM_,REMOTE_,SNOOPX_} fields. - * Supporting this namespace inorder to not break defined ABIs. + * We support this namespace in order to not break defined ABIs. * - * memory hierarchy (memory level, hit or miss) + * Memory hierarchy (memory level, hit or miss) */ -#define PERF_MEM_LVL_NA 0x01 /* not available */ -#define PERF_MEM_LVL_HIT 0x02 /* hit level */ -#define PERF_MEM_LVL_MISS 0x04 /* miss level */ -#define PERF_MEM_LVL_L1 0x08 /* L1 */ -#define PERF_MEM_LVL_LFB 0x10 /* Line Fill Buffer */ -#define PERF_MEM_LVL_L2 0x20 /* L2 */ -#define PERF_MEM_LVL_L3 0x40 /* L3 */ -#define PERF_MEM_LVL_LOC_RAM 0x80 /* Local DRAM */ -#define PERF_MEM_LVL_REM_RAM1 0x100 /* Remote DRAM (1 hop) */ -#define PERF_MEM_LVL_REM_RAM2 0x200 /* Remote DRAM (2 hops) */ -#define PERF_MEM_LVL_REM_CCE1 0x400 /* Remote Cache (1 hop) */ -#define PERF_MEM_LVL_REM_CCE2 0x800 /* Remote Cache (2 hops) */ -#define PERF_MEM_LVL_IO 0x1000 /* I/O memory */ -#define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */ -#define PERF_MEM_LVL_SHIFT 5 - -#define PERF_MEM_REMOTE_REMOTE 0x01 /* Remote */ -#define PERF_MEM_REMOTE_SHIFT 37 - -#define PERF_MEM_LVLNUM_L1 0x01 /* L1 */ -#define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ -#define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ -#define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ -#define PERF_MEM_LVLNUM_L2_MHB 0x05 /* L2 Miss Handling Buffer */ -#define PERF_MEM_LVLNUM_MSC 0x06 /* Memory-side Cache */ -/* 0x7 available */ -#define PERF_MEM_LVLNUM_UNC 0x08 /* Uncached */ -#define PERF_MEM_LVLNUM_CXL 0x09 /* CXL */ -#define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ -#define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ -#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB / L1 Miss Handling Buffer */ -#define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ -#define PERF_MEM_LVLNUM_PMEM 0x0e /* PMEM */ -#define PERF_MEM_LVLNUM_NA 0x0f /* N/A */ - -#define PERF_MEM_LVLNUM_SHIFT 33 - -/* snoop mode */ -#define PERF_MEM_SNOOP_NA 0x01 /* not available */ -#define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */ -#define PERF_MEM_SNOOP_HIT 0x04 /* snoop hit */ -#define PERF_MEM_SNOOP_MISS 0x08 /* snoop miss */ -#define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */ -#define PERF_MEM_SNOOP_SHIFT 19 - -#define PERF_MEM_SNOOPX_FWD 0x01 /* forward */ -#define PERF_MEM_SNOOPX_PEER 0x02 /* xfer from peer */ -#define PERF_MEM_SNOOPX_SHIFT 38 - -/* locked instruction */ -#define PERF_MEM_LOCK_NA 0x01 /* not available */ -#define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */ -#define PERF_MEM_LOCK_SHIFT 24 +#define PERF_MEM_LVL_NA 0x0001 /* Not available */ +#define PERF_MEM_LVL_HIT 0x0002 /* Hit level */ +#define PERF_MEM_LVL_MISS 0x0004 /* Miss level */ +#define PERF_MEM_LVL_L1 0x0008 /* L1 */ +#define PERF_MEM_LVL_LFB 0x0010 /* Line Fill Buffer */ +#define PERF_MEM_LVL_L2 0x0020 /* L2 */ +#define PERF_MEM_LVL_L3 0x0040 /* L3 */ +#define PERF_MEM_LVL_LOC_RAM 0x0080 /* Local DRAM */ +#define PERF_MEM_LVL_REM_RAM1 0x0100 /* Remote DRAM (1 hop) */ +#define PERF_MEM_LVL_REM_RAM2 0x0200 /* Remote DRAM (2 hops) */ +#define PERF_MEM_LVL_REM_CCE1 0x0400 /* Remote Cache (1 hop) */ +#define PERF_MEM_LVL_REM_CCE2 0x0800 /* Remote Cache (2 hops) */ +#define PERF_MEM_LVL_IO 0x1000 /* I/O memory */ +#define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */ +#define PERF_MEM_LVL_SHIFT 5 + +#define PERF_MEM_REMOTE_REMOTE 0x0001 /* Remote */ +#define PERF_MEM_REMOTE_SHIFT 37 + +#define PERF_MEM_LVLNUM_L1 0x0001 /* L1 */ +#define PERF_MEM_LVLNUM_L2 0x0002 /* L2 */ +#define PERF_MEM_LVLNUM_L3 0x0003 /* L3 */ +#define PERF_MEM_LVLNUM_L4 0x0004 /* L4 */ +#define PERF_MEM_LVLNUM_L2_MHB 0x0005 /* L2 Miss Handling Buffer */ +#define PERF_MEM_LVLNUM_MSC 0x0006 /* Memory-side Cache */ +/* 0x007 available */ +#define PERF_MEM_LVLNUM_UNC 0x0008 /* Uncached */ +#define PERF_MEM_LVLNUM_CXL 0x0009 /* CXL */ +#define PERF_MEM_LVLNUM_IO 0x000a /* I/O */ +#define PERF_MEM_LVLNUM_ANY_CACHE 0x000b /* Any cache */ +#define PERF_MEM_LVLNUM_LFB 0x000c /* LFB / L1 Miss Handling Buffer */ +#define PERF_MEM_LVLNUM_RAM 0x000d /* RAM */ +#define PERF_MEM_LVLNUM_PMEM 0x000e /* PMEM */ +#define PERF_MEM_LVLNUM_NA 0x000f /* N/A */ + +#define PERF_MEM_LVLNUM_SHIFT 33 + +/* Snoop mode */ +#define PERF_MEM_SNOOP_NA 0x0001 /* Not available */ +#define PERF_MEM_SNOOP_NONE 0x0002 /* No snoop */ +#define PERF_MEM_SNOOP_HIT 0x0004 /* Snoop hit */ +#define PERF_MEM_SNOOP_MISS 0x0008 /* Snoop miss */ +#define PERF_MEM_SNOOP_HITM 0x0010 /* Snoop hit modified */ +#define PERF_MEM_SNOOP_SHIFT 19 + +#define PERF_MEM_SNOOPX_FWD 0x0001 /* Forward */ +#define PERF_MEM_SNOOPX_PEER 0x0002 /* Transfer from peer */ +#define PERF_MEM_SNOOPX_SHIFT 38 + +/* Locked instruction */ +#define PERF_MEM_LOCK_NA 0x0001 /* Not available */ +#define PERF_MEM_LOCK_LOCKED 0x0002 /* Locked transaction */ +#define PERF_MEM_LOCK_SHIFT 24 /* TLB access */ -#define PERF_MEM_TLB_NA 0x01 /* not available */ -#define PERF_MEM_TLB_HIT 0x02 /* hit level */ -#define PERF_MEM_TLB_MISS 0x04 /* miss level */ -#define PERF_MEM_TLB_L1 0x08 /* L1 */ -#define PERF_MEM_TLB_L2 0x10 /* L2 */ -#define PERF_MEM_TLB_WK 0x20 /* Hardware Walker*/ -#define PERF_MEM_TLB_OS 0x40 /* OS fault handler */ -#define PERF_MEM_TLB_SHIFT 26 +#define PERF_MEM_TLB_NA 0x0001 /* Not available */ +#define PERF_MEM_TLB_HIT 0x0002 /* Hit level */ +#define PERF_MEM_TLB_MISS 0x0004 /* Miss level */ +#define PERF_MEM_TLB_L1 0x0008 /* L1 */ +#define PERF_MEM_TLB_L2 0x0010 /* L2 */ +#define PERF_MEM_TLB_WK 0x0020 /* Hardware Walker*/ +#define PERF_MEM_TLB_OS 0x0040 /* OS fault handler */ +#define PERF_MEM_TLB_SHIFT 26 /* Access blocked */ -#define PERF_MEM_BLK_NA 0x01 /* not available */ -#define PERF_MEM_BLK_DATA 0x02 /* data could not be forwarded */ -#define PERF_MEM_BLK_ADDR 0x04 /* address conflict */ -#define PERF_MEM_BLK_SHIFT 40 - -/* hop level */ -#define PERF_MEM_HOPS_0 0x01 /* remote core, same node */ -#define PERF_MEM_HOPS_1 0x02 /* remote node, same socket */ -#define PERF_MEM_HOPS_2 0x03 /* remote socket, same board */ -#define PERF_MEM_HOPS_3 0x04 /* remote board */ +#define PERF_MEM_BLK_NA 0x0001 /* Not available */ +#define PERF_MEM_BLK_DATA 0x0002 /* Data could not be forwarded */ +#define PERF_MEM_BLK_ADDR 0x0004 /* Address conflict */ +#define PERF_MEM_BLK_SHIFT 40 + +/* Hop level */ +#define PERF_MEM_HOPS_0 0x0001 /* Remote core, same node */ +#define PERF_MEM_HOPS_1 0x0002 /* Remote node, same socket */ +#define PERF_MEM_HOPS_2 0x0003 /* Remote socket, same board */ +#define PERF_MEM_HOPS_3 0x0004 /* Remote board */ /* 5-7 available */ -#define PERF_MEM_HOPS_SHIFT 43 +#define PERF_MEM_HOPS_SHIFT 43 #define PERF_MEM_S(a, s) \ (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) /* - * single taken branch record layout: + * Layout of single taken branch records: * * from: source instruction (may not always be a branch insn) * to: branch target @@ -1429,37 +1439,37 @@ union perf_mem_data_src { struct perf_branch_entry { __u64 from; __u64 to; - __u64 mispred:1, /* target mispredicted */ - predicted:1,/* target predicted */ - in_tx:1, /* in transaction */ - abort:1, /* transaction abort */ - cycles:16, /* cycle count to last branch */ - type:4, /* branch type */ - spec:2, /* branch speculation info */ - new_type:4, /* additional branch type */ - priv:3, /* privilege level */ - reserved:31; + __u64 mispred : 1, /* target mispredicted */ + predicted : 1, /* target predicted */ + in_tx : 1, /* in transaction */ + abort : 1, /* transaction abort */ + cycles : 16, /* cycle count to last branch */ + type : 4, /* branch type */ + spec : 2, /* branch speculation info */ + new_type : 4, /* additional branch type */ + priv : 3, /* privilege level */ + reserved : 31; }; /* Size of used info bits in struct perf_branch_entry */ #define PERF_BRANCH_ENTRY_INFO_BITS_MAX 33 union perf_sample_weight { - __u64 full; + __u64 full; #if defined(__LITTLE_ENDIAN_BITFIELD) struct { - __u32 var1_dw; - __u16 var2_w; - __u16 var3_w; + __u32 var1_dw; + __u16 var2_w; + __u16 var3_w; }; #elif defined(__BIG_ENDIAN_BITFIELD) struct { - __u16 var3_w; - __u16 var2_w; - __u32 var1_dw; + __u16 var3_w; + __u16 var2_w; + __u32 var1_dw; }; #else -#error "Unknown endianness" +# error "Unknown endianness" #endif }; -- Gitee From 011f1a996d7ce990eee3f9d287348b4ccb6f15ef Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 11 Dec 2024 08:11:46 -0800 Subject: [PATCH 34/52] perf/x86/intel/uncore: Add Clearwater Forest support mainline inclusion from mainline-v6.13-rc5 commit b6ccddd6fe1fd49c7a82b6fbed01cccad21a29c7 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b6ccddd6fe1fd49c7a82b6fbed01cccad21a29c7 --------------------------- From the perspective of the uncore PMU, the Clearwater Forest is the same as the previous Sierra Forest. The only difference is the event list, which will be supported in the perf tool later. Intel-SIG: commit b6ccddd6fe1f perf/x86/intel/uncore: Add Clearwater Forest support Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241211161146.235253-1-kan.liang@linux.intel.com [Dapeng Mi: resolve conflict and amend commit log] Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/uncore.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index b2453052dad8..84d6508e807b 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1894,6 +1894,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, &gnr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT, &gnr_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_DARKMONT_X, &gnr_uncore_init), {}, }; MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match); -- Gitee From d1841d25612e78df7009e61a7e3200d7dabe9309 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 7 Jul 2025 13:17:47 -0700 Subject: [PATCH 35/52] perf/x86/intel/uncore: Support MSR portal for discovery tables mainline inclusion from mainline-v6.17-rc1 commit cf002dafedd06241175e4dbce39ba90a4b75822c category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cf002dafedd06241175e4dbce39ba90a4b75822c --------------------------- Starting from the Panther Lake, the discovery table mechanism is also supported in client platforms. The difference is that the portal of the global discovery table is retrieved from an MSR. The layout of discovery tables are the same as the server platforms. Factor out __parse_discovery_table() to parse discover tables. The uncore PMON is Die scope. Need to parse the discovery tables for each die. Intel-SIG: commit cf002dafedd0 perf/x86/intel/uncore: Support MSR portal for discovery tables Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://lore.kernel.org/r/20250707201750.616527-2-kan.liang@linux.intel.com [Dapeng Mi: Replace rdmsrq_safe_on_cpu() with rdmsrl_safe_on_cpu()] Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/uncore_discovery.c | 87 ++++++++++++++++++------ arch/x86/events/intel/uncore_discovery.h | 3 + 2 files changed, 70 insertions(+), 20 deletions(-) diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c index 571e44b49691..79656d4d46df 100644 --- a/arch/x86/events/intel/uncore_discovery.c +++ b/arch/x86/events/intel/uncore_discovery.c @@ -273,32 +273,15 @@ uncore_ignore_unit(struct uncore_unit_discovery *unit, int *ignore) return false; } -static int parse_discovery_table(struct pci_dev *dev, int die, - u32 bar_offset, bool *parsed, - int *ignore) +static int __parse_discovery_table(resource_size_t addr, int die, + bool *parsed, int *ignore) { struct uncore_global_discovery global; struct uncore_unit_discovery unit; void __iomem *io_addr; - resource_size_t addr; unsigned long size; - u32 val; int i; - pci_read_config_dword(dev, bar_offset, &val); - - if (val & ~PCI_BASE_ADDRESS_MEM_MASK & ~PCI_BASE_ADDRESS_MEM_TYPE_64) - return -EINVAL; - - addr = (resource_size_t)(val & PCI_BASE_ADDRESS_MEM_MASK); -#ifdef CONFIG_PHYS_ADDR_T_64BIT - if ((val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64) { - u32 val2; - - pci_read_config_dword(dev, bar_offset + 4, &val2); - addr |= ((resource_size_t)val2) << 32; - } -#endif size = UNCORE_DISCOVERY_GLOBAL_MAP_SIZE; io_addr = ioremap(addr, size); if (!io_addr) @@ -341,7 +324,32 @@ static int parse_discovery_table(struct pci_dev *dev, int die, return 0; } -bool intel_uncore_has_discovery_tables(int *ignore) +static int parse_discovery_table(struct pci_dev *dev, int die, + u32 bar_offset, bool *parsed, + int *ignore) +{ + resource_size_t addr; + u32 val; + + pci_read_config_dword(dev, bar_offset, &val); + + if (val & ~PCI_BASE_ADDRESS_MEM_MASK & ~PCI_BASE_ADDRESS_MEM_TYPE_64) + return -EINVAL; + + addr = (resource_size_t)(val & PCI_BASE_ADDRESS_MEM_MASK); +#ifdef CONFIG_PHYS_ADDR_T_64BIT + if ((val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64) { + u32 val2; + + pci_read_config_dword(dev, bar_offset + 4, &val2); + addr |= ((resource_size_t)val2) << 32; + } +#endif + + return __parse_discovery_table(addr, die, parsed, ignore); +} + +static bool intel_uncore_has_discovery_tables_pci(int *ignore) { u32 device, val, entry_id, bar_offset; int die, dvsec = 0, ret = true; @@ -390,6 +398,45 @@ bool intel_uncore_has_discovery_tables(int *ignore) return ret; } +static bool intel_uncore_has_discovery_tables_msr(int *ignore) +{ + unsigned long *die_mask; + bool parsed = false; + int cpu, die; + u64 base; + + die_mask = kcalloc(BITS_TO_LONGS(uncore_max_dies()), + sizeof(unsigned long), GFP_KERNEL); + if (!die_mask) + return false; + + cpus_read_lock(); + for_each_online_cpu(cpu) { + die = topology_logical_die_id(cpu); + if (__test_and_set_bit(die, die_mask)) + continue; + + if (rdmsrl_safe_on_cpu(cpu, UNCORE_DISCOVERY_MSR, &base)) + continue; + + if (!base) + continue; + + __parse_discovery_table(base, die, &parsed, ignore); + } + + cpus_read_unlock(); + + kfree(die_mask); + return parsed; +} + +bool intel_uncore_has_discovery_tables(int *ignore) +{ + return intel_uncore_has_discovery_tables_msr(ignore) || + intel_uncore_has_discovery_tables_pci(ignore); +} + void intel_uncore_clear_discovery_tables(void) { struct intel_uncore_discovery_type *type, *next; diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h index 0e94aa7db8e7..690f737e6837 100644 --- a/arch/x86/events/intel/uncore_discovery.h +++ b/arch/x86/events/intel/uncore_discovery.h @@ -1,5 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0-only */ +/* Store the full address of the global discovery table */ +#define UNCORE_DISCOVERY_MSR 0x201e + /* Generic device ID of a discovery table device */ #define UNCORE_DISCOVERY_TABLE_DEVICE 0x09a7 /* Capability ID for a discovery table device */ -- Gitee From 88688e7dfc40e94d961749d46be1d0f68297b788 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 7 Jul 2025 13:17:48 -0700 Subject: [PATCH 36/52] perf/x86/intel/uncore: Support customized MMIO map size mainline inclusion from mainline-v6.17-rc1 commit fca24bf2b6b619770d7f1222c0284791d7766239 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fca24bf2b6b619770d7f1222c0284791d7766239 --------------------------- For a server platform, the MMIO map size is always 0x4000. However, a client platform may have a smaller map size. Make the map size customizable. Intel-SIG: commit fca24bf2b6b6 perf/x86/intel/uncore: Support customized MMIO map size Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dapeng Mi Link: https://lore.kernel.org/r/20250707201750.616527-3-kan.liang@linux.intel.com Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- arch/x86/events/intel/uncore_discovery.c | 2 +- arch/x86/events/intel/uncore_snbep.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c index 79656d4d46df..a543a54fd714 100644 --- a/arch/x86/events/intel/uncore_discovery.c +++ b/arch/x86/events/intel/uncore_discovery.c @@ -650,7 +650,7 @@ void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box) } addr = unit->addr; - box->io_addr = ioremap(addr, UNCORE_GENERIC_MMIO_SIZE); + box->io_addr = ioremap(addr, type->mmio_map_size); if (!box->io_addr) { pr_warn("Uncore type %d box %d: ioremap error for 0x%llx.\n", type->type_id, unit->id, (unsigned long long)addr); diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 41105cf1e632..90f22a79c065 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -6491,6 +6491,8 @@ static void uncore_type_customized_copy(struct intel_uncore_type *to_type, to_type->get_topology = from_type->get_topology; if (from_type->cleanup_mapping) to_type->cleanup_mapping = from_type->cleanup_mapping; + if (from_type->mmio_map_size) + to_type->mmio_map_size = from_type->mmio_map_size; } static struct intel_uncore_type ** -- Gitee From 11973459b6f614920e7d8fab826f9c431c14eab2 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 11 Feb 2025 13:30:15 -0800 Subject: [PATCH 37/52] perf vendor events: Add Clearwaterforest events mainline inclusion from mainline-v6.15-rc1 commit e415c1493fa1e93afaec697385b8952d932c41bc category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e415c1493fa1e93afaec697385b8952d932c41bc --------------------------- Add events v1.00. Bring in the events from: https://github.com/intel/perfmon/tree/main/CWF/events Intel-SIG: commit e415c1493fa1 perf vendor events: Add Clearwaterforest events Co-developed-by: Caleb Biggers Signed-off-by: Caleb Biggers Acked-by: Kan Liang Signed-off-by: Ian Rogers Tested-by: Thomas Falcon Link: https://lore.kernel.org/r/20250211213031.114209-9-irogers@google.com Signed-off-by: Namhyung Kim [Dapeng Mi: resolve conflict and amend commit log] Signed-off-by: Dapeng Mi Signed-off-by: Jason Zeng --- .../arch/x86/clearwaterforest/cache.json | 144 ++++++++++++++++++ .../arch/x86/clearwaterforest/counter.json | 7 + .../arch/x86/clearwaterforest/frontend.json | 18 +++ .../arch/x86/clearwaterforest/memory.json | 22 +++ .../arch/x86/clearwaterforest/other.json | 22 +++ .../arch/x86/clearwaterforest/pipeline.json | 113 ++++++++++++++ .../x86/clearwaterforest/virtual-memory.json | 29 ++++ tools/perf/pmu-events/arch/x86/mapfile.csv | 1 + 8 files changed, 356 insertions(+) create mode 100644 tools/perf/pmu-events/arch/x86/clearwaterforest/cache.json create mode 100644 tools/perf/pmu-events/arch/x86/clearwaterforest/counter.json create mode 100644 tools/perf/pmu-events/arch/x86/clearwaterforest/frontend.json create mode 100644 tools/perf/pmu-events/arch/x86/clearwaterforest/memory.json create mode 100644 tools/perf/pmu-events/arch/x86/clearwaterforest/other.json create mode 100644 tools/perf/pmu-events/arch/x86/clearwaterforest/pipeline.json create mode 100644 tools/perf/pmu-events/arch/x86/clearwaterforest/virtual-memory.json diff --git a/tools/perf/pmu-events/arch/x86/clearwaterforest/cache.json b/tools/perf/pmu-events/arch/x86/clearwaterforest/cache.json new file mode 100644 index 000000000000..875361b30f1d --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/clearwaterforest/cache.json @@ -0,0 +1,144 @@ +[ + { + "BriefDescription": "Counts the number of cacheable memory requests that miss in the LLC. Counts on a per core basis.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x2e", + "EventName": "LONGEST_LAT_CACHE.MISS", + "PublicDescription": "Counts the number of cacheable memory requests that miss in the Last Level Cache (LLC). Requests include demand loads, reads for ownership (RFO), instruction fetches and L1 HW prefetches. If the core has access to an L3 cache, the LLC is the L3 cache, otherwise it is the L2 cache. Counts on a per core basis.", + "SampleAfterValue": "1000003", + "UMask": "0x41" + }, + { + "BriefDescription": "Counts the number of cacheable memory requests that access the LLC. Counts on a per core basis.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x2e", + "EventName": "LONGEST_LAT_CACHE.REFERENCE", + "PublicDescription": "Counts the number of cacheable memory requests that access the Last Level Cache (LLC). Requests include demand loads, reads for ownership (RFO), instruction fetches and L1 HW prefetches. If the core has access to an L3 cache, the LLC is the L3 cache, otherwise it is the L2 cache. Counts on a per core basis.", + "SampleAfterValue": "1000003", + "UMask": "0x4f" + }, + { + "BriefDescription": "Counts the number of load ops retired.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.ALL_LOADS", + "SampleAfterValue": "1000003", + "UMask": "0x81" + }, + { + "BriefDescription": "Counts the number of store ops retired.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.ALL_STORES", + "SampleAfterValue": "1000003", + "UMask": "0x82" + }, + { + "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", + "Counter": "0,1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_1024", + "MSRIndex": "0x3F6", + "MSRValue": "0x400", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", + "Counter": "0,1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_128", + "MSRIndex": "0x3F6", + "MSRValue": "0x80", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", + "Counter": "0,1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_16", + "MSRIndex": "0x3F6", + "MSRValue": "0x10", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", + "Counter": "0,1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_2048", + "MSRIndex": "0x3F6", + "MSRValue": "0x800", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", + "Counter": "0,1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_256", + "MSRIndex": "0x3F6", + "MSRValue": "0x100", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", + "Counter": "0,1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_32", + "MSRIndex": "0x3F6", + "MSRValue": "0x20", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", + "Counter": "0,1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_4", + "MSRIndex": "0x3F6", + "MSRValue": "0x4", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", + "Counter": "0,1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_512", + "MSRIndex": "0x3F6", + "MSRValue": "0x200", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", + "Counter": "0,1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_64", + "MSRIndex": "0x3F6", + "MSRValue": "0x40", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", + "Counter": "0,1", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_8", + "MSRIndex": "0x3F6", + "MSRValue": "0x8", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of stores uops retired same as MEM_UOPS_RETIRED.ALL_STORES", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.STORE_LATENCY", + "SampleAfterValue": "1000003", + "UMask": "0x6" + } +] diff --git a/tools/perf/pmu-events/arch/x86/clearwaterforest/counter.json b/tools/perf/pmu-events/arch/x86/clearwaterforest/counter.json new file mode 100644 index 000000000000..a0eaf5b6f2bc --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/clearwaterforest/counter.json @@ -0,0 +1,7 @@ +[ + { + "Unit": "core", + "CountersNumFixed": "3", + "CountersNumGeneric": "39" + } +] \ No newline at end of file diff --git a/tools/perf/pmu-events/arch/x86/clearwaterforest/frontend.json b/tools/perf/pmu-events/arch/x86/clearwaterforest/frontend.json new file mode 100644 index 000000000000..7a9250e5c8f2 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/clearwaterforest/frontend.json @@ -0,0 +1,18 @@ +[ + { + "BriefDescription": "Counts every time the code stream enters into a new cache line by walking sequential from the previous line or being redirected by a jump.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x80", + "EventName": "ICACHE.ACCESSES", + "SampleAfterValue": "1000003", + "UMask": "0x3" + }, + { + "BriefDescription": "Counts every time the code stream enters into a new cache line by walking sequential from the previous line or being redirected by a jump and the instruction cache registers bytes are not present. -", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x80", + "EventName": "ICACHE.MISSES", + "SampleAfterValue": "1000003", + "UMask": "0x2" + } +] diff --git a/tools/perf/pmu-events/arch/x86/clearwaterforest/memory.json b/tools/perf/pmu-events/arch/x86/clearwaterforest/memory.json new file mode 100644 index 000000000000..f5007e56f39b --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/clearwaterforest/memory.json @@ -0,0 +1,22 @@ +[ + { + "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_DATA_RD.L3_MISS", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x33FBFC00001", + "SampleAfterValue": "100003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_RFO.L3_MISS", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x33FBFC00002", + "SampleAfterValue": "100003", + "UMask": "0x1" + } +] diff --git a/tools/perf/pmu-events/arch/x86/clearwaterforest/other.json b/tools/perf/pmu-events/arch/x86/clearwaterforest/other.json new file mode 100644 index 000000000000..80454e497f83 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/clearwaterforest/other.json @@ -0,0 +1,22 @@ +[ + { + "BriefDescription": "Counts demand data reads that have any type of response.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_DATA_RD.ANY_RESPONSE", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x10001", + "SampleAfterValue": "100003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that have any type of response.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xB7", + "EventName": "OCR.DEMAND_RFO.ANY_RESPONSE", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x10002", + "SampleAfterValue": "100003", + "UMask": "0x1" + } +] diff --git a/tools/perf/pmu-events/arch/x86/clearwaterforest/pipeline.json b/tools/perf/pmu-events/arch/x86/clearwaterforest/pipeline.json new file mode 100644 index 000000000000..6a5faa704b85 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/clearwaterforest/pipeline.json @@ -0,0 +1,113 @@ +[ + { + "BriefDescription": "Counts the total number of branch instructions retired for all branch types.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.ALL_BRANCHES", + "PublicDescription": "Counts the total number of instructions in which the instruction pointer (IP) of the processor is resteered due to a branch instruction and the branch instruction successfully retires. All branch type instructions are accounted for.", + "SampleAfterValue": "1000003" + }, + { + "BriefDescription": "Counts the total number of mispredicted branch instructions retired for all branch types.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.ALL_BRANCHES", + "PublicDescription": "Counts the total number of mispredicted branch instructions retired. All branch type instructions are accounted for. Prediction of the branch target address enables the processor to begin executing instructions before the non-speculative execution path is known. The branch prediction unit (BPU) predicts the target address based on the instruction pointer (IP) of the branch and on the execution path through which execution reached this IP. A branch misprediction occurs when the prediction is wrong, and results in discarding all instructions executed in the speculative path and re-fetching from the correct path.", + "SampleAfterValue": "1000003" + }, + { + "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles.", + "Counter": "Fixed counter 1", + "EventName": "CPU_CLK_UNHALTED.CORE", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.THREAD_P]", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x3c", + "EventName": "CPU_CLK_UNHALTED.CORE_P", + "SampleAfterValue": "1000003" + }, + { + "BriefDescription": "Fixed Counter: Counts the number of unhalted reference clock cycles.", + "Counter": "Fixed counter 2", + "EventName": "CPU_CLK_UNHALTED.REF_TSC", + "SampleAfterValue": "1000003", + "UMask": "0x3" + }, + { + "BriefDescription": "Counts the number of unhalted reference clock cycles at TSC frequency.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x3c", + "EventName": "CPU_CLK_UNHALTED.REF_TSC_P", + "PublicDescription": "Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. This event is not affected by core frequency changes and increments at a fixed frequency that is also used for the Time Stamp Counter (TSC). This event uses a programmable general purpose performance counter.", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles.", + "Counter": "Fixed counter 1", + "EventName": "CPU_CLK_UNHALTED.THREAD", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.CORE_P]", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x3c", + "EventName": "CPU_CLK_UNHALTED.THREAD_P", + "SampleAfterValue": "1000003" + }, + { + "BriefDescription": "Fixed Counter: Counts the number of instructions retired.", + "Counter": "Fixed counter 0", + "EventName": "INST_RETIRED.ANY", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts the number of instructions retired.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc0", + "EventName": "INST_RETIRED.ANY_P", + "SampleAfterValue": "1000003" + }, + { + "BriefDescription": "Fixed Counter: Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear.", + "Counter": "36", + "EventName": "TOPDOWN_BAD_SPECULATION.ALL", + "SampleAfterValue": "1000003", + "UMask": "0x5" + }, + { + "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls. [This event is alias to TOPDOWN_BE_BOUND.ALL_P]", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xa4", + "EventName": "TOPDOWN_BE_BOUND.ALL", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of retirement slots not consumed due to backend stalls. [This event is alias to TOPDOWN_BE_BOUND.ALL]", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xa4", + "EventName": "TOPDOWN_BE_BOUND.ALL_P", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Fixed Counter: Counts the number of retirement slots not consumed due to front end stalls.", + "Counter": "37", + "EventName": "TOPDOWN_FE_BOUND.ALL", + "SampleAfterValue": "1000003", + "UMask": "0x6" + }, + { + "BriefDescription": "Fixed Counter: Counts the number of consumed retirement slots.", + "Counter": "38", + "EventName": "TOPDOWN_RETIRING.ALL", + "SampleAfterValue": "1000003", + "UMask": "0x7" + } +] diff --git a/tools/perf/pmu-events/arch/x86/clearwaterforest/virtual-memory.json b/tools/perf/pmu-events/arch/x86/clearwaterforest/virtual-memory.json new file mode 100644 index 000000000000..78f2b835c1fa --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/clearwaterforest/virtual-memory.json @@ -0,0 +1,29 @@ +[ + { + "BriefDescription": "Counts the number of page walks completed due to load DTLB misses to any page size.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x08", + "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED", + "PublicDescription": "Counts the number of page walks completed due to loads (including SW prefetches) whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to any page size. Includes page walks that page fault.", + "SampleAfterValue": "1000003", + "UMask": "0xe" + }, + { + "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to any page size.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x49", + "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED", + "PublicDescription": "Counts the number of page walks completed due to stores whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to any page size. Includes page walks that page fault.", + "SampleAfterValue": "1000003", + "UMask": "0xe" + }, + { + "BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to any page size.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x85", + "EventName": "ITLB_MISSES.WALK_COMPLETED", + "PublicDescription": "Counts the number of page walks completed due to instruction fetches whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to any page size. Includes page walks that page fault.", + "SampleAfterValue": "1000003", + "UMask": "0xe" + } +] diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 33767dd28122..da3cd1763d8e 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -6,6 +6,7 @@ GenuineIntel-6-(3D|47),v28,broadwell,core GenuineIntel-6-56,v10,broadwellde,core GenuineIntel-6-4F,v21,broadwellx,core GenuineIntel-6-55-[56789ABCDEF],v1.19,cascadelakex,core +GenuineIntel-6-DD,v1.00,clearwaterforest,core GenuineIntel-6-9[6C],v1.04,elkhartlake,core GenuineIntel-6-5[CF],v13,goldmont,core GenuineIntel-6-7A,v1.01,goldmontplus,core -- Gitee From a976d8de7ab3ef2747e0cd72049c515a620246fc Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 8 May 2025 16:44:52 +0300 Subject: [PATCH 38/52] perf/x86/intel: Fix segfault with PEBS-via-PT with sample_freq mainline inclusion from mainline-v6.15-rc7 commit 99bcd91fabada0dbb1d5f0de44532d8008db93c6 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=99bcd91fabada0dbb1d5f0de44532d8008db93c6 --------------------------- Currently, using PEBS-via-PT with a sample frequency instead of a sample period, causes a segfault. For example: BUG: kernel NULL pointer dereference, address: 0000000000000195 ? __die_body.cold+0x19/0x27 ? page_fault_oops+0xca/0x290 ? exc_page_fault+0x7e/0x1b0 ? asm_exc_page_fault+0x26/0x30 ? intel_pmu_pebs_event_update_no_drain+0x40/0x60 ? intel_pmu_pebs_event_update_no_drain+0x32/0x60 intel_pmu_drain_pebs_icl+0x333/0x350 handle_pmi_common+0x272/0x3c0 intel_pmu_handle_irq+0x10a/0x2e0 perf_event_nmi_handler+0x2a/0x50 That happens because intel_pmu_pebs_event_update_no_drain() assumes all the pebs_enabled bits represent counter indexes, which is not always the case. In this particular case, bits 60 and 61 are set for PEBS-via-PT purposes. The behaviour of PEBS-via-PT with sample frequency is questionable because although a PMI is generated (PEBS_PMI_AFTER_EACH_RECORD), the period is not adjusted anyway. Putting that aside, fix intel_pmu_pebs_event_update_no_drain() by passing the mask of counter bits instead of 'size'. Note, prior to the Fixes commit, 'size' would be limited to the maximum counter index, so the issue was not hit. Intel-SIG: commit 99bcd91fabad perf/x86/intel: Fix segfault with PEBS-via-PT with sample_freq Fixes: 722e42e45c2f1 ("perf/x86: Support counter mask") Signed-off-by: Adrian Hunter Signed-off-by: Ingo Molnar Reviewed-by: Kan Liang Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Cc: Ian Rogers Cc: linux-perf-users@vger.kernel.org Link: https://lore.kernel.org/r/20250508134452.73960-1-adrian.hunter@intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/ds.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index a78556527447..1f465d74f122 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2426,8 +2426,9 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_ setup_pebs_fixed_sample_data); } -static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size) +static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, u64 mask) { + u64 pebs_enabled = cpuc->pebs_enabled & mask; struct perf_event *event; int bit; @@ -2438,7 +2439,7 @@ static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int * It needs to call intel_pmu_save_and_restart_reload() to * update the event->count for this case. */ - for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled, size) { + for_each_set_bit(bit, (unsigned long *)&pebs_enabled, X86_PMC_IDX_MAX) { event = cpuc->events[bit]; if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) intel_pmu_save_and_restart_reload(event, 0); @@ -2473,7 +2474,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d } if (unlikely(base >= top)) { - intel_pmu_pebs_event_update_no_drain(cpuc, size); + intel_pmu_pebs_event_update_no_drain(cpuc, mask); return; } @@ -2587,7 +2588,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d (hybrid(cpuc->pmu, fixed_cntr_mask64) << INTEL_PMC_IDX_FIXED); if (unlikely(base >= top)) { - intel_pmu_pebs_event_update_no_drain(cpuc, X86_PMC_IDX_MAX); + intel_pmu_pebs_event_update_no_drain(cpuc, mask); return; } -- Gitee From c8e484276aa657635b90ce41dfd384a1e8118baa Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 24 Apr 2025 06:47:18 -0700 Subject: [PATCH 39/52] perf/x86/intel/ds: Fix counter backwards of non-precise events counters-snapshotting mainline inclusion from mainline-v6.15-rc5 commit 7da9960b59fb7e590eb8538c9428db55a4ea2d23 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7da9960b59fb7e590eb8538c9428db55a4ea2d23 --------------------------- The counter backwards may be observed in the PMI handler when counters-snapshotting some non-precise events in the freq mode. For the non-precise events, it's possible the counters-snapshotting records a positive value for an overflowed PEBS event. Then the HW auto-reload mechanism reset the counter to 0 immediately. Because the pebs_event_reset is cleared in the freq mode, which doesn't set the PERF_X86_EVENT_AUTO_RELOAD. In the PMI handler, 0 will be read rather than the positive value recorded in the counters-snapshotting record. The counters-snapshotting case has to be specially handled. Since the event value has been updated when processing the counters-snapshotting record, only needs to set the new period for the counter via x86_pmu_set_period(). Intel-SIG: commit 7da9960b59fb perf/x86/intel/ds: Fix counter backwards of non-precise events counters-snapshotting Fixes: e02e9b0374c3 ("perf/x86/intel: Support PEBS counters snapshotting") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250424134718.311934-6-kan.liang@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/ds.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 1f465d74f122..fb012f262bae 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2357,8 +2357,25 @@ __intel_pmu_pebs_last_event(struct perf_event *event, */ intel_pmu_save_and_restart_reload(event, count); } - } else - intel_pmu_save_and_restart(event); + } else { + /* + * For a non-precise event, it's possible the + * counters-snapshotting records a positive value for the + * overflowed event. Then the HW auto-reload mechanism + * reset the counter to 0 immediately, because the + * pebs_event_reset is cleared if the PERF_X86_EVENT_AUTO_RELOAD + * is not set. The counter backwards may be observed in a + * PMI handler. + * + * Since the event value has been updated when processing the + * counters-snapshotting record, only needs to set the new + * period for the counter. + */ + if (is_pebs_counter_event_group(event)) + static_call(x86_pmu_set_period)(event); + else + intel_pmu_save_and_restart(event); + } } static __always_inline void -- Gitee From 0a2f242ef8cf1ebcad502a5353f86862523bc152 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 19 Feb 2025 06:10:05 -0800 Subject: [PATCH 40/52] perf/x86/intel: Fix event constraints for LNC mainline inclusion from mainline-v6.14-rc4 commit 782cffeec9ad96daa64ffb2d527b2a052fb02552 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=782cffeec9ad96daa64ffb2d527b2a052fb02552 --------------------------- According to the latest event list, update the event constraint tables for Lion Cove core. The general rule (the event codes < 0x90 are restricted to counters 0-3.) has been removed. There is no restriction for most of the performance monitoring events. Intel-SIG: commit 782cffeec9ad perf/x86/intel: Fix event constraints for LNC Fixes: a932aa0e868f ("perf/x86: Add Lunar Lake and Arrow Lake support") Reported-by: Amiri Khalil Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250219141005.2446823-1-kan.liang@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 20 +++++++------------- arch/x86/events/intel/ds.c | 2 +- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 53da2672ce4b..78bbc648e1ee 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -387,34 +387,28 @@ static struct event_constraint intel_lnc_event_constraints[] = { METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FETCH_LAT, 6), METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_MEM_BOUND, 7), + INTEL_EVENT_CONSTRAINT(0x20, 0xf), + + INTEL_UEVENT_CONSTRAINT(0x012a, 0xf), + INTEL_UEVENT_CONSTRAINT(0x012b, 0xf), INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), INTEL_UEVENT_CONSTRAINT(0x0175, 0x4), INTEL_EVENT_CONSTRAINT(0x2e, 0x3ff), INTEL_EVENT_CONSTRAINT(0x3c, 0x3ff), - /* - * Generally event codes < 0x90 are restricted to counters 0-3. - * The 0x2E and 0x3C are exception, which has no restriction. - */ - INTEL_EVENT_CONSTRAINT_RANGE(0x01, 0x8f, 0xf), - INTEL_UEVENT_CONSTRAINT(0x01a3, 0xf), - INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1), INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1), INTEL_UEVENT_CONSTRAINT(0x10a4, 0x1), INTEL_UEVENT_CONSTRAINT(0x01b1, 0x8), + INTEL_UEVENT_CONSTRAINT(0x01cd, 0x3fc), INTEL_UEVENT_CONSTRAINT(0x02cd, 0x3), - INTEL_EVENT_CONSTRAINT(0xce, 0x1), INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xdf, 0xf), - /* - * Generally event codes >= 0x90 are likely to have no restrictions. - * The exception are defined as above. - */ - INTEL_EVENT_CONSTRAINT_RANGE(0x90, 0xfe, 0x3ff), + + INTEL_UEVENT_CONSTRAINT(0x00e0, 0xf), EVENT_CONSTRAINT_END }; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index fb012f262bae..23387996222d 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1177,7 +1177,7 @@ struct event_constraint intel_lnc_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), - INTEL_HYBRID_LDLAT_CONSTRAINT(0x1cd, 0x3ff), + INTEL_HYBRID_LDLAT_CONSTRAINT(0x1cd, 0x3fc), INTEL_HYBRID_STLAT_CONSTRAINT(0x2cd, 0x3), INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */ -- Gitee From 0cf23d2bb256bec1498aaf470fb95f0dafe0447b Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 16 Dec 2024 08:02:52 -0800 Subject: [PATCH 41/52] perf/x86/intel: Fix bitmask of OCR and FRONTEND events for LNC mainline inclusion from mainline-v6.13-rc5 commit aa5d2ca7c179c40669edb5e96d931bf9828dea3d category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=aa5d2ca7c179c40669edb5e96d931bf9828dea3d --------------------------- The released OCR and FRONTEND events utilized more bits on Lunar Lake p-core. The corresponding mask in the extra_regs has to be extended to unblock the extra bits. Add a dedicated intel_lnc_extra_regs. Intel-SIG: commit aa5d2ca7c179 perf/x86/intel: Fix bitmask of OCR and FRONTEND events for LNC Fixes: a932aa0e868f ("perf/x86: Add Lunar Lake and Arrow Lake support") Reported-by: Andi Kleen Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20241216160252.430858-1-kan.liang@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 78bbc648e1ee..0a54896b6242 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -413,6 +413,16 @@ static struct event_constraint intel_lnc_event_constraints[] = { EVENT_CONSTRAINT_END }; +static struct extra_reg intel_lnc_extra_regs[] __read_mostly = { + INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0xfffffffffffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0xfffffffffffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + INTEL_UEVENT_EXTRA_REG(0x02c6, MSR_PEBS_FRONTEND, 0x9, FE), + INTEL_UEVENT_EXTRA_REG(0x03c6, MSR_PEBS_FRONTEND, 0x7fff1f, FE), + INTEL_UEVENT_EXTRA_REG(0x40ad, MSR_PEBS_FRONTEND, 0xf, FE), + INTEL_UEVENT_EXTRA_REG(0x04c2, MSR_PEBS_FRONTEND, 0x8, FE), + EVENT_EXTRA_END +}; EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); @@ -6748,7 +6758,7 @@ static __always_inline void intel_pmu_init_lnc(struct pmu *pmu) intel_pmu_init_glc(pmu); hybrid(pmu, event_constraints) = intel_lnc_event_constraints; hybrid(pmu, pebs_constraints) = intel_lnc_pebs_event_constraints; - hybrid(pmu, extra_regs) = intel_rwc_extra_regs; + hybrid(pmu, extra_regs) = intel_lnc_extra_regs; } static __always_inline void intel_pmu_init_skt(struct pmu *pmu) -- Gitee From bc2d96f5fc9d2ec7aa383821fac32d789b5c2f8d Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Thu, 29 May 2025 08:02:36 +0000 Subject: [PATCH 42/52] perf/x86/intel: Fix incorrect MSR index calculations in intel_pmu_config_acr() mainline inclusion from mainline-v6.16-rc1 commit 86aa94cd50b138be0dd872b0779fa3036e641881 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=86aa94cd50b138be0dd872b0779fa3036e641881 --------------------------- The MSR offset calculations in intel_pmu_config_acr() are buggy. To calculate fixed counter MSR addresses in intel_pmu_config_acr(), the HW counter index "idx" is subtracted by INTEL_PMC_IDX_FIXED. This leads to the ACR mask value of fixed counters to be incorrectly saved to the positions of GP counters in acr_cfg_b[], e.g. For fixed counter 0, its ACR counter mask should be saved to acr_cfg_b[32], but it's saved to acr_cfg_b[0] incorrectly. Fix this issue. [ mingo: Clarified & improved the changelog. ] Intel-SIG: commit 86aa94cd50b1 perf/x86/intel: Fix incorrect MSR index calculations in intel_pmu_config_acr() Fixes: ec980e4facef ("perf/x86/intel: Support auto counter reload") Signed-off-by: Dapeng Mi Signed-off-by: Ingo Molnar Reviewed-by: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250529080236.2552247-2-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 0a54896b6242..8d4dd547de90 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2889,6 +2889,7 @@ static void intel_pmu_config_acr(int idx, u64 mask, u32 reload) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int msr_b, msr_c; + int msr_offset; if (!mask && !cpuc->acr_cfg_b[idx]) return; @@ -2896,19 +2897,20 @@ static void intel_pmu_config_acr(int idx, u64 mask, u32 reload) if (idx < INTEL_PMC_IDX_FIXED) { msr_b = MSR_IA32_PMC_V6_GP0_CFG_B; msr_c = MSR_IA32_PMC_V6_GP0_CFG_C; + msr_offset = x86_pmu.addr_offset(idx, false); } else { msr_b = MSR_IA32_PMC_V6_FX0_CFG_B; msr_c = MSR_IA32_PMC_V6_FX0_CFG_C; - idx -= INTEL_PMC_IDX_FIXED; + msr_offset = x86_pmu.addr_offset(idx - INTEL_PMC_IDX_FIXED, false); } if (cpuc->acr_cfg_b[idx] != mask) { - wrmsrl(msr_b + x86_pmu.addr_offset(idx, false), mask); + wrmsrl(msr_b + msr_offset, mask); cpuc->acr_cfg_b[idx] = mask; } /* Only need to update the reload value when there is a valid config value. */ if (mask && cpuc->acr_cfg_c[idx] != reload) { - wrmsrl(msr_c + x86_pmu.addr_offset(idx, false), reload); + wrmsrl(msr_c + msr_offset, reload); cpuc->acr_cfg_c[idx] = reload; } } -- Gitee From c8b9e74e4186fbc19ff18def1898c414d484f42c Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 6 Aug 2024 12:07:50 -0700 Subject: [PATCH 43/52] tools/include: Sync x86 headers with the kernel sources mainline inclusion from mainline-v6.11-rc4 commit f6d9883f8e680460be4714d4d35c7acac1dffeaf category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f6d9883f8e680460be4714d4d35c7acac1dffeaf --------------------------- To pick up changes from: 149fd4712bcd perf/x86/intel: Support Perfmon MSRs aliasing 21b362cc762a x86/resctrl: Enable shared RMID mode on Sub-NUMA Cluster (SNC) systems 4f460bff7b6a cpufreq: acpi: move MSR_K7_HWCR_CPB_DIS_BIT into msr-index.h 7ea81936b853 x86/cpufeatures: Add HWP highest perf change feature flag 78ce84b9e0a5 x86/cpufeatures: Flip the /proc/cpuinfo appearance logic 1beb348d5c7f x86/sev: Provide SVSM discovery support This should be used to beautify x86 syscall arguments and it addresses these tools/perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h Please see tools/include/uapi/README for details (it's in the first patch of this series). Intel-SIG: commit f6d9883f8e68 tools/include: Sync x86 headers with the kernel sources Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Namhyung Kim Conflicts: tools/arch/x86/include/asm/cpufeatures.h tools/arch/x86/include/asm/msr-index.h [jz: only changes in following 2 commits are adapted, since other commits are not in OLK-6.6: 149fd4712bcd perf/x86/intel: Support Perfmon MSRs aliasing 7ea81936b853 x86/cpufeatures: Add HWP highest perf change feature flag] Signed-off-by: Jason Zeng --- tools/arch/x86/include/asm/cpufeatures.h | 1 + tools/arch/x86/include/asm/msr-index.h | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 3c3af4eb5877..9b70b46ce955 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -352,6 +352,7 @@ #define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ #define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ #define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ +#define X86_FEATURE_HWP_HIGHEST_PERF_CHANGE (14*32+15) /* HWP Highest perf change */ #define X86_FEATURE_HFI (14*32+19) /* Hardware Feedback Interface */ /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 474fd815148e..7feec99a952d 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -546,6 +546,12 @@ #define MSR_RELOAD_PMC0 0x000014c1 #define MSR_RELOAD_FIXED_CTR0 0x00001309 +/* V6 PMON MSR range */ +#define MSR_IA32_PMC_V6_GP0_CTR 0x1900 +#define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901 +#define MSR_IA32_PMC_V6_FX0_CTR 0x1980 +#define MSR_IA32_PMC_V6_STEP 4 + /* * AMD64 MSRs. Not complete. See the architecture manual for a more * complete list. -- Gitee From f22d107f25b0e2b09c5c0a0af2b0ee1a2130366c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 12 Jun 2025 17:58:05 -0300 Subject: [PATCH 44/52] tools arch x86: Sync the msr-index.h copy with the kernel sources mainline inclusion from mainline-v6.16-rc3 commit 6143374c6dc874d301dc16612aed144bf4544bae category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6143374c6dc874d301dc16612aed144bf4544bae --------------------------- To pick up the changes from these csets: 159013a7ca18c271 ("x86/its: Enumerate Indirect Target Selection (ITS) bug") f4138de5e41fae1a ("x86/msr: Standardize on u64 in ") ec980e4facef8110 ("perf/x86/intel: Support auto counter reload") That cause no changes to tooling as it doesn't include a new MSR to be captured by the tools/perf/trace/beauty/tracepoints/x86_msr.sh script. Just silences this perf build warning: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h Intel-SIG: commit 6143374c6dc8 tools arch x86: Sync the msr-index.h copy with the kernel sources Cc: Adrian Hunter Cc: Dave Hansen Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Pawan Gupta Cc: Peter Zijlstra Link: https://lore.kernel.org/r/aEtAUg83OQGx8Kay@x1 Signed-off-by: Arnaldo Carvalho de Melo [jz: only adapt changes from following commit, since the other commits are not in OLK-6.6: ec980e4facef8110 ("perf/x86/intel: Support auto counter reload")] Signed-off-by: Jason Zeng --- tools/arch/x86/include/asm/msr-index.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 7feec99a952d..d0c16822c29c 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -549,7 +549,11 @@ /* V6 PMON MSR range */ #define MSR_IA32_PMC_V6_GP0_CTR 0x1900 #define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901 +#define MSR_IA32_PMC_V6_GP0_CFG_B 0x1902 +#define MSR_IA32_PMC_V6_GP0_CFG_C 0x1903 #define MSR_IA32_PMC_V6_FX0_CTR 0x1980 +#define MSR_IA32_PMC_V6_FX0_CFG_B 0x1982 +#define MSR_IA32_PMC_V6_FX0_CFG_C 0x1983 #define MSR_IA32_PMC_V6_STEP 4 /* -- Gitee From 8b22b950220573e8db38e0a423c085ca7003d01b Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 20 Aug 2025 10:30:27 +0800 Subject: [PATCH 45/52] perf/x86/intel: Fix IA32_PMC_x_CFG_B MSRs access error mainline inclusion from mainline-v6.18-rc1 commit 43796f30507802d93ead2dc44fc9637f34671a89 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=43796f30507802d93ead2dc44fc9637f34671a89 --------------------------- When running perf_fuzzer on PTL, sometimes the below "unchecked MSR access error" is seen when accessing IA32_PMC_x_CFG_B MSRs. [ 55.611268] unchecked MSR access error: WRMSR to 0x1986 (tried to write 0x0000000200000001) at rIP: 0xffffffffac564b28 (native_write_msr+0x8/0x30) [ 55.611280] Call Trace: [ 55.611282] [ 55.611284] ? intel_pmu_config_acr+0x87/0x160 [ 55.611289] intel_pmu_enable_acr+0x6d/0x80 [ 55.611291] intel_pmu_enable_event+0xce/0x460 [ 55.611293] x86_pmu_start+0x78/0xb0 [ 55.611297] x86_pmu_enable+0x218/0x3a0 [ 55.611300] ? x86_pmu_enable+0x121/0x3a0 [ 55.611302] perf_pmu_enable+0x40/0x50 [ 55.611307] ctx_resched+0x19d/0x220 [ 55.611309] __perf_install_in_context+0x284/0x2f0 [ 55.611311] ? __pfx_remote_function+0x10/0x10 [ 55.611314] remote_function+0x52/0x70 [ 55.611317] ? __pfx_remote_function+0x10/0x10 [ 55.611319] generic_exec_single+0x84/0x150 [ 55.611323] smp_call_function_single+0xc5/0x1a0 [ 55.611326] ? __pfx_remote_function+0x10/0x10 [ 55.611329] perf_install_in_context+0xd1/0x1e0 [ 55.611331] ? __pfx___perf_install_in_context+0x10/0x10 [ 55.611333] __do_sys_perf_event_open+0xa76/0x1040 [ 55.611336] __x64_sys_perf_event_open+0x26/0x30 [ 55.611337] x64_sys_call+0x1d8e/0x20c0 [ 55.611339] do_syscall_64+0x4f/0x120 [ 55.611343] entry_SYSCALL_64_after_hwframe+0x76/0x7e On PTL, GP counter 0 and 1 doesn't support auto counter reload feature, thus it would trigger a #GP when trying to write 1 on bit 0 of CFG_B MSR which requires to enable auto counter reload on GP counter 0. The root cause of causing this issue is the check for auto counter reload (ACR) counter mask from user space is incorrect in intel_pmu_acr_late_setup() helper. It leads to an invalid ACR counter mask from user space could be set into hw.config1 and then written into CFG_B MSRs and trigger the MSR access warning. e.g., User may create a perf event with ACR counter mask (config2=0xcb), and there is only 1 event created, so "cpuc->n_events" is 1. The correct check condition should be "i + idx >= cpuc->n_events" instead of "i + idx > cpuc->n_events" (it looks a typo). Otherwise, the counter mask would traverse twice and an invalid "cpuc->assign[1]" bit (bit 0) is set into hw.config1 and cause MSR accessing error. Besides, also check if the ACR counter mask corresponding events are ACR events. If not, filter out these counter mask. If a event is not a ACR event, it could be scheduled to an HW counter which doesn't support ACR. It's invalid to add their counter index in ACR counter mask. Furthermore, remove the WARN_ON_ONCE() since it's easily triggered as user could set any invalid ACR counter mask and the warning message could mislead users. Intel-SIG: commit 43796f305078 perf/x86/intel: Fix IA32_PMC_x_CFG_B MSRs access error Fixes: ec980e4facef ("perf/x86/intel: Support auto counter reload") Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20250820023032.17128-3-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 8d4dd547de90..a1c960038a58 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2986,7 +2986,8 @@ static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc) if (event->group_leader != leader->group_leader) break; for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) { - if (WARN_ON_ONCE(i + idx > cpuc->n_events)) + if (i + idx >= cpuc->n_events || + !is_acr_event_group(cpuc->event_list[i + idx])) return; __set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1); } -- Gitee From d0035b4a77880b903d223ed9be0d3ea48a58bbc7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 9 Nov 2023 18:28:49 -0800 Subject: [PATCH 46/52] KVM: x86/pmu: Add common define to capture fixed counters offset mainline inclusion from mainline-v6.9-rc1 commit be6b067dae1573cf4d53c8b08175d8872d82f030 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=be6b067dae1573cf4d53c8b08175d8872d82f030 --------------------------- Add a common define to "officially" solidify KVM's split of counters, i.e. to commit to using bits 31:0 to track general purpose counters and bits 63:32 to track fixed counters (which only Intel supports). KVM already bleeds this behavior all over common PMU code, and adding a KVM- defined macro allows clarifying that the value is a _base_, as oppposed to the _flag_ that is used to access fixed PMCs via RDPMC (which perf confusingly calls INTEL_PMC_FIXED_RDPMC_BASE). No functional change intended. Intel-SIG: commit be6b067dae15 KVM: x86/pmu: Add common define to capture fixed counters offset Link: https://lore.kernel.org/r/20231110022857.1273836-3-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: Jason Zeng --- arch/x86/kvm/pmu.c | 8 ++++---- arch/x86/kvm/pmu.h | 4 +++- arch/x86/kvm/vmx/pmu_intel.c | 12 ++++++------ 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index f4a4a33e6daa..4f97c6f02885 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -67,7 +67,7 @@ static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = { * all perf counters (both gp and fixed). The mapping relationship * between pmc and perf counters is as the following: * * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters - * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed + * [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters */ @@ -362,7 +362,7 @@ static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f, static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter, int idx) { - int fixed_idx = idx - INTEL_PMC_IDX_FIXED; + int fixed_idx = idx - KVM_FIXED_PMC_BASE_IDX; if (filter->action == KVM_PMU_EVENT_DENY && test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap)) @@ -416,7 +416,7 @@ static void reprogram_counter(struct kvm_pmc *pmc) if (pmc_is_fixed(pmc)) { fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, - pmc->idx - INTEL_PMC_IDX_FIXED); + pmc->idx - KVM_FIXED_PMC_BASE_IDX); if (fixed_ctr_ctrl & 0x1) eventsel |= ARCH_PERFMON_EVENTSEL_OS; if (fixed_ctr_ctrl & 0x2) @@ -798,7 +798,7 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc) select_user = config & ARCH_PERFMON_EVENTSEL_USR; } else { config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, - pmc->idx - INTEL_PMC_IDX_FIXED); + pmc->idx - KVM_FIXED_PMC_BASE_IDX); select_os = config & 0x1; select_user = config & 0x2; } diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index a46aa9b25150..9da30eb41330 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -18,6 +18,8 @@ #define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001 #define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002 +#define KVM_FIXED_PMC_BASE_IDX INTEL_PMC_IDX_FIXED + struct kvm_pmu_ops { bool (*hw_event_available)(struct kvm_pmc *pmc); struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); @@ -153,7 +155,7 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) if (pmc_is_fixed(pmc)) return fixed_ctrl_field(pmu->fixed_ctr_ctrl, - pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; + pmc->idx - KVM_FIXED_PMC_BASE_IDX) & 0x3; return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 5976b67b7268..f6578a7389aa 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -84,18 +84,18 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); - __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); + __set_bit(KVM_FIXED_PMC_BASE_IDX + i, pmu->pmc_in_use); kvm_pmu_request_counter_reprogram(pmc); } } static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) { - if (pmc_idx < INTEL_PMC_IDX_FIXED) { + if (pmc_idx < KVM_FIXED_PMC_BASE_IDX) { return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, MSR_P6_EVNTSEL0); } else { - u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; + u32 idx = pmc_idx - KVM_FIXED_PMC_BASE_IDX; return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); } @@ -539,7 +539,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) for (i = 0; i < pmu->nr_arch_fixed_counters; i++) pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4)); counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED)); + (((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX)); pmu->global_ctrl_mask = counter_mask; /* @@ -583,7 +583,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { pmu->fixed_ctr_ctrl_mask &= - ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4)); + ~(1ULL << (KVM_FIXED_PMC_BASE_IDX + i * 4)); } pmu->pebs_data_cfg_mask = ~0xff00000full; } else { @@ -609,7 +609,7 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { pmu->fixed_counters[i].type = KVM_PMC_FIXED; pmu->fixed_counters[i].vcpu = vcpu; - pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; + pmu->fixed_counters[i].idx = i + KVM_FIXED_PMC_BASE_IDX; pmu->fixed_counters[i].current_config = 0; } -- Gitee From 8ff4de2e55989ec6fa3ff0b137d4c1d650260532 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Tue, 30 Apr 2024 08:52:38 +0800 Subject: [PATCH 47/52] KVM: x86/pmu: Change ambiguous _mask suffix to _rsvd in kvm_pmu mainline inclusion from mainline-v6.11-rc1 commit 0e102ce3d4133194a26060fe987315133736c37b category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0e102ce3d4133194a26060fe987315133736c37b --------------------------- Several '_mask' suffixed variables such as, global_ctrl_mask, are defined in kvm_pmu structure. However the _mask suffix is ambiguous and misleading since it's not a real mask with positive logic. On the contrary it represents the reserved bits of corresponding MSRs and these bits should not be accessed. Intel-SIG: commit 0e102ce3d413 KVM: x86/pmu: Change ambiguous _mask suffix to _rsvd in kvm_pmu Suggested-by: Sean Christopherson Signed-off-by: Dapeng Mi Link: https://lore.kernel.org/r/20240430005239.13527-2-dapeng1.mi@linux.intel.com Signed-off-by: Sean Christopherson Signed-off-by: Jason Zeng --- arch/x86/include/asm/kvm_host.h | 10 +++++----- arch/x86/kvm/pmu.c | 16 ++++++++-------- arch/x86/kvm/pmu.h | 2 +- arch/x86/kvm/svm/pmu.c | 4 ++-- arch/x86/kvm/vmx/pmu_intel.c | 26 +++++++++++++------------- 5 files changed, 29 insertions(+), 29 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d27a280daf5c..3614f830dc93 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -519,12 +519,12 @@ struct kvm_pmu { unsigned nr_arch_fixed_counters; unsigned available_event_types; u64 fixed_ctr_ctrl; - u64 fixed_ctr_ctrl_mask; + u64 fixed_ctr_ctrl_rsvd; u64 global_ctrl; u64 global_status; u64 counter_bitmask[2]; - u64 global_ctrl_mask; - u64 global_status_mask; + u64 global_ctrl_rsvd; + u64 global_status_rsvd; u64 reserved_bits; u64 raw_event_mask; struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; @@ -544,9 +544,9 @@ struct kvm_pmu { u64 ds_area; u64 pebs_enable; - u64 pebs_enable_mask; + u64 pebs_enable_rsvd; u64 pebs_data_cfg; - u64 pebs_data_cfg_mask; + u64 pebs_data_cfg_rsvd; /* * If a guest counter is cross-mapped to host counter with different diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 4f97c6f02885..3797c9ef9b4b 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -619,13 +619,13 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) break; - if (data & pmu->global_status_mask) + if (data & pmu->global_status_rsvd) return 1; pmu->global_status = data; break; case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: - data &= ~pmu->global_ctrl_mask; + data &= ~pmu->global_ctrl_rsvd; fallthrough; case MSR_CORE_PERF_GLOBAL_CTRL: if (!kvm_valid_perf_global_ctrl(pmu, data)) @@ -642,7 +642,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in * GLOBAL_STATUS, and so the set of reserved bits is the same. */ - if (data & pmu->global_status_mask) + if (data & pmu->global_status_rsvd) return 1; fallthrough; case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: @@ -709,11 +709,11 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->reserved_bits = 0xffffffff00200000ull; pmu->raw_event_mask = X86_RAW_EVENT_MASK; - pmu->global_ctrl_mask = ~0ull; - pmu->global_status_mask = ~0ull; - pmu->fixed_ctr_ctrl_mask = ~0ull; - pmu->pebs_enable_mask = ~0ull; - pmu->pebs_data_cfg_mask = ~0ull; + pmu->global_ctrl_rsvd = ~0ull; + pmu->global_status_rsvd = ~0ull; + pmu->fixed_ctr_ctrl_rsvd = ~0ull; + pmu->pebs_enable_rsvd = ~0ull; + pmu->pebs_data_cfg_rsvd = ~0ull; bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); if (!vcpu->kvm->arch.enable_pmu) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 9da30eb41330..65d659ddea88 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -95,7 +95,7 @@ static inline bool pmc_is_fixed(struct kvm_pmc *pmc) static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu, u64 data) { - return !(pmu->global_ctrl_mask & data); + return !(pmu->global_ctrl_rsvd & data); } /* returns general purpose PMC with the specified MSR. Note that it can be diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 3fd47de14b38..0ec0e95dabe7 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -204,8 +204,8 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) kvm_pmu_cap.num_counters_gp); if (pmu->version > 1) { - pmu->global_ctrl_mask = ~((1ull << pmu->nr_arch_gp_counters) - 1); - pmu->global_status_mask = pmu->global_ctrl_mask; + pmu->global_ctrl_rsvd = ~((1ull << pmu->nr_arch_gp_counters) - 1); + pmu->global_status_rsvd = pmu->global_ctrl_rsvd; } pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index f6578a7389aa..5387edbbbeba 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -398,14 +398,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: - if (data & pmu->fixed_ctr_ctrl_mask) + if (data & pmu->fixed_ctr_ctrl_rsvd) return 1; if (pmu->fixed_ctr_ctrl != data) reprogram_fixed_counters(pmu, data); break; case MSR_IA32_PEBS_ENABLE: - if (data & pmu->pebs_enable_mask) + if (data & pmu->pebs_enable_rsvd) return 1; if (pmu->pebs_enable != data) { @@ -421,7 +421,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pmu->ds_area = data; break; case MSR_PEBS_DATA_CFG: - if (data & pmu->pebs_data_cfg_mask) + if (data & pmu->pebs_data_cfg_rsvd) return 1; pmu->pebs_data_cfg = data; @@ -490,7 +490,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) union cpuid10_eax eax; union cpuid10_edx edx; u64 perf_capabilities; - u64 counter_mask; + u64 counter_rsvd; int i; memset(&lbr_desc->records, 0, sizeof(lbr_desc->records)); @@ -537,21 +537,21 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) } for (i = 0; i < pmu->nr_arch_fixed_counters; i++) - pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4)); - counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | + pmu->fixed_ctr_ctrl_rsvd &= ~(0xbull << (i * 4)); + counter_rsvd = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX)); - pmu->global_ctrl_mask = counter_mask; + pmu->global_ctrl_rsvd = counter_rsvd; /* * GLOBAL_STATUS and GLOBAL_OVF_CONTROL (a.k.a. GLOBAL_STATUS_RESET) * share reserved bit definitions. The kernel just happens to use * OVF_CTRL for the names. */ - pmu->global_status_mask = pmu->global_ctrl_mask + pmu->global_status_rsvd = pmu->global_ctrl_rsvd & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); if (vmx_pt_mode_is_host_guest()) - pmu->global_status_mask &= + pmu->global_status_rsvd &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); @@ -579,15 +579,15 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (perf_capabilities & PERF_CAP_PEBS_FORMAT) { if (perf_capabilities & PERF_CAP_PEBS_BASELINE) { - pmu->pebs_enable_mask = counter_mask; + pmu->pebs_enable_rsvd = counter_rsvd; pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { - pmu->fixed_ctr_ctrl_mask &= + pmu->fixed_ctr_ctrl_rsvd &= ~(1ULL << (KVM_FIXED_PMC_BASE_IDX + i * 4)); } - pmu->pebs_data_cfg_mask = ~0xff00000full; + pmu->pebs_data_cfg_rsvd = ~0xff00000full; } else { - pmu->pebs_enable_mask = + pmu->pebs_enable_rsvd = ~((1ull << pmu->nr_arch_gp_counters) - 1); } } -- Gitee From ccbb78aea5da1e5bd416fbeedba769b1ed066395 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Tue, 30 Apr 2024 08:52:39 +0800 Subject: [PATCH 48/52] KVM: x86/pmu: Manipulate FIXED_CTR_CTRL MSR with macros mainline inclusion from mainline-v6.11-rc1 commit 75430c412a3139c29404459ab1216a07d1280428 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=75430c412a3139c29404459ab1216a07d1280428 --------------------------- Magic numbers are used to manipulate the bit fields of FIXED_CTR_CTRL MSR. This makes reading code become difficult, so use pre-defined macros to replace these magic numbers. Intel-SIG: commit 75430c412a31 KVM: x86/pmu: Manipulate FIXED_CTR_CTRL MSR with macros Signed-off-by: Dapeng Mi Link: https://lore.kernel.org/r/20240430005239.13527-3-dapeng1.mi@linux.intel.com [sean: drop unnecessary curly braces] Signed-off-by: Sean Christopherson Signed-off-by: Jason Zeng --- arch/x86/kvm/pmu.c | 10 +++++----- arch/x86/kvm/pmu.h | 6 ++++-- arch/x86/kvm/vmx/pmu_intel.c | 12 ++++++++---- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 3797c9ef9b4b..c589fa44604c 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -417,11 +417,11 @@ static void reprogram_counter(struct kvm_pmc *pmc) if (pmc_is_fixed(pmc)) { fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, pmc->idx - KVM_FIXED_PMC_BASE_IDX); - if (fixed_ctr_ctrl & 0x1) + if (fixed_ctr_ctrl & INTEL_FIXED_0_KERNEL) eventsel |= ARCH_PERFMON_EVENTSEL_OS; - if (fixed_ctr_ctrl & 0x2) + if (fixed_ctr_ctrl & INTEL_FIXED_0_USER) eventsel |= ARCH_PERFMON_EVENTSEL_USR; - if (fixed_ctr_ctrl & 0x8) + if (fixed_ctr_ctrl & INTEL_FIXED_0_ENABLE_PMI) eventsel |= ARCH_PERFMON_EVENTSEL_INT; new_config = (u64)fixed_ctr_ctrl; } @@ -799,8 +799,8 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc) } else { config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, pmc->idx - KVM_FIXED_PMC_BASE_IDX); - select_os = config & 0x1; - select_user = config & 0x2; + select_os = config & INTEL_FIXED_0_KERNEL; + select_user = config & INTEL_FIXED_0_USER; } return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 65d659ddea88..aca521bd9f49 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -12,7 +12,8 @@ MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) /* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ -#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf) +#define fixed_ctrl_field(ctrl_reg, idx) \ + (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK) #define VMWARE_BACKDOOR_PMC_HOST_TSC 0x10000 #define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001 @@ -155,7 +156,8 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) if (pmc_is_fixed(pmc)) return fixed_ctrl_field(pmu->fixed_ctr_ctrl, - pmc->idx - KVM_FIXED_PMC_BASE_IDX) & 0x3; + pmc->idx - KVM_FIXED_PMC_BASE_IDX) & + (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER); return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 5387edbbbeba..7180fd4aa5db 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -537,7 +537,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) } for (i = 0; i < pmu->nr_arch_fixed_counters; i++) - pmu->fixed_ctr_ctrl_rsvd &= ~(0xbull << (i * 4)); + pmu->fixed_ctr_ctrl_rsvd &= + ~intel_fixed_bits_by_idx(i, + INTEL_FIXED_0_KERNEL | + INTEL_FIXED_0_USER | + INTEL_FIXED_0_ENABLE_PMI); + counter_rsvd = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX)); pmu->global_ctrl_rsvd = counter_rsvd; @@ -581,10 +586,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (perf_capabilities & PERF_CAP_PEBS_BASELINE) { pmu->pebs_enable_rsvd = counter_rsvd; pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; - for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) pmu->fixed_ctr_ctrl_rsvd &= - ~(1ULL << (KVM_FIXED_PMC_BASE_IDX + i * 4)); - } + ~intel_fixed_bits_by_idx(i, ICL_FIXED_0_ADAPTIVE); pmu->pebs_data_cfg_rsvd = ~0xff00000full; } else { pmu->pebs_enable_rsvd = -- Gitee From e7c8a25f8a2c91804b4ce3607c947dd2c65d8ceb Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 20 Aug 2025 10:30:31 +0800 Subject: [PATCH 49/52] perf/x86/intel: Add ICL_FIXED_0_ADAPTIVE bit into INTEL_FIXED_BITS_MASK mainline inclusion from mainline-v6.18-rc1 commit 2676dbf9f4fb7f6739d1207c0f1deaf63124642a category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2676dbf9f4fb7f6739d1207c0f1deaf63124642a --------------------------- ICL_FIXED_0_ADAPTIVE is missed to be added into INTEL_FIXED_BITS_MASK, add it. With help of this new INTEL_FIXED_BITS_MASK, intel_pmu_enable_fixed() can be optimized. The old fixed counter control bits can be unconditionally cleared with INTEL_FIXED_BITS_MASK and then set new control bits base on new configuration. Intel-SIG: commit 2676dbf9f4fb perf/x86/intel: Add ICL_FIXED_0_ADAPTIVE bit into INTEL_FIXED_BITS_MASK Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Tested-by: Yi Lai Link: https://lore.kernel.org/r/20250820023032.17128-7-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/intel/core.c | 10 +++------- arch/x86/include/asm/perf_event.h | 6 +++++- arch/x86/kvm/pmu.h | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a1c960038a58..17bb6087766a 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2834,8 +2834,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - u64 mask, bits = 0; int idx = hwc->idx; + u64 bits = 0; if (is_topdown_idx(idx)) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -2874,14 +2874,10 @@ static void intel_pmu_enable_fixed(struct perf_event *event) idx -= INTEL_PMC_IDX_FIXED; bits = intel_fixed_bits_by_idx(idx, bits); - mask = intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); - - if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) { + if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) bits |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); - mask |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE); - } - cpuc->fixed_ctrl_val &= ~mask; + cpuc->fixed_ctrl_val &= ~intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK); cpuc->fixed_ctrl_val |= bits; } diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 08d2853395cb..0335e7263419 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -35,7 +35,6 @@ #define ARCH_PERFMON_EVENTSEL_EQ (1ULL << 36) #define ARCH_PERFMON_EVENTSEL_UMASK2 (0xFFULL << 40) -#define INTEL_FIXED_BITS_MASK 0xFULL #define INTEL_FIXED_BITS_STRIDE 4 #define INTEL_FIXED_0_KERNEL (1ULL << 0) #define INTEL_FIXED_0_USER (1ULL << 1) @@ -48,6 +47,11 @@ #define ICL_EVENTSEL_ADAPTIVE (1ULL << 34) #define ICL_FIXED_0_ADAPTIVE (1ULL << 32) +#define INTEL_FIXED_BITS_MASK \ + (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER | \ + INTEL_FIXED_0_ANYTHREAD | INTEL_FIXED_0_ENABLE_PMI | \ + ICL_FIXED_0_ADAPTIVE) + #define intel_fixed_bits_by_idx(_idx, _bits) \ ((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE)) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index aca521bd9f49..004121dfddd6 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -11,7 +11,7 @@ #define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \ MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) -/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ +/* retrieve a fixed counter bits out of IA32_FIXED_CTR_CTRL */ #define fixed_ctrl_field(ctrl_reg, idx) \ (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK) -- Gitee From 79b578be12f77f78e4465c85c87e8070a1d40f46 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Wed, 20 Aug 2025 10:30:32 +0800 Subject: [PATCH 50/52] perf/x86: Print PMU counters bitmap in x86_pmu_show_pmu_cap() mainline inclusion from mainline-v6.18-rc1 commit f49e1be19542487921e82b29004908966cb99d7c category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f49e1be19542487921e82b29004908966cb99d7c --------------------------- Along with the introduction Perfmon v6, pmu counters could be incontinuous, like fixed counters on CWF, only fixed counters 0-3 and 5-7 are supported, there is no fixed counter 4 on CWF. To accommodate this change, archPerfmonExt CPUID (0x23) leaves are introduced to enumerate the true-view of counters bitmap. Current perf code already supports archPerfmonExt CPUID and uses counters-bitmap to enumerate HW really supported counters, but x86_pmu_show_pmu_cap() still only dumps the absolute counter number instead of true-view bitmap, it's out-dated and may mislead readers. So dump counters true-view bitmap in x86_pmu_show_pmu_cap() and opportunistically change the dump sequence and words. Intel-SIG: commit f49e1be19542 perf/x86: Print PMU counters bitmap in x86_pmu_show_pmu_cap() Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20250820023032.17128-8-dapeng1.mi@linux.intel.com Signed-off-by: Jason Zeng --- arch/x86/events/core.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 504d799683f8..e0255f3c7288 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2060,13 +2060,15 @@ static void _x86_pmu_read(struct perf_event *event) void x86_pmu_show_pmu_cap(struct pmu *pmu) { - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.cntval_bits); - pr_info("... generic registers: %d\n", x86_pmu_num_counters(pmu)); - pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %d\n", x86_pmu_num_counters_fixed(pmu)); - pr_info("... event mask: %016Lx\n", hybrid(pmu, intel_ctrl)); + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.cntval_bits); + pr_info("... generic counters: %d\n", x86_pmu_num_counters(pmu)); + pr_info("... generic bitmap: %016llx\n", hybrid(pmu, cntr_mask64)); + pr_info("... fixed-purpose counters: %d\n", x86_pmu_num_counters_fixed(pmu)); + pr_info("... fixed-purpose bitmap: %016llx\n", hybrid(pmu, fixed_cntr_mask64)); + pr_info("... value mask: %016llx\n", x86_pmu.cntval_mask); + pr_info("... max period: %016llx\n", x86_pmu.max_period); + pr_info("... global_ctrl mask: %016llx\n", hybrid(pmu, intel_ctrl)); } static int __init init_hw_perf_events(void) -- Gitee From ea887b5f530e308db56b44bf5ac7dadd36632f80 Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Thu, 26 Sep 2024 09:40:40 -0500 Subject: [PATCH 51/52] perf mem: Fix printing PERF_MEM_LVLNUM_{L2_MHB|MSC} mainline inclusion from mainline-v6.13-rc1 commit 4f23fc34cc68812c68c3a3dec15e26e87565f430 category: feature bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4f23fc34cc68812c68c3a3dec15e26e87565f430 --------------------------- With commit 8ec9497d3ef34 ("tools/include: Sync uapi/linux/perf.h with the kernel sources"), 'perf mem report' gives an incorrect memory access string. ... 0.02% 1 3644 L5 hit [.] 0x0000000000009b0e mlc [.] 0x00007fce43f59480 ... This occurs because, if no entry exists in mem_lvlnum, perf_mem__lvl_scnprintf will default to 'L%d, lvl', which in this case for PERF_MEM_LVLNUM_L2_MHB is 0x05. Add entries for PERF_MEM_LVLNUM_L2_MHB and PERF_MEM_LVLNUM_MSC to mem_lvlnum, so that the correct strings are printed. ... 0.02% 1 3644 L2 MHB hit [.] 0x0000000000009b0e mlc [.] 0x00007fce43f59480 ... Intel-SIG: commit 4f23fc34cc68 perf mem: Fix printing PERF_MEM_LVLNUM_{L2_MHB|MSC} Fixes: 8ec9497d3ef34 ("tools/include: Sync uapi/linux/perf.h with the kernel sources") Suggested-by: Kan Liang Signed-off-by: Thomas Falcon Reviewed-by: Leo Yan Link: https://lore.kernel.org/r/20240926144040.77897-1-thomas.falcon@intel.com Signed-off-by: Namhyung Kim Signed-off-by: Jason Zeng --- tools/perf/util/mem-events.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index a6996d3e32c0..16e276402df8 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -297,6 +297,12 @@ static const char * const mem_lvl[] = { }; static const char * const mem_lvlnum[] = { + [PERF_MEM_LVLNUM_L1] = "L1", + [PERF_MEM_LVLNUM_L2] = "L2", + [PERF_MEM_LVLNUM_L3] = "L3", + [PERF_MEM_LVLNUM_L4] = "L4", + [PERF_MEM_LVLNUM_L2_MHB] = "L2 MHB", + [PERF_MEM_LVLNUM_MSC] = "Memory-side Cache", [PERF_MEM_LVLNUM_UNC] = "Uncached", [PERF_MEM_LVLNUM_CXL] = "CXL", [PERF_MEM_LVLNUM_IO] = "I/O", @@ -379,7 +385,7 @@ int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info) if (mem_lvlnum[lvl]) l += scnprintf(out + l, sz - l, mem_lvlnum[lvl]); else - l += scnprintf(out + l, sz - l, "L%d", lvl); + l += scnprintf(out + l, sz - l, "Unknown level %d", lvl); l += scnprintf(out + l, sz - l, " %s", hit_miss); return l; -- Gitee From 0705e54e1470b540c0f30bc9ee5558ebd68e73a8 Mon Sep 17 00:00:00 2001 From: Jason Zeng Date: Mon, 13 Oct 2025 16:58:34 +0800 Subject: [PATCH 52/52] Fix kabi for CWF PMU support Intel inclusion category: bugfix bugzilla: https://gitee.com/openeuler/intel-kernel/issues/ICZHEB CVE: NA -------------------------------- Following upstream commits introduced 2 fields (config1 and dyn_constraint) in struct hw_perf_event, which breaks kABI. ec980e4facef ("perf/x86/intel: Support auto counter reload") 4dfe3232cc04 ("perf/x86: Add dynamic constraint") To fix this kABI breakage, we introduce struct hw_perf_event_ext, and use one KABI_RESERVE field in struct perf_event as pointer to this struct hw_perf_event_ext. This is viable because hw_perf_event is always embedded in struct perf_event, so we can always access hw_perf_event_ext from perf_event when needed. The allocation size of the perf_event slab cache is also enlarged so that an element of struct hw_perf_event_ext will always be allocated together with each struct perf_event allocation. Another kABI changes are caused by the following commit: 0e102ce3d413 ("KVM: x86/pmu: Change ambiguous _mask suffix to _rsvd in kvm_pmu") But the fix is trivial. Fixes: ec980e4facef ("perf/x86/intel: Support auto counter reload") Fixes: 4dfe3232cc04 ("perf/x86: Add dynamic constraint") Fixes: 0e102ce3d413 ("KVM: x86/pmu: Change ambiguous _mask suffix to _rsvd in kvm_pmu") Signed-off-by: Jason Zeng --- arch/x86/events/core.c | 2 +- arch/x86/events/intel/core.c | 17 +++++++++-------- arch/x86/include/asm/kvm_host.h | 10 +++++----- include/linux/perf_event.h | 15 ++++++++++++--- kernel/events/core.c | 13 ++++++++++++- 5 files changed, 39 insertions(+), 18 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index e0255f3c7288..dc7a254aa426 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -675,7 +675,7 @@ static int __x86_pmu_event_init(struct perf_event *event) event->hw.idx = -1; event->hw.last_cpu = -1; event->hw.last_tag = ~0ULL; - event->hw.dyn_constraint = ~0ULL; + event->hw_ext->dyn_constraint = ~0ULL; /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 17bb6087766a..977f4f7442f2 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2914,6 +2914,7 @@ static void intel_pmu_config_acr(int idx, u64 mask, u32 reload) static void intel_pmu_enable_acr(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_ext *hw_ext = event->hw_ext; if (!is_acr_event_group(event) || !event->attr.config2) { /* @@ -2924,7 +2925,7 @@ static void intel_pmu_enable_acr(struct perf_event *event) return; } - intel_pmu_config_acr(hwc->idx, hwc->config1, -hwc->sample_period); + intel_pmu_config_acr(hwc->idx, hw_ext->config1, -hwc->sample_period); } DEFINE_STATIC_CALL_NULL(intel_pmu_enable_acr_event, intel_pmu_enable_acr); @@ -2985,7 +2986,7 @@ static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc) if (i + idx >= cpuc->n_events || !is_acr_event_group(cpuc->event_list[i + idx])) return; - __set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1); + __set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw_ext->config1); } } i = j - 1; @@ -3831,9 +3832,9 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, if (cpuc->excl_cntrs) return intel_get_excl_constraints(cpuc, event, idx, c2); - if (event->hw.dyn_constraint != ~0ULL) { + if (event->hw_ext->dyn_constraint != ~0ULL) { c2 = dyn_constraint(cpuc, c2, idx); - c2->idxmsk64 &= event->hw.dyn_constraint; + c2->idxmsk64 &= event->hw_ext->dyn_constraint; c2->weight = hweight64(c2->idxmsk64); } @@ -4195,7 +4196,7 @@ static bool intel_pmu_is_acr_group(struct perf_event *event) static inline void intel_pmu_set_acr_cntr_constr(struct perf_event *event, u64 *cause_mask, int *num) { - event->hw.dyn_constraint &= hybrid(event->pmu, acr_cntr_mask64); + event->hw_ext->dyn_constraint &= hybrid(event->pmu, acr_cntr_mask64); *cause_mask |= event->attr.config2; *num += 1; } @@ -4204,7 +4205,7 @@ static inline void intel_pmu_set_acr_caused_constr(struct perf_event *event, int idx, u64 cause_mask) { if (test_bit(idx, (unsigned long *)&cause_mask)) - event->hw.dyn_constraint &= hybrid(event->pmu, acr_cause_mask64); + event->hw_ext->dyn_constraint &= hybrid(event->pmu, acr_cause_mask64); } static int intel_pmu_hw_config(struct perf_event *event) @@ -4266,7 +4267,7 @@ static int intel_pmu_hw_config(struct perf_event *event) return -EINVAL; if (branch_sample_counters(leader)) { num++; - leader->hw.dyn_constraint &= x86_pmu.lbr_counters; + leader->hw_ext->dyn_constraint &= x86_pmu.lbr_counters; } leader->hw.flags |= PERF_X86_EVENT_BRANCH_COUNTERS; @@ -4275,7 +4276,7 @@ static int intel_pmu_hw_config(struct perf_event *event) return -EINVAL; if (branch_sample_counters(sibling)) { num++; - sibling->hw.dyn_constraint &= x86_pmu.lbr_counters; + sibling->hw_ext->dyn_constraint &= x86_pmu.lbr_counters; } } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3614f830dc93..40c7d9e27b76 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -519,12 +519,12 @@ struct kvm_pmu { unsigned nr_arch_fixed_counters; unsigned available_event_types; u64 fixed_ctr_ctrl; - u64 fixed_ctr_ctrl_rsvd; + KABI_REPLACE(u64 fixed_ctr_ctrl_mask, u64 fixed_ctr_ctrl_rsvd) u64 global_ctrl; u64 global_status; u64 counter_bitmask[2]; - u64 global_ctrl_rsvd; - u64 global_status_rsvd; + KABI_REPLACE(u64 global_ctrl_mask, u64 global_ctrl_rsvd) + KABI_REPLACE(u64 global_status_mask, u64 global_status_rsvd) u64 reserved_bits; u64 raw_event_mask; struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; @@ -544,9 +544,9 @@ struct kvm_pmu { u64 ds_area; u64 pebs_enable; - u64 pebs_enable_rsvd; + KABI_REPLACE(u64 pebs_enable_mask, u64 pebs_enable_rsvd) u64 pebs_data_cfg; - u64 pebs_data_cfg_rsvd; + KABI_REPLACE(u64 pebs_data_cfg_mask, u64 pebs_data_cfg_rsvd) /* * If a guest counter is cross-mapped to host counter with different diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 717ed8691f5a..593fa001b092 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -150,6 +150,17 @@ struct hw_perf_event_extra { static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); +struct hw_perf_event_ext { +#ifdef CONFIG_PERF_EVENTS + union { + struct { + u64 config1; + u64 dyn_constraint; + }; + }; +#endif +}; + /** * struct hw_perf_event - performance event hardware details: */ @@ -158,9 +169,7 @@ struct hw_perf_event { union { struct { /* hardware */ u64 config; - u64 config1; u64 last_tag; - u64 dyn_constraint; unsigned long config_base; unsigned long event_base; int event_base_rdpmc; @@ -853,7 +862,7 @@ struct perf_event { */ __u32 orig_type; - KABI_RESERVE(1) + KABI_USE(1, struct hw_perf_event_ext *hw_ext) KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/kernel/events/core.c b/kernel/events/core.c index 922389a83c67..ca67d151061a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11981,6 +11981,15 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!event) return ERR_PTR(-ENOMEM); + /* + * To fix kabi change, each allocated event is changed to be of size: + * sizeof(struct perf_event) + sizeof(struct hw_perf_event_ext) + * And event->hw_ext will point to the struct hw_perf_event_ext part. + * See perf_event_init(). + */ + event->hw_ext = (struct hw_perf_event_ext *)((void *)event + + sizeof(struct perf_event)); + /* * Single events are their own group leaders, with an * empty sibling list: @@ -13843,7 +13852,9 @@ void __init perf_event_init(void) ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); - perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC); + perf_event_cache = kmem_cache_create("perf_event", + sizeof(struct perf_event) + sizeof(struct hw_perf_event_ext), + __alignof__(struct perf_event), SLAB_PANIC, NULL); /* * Build time assertion that we keep the data_head at the intended -- Gitee