diff --git a/arch/Kconfig b/arch/Kconfig index 80682c6f6d12a3b39007dff0d8d26e9750cad0a1..7adde6e58dce60c72d8a7d7d9feeba68728c9c8f 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -973,6 +973,9 @@ config ARCH_HAS_NONLEAF_PMD_YOUNG address translations. Page table walkers that clear the accessed bit may use this capability to reduce their search space. +config HAVE_STATIC_CALL + bool + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 6f013e418834987c5b40218ce974e3d344357eb0..58de6cec907b0794b8be7480c229f314fc7e165f 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -529,6 +529,7 @@ static void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) } } cpuhw->bhrb_stack.nr = u_index; + cpuhw->bhrb_stack.hw_idx = -1ULL; return; } diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig index 4a809c6cbd2f5d8d1b19b187b2a8a0fbfd95092d..03b8674b192c27cf8f8eeb288c77bda7dadade5b 100644 --- a/arch/x86/events/Kconfig +++ b/arch/x86/events/Kconfig @@ -34,4 +34,14 @@ config PERF_EVENTS_AMD_POWER (CPUID Fn8000_0007_EDX[12]) interface to calculate the average power consumption on Family 15h processors. +config PERF_EVENTS_AMD_UNCORE + tristate "AMD Uncore performance events" + depends on PERF_EVENTS && CPU_SUP_AMD + default y + help + Include support for AMD uncore performance events for use with + e.g., perf stat -e amd_l3/.../,amd_df/.../. + + To compile this driver as a module, choose M here: the + module will be called 'amd-uncore'. endmenu diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile index fe8795a67385a5deda3a5ee145b2778607fe1046..cf323ffab5cdb98eef89c7044b59a892be9895f4 100644 --- a/arch/x86/events/amd/Makefile +++ b/arch/x86/events/amd/Makefile @@ -1,8 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_CPU_SUP_AMD) += core.o uncore.o +obj-$(CONFIG_CPU_SUP_AMD) += core.o brs.o obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o +obj-$(CONFIG_PERF_EVENTS_AMD_UNCORE) += amd-uncore.o +amd-uncore-objs := uncore.o ifdef CONFIG_AMD_IOMMU obj-$(CONFIG_CPU_SUP_AMD) += iommu.o endif - diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c new file mode 100644 index 0000000000000000000000000000000000000000..3c13c484c637be8d0f209663c49b4649835be98a --- /dev/null +++ b/arch/x86/events/amd/brs.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implement support for AMD Fam19h Branch Sampling feature + * Based on specifications published in AMD PPR Fam19 Model 01 + * + * Copyright 2021 Google LLC + * Contributed by Stephane Eranian + */ +#include +#include +#include + +#include "../perf_event.h" + +#define BRS_POISON 0xFFFFFFFFFFFFFFFEULL /* mark limit of valid entries */ + +/* Debug Extension Configuration register layout */ +union amd_debug_extn_cfg { + __u64 val; + struct { + __u64 rsvd0:2, /* reserved */ + brsmen:1, /* branch sample enable */ + rsvd4_3:2,/* reserved - must be 0x3 */ + vb:1, /* valid branches recorded */ + rsvd2:10, /* reserved */ + msroff:4, /* index of next entry to write */ + rsvd3:4, /* reserved */ + pmc:3, /* #PMC holding the sampling event */ + rsvd4:37; /* reserved */ + }; +}; + +static inline unsigned int brs_from(int idx) +{ + return MSR_AMD_SAMP_BR_FROM + 2 * idx; +} + +static inline unsigned int brs_to(int idx) +{ + return MSR_AMD_SAMP_BR_FROM + 2 * idx + 1; +} + +static inline void set_debug_extn_cfg(u64 val) +{ + /* bits[4:3] must always be set to 11b */ + wrmsrl(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3); +} + +static inline u64 get_debug_extn_cfg(void) +{ + u64 val; + + rdmsrl(MSR_AMD_DBG_EXTN_CFG, val); + return val; +} + +static bool __init amd_brs_detect(void) +{ + if (!boot_cpu_has(X86_FEATURE_BRS)) + return false; + + switch (boot_cpu_data.x86) { + case 0x19: /* AMD Fam19h (Zen3) */ + x86_pmu.lbr_nr = 16; + + /* No hardware filtering supported */ + x86_pmu.lbr_sel_map = NULL; + x86_pmu.lbr_sel_mask = 0; + break; + default: + return false; + } + + return true; +} + +/* + * Current BRS implementation does not support branch type or privilege level + * filtering. Therefore, this function simply enforces these limitations. No need for + * a br_sel_map. Software filtering is not supported because it would not correlate well + * with a sampling period. + */ +int amd_brs_setup_filter(struct perf_event *event) +{ + u64 type = event->attr.branch_sample_type; + + /* No BRS support */ + if (!x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + /* Can only capture all branches, i.e., no filtering */ + if ((type & ~PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_ANY) + return -EINVAL; + + /* can only capture at all priv levels due to the way BRS works */ + if ((type & PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_PLM_ALL) + return -EINVAL; + + return 0; +} + +/* tos = top of stack, i.e., last valid entry written */ +static inline int amd_brs_get_tos(union amd_debug_extn_cfg *cfg) +{ + /* + * msroff: index of next entry to write so top-of-stack is one off + * if BRS is full then msroff is set back to 0. + */ + return (cfg->msroff ? cfg->msroff : x86_pmu.lbr_nr) - 1; +} + +/* + * make sure we have a sane BRS offset to begin with + * especially with kexec + */ +void amd_brs_reset(void) +{ + /* + * Reset config + */ + set_debug_extn_cfg(0); + + /* + * Mark first entry as poisoned + */ + wrmsrl(brs_to(0), BRS_POISON); +} + +int __init amd_brs_init(void) +{ + if (!amd_brs_detect()) + return -EOPNOTSUPP; + + pr_cont("%d-deep BRS, ", x86_pmu.lbr_nr); + + return 0; +} + +void amd_brs_enable(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* Activate only on first user */ + if (++cpuc->brs_active > 1) + return; + + cfg.val = 0; /* reset all fields */ + cfg.brsmen = 1; /* enable branch sampling */ + + /* Set enable bit */ + set_debug_extn_cfg(cfg.val); +} + +void amd_brs_enable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (cpuc->lbr_users) + amd_brs_enable(); +} + +void amd_brs_disable(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* Check if active (could be disabled via x86_pmu_disable_all()) */ + if (!cpuc->brs_active) + return; + + /* Only disable for last user */ + if (--cpuc->brs_active) + return; + + /* + * Clear the brsmen bit but preserve the others as they contain + * useful state such as vb and msroff + */ + cfg.val = get_debug_extn_cfg(); + + /* + * When coming in on interrupt and BRS is full, then hw will have + * already stopped BRS, no need to issue wrmsr again + */ + if (cfg.brsmen) { + cfg.brsmen = 0; + set_debug_extn_cfg(cfg.val); + } +} + +void amd_brs_disable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (cpuc->lbr_users) + amd_brs_disable(); +} + +/* + * Caller must ensure amd_brs_inuse() is true before calling + * return: + */ +void amd_brs_drain(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event = cpuc->events[0]; + struct perf_branch_entry *br = cpuc->lbr_entries; + union amd_debug_extn_cfg cfg; + u32 i, nr = 0, num, tos, start; + u32 shift = 64 - boot_cpu_data.x86_virt_bits; + + /* + * BRS event forced on PMC0, + * so check if there is an event. + * It is possible to have lbr_users > 0 but the event + * not yet scheduled due to long latency PMU irq + */ + if (!event) + goto empty; + + cfg.val = get_debug_extn_cfg(); + + /* Sanity check [0-x86_pmu.lbr_nr] */ + if (WARN_ON_ONCE(cfg.msroff >= x86_pmu.lbr_nr)) + goto empty; + + /* No valid branch */ + if (cfg.vb == 0) + goto empty; + + /* + * msr.off points to next entry to be written + * tos = most recent entry index = msr.off - 1 + * BRS register buffer saturates, so we know we have + * start < tos and that we have to read from start to tos + */ + start = 0; + tos = amd_brs_get_tos(&cfg); + + num = tos - start + 1; + + /* + * BRS is only one pass (saturation) from MSROFF to depth-1 + * MSROFF wraps to zero when buffer is full + */ + for (i = 0; i < num; i++) { + u32 brs_idx = tos - i; + u64 from, to; + + rdmsrl(brs_to(brs_idx), to); + + /* Entry does not belong to us (as marked by kernel) */ + if (to == BRS_POISON) + break; + + rdmsrl(brs_from(brs_idx), from); + + /* + * Sign-extend SAMP_BR_TO to 64 bits, bits 61-63 are reserved. + * Necessary to generate proper virtual addresses suitable for + * symbolization + */ + to = (u64)(((s64)to << shift) >> shift); + + perf_clear_branch_entry_bitfields(br+nr); + + br[nr].from = from; + br[nr].to = to; + + nr++; + } +empty: + /* Record number of sampled branches */ + cpuc->lbr_stack.nr = nr; +} + +/* + * Poison most recent entry to prevent reuse by next task + * required because BRS entry are not tagged by PID + */ +static void amd_brs_poison_buffer(void) +{ + union amd_debug_extn_cfg cfg; + unsigned int idx; + + /* Get current state */ + cfg.val = get_debug_extn_cfg(); + + /* idx is most recently written entry */ + idx = amd_brs_get_tos(&cfg); + + /* Poison target of entry */ + wrmsrl(brs_to(idx), BRS_POISON); +} + +/* + * On context switch in, we need to make sure no samples from previous user + * are left in the BRS. + * + * On ctxswin, sched_in = true, called after the PMU has started + * On ctxswout, sched_in = false, called before the PMU is stopped + */ +void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* no active users */ + if (!cpuc->lbr_users) + return; + + /* + * On context switch in, we need to ensure we do not use entries + * from previous BRS user on that CPU, so we poison the buffer as + * a faster way compared to resetting all entries. + */ + if (sched_in) + amd_brs_poison_buffer(); +} diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index f6d8808b7b3c285b57bd841b73a9f510eccaea8a..d3c344fecaeb2f69b3385a950f801e2b72fce25e 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "../perf_event.h" @@ -18,6 +19,9 @@ static unsigned long perf_nmi_window; #define AMD_MERGE_EVENT ((0xFULL << 32) | 0xFFULL) #define AMD_MERGE_EVENT_ENABLE (AMD_MERGE_EVENT | ARCH_PERFMON_EVENTSEL_ENABLE) +/* PMC Enable and Overflow bits for PerfCntrGlobal* registers */ +static u64 amd_pmu_global_cntr_mask __read_mostly; + static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -325,8 +329,16 @@ static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc) } } +#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ +static inline int amd_is_brs_event(struct perf_event *e) +{ + return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT; +} + static int amd_core_hw_config(struct perf_event *event) { + int ret = 0; + if (event->attr.exclude_host && event->attr.exclude_guest) /* * When HO == GO == 1 the hardware treats that as GO == HO == 0 @@ -343,7 +355,66 @@ static int amd_core_hw_config(struct perf_event *event) if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw)) event->hw.flags |= PERF_X86_EVENT_PAIR; - return 0; + /* + * if branch stack is requested + */ + if (has_branch_stack(event)) { + /* + * Due to interrupt holding, BRS is not recommended in + * counting mode. + */ + if (!is_sampling_event(event)) + return -EINVAL; + + /* + * Due to the way BRS operates by holding the interrupt until + * lbr_nr entries have been captured, it does not make sense + * to allow sampling on BRS with an event that does not match + * what BRS is capturing, i.e., retired taken branches. + * Otherwise the correlation with the event's period is even + * more loose: + * + * With retired taken branch: + * Effective P = P + 16 + X + * With any other event: + * Effective P = P + Y + X + * + * Where X is the number of taken branches due to interrupt + * skid. Skid is large. + * + * Where Y is the occurences of the event while BRS is + * capturing the lbr_nr entries. + * + * By using retired taken branches, we limit the impact on the + * Y variable. We know it cannot be more than the depth of + * BRS. + */ + if (!amd_is_brs_event(event)) + return -EINVAL; + + /* + * BRS implementation does not work with frequency mode + * reprogramming of the period. + */ + if (event->attr.freq) + return -EINVAL; + /* + * The kernel subtracts BRS depth from period, so it must + * be big enough. + */ + if (event->attr.sample_period <= x86_pmu.lbr_nr) + return -EINVAL; + + /* + * Check if we can allow PERF_SAMPLE_BRANCH_STACK + */ + ret = amd_brs_setup_filter(event); + + /* only set in case of success */ + if (!ret) + event->hw.flags |= PERF_X86_EVENT_AMD_BRS; + } + return ret; } static inline int amd_is_nb_event(struct hw_perf_event *hwc) @@ -366,7 +437,7 @@ static int amd_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip && get_ibs_caps()) return -ENOENT; - if (has_branch_stack(event)) + if (has_branch_stack(event) && !x86_pmu.lbr_nr) return -EOPNOTSUPP; ret = x86_pmu_hw_config(event); @@ -510,6 +581,22 @@ static struct amd_nb *amd_alloc_nb(int cpu) return nb; } +static void amd_pmu_cpu_reset(int cpu) +{ + if (x86_pmu.version < 2) + return; + + /* Clear enable bits i.e. PerfCntrGlobalCtl.PerfCntrEn */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); + + /* + * Clear freeze and overflow bits i.e. PerfCntrGLobalStatus.LbrFreeze + * and PerfCntrGLobalStatus.PerfCntrOvfl + */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, + GLOBAL_STATUS_LBRS_FROZEN | amd_pmu_global_cntr_mask); +} + static int amd_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); @@ -534,6 +621,7 @@ static void amd_pmu_cpu_starting(int cpu) int i, nb_id; cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; + amd_pmu_cpu_reset(cpu); if (!x86_pmu.amd_nb_constraints) return; @@ -555,6 +643,8 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; + + amd_brs_reset(); } static void amd_pmu_cpu_dead(int cpu) @@ -576,6 +666,48 @@ static void amd_pmu_cpu_dead(int cpu) } } +static inline void amd_pmu_set_global_ctl(u64 ctl) +{ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl); +} + +static inline u64 amd_pmu_get_global_status(void) +{ + u64 status; + + /* PerfCntrGlobalStatus is read-only */ + rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, status); + + return status & amd_pmu_global_cntr_mask; +} + +static inline void amd_pmu_ack_global_status(u64 status) +{ + /* + * PerfCntrGlobalStatus is read-only but an overflow acknowledgment + * mechanism exists; writing 1 to a bit in PerfCntrGlobalStatusClr + * clears the same bit in PerfCntrGlobalStatus + */ + + /* Only allow modifications to PerfCntrGlobalStatus.PerfCntrOvfl */ + status &= amd_pmu_global_cntr_mask; + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, status); +} + +static bool amd_pmu_test_overflow_topbit(int idx) +{ + u64 counter; + + rdmsrl(x86_pmu_event_addr(idx), counter); + + return !(counter & BIT_ULL(x86_pmu.cntval_bits - 1)); +} + +static bool amd_pmu_test_overflow_status(int idx) +{ + return amd_pmu_get_global_status() & BIT_ULL(idx); +} + /* * When a PMC counter overflows, an NMI is used to process the event and * reset the counter. NMI latency can result in the counter being updated @@ -588,7 +720,6 @@ static void amd_pmu_cpu_dead(int cpu) static void amd_pmu_wait_on_overflow(int idx) { unsigned int i; - u64 counter; /* * Wait for the counter to be reset if it has overflowed. This loop @@ -596,22 +727,24 @@ static void amd_pmu_wait_on_overflow(int idx) * forever... */ for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) { - rdmsrl(x86_pmu_event_addr(idx), counter); - if (counter & (1ULL << (x86_pmu.cntval_bits - 1))) - break; + if ( x86_pmu.version >= 2 ) { + if ( !amd_pmu_test_overflow_status(idx) ) + break; + } else { + if ( !amd_pmu_test_overflow_topbit(idx) ) + break; + } /* Might be in IRQ context, so can't sleep */ udelay(1); } } -static void amd_pmu_disable_all(void) +static void amd_pmu_check_overflow(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; - x86_pmu_disable_all(); - /* * This shouldn't be called from NMI context, but add a safeguard here * to return, since if we're in NMI context we can't wait for an NMI @@ -634,6 +767,50 @@ static void amd_pmu_disable_all(void) } } +static void amd_pmu_enable_event(struct perf_event *event) +{ + x86_pmu_enable_event(event); +} + +static void amd_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc; + int idx; + + amd_brs_enable_all(); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + hwc = &cpuc->events[idx]->hw; + + /* only activate events which are marked as active */ + if (!test_bit(idx, cpuc->active_mask)) + continue; + + amd_pmu_enable_event(cpuc->events[idx]); + } +} + +static void amd_pmu_v2_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + /* + * Testing cpu_hw_events.enabled should be skipped in this case unlike + * in x86_pmu_enable_event(). + * + * Since cpu_hw_events.enabled is set only after returning from + * x86_pmu_start(), the PMCs must be programmed and kept ready. + * Counting starts only after x86_pmu_enable_all() is called. + */ + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); +} + +static void amd_pmu_v2_enable_all(int added) +{ + amd_pmu_set_global_ctl(amd_pmu_global_cntr_mask); +} + static void amd_pmu_disable_event(struct perf_event *event) { x86_pmu_disable_event(event); @@ -651,6 +828,32 @@ static void amd_pmu_disable_event(struct perf_event *event) amd_pmu_wait_on_overflow(event->hw.idx); } +static void amd_pmu_disable_all(void) +{ + amd_brs_disable_all(); + x86_pmu_disable_all(); + amd_pmu_check_overflow(); +} + +static void amd_pmu_v2_disable_all(void) +{ + /* Disable all PMCs */ + amd_pmu_set_global_ctl(0); + amd_pmu_check_overflow(); +} + +static void amd_pmu_add_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + amd_pmu_brs_add(event); +} + +static void amd_pmu_del_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + amd_pmu_brs_del(event); +} + /* * Because of NMI latency, if multiple PMC counters are active or other sources * of NMIs are received, the perf NMI handler can handle one or more overflowed @@ -669,36 +872,128 @@ static void amd_pmu_disable_event(struct perf_event *event) * handled a counter. When an un-handled NMI is received, it will be claimed * only if arriving within that window. */ +static inline int amd_pmu_adjust_nmi_window(int handled) +{ + /* + * If a counter was handled, record a timestamp such that un-handled + * NMIs will be claimed if arriving within that window. + */ + if (handled) { + this_cpu_write(perf_nmi_tstamp, jiffies + perf_nmi_window); + + return handled; + } + + if (time_after(jiffies, this_cpu_read(perf_nmi_tstamp))) + return NMI_DONE; + + return NMI_HANDLED; +} + static int amd_pmu_handle_irq(struct pt_regs *regs) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int active, handled; + int handled; + int pmu_enabled; /* - * Obtain the active count before calling x86_pmu_handle_irq() since - * it is possible that x86_pmu_handle_irq() may make a counter - * inactive (through x86_pmu_stop). + * Save the PMU state. + * It needs to be restored when leaving the handler. */ - active = __bitmap_weight(cpuc->active_mask, X86_PMC_IDX_MAX); + pmu_enabled = cpuc->enabled; + cpuc->enabled = 0; + + /* stop everything (includes BRS) */ + amd_pmu_disable_all(); + + /* Drain BRS is in use (could be inactive) */ + if (cpuc->lbr_users) + amd_brs_drain(); /* Process any counter overflows */ handled = x86_pmu_handle_irq(regs); + cpuc->enabled = pmu_enabled; + if (pmu_enabled) + amd_pmu_enable_all(0); + + return amd_pmu_adjust_nmi_window(handled); +} + +static int amd_pmu_v2_handle_irq(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_sample_data data; + struct hw_perf_event *hwc; + struct perf_event *event; + int handled = 0, idx; + u64 status, mask; + bool pmu_enabled; + /* - * If a counter was handled, record a timestamp such that un-handled - * NMIs will be claimed if arriving within that window. + * Save the PMU state as it needs to be restored when leaving the + * handler */ - if (handled) { - this_cpu_write(perf_nmi_tstamp, - jiffies + perf_nmi_window); + pmu_enabled = cpuc->enabled; + cpuc->enabled = 0; - return handled; + /* Stop counting */ + amd_pmu_v2_disable_all(); + + status = amd_pmu_get_global_status(); + + /* Check if any overflows are pending */ + if (!status) + goto done; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + event = cpuc->events[idx]; + hwc = &event->hw; + x86_perf_event_update(event); + mask = BIT_ULL(idx); + + if (!(status & mask)) + continue; + + /* Event overflow */ + handled++; + status &= ~mask; + perf_sample_data_init(&data, 0, hwc->last_period); + + if (!x86_perf_event_set_period(event)) + continue; + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); } - if (time_after(jiffies, this_cpu_read(perf_nmi_tstamp))) - return NMI_DONE; + /* + * It should never be the case that some overflows are not handled as + * the corresponding PMCs are expected to be inactive according to the + * active_mask + */ + WARN_ON(status > 0); - return NMI_HANDLED; + /* Clear overflow bits */ + amd_pmu_ack_global_status(~status); + + /* + * Unmasking the LVTPC is not required as the Mask (M) bit of the LVT + * PMI entry is not set by the local APIC when a PMC overflow occurs + */ + inc_irq_stat(apic_perf_irqs); + +done: + cpuc->enabled = pmu_enabled; + + /* Resume counting only if PMU is active */ + if (pmu_enabled) + amd_pmu_v2_enable_all(0); + + return amd_pmu_adjust_nmi_window(handled); } static struct event_constraint * @@ -906,6 +1201,51 @@ static void amd_put_event_constraints_f17h(struct cpu_hw_events *cpuc, --cpuc->n_pair; } +/* + * Because of the way BRS operates with an inactive and active phases, and + * the link to one counter, it is not possible to have two events using BRS + * scheduled at the same time. There would be an issue with enforcing the + * period of each one and given that the BRS saturates, it would not be possible + * to guarantee correlated content for all events. Therefore, in situations + * where multiple events want to use BRS, the kernel enforces mutual exclusion. + * Exclusion is enforced by chosing only one counter for events using BRS. + * The event scheduling logic will then automatically multiplex the + * events and ensure that at most one event is actively using BRS. + * + * The BRS counter could be any counter, but there is no constraint on Fam19h, + * therefore all counters are equal and thus we pick the first one: PMC0 + */ +static struct event_constraint amd_fam19h_brs_cntr0_constraint = + EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK); + +static struct event_constraint amd_fam19h_brs_pair_cntr0_constraint = + __EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK, 1, 0, PERF_X86_EVENT_PAIR); + +static struct event_constraint * +amd_get_event_constraints_f19h(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + bool has_brs = has_amd_brs(hwc); + + /* + * In case BRS is used with an event requiring a counter pair, + * the kernel allows it but only on counter 0 & 1 to enforce + * multiplexing requiring to protect BRS in case of multiple + * BRS users + */ + if (amd_is_pair_event_code(hwc)) { + return has_brs ? &amd_fam19h_brs_pair_cntr0_constraint + : &pair_constraint; + } + + if (has_brs) + return &amd_fam19h_brs_cntr0_constraint; + + return &unconstrained; +} + + static ssize_t amd_event_sysfs_show(char *page, u64 config) { u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) | @@ -914,12 +1254,19 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config) return x86_event_sysfs_show(page, config, event); } +static void amd_pmu_sched_task(struct perf_event_context *ctx, + bool sched_in) +{ + if (sched_in && x86_pmu.lbr_nr) + amd_pmu_brs_sched_task(ctx, sched_in); +} + static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, .disable_all = amd_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, + .enable_all = amd_pmu_enable_all, + .enable = amd_pmu_enable_event, .disable = amd_pmu_disable_event, .hw_config = amd_pmu_hw_config, .schedule_events = x86_schedule_events, @@ -929,6 +1276,8 @@ static __initconst const struct x86_pmu amd_pmu = { .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), .num_counters = AMD64_NUM_COUNTERS, + .add = amd_pmu_add_event, + .del = amd_pmu_del_event, .cntval_bits = 48, .cntval_mask = (1ULL << 48) - 1, .apic = 1, @@ -947,8 +1296,40 @@ static __initconst const struct x86_pmu amd_pmu = { .amd_nb_constraints = 1, }; +static ssize_t branches_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr); +} + +static DEVICE_ATTR_RO(branches); + +static struct attribute *amd_pmu_brs_attrs[] = { + &dev_attr_branches.attr, + NULL, +}; + +static umode_t +amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return x86_pmu.lbr_nr ? attr->mode : 0; +} + +static struct attribute_group group_caps_amd_brs = { + .name = "caps", + .attrs = amd_pmu_brs_attrs, + .is_visible = amd_brs_is_visible, +}; + +static const struct attribute_group *amd_attr_update[] = { + &group_caps_amd_brs, + NULL, +}; + static int __init amd_core_pmu_init(void) { + union cpuid_0x80000022_ebx ebx; u64 even_ctr_mask = 0ULL; int i; @@ -966,6 +1347,26 @@ static int __init amd_core_pmu_init(void) x86_pmu.eventsel = MSR_F15H_PERF_CTL; x86_pmu.perfctr = MSR_F15H_PERF_CTR; x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; + + /* Check for Performance Monitoring v2 support */ + if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) { + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); + + /* Update PMU version for later usage */ + x86_pmu.version = 2; + + /* Find the number of available Core PMCs */ + x86_pmu.num_counters = ebx.split.num_core_pmc; + + amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1; + + /* Update PMC handling functions */ + x86_pmu.enable_all = amd_pmu_v2_enable_all; + x86_pmu.disable_all = amd_pmu_v2_disable_all; + x86_pmu.enable = amd_pmu_v2_enable_event; + x86_pmu.handle_irq = amd_pmu_v2_handle_irq; + } + /* * AMD Core perfctr has separate MSRs for the NB events, see * the amd/uncore.c driver. @@ -998,6 +1399,19 @@ static int __init amd_core_pmu_init(void) x86_pmu.flags |= PMU_FL_PAIR; } + /* + * BRS requires special event constraints and flushing on ctxsw. + */ + if (boot_cpu_data.x86 >= 0x19 && !amd_brs_init()) { + x86_pmu.get_event_constraints = amd_get_event_constraints_f19h; + x86_pmu.sched_task = amd_pmu_sched_task; + /* + * put_event_constraints callback same as Fam17h, set above + */ + } + + x86_pmu.attr_update = amd_attr_update; + pr_cont("core perfctr, "); return 0; } @@ -1032,6 +1446,24 @@ __init int amd_pmu_init(void) return 0; } +static inline void amd_pmu_reload_virt(void) +{ + if (x86_pmu.version >= 2) { + /* + * Clear global enable bits, reprogram the PERF_CTL + * registers with updated perf_ctr_virt_mask and then + * set global enable bits once again + */ + amd_pmu_v2_disable_all(); + amd_pmu_enable_all(0); + amd_pmu_v2_enable_all(0); + return; + } + + amd_pmu_disable_all(); + amd_pmu_enable_all(0); +} + void amd_pmu_enable_virt(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1039,8 +1471,7 @@ void amd_pmu_enable_virt(void) cpuc->perf_ctr_virt_mask = 0; /* Reload all events */ - amd_pmu_disable_all(); - x86_pmu_enable_all(0); + amd_pmu_reload_virt(); } EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); @@ -1057,7 +1488,6 @@ void amd_pmu_disable_virt(void) cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; /* Reload all events */ - amd_pmu_disable_all(); - x86_pmu_enable_all(0); + amd_pmu_reload_virt(); } EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 2e930d8c04d955fea000435b608c9a611ab66a90..11e8b493e0153ac1f52e098e0f74049a8a4eb2d9 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -26,6 +26,7 @@ static u32 ibs_caps; #include #include +#include #define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) #define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT @@ -100,15 +101,6 @@ struct perf_ibs { u64 (*get_count)(u64 config); }; -struct perf_ibs_data { - u32 size; - union { - u32 data[0]; /* data buffer starts here */ - u32 caps; - }; - u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; -}; - static int perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period) { @@ -339,11 +331,14 @@ static int perf_ibs_set_period(struct perf_ibs *perf_ibs, static u64 get_ibs_fetch_count(u64 config) { - return (config & IBS_FETCH_CNT) >> 12; + union ibs_fetch_ctl fetch_ctl = (union ibs_fetch_ctl)config; + + return fetch_ctl.fetch_cnt << 4; } static u64 get_ibs_op_count(u64 config) { + union ibs_op_ctl op_ctl = (union ibs_op_ctl)config; u64 count = 0; /* @@ -351,10 +346,13 @@ static u64 get_ibs_op_count(u64 config) * and the lower 7 bits of CurCnt are randomized. * Otherwise CurCnt has the full 27-bit current counter value. */ - if (config & IBS_OP_VAL) - count = (config & IBS_OP_MAX_CNT) << 4; - else if (ibs_caps & IBS_CAPS_RDWROPCNT) - count = (config & IBS_OP_CUR_CNT) >> 32; + if (op_ctl.op_val) { + count = op_ctl.opmaxcnt << 4; + if (ibs_caps & IBS_CAPS_OPCNTEXT) + count += op_ctl.opmaxcnt_ext << 20; + } else if (ibs_caps & IBS_CAPS_RDWROPCNT) { + count = op_ctl.opcurcnt; + } return count; } @@ -415,7 +413,7 @@ static void perf_ibs_start(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); - u64 period; + u64 period, config = 0; if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) return; @@ -424,13 +422,19 @@ static void perf_ibs_start(struct perf_event *event, int flags) hwc->state = 0; perf_ibs_set_period(perf_ibs, hwc, &period); + if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) { + config |= period & IBS_OP_MAX_CNT_EXT_MASK; + period &= ~IBS_OP_MAX_CNT_EXT_MASK; + } + config |= period >> 4; + /* * Set STARTED before enabling the hardware, such that a subsequent NMI * must observe it. */ set_bit(IBS_STARTED, pcpu->state); clear_bit(IBS_STOPPING, pcpu->state); - perf_ibs_enable_event(perf_ibs, hwc, period >> 4); + perf_ibs_enable_event(perf_ibs, hwc, config); perf_event_update_userpage(event); } @@ -599,7 +603,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) struct perf_ibs_data ibs_data; int offset, size, check_rip, offset_max, throttle = 0; unsigned int msr; - u64 *buf, *config, period; + u64 *buf, *config, period, new_config = 0; if (!test_bit(IBS_STARTED, pcpu->state)) { fail: @@ -706,13 +710,17 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) if (throttle) { perf_ibs_stop(event, 0); } else { - period >>= 4; - - if ((ibs_caps & IBS_CAPS_RDWROPCNT) && - (*config & IBS_OP_CNT_CTL)) - period |= *config & IBS_OP_CUR_CNT_RAND; + if (perf_ibs == &perf_ibs_op) { + if (ibs_caps & IBS_CAPS_OPCNTEXT) { + new_config = period & IBS_OP_MAX_CNT_EXT_MASK; + period &= ~IBS_OP_MAX_CNT_EXT_MASK; + } + if ((ibs_caps & IBS_CAPS_RDWROPCNT) && (*config & IBS_OP_CNT_CTL)) + new_config |= *config & IBS_OP_CUR_CNT_RAND; + } + new_config |= period >> 4; - perf_ibs_enable_event(perf_ibs, hwc, period); + perf_ibs_enable_event(perf_ibs, hwc, new_config); } perf_event_update_userpage(event); @@ -789,6 +797,13 @@ static __init void perf_event_ibs_init(void) perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; *attr++ = &format_attr_cnt_ctl.attr; } + + if (ibs_caps & IBS_CAPS_OPCNTEXT) { + perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK; + perf_ibs_op.config_mask |= IBS_OP_MAX_CNT_EXT_MASK; + perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; + } + perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index af14e78a1697a4ae9080e99396bb272797eeb79b..3c13f0968ccb5e18539e9a063b8a2e5da5553d12 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -402,6 +402,7 @@ static struct pmu amd_nb_pmu = { .stop = amd_uncore_stop, .read = amd_uncore_read, .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, }; static struct pmu amd_llc_pmu = { @@ -415,6 +416,7 @@ static struct pmu amd_llc_pmu = { .stop = amd_uncore_stop, .read = amd_uncore_read, .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, }; static struct amd_uncore *amd_uncore_alloc(unsigned int cpu) @@ -663,9 +665,10 @@ static int __init amd_uncore_init(void) if (ret) goto fail_nb; - pr_info("%s NB counters detected\n", - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? - "HYGON" : "AMD"); + pr_info("%d %s %s counters detected\n", num_counters_nb, + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON" : "", + amd_nb_pmu.name); + ret = 0; } @@ -706,9 +709,9 @@ static int __init amd_uncore_init(void) if (ret) goto fail_llc; - pr_info("%s LLC counters detected\n", - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? - "HYGON" : "AMD"); + pr_info("%d %s %s counters detected\n", num_counters_llc, + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON" : "", + amd_llc_pmu.name); ret = 0; } @@ -746,4 +749,28 @@ static int __init amd_uncore_init(void) return ret; } -device_initcall(amd_uncore_init); + +static void __exit amd_uncore_exit(void) +{ + cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE); + cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING); + cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP); + + if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) { + perf_pmu_unregister(&amd_llc_pmu); + free_percpu(amd_uncore_llc); + amd_uncore_llc = NULL; + } + + if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) { + perf_pmu_unregister(&amd_nb_pmu); + free_percpu(amd_uncore_nb); + amd_uncore_nb = NULL; + } +} + +module_init(amd_uncore_init); +module_exit(amd_uncore_exit); + +MODULE_DESCRIPTION("AMD Uncore Driver"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 6f32db1a251d17c6c5ed79524d8f1d145d2107db..aba34228231dab81d757df39f6a6714d242d4df3 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1259,6 +1259,10 @@ static void x86_pmu_enable(struct pmu *pmu) if (hwc->state & PERF_HES_ARCH) continue; + /* + * if cpuc->enabled = 0, then no wrmsr as + * per x86_pmu_enable_event() + */ x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; @@ -1624,11 +1628,15 @@ int x86_pmu_handle_irq(struct pt_regs *regs) * event overflow */ handled++; - perf_sample_data_init(&data, 0, event->hw.last_period); if (!x86_perf_event_set_period(event)) continue; + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (has_branch_stack(event)) + data.br_stack = &cpuc->lbr_stack; + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } @@ -2219,7 +2227,7 @@ static int x86_pmu_event_init(struct perf_event *event) if (READ_ONCE(x86_pmu.attr_rdpmc) && !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS)) - event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED; + event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT; return err; } @@ -2231,7 +2239,7 @@ static void refresh_pce(void *ignored) static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) { - if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; /* @@ -2253,7 +2261,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) { - if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed)) @@ -2264,7 +2272,7 @@ static int x86_pmu_event_idx(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT)) return 0; if (is_metric_idx(hwc->idx)) @@ -2427,7 +2435,7 @@ void arch_perf_update_userpage(struct perf_event *event, userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; userpg->cap_user_rdpmc = - !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED); + !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT); userpg->pmc_width = x86_pmu.cntval_bits; if (!using_native_sched_clock() || !sched_clock_stable()) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index bf891caafa3c068bcf67130c1151ddce47ccdb54..091420463569326a706be9afeb2722549fc60561 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4277,6 +4277,11 @@ static __initconst const struct x86_pmu core_pmu = { .cpu_dead = intel_pmu_cpu_dead, .check_period = intel_pmu_check_period, + + .lbr_reset = intel_pmu_lbr_reset_64, + .lbr_read = intel_pmu_lbr_read_64, + .lbr_save = intel_pmu_lbr_save, + .lbr_restore = intel_pmu_lbr_restore, }; static __initconst const struct x86_pmu intel_pmu = { @@ -4321,6 +4326,11 @@ static __initconst const struct x86_pmu intel_pmu = { .check_period = intel_pmu_check_period, .aux_output_match = intel_pmu_aux_output_match, + + .lbr_reset = intel_pmu_lbr_reset_64, + .lbr_read = intel_pmu_lbr_read_64, + .lbr_save = intel_pmu_lbr_save, + .lbr_restore = intel_pmu_lbr_restore, }; static __init void intel_clovertown_quirk(void) @@ -4960,6 +4970,14 @@ __init int intel_pmu_init(void) x86_pmu.intel_cap.capabilities = capabilities; } + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) { + x86_pmu.lbr_reset = intel_pmu_lbr_reset_32; + x86_pmu.lbr_read = intel_pmu_lbr_read_32; + } + + if (boot_cpu_has(X86_FEATURE_ARCH_LBR)) + intel_pmu_arch_lbr_init(); + intel_ds_init(); x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 6da90c6f33903a92a216a4a32a830ed6d1dd71b0..7c7319154e1b1028dcbc767264ab5130b363463a 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -956,7 +956,7 @@ static void adaptive_pebs_record_size_update(void) if (pebs_data_cfg & PEBS_DATACFG_XMMS) sz += sizeof(struct pebs_xmm); if (pebs_data_cfg & PEBS_DATACFG_LBRS) - sz += x86_pmu.lbr_nr * sizeof(struct pebs_lbr_entry); + sz += x86_pmu.lbr_nr * sizeof(struct lbr_entry); cpuc->pebs_record_size = sz; } @@ -1597,10 +1597,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } if (format_size & PEBS_DATACFG_LBRS) { - struct pebs_lbr *lbr = next_record; + struct lbr_entry *lbr = next_record; int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT) & 0xff) + 1; - next_record = next_record + num_lbr*sizeof(struct pebs_lbr_entry); + next_record = next_record + num_lbr * sizeof(struct lbr_entry); if (has_branch_stack(event)) { intel_pmu_store_pebs_lbrs(lbr); diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 73dbd30f82d33ffb2cf398f1852c8dca930f421f..e01c48eaaf6b44787a965f11aaf3413655ebf4a9 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -8,17 +8,6 @@ #include "../perf_event.h" -enum { - LBR_FORMAT_32 = 0x00, - LBR_FORMAT_LIP = 0x01, - LBR_FORMAT_EIP = 0x02, - LBR_FORMAT_EIP_FLAGS = 0x03, - LBR_FORMAT_EIP_FLAGS2 = 0x04, - LBR_FORMAT_INFO = 0x05, - LBR_FORMAT_TIME = 0x06, - LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME, -}; - static const enum { LBR_EIP_FLAGS = 1, LBR_TSX = 2, @@ -143,8 +132,54 @@ enum { X86_BR_IRQ |\ X86_BR_INT) +/* + * Intel LBR_CTL bits + * + * Hardware branch filter for Arch LBR + */ +#define ARCH_LBR_KERNEL_BIT 1 /* capture at ring0 */ +#define ARCH_LBR_USER_BIT 2 /* capture at ring > 0 */ +#define ARCH_LBR_CALL_STACK_BIT 3 /* enable call stack */ +#define ARCH_LBR_JCC_BIT 16 /* capture conditional branches */ +#define ARCH_LBR_REL_JMP_BIT 17 /* capture relative jumps */ +#define ARCH_LBR_IND_JMP_BIT 18 /* capture indirect jumps */ +#define ARCH_LBR_REL_CALL_BIT 19 /* capture relative calls */ +#define ARCH_LBR_IND_CALL_BIT 20 /* capture indirect calls */ +#define ARCH_LBR_RETURN_BIT 21 /* capture near returns */ +#define ARCH_LBR_OTHER_BRANCH_BIT 22 /* capture other branches */ + +#define ARCH_LBR_KERNEL (1ULL << ARCH_LBR_KERNEL_BIT) +#define ARCH_LBR_USER (1ULL << ARCH_LBR_USER_BIT) +#define ARCH_LBR_CALL_STACK (1ULL << ARCH_LBR_CALL_STACK_BIT) +#define ARCH_LBR_JCC (1ULL << ARCH_LBR_JCC_BIT) +#define ARCH_LBR_REL_JMP (1ULL << ARCH_LBR_REL_JMP_BIT) +#define ARCH_LBR_IND_JMP (1ULL << ARCH_LBR_IND_JMP_BIT) +#define ARCH_LBR_REL_CALL (1ULL << ARCH_LBR_REL_CALL_BIT) +#define ARCH_LBR_IND_CALL (1ULL << ARCH_LBR_IND_CALL_BIT) +#define ARCH_LBR_RETURN (1ULL << ARCH_LBR_RETURN_BIT) +#define ARCH_LBR_OTHER_BRANCH (1ULL << ARCH_LBR_OTHER_BRANCH_BIT) + +#define ARCH_LBR_ANY \ + (ARCH_LBR_JCC |\ + ARCH_LBR_REL_JMP |\ + ARCH_LBR_IND_JMP |\ + ARCH_LBR_REL_CALL |\ + ARCH_LBR_IND_CALL |\ + ARCH_LBR_RETURN |\ + ARCH_LBR_OTHER_BRANCH) + +#define ARCH_LBR_CTL_MASK 0x7f000e + static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); +static __always_inline bool is_lbr_call_stack_bit_set(u64 config) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + return !!(config & ARCH_LBR_CALL_STACK); + + return !!(config & LBR_CALL_STACK); +} + /* * We only support LBR implementations that have FREEZE_LBRS_ON_PMI * otherwise it becomes near impossible to get a reliable stack. @@ -168,33 +203,46 @@ static void __intel_pmu_lbr_enable(bool pmi) */ if (cpuc->lbr_sel) lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask; - if (!pmi && cpuc->lbr_sel) + if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && !pmi && cpuc->lbr_sel) wrmsrl(MSR_LBR_SELECT, lbr_select); rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); orig_debugctl = debugctl; - debugctl |= DEBUGCTLMSR_LBR; + + if (!static_cpu_has(X86_FEATURE_ARCH_LBR)) + debugctl |= DEBUGCTLMSR_LBR; /* * LBR callstack does not work well with FREEZE_LBRS_ON_PMI. * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions * may cause superfluous increase/decrease of LBR_TOS. */ - if (!(lbr_select & LBR_CALL_STACK)) + if (is_lbr_call_stack_bit_set(lbr_select)) + debugctl &= ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; + else debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; + if (orig_debugctl != debugctl) wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + wrmsrl(MSR_ARCH_LBR_CTL, lbr_select | ARCH_LBR_CTL_LBREN); } static void __intel_pmu_lbr_disable(void) { u64 debugctl; + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) { + wrmsrl(MSR_ARCH_LBR_CTL, 0); + return; + } + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); } -static void intel_pmu_lbr_reset_32(void) +void intel_pmu_lbr_reset_32(void) { int i; @@ -202,7 +250,7 @@ static void intel_pmu_lbr_reset_32(void) wrmsrl(x86_pmu.lbr_from + i, 0); } -static void intel_pmu_lbr_reset_64(void) +void intel_pmu_lbr_reset_64(void) { int i; @@ -210,10 +258,16 @@ static void intel_pmu_lbr_reset_64(void) wrmsrl(x86_pmu.lbr_from + i, 0); wrmsrl(x86_pmu.lbr_to + i, 0); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + i, 0); + wrmsrl(x86_pmu.lbr_info + i, 0); } } +static void intel_pmu_arch_lbr_reset(void) +{ + /* Write to ARCH_LBR_DEPTH MSR, all LBR entries are reset to 0 */ + wrmsrl(MSR_ARCH_LBR_DEPTH, x86_pmu.lbr_nr); +} + void intel_pmu_lbr_reset(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -221,10 +275,7 @@ void intel_pmu_lbr_reset(void) if (!x86_pmu.lbr_nr) return; - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) - intel_pmu_lbr_reset_32(); - else - intel_pmu_lbr_reset_64(); + x86_pmu.lbr_reset(); cpuc->last_task_ctx = NULL; cpuc->last_log_id = 0; @@ -308,69 +359,97 @@ static u64 lbr_from_signext_quirk_rd(u64 val) return val; } -static inline void wrlbr_from(unsigned int idx, u64 val) +static __always_inline void wrlbr_from(unsigned int idx, u64 val) { val = lbr_from_signext_quirk_wr(val); wrmsrl(x86_pmu.lbr_from + idx, val); } -static inline void wrlbr_to(unsigned int idx, u64 val) +static __always_inline void wrlbr_to(unsigned int idx, u64 val) { wrmsrl(x86_pmu.lbr_to + idx, val); } -static inline u64 rdlbr_from(unsigned int idx) +static __always_inline void wrlbr_info(unsigned int idx, u64 val) +{ + wrmsrl(x86_pmu.lbr_info + idx, val); +} + +static __always_inline u64 rdlbr_from(unsigned int idx, struct lbr_entry *lbr) { u64 val; + if (lbr) + return lbr->from; + rdmsrl(x86_pmu.lbr_from + idx, val); return lbr_from_signext_quirk_rd(val); } -static inline u64 rdlbr_to(unsigned int idx) +static __always_inline u64 rdlbr_to(unsigned int idx, struct lbr_entry *lbr) { u64 val; + if (lbr) + return lbr->to; + rdmsrl(x86_pmu.lbr_to + idx, val); return val; } -static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) +static __always_inline u64 rdlbr_info(unsigned int idx, struct lbr_entry *lbr) +{ + u64 val; + + if (lbr) + return lbr->info; + + rdmsrl(x86_pmu.lbr_info + idx, val); + + return val; +} + +static inline void +wrlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) +{ + wrlbr_from(idx, lbr->from); + wrlbr_to(idx, lbr->to); + if (need_info) + wrlbr_info(idx, lbr->info); +} + +static inline bool +rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) +{ + u64 from = rdlbr_from(idx, NULL); + + /* Don't read invalid entry */ + if (!from) + return false; + + lbr->from = from; + lbr->to = rdlbr_to(idx, NULL); + if (need_info) + lbr->info = rdlbr_info(idx, NULL); + + return true; +} + +void intel_pmu_lbr_restore(void *ctx) { + bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_task_context *task_ctx = ctx; int i; unsigned lbr_idx, mask; - u64 tos; - - if (task_ctx->lbr_callstack_users == 0 || - task_ctx->lbr_stack_state == LBR_NONE) { - intel_pmu_lbr_reset(); - return; - } - - tos = task_ctx->tos; - /* - * Does not restore the LBR registers, if - * - No one else touched them, and - * - Did not enter C6 - */ - if ((task_ctx == cpuc->last_task_ctx) && - (task_ctx->log_id == cpuc->last_log_id) && - rdlbr_from(tos)) { - task_ctx->lbr_stack_state = LBR_NONE; - return; - } + u64 tos = task_ctx->tos; mask = x86_pmu.lbr_nr - 1; for (i = 0; i < task_ctx->valid_lbrs; i++) { lbr_idx = (tos - i) & mask; - wrlbr_from(lbr_idx, task_ctx->lbr_from[i]); - wrlbr_to (lbr_idx, task_ctx->lbr_to[i]); - - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); + wrlbr_all(&task_ctx->lbr[i], lbr_idx, need_info); } for (; i < x86_pmu.lbr_nr; i++) { @@ -378,55 +457,150 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) wrlbr_from(lbr_idx, 0); wrlbr_to(lbr_idx, 0); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + lbr_idx, 0); + wrlbr_info(lbr_idx, 0); } wrmsrl(x86_pmu.lbr_tos, tos); - task_ctx->lbr_stack_state = LBR_NONE; if (cpuc->lbr_select) wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } -static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) +static void intel_pmu_arch_lbr_restore(void *ctx) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - unsigned lbr_idx, mask; - u64 tos, from; + struct x86_perf_task_context_arch_lbr *task_ctx = ctx; + struct lbr_entry *entries = task_ctx->entries; int i; - if (task_ctx->lbr_callstack_users == 0) { - task_ctx->lbr_stack_state = LBR_NONE; + /* Fast reset the LBRs before restore if the call stack is not full. */ + if (!entries[x86_pmu.lbr_nr - 1].from) + intel_pmu_arch_lbr_reset(); + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + if (!entries[i].from) + break; + wrlbr_all(&entries[i], i, true); + } +} + +static __always_inline bool lbr_is_reset_in_cstate(void *ctx) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + return x86_pmu.lbr_deep_c_reset && !rdlbr_from(0, NULL); + + return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos, NULL); +} + +static void __intel_pmu_lbr_restore(void *ctx) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (task_context_opt(ctx)->lbr_callstack_users == 0 || + task_context_opt(ctx)->lbr_stack_state == LBR_NONE) { + intel_pmu_lbr_reset(); + return; + } + + /* + * Does not restore the LBR registers, if + * - No one else touched them, and + * - Was not cleared in Cstate + */ + if ((ctx == cpuc->last_task_ctx) && + (task_context_opt(ctx)->log_id == cpuc->last_log_id) && + !lbr_is_reset_in_cstate(ctx)) { + task_context_opt(ctx)->lbr_stack_state = LBR_NONE; return; } + x86_pmu.lbr_restore(ctx); + + task_context_opt(ctx)->lbr_stack_state = LBR_NONE; +} + +void intel_pmu_lbr_save(void *ctx) +{ + bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO; + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_task_context *task_ctx = ctx; + unsigned lbr_idx, mask; + u64 tos; + int i; + mask = x86_pmu.lbr_nr - 1; tos = intel_pmu_lbr_tos(); for (i = 0; i < x86_pmu.lbr_nr; i++) { lbr_idx = (tos - i) & mask; - from = rdlbr_from(lbr_idx); - if (!from) + if (!rdlbr_all(&task_ctx->lbr[i], lbr_idx, need_info)) break; - task_ctx->lbr_from[i] = from; - task_ctx->lbr_to[i] = rdlbr_to(lbr_idx); - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } task_ctx->valid_lbrs = i; task_ctx->tos = tos; - task_ctx->lbr_stack_state = LBR_VALID; - - cpuc->last_task_ctx = task_ctx; - cpuc->last_log_id = ++task_ctx->log_id; if (cpuc->lbr_select) - rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); + rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); +} + +static void intel_pmu_arch_lbr_save(void *ctx) +{ + struct x86_perf_task_context_arch_lbr *task_ctx = ctx; + struct lbr_entry *entries = task_ctx->entries; + int i; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + if (!rdlbr_all(&entries[i], i, true)) + break; + } + + /* LBR call stack is not full. Reset is required in restore. */ + if (i < x86_pmu.lbr_nr) + entries[x86_pmu.lbr_nr - 1].from = 0; +} + +static void __intel_pmu_lbr_save(void *ctx) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (task_context_opt(ctx)->lbr_callstack_users == 0) { + task_context_opt(ctx)->lbr_stack_state = LBR_NONE; + return; + } + + x86_pmu.lbr_save(ctx); + + task_context_opt(ctx)->lbr_stack_state = LBR_VALID; + + cpuc->last_task_ctx = ctx; + cpuc->last_log_id = ++task_context_opt(ctx)->log_id; +} + +void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, + struct perf_event_context *next) +{ + void *prev_ctx_data, *next_ctx_data; + + swap(prev->task_ctx_data, next->task_ctx_data); + + /* + * Architecture specific synchronization makes sense in + * case both prev->task_ctx_data and next->task_ctx_data + * pointers are allocated. + */ + + prev_ctx_data = next->task_ctx_data; + next_ctx_data = prev->task_ctx_data; + + if (!prev_ctx_data || !next_ctx_data) + return; + + swap(task_context_opt(prev_ctx_data)->lbr_callstack_users, + task_context_opt(next_ctx_data)->lbr_callstack_users); } void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct x86_perf_task_context *task_ctx; + void *task_ctx; if (!cpuc->lbr_users) return; @@ -463,7 +637,6 @@ static inline bool branch_user_callstack(unsigned br_sel) void intel_pmu_lbr_add(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct x86_perf_task_context *task_ctx; if (!x86_pmu.lbr_nr) return; @@ -473,10 +646,8 @@ void intel_pmu_lbr_add(struct perf_event *event) cpuc->br_sel = event->hw.branch_reg.reg; - if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) { - task_ctx = event->ctx->task_ctx_data; - task_ctx->lbr_callstack_users++; - } + if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) + task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users++; /* * Request pmu::sched_task() callback, which will fire inside the @@ -507,16 +678,13 @@ void intel_pmu_lbr_add(struct perf_event *event) void intel_pmu_lbr_del(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct x86_perf_task_context *task_ctx; if (!x86_pmu.lbr_nr) return; if (branch_user_callstack(cpuc->br_sel) && - event->ctx->task_ctx_data) { - task_ctx = event->ctx->task_ctx_data; - task_ctx->lbr_callstack_users--; - } + event->ctx->task_ctx_data) + task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users--; if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT) cpuc->lbr_select = 0; @@ -553,9 +721,10 @@ void intel_pmu_lbr_disable_all(void) __intel_pmu_lbr_disable(); } -static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) +void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) { unsigned long mask = x86_pmu.lbr_nr - 1; + struct perf_branch_entry *br = cpuc->lbr_entries; u64 tos = intel_pmu_lbr_tos(); int i; @@ -571,17 +740,14 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); - cpuc->lbr_entries[i].from = msr_lastbranch.from; - cpuc->lbr_entries[i].to = msr_lastbranch.to; - cpuc->lbr_entries[i].mispred = 0; - cpuc->lbr_entries[i].predicted = 0; - cpuc->lbr_entries[i].in_tx = 0; - cpuc->lbr_entries[i].abort = 0; - cpuc->lbr_entries[i].cycles = 0; - cpuc->lbr_entries[i].type = 0; - cpuc->lbr_entries[i].reserved = 0; + perf_clear_branch_entry_bitfields(br); + + br->from = msr_lastbranch.from; + br->to = msr_lastbranch.to; + br++; } cpuc->lbr_stack.nr = i; + cpuc->lbr_stack.hw_idx = tos; } /* @@ -589,11 +755,12 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) * is the same as the linear address, allowing us to merge the LIP and EIP * LBR formats. */ -static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) +void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) { bool need_info = false, call_stack = false; unsigned long mask = x86_pmu.lbr_nr - 1; int lbr_format = x86_pmu.intel_cap.lbr_format; + struct perf_branch_entry *br = cpuc->lbr_entries; u64 tos = intel_pmu_lbr_tos(); int i; int out = 0; @@ -612,8 +779,8 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) u16 cycles = 0; int lbr_flags = lbr_desc[lbr_format]; - from = rdlbr_from(lbr_idx); - to = rdlbr_to(lbr_idx); + from = rdlbr_from(lbr_idx, NULL); + to = rdlbr_to(lbr_idx, NULL); /* * Read LBR call stack entries @@ -625,7 +792,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (lbr_format == LBR_FORMAT_INFO && need_info) { u64 info; - rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info); + info = rdlbr_info(lbr_idx, NULL); mis = !!(info & LBR_INFO_MISPRED); pred = !mis; in_tx = !!(info & LBR_INFO_IN_TX); @@ -665,18 +832,93 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (abort && x86_pmu.lbr_double_abort && out > 0) out--; - cpuc->lbr_entries[out].from = from; - cpuc->lbr_entries[out].to = to; - cpuc->lbr_entries[out].mispred = mis; - cpuc->lbr_entries[out].predicted = pred; - cpuc->lbr_entries[out].in_tx = in_tx; - cpuc->lbr_entries[out].abort = abort; - cpuc->lbr_entries[out].cycles = cycles; - cpuc->lbr_entries[out].type = 0; - cpuc->lbr_entries[out].reserved = 0; + perf_clear_branch_entry_bitfields(br+out); + br[out].from = from; + br[out].to = to; + br[out].mispred = mis; + br[out].predicted = pred; + br[out].in_tx = in_tx; + br[out].abort = abort; + br[out].cycles = cycles; out++; } cpuc->lbr_stack.nr = out; + cpuc->lbr_stack.hw_idx = tos; +} + +static __always_inline int get_lbr_br_type(u64 info) +{ + if (!static_cpu_has(X86_FEATURE_ARCH_LBR) || !x86_pmu.lbr_br_type) + return 0; + + return (info & LBR_INFO_BR_TYPE) >> LBR_INFO_BR_TYPE_OFFSET; +} + +static __always_inline bool get_lbr_mispred(u64 info) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred) + return 0; + + return !!(info & LBR_INFO_MISPRED); +} + +static __always_inline bool get_lbr_predicted(u64 info) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred) + return 0; + + return !(info & LBR_INFO_MISPRED); +} + +static __always_inline bool get_lbr_cycles(u64 info) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && + !(x86_pmu.lbr_timed_lbr && info & LBR_INFO_CYC_CNT_VALID)) + return 0; + + return info & LBR_INFO_CYCLES; +} + +static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, + struct lbr_entry *entries) +{ + struct perf_branch_entry *e; + struct lbr_entry *lbr; + u64 from, to, info; + int i; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + lbr = entries ? &entries[i] : NULL; + e = &cpuc->lbr_entries[i]; + + from = rdlbr_from(i, lbr); + /* + * Read LBR entries until invalid entry (0s) is detected. + */ + if (!from) + break; + + to = rdlbr_to(i, lbr); + info = rdlbr_info(i, lbr); + + perf_clear_branch_entry_bitfields(e); + + e->from = from; + e->to = to; + e->mispred = get_lbr_mispred(info); + e->predicted = get_lbr_predicted(info); + e->in_tx = !!(info & LBR_INFO_IN_TX); + e->abort = !!(info & LBR_INFO_ABORT); + e->cycles = get_lbr_cycles(info); + e->type = get_lbr_br_type(info); + } + + cpuc->lbr_stack.nr = i; +} + +static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc) +{ + intel_pmu_store_lbr(cpuc, NULL); } void intel_pmu_lbr_read(void) @@ -693,10 +935,7 @@ void intel_pmu_lbr_read(void) cpuc->lbr_users == cpuc->lbr_pebs_users) return; - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) - intel_pmu_lbr_read_32(cpuc); - else - intel_pmu_lbr_read_64(cpuc); + x86_pmu.lbr_read(cpuc); intel_pmu_lbr_filter(cpuc); } @@ -796,6 +1035,11 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) reg = &event->hw.branch_reg; reg->idx = EXTRA_REG_LBR; + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) { + reg->config = mask; + return 0; + } + /* * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate * in suppress mode. So LBR_SELECT should be set to @@ -1052,6 +1296,27 @@ common_branch_type(int type) return PERF_BR_UNKNOWN; } +enum { + ARCH_LBR_BR_TYPE_JCC = 0, + ARCH_LBR_BR_TYPE_NEAR_IND_JMP = 1, + ARCH_LBR_BR_TYPE_NEAR_REL_JMP = 2, + ARCH_LBR_BR_TYPE_NEAR_IND_CALL = 3, + ARCH_LBR_BR_TYPE_NEAR_REL_CALL = 4, + ARCH_LBR_BR_TYPE_NEAR_RET = 5, + ARCH_LBR_BR_TYPE_KNOWN_MAX = ARCH_LBR_BR_TYPE_NEAR_RET, + + ARCH_LBR_BR_TYPE_MAP_MAX = 16, +}; + +static const int arch_lbr_br_type_map[ARCH_LBR_BR_TYPE_MAP_MAX] = { + [ARCH_LBR_BR_TYPE_JCC] = X86_BR_JCC, + [ARCH_LBR_BR_TYPE_NEAR_IND_JMP] = X86_BR_IND_JMP, + [ARCH_LBR_BR_TYPE_NEAR_REL_JMP] = X86_BR_JMP, + [ARCH_LBR_BR_TYPE_NEAR_IND_CALL] = X86_BR_IND_CALL, + [ARCH_LBR_BR_TYPE_NEAR_REL_CALL] = X86_BR_CALL, + [ARCH_LBR_BR_TYPE_NEAR_RET] = X86_BR_RET, +}; + /* * implement actual branch filter based on user demand. * Hardware may not exactly satisfy that request, thus @@ -1064,7 +1329,7 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) { u64 from, to; int br_sel = cpuc->br_sel; - int i, j, type; + int i, j, type, to_plm; bool compress = false; /* if sampling all branches, then nothing to filter */ @@ -1076,8 +1341,19 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) from = cpuc->lbr_entries[i].from; to = cpuc->lbr_entries[i].to; + type = cpuc->lbr_entries[i].type; - type = branch_type(from, to, cpuc->lbr_entries[i].abort); + /* + * Parse the branch type recorded in LBR_x_INFO MSR. + * Doesn't support OTHER_BRANCH decoding for now. + * OTHER_BRANCH branch type still rely on software decoding. + */ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && + type <= ARCH_LBR_BR_TYPE_KNOWN_MAX) { + to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; + type = arch_lbr_br_type_map[type] | to_plm; + } else + type = branch_type(from, to, cpuc->lbr_entries[i].abort); if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) { if (cpuc->lbr_entries[i].in_tx) type |= X86_BR_IN_TX; @@ -1112,25 +1388,18 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) } } -void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr) +void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int i; - cpuc->lbr_stack.nr = x86_pmu.lbr_nr; - for (i = 0; i < x86_pmu.lbr_nr; i++) { - u64 info = lbr->lbr[i].info; - struct perf_branch_entry *e = &cpuc->lbr_entries[i]; + /* Cannot get TOS for large PEBS and Arch LBR */ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) || + (cpuc->n_pebs == cpuc->n_large_pebs)) + cpuc->lbr_stack.hw_idx = -1ULL; + else + cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos(); - e->from = lbr->lbr[i].from; - e->to = lbr->lbr[i].to; - e->mispred = !!(info & LBR_INFO_MISPRED); - e->predicted = !(info & LBR_INFO_MISPRED); - e->in_tx = !!(info & LBR_INFO_IN_TX); - e->abort = !!(info & LBR_INFO_ABORT); - e->cycles = info & LBR_INFO_CYCLES; - e->reserved = 0; - } + intel_pmu_store_lbr(cpuc, lbr); intel_pmu_lbr_filter(cpuc); } @@ -1187,6 +1456,26 @@ static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, }; +static int arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = ARCH_LBR_ANY, + [PERF_SAMPLE_BRANCH_USER_SHIFT] = ARCH_LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = ARCH_LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = ARCH_LBR_RETURN | + ARCH_LBR_OTHER_BRANCH, + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = ARCH_LBR_REL_CALL | + ARCH_LBR_IND_CALL | + ARCH_LBR_OTHER_BRANCH, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = ARCH_LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = ARCH_LBR_JCC, + [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = ARCH_LBR_REL_CALL | + ARCH_LBR_IND_CALL | + ARCH_LBR_RETURN | + ARCH_LBR_CALL_STACK, + [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = ARCH_LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_CALL_SHIFT] = ARCH_LBR_REL_CALL, +}; + /* core */ void __init intel_pmu_lbr_init_core(void) { @@ -1262,6 +1551,7 @@ __init void intel_pmu_lbr_init_skl(void) x86_pmu.lbr_tos = MSR_LBR_TOS; x86_pmu.lbr_from = MSR_LBR_NHM_FROM; x86_pmu.lbr_to = MSR_LBR_NHM_TO; + x86_pmu.lbr_info = MSR_LBR_INFO_0; x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = hsw_lbr_sel_map; @@ -1333,6 +1623,81 @@ void intel_pmu_lbr_init_knl(void) x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS; } +void __init intel_pmu_arch_lbr_init(void) +{ + union cpuid28_eax eax; + union cpuid28_ebx ebx; + union cpuid28_ecx ecx; + unsigned int unused_edx; + u64 lbr_nr; + + /* Arch LBR Capabilities */ + cpuid(28, &eax.full, &ebx.full, &ecx.full, &unused_edx); + + lbr_nr = fls(eax.split.lbr_depth_mask) * 8; + if (!lbr_nr) + goto clear_arch_lbr; + + /* Apply the max depth of Arch LBR */ + if (wrmsrl_safe(MSR_ARCH_LBR_DEPTH, lbr_nr)) + goto clear_arch_lbr; + + x86_pmu.lbr_depth_mask = eax.split.lbr_depth_mask; + x86_pmu.lbr_deep_c_reset = eax.split.lbr_deep_c_reset; + x86_pmu.lbr_lip = eax.split.lbr_lip; + x86_pmu.lbr_cpl = ebx.split.lbr_cpl; + x86_pmu.lbr_filter = ebx.split.lbr_filter; + x86_pmu.lbr_call_stack = ebx.split.lbr_call_stack; + x86_pmu.lbr_mispred = ecx.split.lbr_mispred; + x86_pmu.lbr_timed_lbr = ecx.split.lbr_timed_lbr; + x86_pmu.lbr_br_type = ecx.split.lbr_br_type; + x86_pmu.lbr_nr = lbr_nr; + + x86_get_pmu()->task_ctx_size = sizeof(struct x86_perf_task_context_arch_lbr) + + lbr_nr * sizeof(struct lbr_entry); + + x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0; + x86_pmu.lbr_to = MSR_ARCH_LBR_TO_0; + x86_pmu.lbr_info = MSR_ARCH_LBR_INFO_0; + + /* LBR callstack requires both CPL and Branch Filtering support */ + if (!x86_pmu.lbr_cpl || + !x86_pmu.lbr_filter || + !x86_pmu.lbr_call_stack) + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_NOT_SUPP; + + if (!x86_pmu.lbr_cpl) { + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_NOT_SUPP; + } else if (!x86_pmu.lbr_filter) { + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_NOT_SUPP; + } + + x86_pmu.lbr_ctl_mask = ARCH_LBR_CTL_MASK; + x86_pmu.lbr_ctl_map = arch_lbr_ctl_map; + + if (!x86_pmu.lbr_cpl && !x86_pmu.lbr_filter) + x86_pmu.lbr_ctl_map = NULL; + + x86_pmu.lbr_reset = intel_pmu_arch_lbr_reset; + x86_pmu.lbr_read = intel_pmu_arch_lbr_read; + x86_pmu.lbr_save = intel_pmu_arch_lbr_save; + x86_pmu.lbr_restore = intel_pmu_arch_lbr_restore; + + pr_cont("Architectural LBR, "); + + return; + +clear_arch_lbr: + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_ARCH_LBR); +} + /** * x86_perf_get_lbr - get the LBR records information * @@ -1347,7 +1712,7 @@ int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) lbr->nr = x86_pmu.lbr_nr; lbr->from = x86_pmu.lbr_from; lbr->to = x86_pmu.lbr_to; - lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? MSR_LBR_INFO_0 : 0; + lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? x86_pmu.lbr_info : 0; return 0; } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 39db37ee8ffa7d79b4c767a9ed66c4b1fe9d260c..232db8465cfe04203fbc78ffae0b410551867b23 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -65,21 +65,23 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) /* * struct hw_perf_event.flags flags */ -#define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */ -#define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */ -#define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */ -#define PERF_X86_EVENT_PEBS_LD_HSW 0x0008 /* haswell style datala, load */ -#define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */ -#define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */ -#define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */ -#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0080 /* grant rdpmc permission */ -#define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */ -#define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */ -#define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ -#define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */ -#define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */ -#define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */ -#define PERF_X86_EVENT_TOPDOWN 0x4000 /* Count Topdown slots/metrics events */ +#define PERF_X86_EVENT_PEBS_LDLAT 0x00001 /* ld+ldlat data address sampling */ +#define PERF_X86_EVENT_PEBS_ST 0x00002 /* st data address sampling */ +#define PERF_X86_EVENT_PEBS_ST_HSW 0x00004 /* haswell style datala, store */ +#define PERF_X86_EVENT_PEBS_LD_HSW 0x00008 /* haswell style datala, load */ +#define PERF_X86_EVENT_PEBS_NA_HSW 0x00010 /* haswell style datala, unknown */ +#define PERF_X86_EVENT_EXCL 0x00020 /* HT exclusivity on counter */ +#define PERF_X86_EVENT_DYNAMIC 0x00040 /* dynamic alloc'd constraint */ + +#define PERF_X86_EVENT_EXCL_ACCT 0x00100 /* accounted EXCL event */ +#define PERF_X86_EVENT_AUTO_RELOAD 0x00200 /* use PEBS auto-reload */ +#define PERF_X86_EVENT_LARGE_PEBS 0x00400 /* use large PEBS */ +#define PERF_X86_EVENT_PEBS_VIA_PT 0x00800 /* use PT buffer for PEBS */ +#define PERF_X86_EVENT_PAIR 0x01000 /* Large Increment per Cycle */ +#define PERF_X86_EVENT_LBR_SELECT 0x02000 /* Save/Restore MSR_LBR_SELECT */ +#define PERF_X86_EVENT_TOPDOWN 0x04000 /* Count Topdown slots/metrics events */ +#define PERF_X86_EVENT_PEBS_STLAT 0x08000 /* st+stlat data address sampling */ +#define PERF_X86_EVENT_AMD_BRS 0x10000 /* AMD Branch Sampling */ static inline bool is_topdown_count(struct perf_event *event) { @@ -204,6 +206,17 @@ struct intel_excl_cntrs { struct x86_perf_task_context; #define MAX_LBR_ENTRIES 32 +enum { + LBR_FORMAT_32 = 0x00, + LBR_FORMAT_LIP = 0x01, + LBR_FORMAT_EIP = 0x02, + LBR_FORMAT_EIP_FLAGS = 0x03, + LBR_FORMAT_EIP_FLAGS2 = 0x04, + LBR_FORMAT_INFO = 0x05, + LBR_FORMAT_TIME = 0x06, + LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME, +}; + enum { X86_PERF_KFREE_SHARED = 0, X86_PERF_KFREE_EXCL = 1, @@ -261,9 +274,12 @@ struct cpu_hw_events { int lbr_pebs_users; struct perf_branch_stack lbr_stack; struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; - struct er_account *lbr_sel; + union { + struct er_account *lbr_sel; + struct er_account *lbr_ctl; + }; u64 br_sel; - struct x86_perf_task_context *last_task_ctx; + void *last_task_ctx; int last_log_id; int lbr_select; @@ -306,6 +322,8 @@ struct cpu_hw_events { * AMD specific bits */ struct amd_nb *amd_nb; + int brs_active; /* BRS is enabled */ + /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ u64 perf_ctr_virt_mask; int n_pair; /* Large increment events */ @@ -721,12 +739,36 @@ struct x86_pmu { * Intel LBR */ unsigned int lbr_tos, lbr_from, lbr_to, - lbr_nr; /* LBR base regs and size */ - u64 lbr_sel_mask; /* LBR_SELECT valid bits */ - const int *lbr_sel_map; /* lbr_select mappings */ + lbr_info, lbr_nr; /* LBR base regs and size */ + union { + u64 lbr_sel_mask; /* LBR_SELECT valid bits */ + u64 lbr_ctl_mask; /* LBR_CTL valid bits */ + }; + union { + const int *lbr_sel_map; /* lbr_select mappings */ + int *lbr_ctl_map; /* LBR_CTL mappings */ + }; bool lbr_double_abort; /* duplicated lbr aborts */ bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */ + /* + * Intel Architectural LBR CPUID Enumeration + */ + unsigned int lbr_depth_mask:8; + unsigned int lbr_deep_c_reset:1; + unsigned int lbr_lip:1; + unsigned int lbr_cpl:1; + unsigned int lbr_filter:1; + unsigned int lbr_call_stack:1; + unsigned int lbr_mispred:1; + unsigned int lbr_timed_lbr:1; + unsigned int lbr_br_type:1; + + void (*lbr_reset)(void); + void (*lbr_read)(struct cpu_hw_events *cpuc); + void (*lbr_save)(void *ctx); + void (*lbr_restore)(void *ctx); + /* * Intel PT/LBR/BTS are exclusive */ @@ -763,16 +805,23 @@ struct x86_pmu { int (*aux_output_match) (struct perf_event *event); }; +struct x86_perf_task_context_opt { + int lbr_callstack_users; + int lbr_stack_state; + int log_id; +}; + struct x86_perf_task_context { - u64 lbr_from[MAX_LBR_ENTRIES]; - u64 lbr_to[MAX_LBR_ENTRIES]; - u64 lbr_info[MAX_LBR_ENTRIES]; u64 lbr_sel; int tos; int valid_lbrs; - int lbr_callstack_users; - int lbr_stack_state; - int log_id; + struct x86_perf_task_context_opt opt; + struct lbr_entry lbr[MAX_LBR_ENTRIES]; +}; + +struct x86_perf_task_context_arch_lbr { + struct x86_perf_task_context_opt opt; + struct lbr_entry entries[]; }; #define x86_add_quirk(func_) \ @@ -823,6 +872,14 @@ static struct perf_pmu_events_ht_attr event_attr_##v = { \ struct pmu *x86_get_pmu(void); extern struct x86_pmu x86_pmu __read_mostly; +static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + return &((struct x86_perf_task_context_arch_lbr *)ctx)->opt; + + return &((struct x86_perf_task_context *)ctx)->opt; +} + static inline bool x86_pmu_has_lbr_callstack(void) { return x86_pmu.lbr_sel_map && @@ -889,6 +946,11 @@ int x86_pmu_hw_config(struct perf_event *event); void x86_pmu_disable_all(void); +static inline bool has_amd_brs(struct hw_perf_event *hwc) +{ + return hwc->flags & PERF_X86_EVENT_AMD_BRS; +} + static inline bool is_counter_pair(struct hw_perf_event *hwc) { return hwc->flags & PERF_X86_EVENT_PAIR; @@ -979,6 +1041,50 @@ ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); +int amd_brs_init(void); +void amd_brs_disable(void); +void amd_brs_enable(void); +void amd_brs_enable_all(void); +void amd_brs_disable_all(void); +void amd_brs_drain(void); +void amd_brs_disable_all(void); +int amd_brs_setup_filter(struct perf_event *event); +void amd_brs_reset(void); + +static inline void amd_pmu_brs_add(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + perf_sched_cb_inc(event->ctx->pmu); + cpuc->lbr_users++; + /* + * No need to reset BRS because it is reset + * on brs_enable() and it is saturating + */ +} + +static inline void amd_pmu_brs_del(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + cpuc->lbr_users--; + WARN_ON_ONCE(cpuc->lbr_users < 0); + + perf_sched_cb_dec(event->ctx->pmu); +} + +void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in); + +/* + * check if BRS is activated on the CPU + * active defined as it has non-zero users and DBG_EXT_CFG.BRSEN=1 + */ +static inline bool amd_brs_active(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + return cpuc->brs_active; +} #else /* CONFIG_CPU_SUP_AMD */ @@ -987,6 +1093,23 @@ static inline int amd_pmu_init(void) return 0; } +static inline int amd_brs_init(void) +{ + return -EOPNOTSUPP; +} + +static inline void amd_brs_drain(void) +{ +} + +static inline void amd_brs_enable_all(void) +{ +} + +static inline void amd_brs_disable_all(void) +{ +} + #endif /* CONFIG_CPU_SUP_AMD */ static inline int is_pebs_pt(struct perf_event *event) @@ -1089,16 +1212,23 @@ void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in); void intel_pmu_auto_reload_read(struct perf_event *event); -void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr); +void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr); void intel_ds_init(void); +void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, + struct perf_event_context *next); + void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); u64 lbr_from_signext_quirk_wr(u64 val); void intel_pmu_lbr_reset(void); +void intel_pmu_lbr_reset_32(void); + +void intel_pmu_lbr_reset_64(void); + void intel_pmu_lbr_add(struct perf_event *event); void intel_pmu_lbr_del(struct perf_event *event); @@ -1109,6 +1239,14 @@ void intel_pmu_lbr_disable_all(void); void intel_pmu_lbr_read(void); +void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc); + +void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc); + +void intel_pmu_lbr_save(void *ctx); + +void intel_pmu_lbr_restore(void *ctx); + void intel_pmu_lbr_init_core(void); void intel_pmu_lbr_init_nhm(void); @@ -1125,6 +1263,8 @@ void intel_pmu_lbr_init_skl(void); void intel_pmu_lbr_init_knl(void); +void intel_pmu_arch_lbr_init(void); + void intel_pmu_pebs_data_source_nhm(void); void intel_pmu_pebs_data_source_skl(bool pmem); diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h new file mode 100644 index 0000000000000000000000000000000000000000..46e1df45efc0462774de91891209c50c2acef0b5 --- /dev/null +++ b/arch/x86/include/asm/amd-ibs.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * From PPR Vol 1 for AMD Family 19h Model 01h B1 + * 55898 Rev 0.35 - Feb 5, 2021 + */ + +#include + +/* + * IBS Hardware MSRs + */ + +/* MSR 0xc0011030: IBS Fetch Control */ +union ibs_fetch_ctl { + __u64 val; + struct { + __u64 fetch_maxcnt:16,/* 0-15: instruction fetch max. count */ + fetch_cnt:16, /* 16-31: instruction fetch count */ + fetch_lat:16, /* 32-47: instruction fetch latency */ + fetch_en:1, /* 48: instruction fetch enable */ + fetch_val:1, /* 49: instruction fetch valid */ + fetch_comp:1, /* 50: instruction fetch complete */ + ic_miss:1, /* 51: i-cache miss */ + phy_addr_valid:1,/* 52: physical address valid */ + l1tlb_pgsz:2, /* 53-54: i-cache L1TLB page size + * (needs IbsPhyAddrValid) */ + l1tlb_miss:1, /* 55: i-cache fetch missed in L1TLB */ + l2tlb_miss:1, /* 56: i-cache fetch missed in L2TLB */ + rand_en:1, /* 57: random tagging enable */ + fetch_l2_miss:1,/* 58: L2 miss for sampled fetch + * (needs IbsFetchComp) */ + reserved:5; /* 59-63: reserved */ + }; +}; + +/* MSR 0xc0011033: IBS Execution Control */ +union ibs_op_ctl { + __u64 val; + struct { + __u64 opmaxcnt:16, /* 0-15: periodic op max. count */ + reserved0:1, /* 16: reserved */ + op_en:1, /* 17: op sampling enable */ + op_val:1, /* 18: op sample valid */ + cnt_ctl:1, /* 19: periodic op counter control */ + opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */ + reserved1:5, /* 27-31: reserved */ + opcurcnt:27, /* 32-58: periodic op counter current count */ + reserved2:5; /* 59-63: reserved */ + }; +}; + +/* MSR 0xc0011035: IBS Op Data 2 */ +union ibs_op_data { + __u64 val; + struct { + __u64 comp_to_ret_ctr:16, /* 0-15: op completion to retire count */ + tag_to_ret_ctr:16, /* 15-31: op tag to retire count */ + reserved1:2, /* 32-33: reserved */ + op_return:1, /* 34: return op */ + op_brn_taken:1, /* 35: taken branch op */ + op_brn_misp:1, /* 36: mispredicted branch op */ + op_brn_ret:1, /* 37: branch op retired */ + op_rip_invalid:1, /* 38: RIP is invalid */ + op_brn_fuse:1, /* 39: fused branch op */ + op_microcode:1, /* 40: microcode op */ + reserved2:23; /* 41-63: reserved */ + }; +}; + +/* MSR 0xc0011036: IBS Op Data 2 */ +union ibs_op_data2 { + __u64 val; + struct { + __u64 data_src:3, /* 0-2: data source */ + reserved0:1, /* 3: reserved */ + rmt_node:1, /* 4: destination node */ + cache_hit_st:1, /* 5: cache hit state */ + reserved1:57; /* 5-63: reserved */ + }; +}; + +/* MSR 0xc0011037: IBS Op Data 3 */ +union ibs_op_data3 { + __u64 val; + struct { + __u64 ld_op:1, /* 0: load op */ + st_op:1, /* 1: store op */ + dc_l1tlb_miss:1, /* 2: data cache L1TLB miss */ + dc_l2tlb_miss:1, /* 3: data cache L2TLB hit in 2M page */ + dc_l1tlb_hit_2m:1, /* 4: data cache L1TLB hit in 2M page */ + dc_l1tlb_hit_1g:1, /* 5: data cache L1TLB hit in 1G page */ + dc_l2tlb_hit_2m:1, /* 6: data cache L2TLB hit in 2M page */ + dc_miss:1, /* 7: data cache miss */ + dc_mis_acc:1, /* 8: misaligned access */ + reserved:4, /* 9-12: reserved */ + dc_wc_mem_acc:1, /* 13: write combining memory access */ + dc_uc_mem_acc:1, /* 14: uncacheable memory access */ + dc_locked_op:1, /* 15: locked operation */ + dc_miss_no_mab_alloc:1, /* 16: DC miss with no MAB allocated */ + dc_lin_addr_valid:1, /* 17: data cache linear address valid */ + dc_phy_addr_valid:1, /* 18: data cache physical address valid */ + dc_l2_tlb_hit_1g:1, /* 19: data cache L2 hit in 1GB page */ + l2_miss:1, /* 20: L2 cache miss */ + sw_pf:1, /* 21: software prefetch */ + op_mem_width:4, /* 22-25: load/store size in bytes */ + op_dc_miss_open_mem_reqs:6, /* 26-31: outstanding mem reqs on DC fill */ + dc_miss_lat:16, /* 32-47: data cache miss latency */ + tlb_refill_lat:16; /* 48-63: L1 TLB refill latency */ + }; +}; + +/* MSR 0xc001103c: IBS Fetch Control Extended */ +union ic_ibs_extd_ctl { + __u64 val; + struct { + __u64 itlb_refill_lat:16, /* 0-15: ITLB Refill latency for sampled fetch */ + reserved:48; /* 16-63: reserved */ + }; +}; + +/* + * IBS driver related + */ + +struct perf_ibs_data { + u32 size; + union { + u32 data[0]; /* data buffer starts here */ + u32 caps; + }; + u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; +}; diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 619c1f80a2abe018fafcff27470631496240c782..00d329a990e56d8735216f482aa82225a79dc637 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -30,6 +30,7 @@ enum cpuid_leafs CPUID_7_ECX, CPUID_8000_0007_EBX, CPUID_7_EDX, + CPUID_8000_001F_EAX, }; #ifdef CONFIG_X86_FEATURE_NAMES @@ -88,8 +89,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \ REQUIRED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 19)) + BUILD_BUG_ON_ZERO(NCAPINTS != 20)) #define DISABLED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ @@ -111,8 +113,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \ DISABLED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 19)) + BUILD_BUG_ON_ZERO(NCAPINTS != 20)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 34ff851bd2fa8dc9a9a9594b8bbb254cb2c29836..5f7b887e50ba9dee0e758b11b65762d860ee8eb9 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -13,7 +13,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 19 /* N 32-bit words worth of info */ +#define NCAPINTS 20 /* N 32-bit words worth of info */ #define NBUGINTS 2 /* N 32-bit bug flags */ /* @@ -94,7 +94,7 @@ #define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ -#define X86_FEATURE_SME_COHERENT ( 3*32+17) /* "" AMD hardware-enforced cache coherency */ +/* FREE! ( 3*32+17) */ #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ @@ -220,7 +220,7 @@ #define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ +/* FREE! ( 7*32+10) */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* "" Set/clear IBRS on kernel entry/exit */ #define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* "" Fill RSB on VM-Exit */ @@ -230,7 +230,7 @@ #define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ -#define X86_FEATURE_SEV ( 7*32+20) /* AMD Secure Encrypted Virtualization */ +#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* AMD Performance Monitoring Version 2 */ #define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ @@ -332,6 +332,7 @@ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ #define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */ +#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ @@ -395,12 +396,20 @@ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ +#define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ #define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */ #define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ #define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ +/* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */ +#define X86_FEATURE_SME (19*32+ 0) /* AMD Secure Memory Encryption */ +#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */ +#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */ +#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ +#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ + /* * BUG word(s) */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 210bc1350e87a2b065c2736f8d5ed2cddb467864..251a84d87ec2cd308d0a51756ef70efbfe28479f 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -90,6 +90,7 @@ #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP) #define DISABLED_MASK17 0 #define DISABLED_MASK18 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define DISABLED_MASK19 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index f779f80bb7c5738b9dca4841412198258be351d6..220448e729b5545fcc27af62d6b6408d1e9fde43 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -192,7 +192,23 @@ #define LBR_INFO_MISPRED BIT_ULL(63) #define LBR_INFO_IN_TX BIT_ULL(62) #define LBR_INFO_ABORT BIT_ULL(61) +#define LBR_INFO_CYC_CNT_VALID BIT_ULL(60) #define LBR_INFO_CYCLES 0xffff +#define LBR_INFO_BR_TYPE_OFFSET 56 +#define LBR_INFO_BR_TYPE (0xfull << LBR_INFO_BR_TYPE_OFFSET) + +#define MSR_ARCH_LBR_CTL 0x000014ce +#define ARCH_LBR_CTL_LBREN BIT(0) +#define ARCH_LBR_CTL_CPL_OFFSET 1 +#define ARCH_LBR_CTL_CPL (0x3ull << ARCH_LBR_CTL_CPL_OFFSET) +#define ARCH_LBR_CTL_STACK_OFFSET 3 +#define ARCH_LBR_CTL_STACK (0x1ull << ARCH_LBR_CTL_STACK_OFFSET) +#define ARCH_LBR_CTL_FILTER_OFFSET 16 +#define ARCH_LBR_CTL_FILTER (0x7full << ARCH_LBR_CTL_FILTER_OFFSET) +#define MSR_ARCH_LBR_DEPTH 0x000014cf +#define MSR_ARCH_LBR_FROM_0 0x00001500 +#define MSR_ARCH_LBR_TO_0 0x00001600 +#define MSR_ARCH_LBR_INFO_0 0x00001200 #define MSR_IA32_PEBS_ENABLE 0x000003f1 #define MSR_PEBS_DATA_CFG 0x000003f2 @@ -497,6 +513,11 @@ #define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3 #define MSR_ZEN2_SPECTRAL_CHICKEN_BIT BIT_ULL(1) +/* AMD Performance Counter Global Status and Control MSRs */ +#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300 +#define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 +#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 @@ -661,6 +682,10 @@ #define MSR_IA32_PERF_CTL 0x00000199 #define INTEL_PERF_CTL_MASK 0xffff +/* AMD Branch Sampling configuration */ +#define MSR_AMD_DBG_EXTN_CFG 0xc000010f +#define MSR_AMD_SAMP_BR_FROM 0xc0010300 + #define MSR_IA32_MPERF 0x000000e7 #define MSR_IA32_APERF 0x000000e8 diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 9c0200ede2427afb40f2f1904ac00d1c08f380bc..3ceae047311523cc4ae01c379e310897a3e70613 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -166,6 +166,58 @@ union cpuid10_edx { unsigned int full; }; +/* + * Intel Architectural LBR CPUID detection/enumeration details: + */ +union cpuid28_eax { + struct { + /* Supported LBR depth values */ + unsigned int lbr_depth_mask:8; + unsigned int reserved:22; + /* Deep C-state Reset */ + unsigned int lbr_deep_c_reset:1; + /* IP values contain LIP */ + unsigned int lbr_lip:1; + } split; + unsigned int full; +}; + +union cpuid28_ebx { + struct { + /* CPL Filtering Supported */ + unsigned int lbr_cpl:1; + /* Branch Filtering Supported */ + unsigned int lbr_filter:1; + /* Call-stack Mode Supported */ + unsigned int lbr_call_stack:1; + } split; + unsigned int full; +}; + +union cpuid28_ecx { + struct { + /* Mispredict Bit Supported */ + unsigned int lbr_mispred:1; + /* Timed LBRs Supported */ + unsigned int lbr_timed_lbr:1; + /* Branch Type Field Supported */ + unsigned int lbr_br_type:1; + } split; + unsigned int full; +}; + +/* + * AMD "Extended Performance Monitoring and Debug" CPUID + * detection/enumeration details: + */ +union cpuid_0x80000022_ebx { + struct { + /* Number of Core Performance Counters */ + unsigned int num_core_pmc:4; + } split; + unsigned int full; +}; + struct x86_pmu_capability { int version; int num_counters_gp; @@ -337,13 +389,14 @@ struct pebs_xmm { u64 xmm[16*2]; /* two entries for each register */ }; -struct pebs_lbr_entry { +struct lbr_entry { u64 from, to, info; }; -struct pebs_lbr { - struct pebs_lbr_entry lbr[0]; /* Variable length */ -}; +/* + * AMD Extended Performance Monitoring and Debug cpuid feature detection + */ +#define EXT_PERFMON_DEBUG_FEATURES 0x80000022 /* * IBS cpuid feature detection @@ -397,6 +450,7 @@ struct pebs_lbr { #define IBS_OP_ENABLE (1ULL<<17) #define IBS_OP_MAX_CNT 0x0000FFFFULL #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ +#define IBS_OP_MAX_CNT_EXT_MASK (0x7FULL<<20) /* separate upper 7 bits */ #define IBS_RIP_INVALID (1ULL<<38) #ifdef CONFIG_X86_LOCAL_APIC @@ -455,17 +509,10 @@ struct x86_pmu_lbr { unsigned int info; }; -extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); extern void perf_check_microcode(void); extern int x86_perf_rdpmc_index(struct perf_event *event); #else -static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) -{ - *nr = 0; - return NULL; -} - static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) { memset(cap, 0, sizeof(*cap)); @@ -477,15 +524,26 @@ static inline void perf_check_microcode(void) { } #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr); +extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); #else static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) { return -1; } +static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) +{ + *nr = 0; + return NULL; +} #endif #ifdef CONFIG_CPU_SUP_INTEL extern void intel_pt_handle_vmx(int on); +#else +static inline void intel_pt_handle_vmx(int on) +{ + +} #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index 6847d85400a8b738ca2adf00ea95fdf8f349fafc..fa5700097f6474dcecef8da81b625d30b648ee60 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -101,6 +101,7 @@ #define REQUIRED_MASK16 0 #define REQUIRED_MASK17 0 #define REQUIRED_MASK18 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define REQUIRED_MASK19 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e667cc3ffeec092765e2345e9444c9f2afd941ef..c7641b1ca2f24500f028ae6a3a4d0138334a0c56 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -959,6 +959,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x8000000a) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); + if (c->extended_cpuid_level >= 0x8000001f) + c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f); + init_scattered_cpuid_features(c); init_speculation_control(c); diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index a03e309a0ac5f401efc1c291691d26435c720cd8..a5872551f3f6a1934cb068d6a0643826f88a8ccb 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -40,9 +40,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, - { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, - { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 }, - { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 }, + { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 07fc4b3e3ce49e3bcb5bc898b20489d4972edac0..2d85c2bfedc888e12fc682a6e9883410719d7200 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -92,14 +92,26 @@ struct perf_raw_record { /* * branch stack layout: * nr: number of taken branches stored in entries[] + * hw_idx: The low level index of raw branch records + * for the most recent branch. + * -1ULL means invalid/unknown. * * Note that nr can vary from sample to sample * branches (to, from) are stored from most recent * to least recent, i.e., entries[0] contains the most * recent branch. + * The entries[] is an abstraction of raw branch records, + * which may not be stored in age order in HW, e.g. Intel LBR. + * The hw_idx is to expose the low level index of raw + * branch record for the most recent branch aka entries[0]. + * The hw_idx index is between -1 (unknown) and max depth, + * which can be retrieved in /sys/devices/cpu/caps/branches. + * For the architectures whose raw branch records are + * already stored in age order, the hw_idx should be 0. */ struct perf_branch_stack { __u64 nr; + __u64 hw_idx; struct perf_branch_entry entries[0]; }; @@ -115,6 +127,15 @@ struct hw_perf_event_extra { int idx; /* index in shared_regs->regs[] */ }; +/** + * hw_perf_event::flag values + * + * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific + * usage. + */ +#define PERF_EVENT_FLAG_ARCH 0x0000ffff +#define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 + /** * struct hw_perf_event - performance event hardware details: */ @@ -1012,6 +1033,22 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->txn = 0; } +/* + * Clear all bitfields in the perf_branch_entry. + * The to and from fields are not cleared because they are + * systematically modified by caller. + */ +static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br) +{ + br->mispred = 0; + br->predicted = 0; + br->in_tx = 0; + br->abort = 0; + br->cycles = 0; + br->type = 0; + br->reserved = 0; +} + extern void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, diff --git a/include/linux/static_call.h b/include/linux/static_call.h new file mode 100644 index 0000000000000000000000000000000000000000..d8892dff2e91f80c4aa83933742f6ea6b32c45cf --- /dev/null +++ b/include/linux/static_call.h @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_STATIC_CALL_H +#define _LINUX_STATIC_CALL_H + +/* + * Static call support + * + * Static calls use code patching to hard-code function pointers into direct + * branch instructions. They give the flexibility of function pointers, but + * with improved performance. This is especially important for cases where + * retpolines would otherwise be used, as retpolines can significantly impact + * performance. + * + * + * API overview: + * + * DECLARE_STATIC_CALL(name, func); + * DEFINE_STATIC_CALL(name, func); + * static_call(name)(args...); + * static_call_update(name, func); + * + * Usage example: + * + * # Start with the following functions (with identical prototypes): + * int func_a(int arg1, int arg2); + * int func_b(int arg1, int arg2); + * + * # Define a 'my_name' reference, associated with func_a() by default + * DEFINE_STATIC_CALL(my_name, func_a); + * + * # Call func_a() + * static_call(my_name)(arg1, arg2); + * + * # Update 'my_name' to point to func_b() + * static_call_update(my_name, &func_b); + * + * # Call func_b() + * static_call(my_name)(arg1, arg2); + * + * + * Implementation details: + * + * This requires some arch-specific code (CONFIG_HAVE_STATIC_CALL). + * Otherwise basic indirect calls are used (with function pointers). + * + * Each static_call() site calls into a trampoline associated with the name. + * The trampoline has a direct branch to the default function. Updates to a + * name will modify the trampoline's branch destination. + * + * If the arch has CONFIG_HAVE_STATIC_CALL_INLINE, then the call sites + * themselves will be patched at runtime to call the functions directly, + * rather than calling through the trampoline. This requires objtool or a + * compiler plugin to detect all the static_call() sites and annotate them + * in the .static_call_sites section. + */ + +#include +#include +#include + +#ifdef CONFIG_HAVE_STATIC_CALL +#include + +/* + * Either @site or @tramp can be NULL. + */ +extern void arch_static_call_transform(void *site, void *tramp, void *func); + +#define STATIC_CALL_TRAMP_ADDR(name) &STATIC_CALL_TRAMP(name) + +/* + * __ADDRESSABLE() is used to ensure the key symbol doesn't get stripped from + * the symbol table so that objtool can reference it when it generates the + * .static_call_sites section. + */ +#define __static_call(name) \ +({ \ + __ADDRESSABLE(STATIC_CALL_KEY(name)); \ + &STATIC_CALL_TRAMP(name); \ +}) + +#else +#define STATIC_CALL_TRAMP_ADDR(name) NULL +#endif + + +#define DECLARE_STATIC_CALL(name, func) \ + extern struct static_call_key STATIC_CALL_KEY(name); \ + extern typeof(func) STATIC_CALL_TRAMP(name); + +#define static_call_update(name, func) \ +({ \ + BUILD_BUG_ON(!__same_type(*(func), STATIC_CALL_TRAMP(name))); \ + __static_call_update(&STATIC_CALL_KEY(name), \ + STATIC_CALL_TRAMP_ADDR(name), func); \ +}) + +#if defined(CONFIG_HAVE_STATIC_CALL) + +struct static_call_key { + void *func; +}; + +#define DEFINE_STATIC_CALL(name, _func) \ + DECLARE_STATIC_CALL(name, _func); \ + struct static_call_key STATIC_CALL_KEY(name) = { \ + .func = _func, \ + }; \ + ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func) + +#define static_call(name) __static_call(name) + +static inline +void __static_call_update(struct static_call_key *key, void *tramp, void *func) +{ + cpus_read_lock(); + WRITE_ONCE(key->func, func); + arch_static_call_transform(NULL, tramp, func); + cpus_read_unlock(); +} + +#define EXPORT_STATIC_CALL(name) \ + EXPORT_SYMBOL(STATIC_CALL_KEY(name)); \ + EXPORT_SYMBOL(STATIC_CALL_TRAMP(name)) + +#define EXPORT_STATIC_CALL_GPL(name) \ + EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name)); \ + EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name)) + +#else /* Generic implementation */ + +struct static_call_key { + void *func; +}; + +#define DEFINE_STATIC_CALL(name, _func) \ + DECLARE_STATIC_CALL(name, _func); \ + struct static_call_key STATIC_CALL_KEY(name) = { \ + .func = _func, \ + } + +#define static_call(name) \ + ((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_KEY(name).func)) + +static inline +void __static_call_update(struct static_call_key *key, void *tramp, void *func) +{ + WRITE_ONCE(key->func, func); +} + +#define EXPORT_STATIC_CALL(name) EXPORT_SYMBOL(STATIC_CALL_KEY(name)) +#define EXPORT_STATIC_CALL_GPL(name) EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name)) + +#endif /* CONFIG_HAVE_STATIC_CALL */ + +#endif /* _LINUX_STATIC_CALL_H */ diff --git a/include/linux/static_call_types.h b/include/linux/static_call_types.h new file mode 100644 index 0000000000000000000000000000000000000000..5ed249dc47d304001c8ec705175bdd723a02002b --- /dev/null +++ b/include/linux/static_call_types.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _STATIC_CALL_TYPES_H +#define _STATIC_CALL_TYPES_H + +#include + +#define STATIC_CALL_KEY_PREFIX __SCK__ +#define STATIC_CALL_KEY(name) __PASTE(STATIC_CALL_KEY_PREFIX, name) + +#define STATIC_CALL_TRAMP_PREFIX __SCT__ +#define STATIC_CALL_TRAMP_PREFIX_STR __stringify(STATIC_CALL_TRAMP_PREFIX) +#define STATIC_CALL_TRAMP(name) __PASTE(STATIC_CALL_TRAMP_PREFIX, name) +#define STATIC_CALL_TRAMP_STR(name) __stringify(STATIC_CALL_TRAMP(name)) + +#endif /* _STATIC_CALL_TYPES_H */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index ceccd980ffcfe44c5204920f4ad73216bd20ae3c..507eddef87156c05564e7660e027510569687e0c 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -180,6 +180,8 @@ enum perf_branch_sample_type_shift { PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT = 16, /* save branch type */ + PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT = 17, /* save low level index of raw branch records */ + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ }; @@ -207,6 +209,8 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_TYPE_SAVE = 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; @@ -849,7 +853,9 @@ enum perf_event_type { * char data[size];}&& PERF_SAMPLE_RAW * * { u64 nr; - * { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK + * { u64 hw_idx; } && PERF_SAMPLE_BRANCH_HW_INDEX + * { u64 from, to, flags } lbr[nr]; + * } && PERF_SAMPLE_BRANCH_STACK * * { u64 abi; # enum perf_sample_regs_abi * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER diff --git a/kernel/events/core.c b/kernel/events/core.c index 2043b0f729a2ee34c4b764803a888047eaac5d46..d478c9789ff6e2ba7e37226efbb308db8d252c73 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6403,6 +6403,11 @@ static void perf_output_read(struct perf_output_handle *handle, perf_output_read_one(handle, event, enabled, running); } +static inline bool perf_sample_save_hw_index(struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; +} + void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, @@ -6491,6 +6496,8 @@ void perf_output_sample(struct perf_output_handle *handle, * sizeof(struct perf_branch_entry); perf_output_put(handle, data->br_stack->nr); + if (perf_sample_save_hw_index(event)) + perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); } else { /* @@ -6680,6 +6687,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_BRANCH_STACK) { int size = sizeof(u64); /* nr */ if (data->br_stack) { + if (perf_sample_save_hw_index(event)) + size += sizeof(u64); + size += data->br_stack->nr * sizeof(struct perf_branch_entry); } diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index 8e1d0bb463611026f80589660c7ea16c850fecf1..f6d8002d11e0e976348c7dc67a64969668d1b2f2 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -84,6 +84,7 @@ #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP) #define DISABLED_MASK17 0 #define DISABLED_MASK18 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define DISABLED_MASK19 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h index 6847d85400a8b738ca2adf00ea95fdf8f349fafc..fa5700097f6474dcecef8da81b625d30b648ee60 100644 --- a/tools/arch/x86/include/asm/required-features.h +++ b/tools/arch/x86/include/asm/required-features.h @@ -101,6 +101,7 @@ #define REQUIRED_MASK16 0 #define REQUIRED_MASK17 0 #define REQUIRED_MASK18 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19) +#define REQUIRED_MASK19 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index ddc5ee6f6592bed23532aff20f75c7a76db53361..88882e55294ecd6f59e17658c9432b1967545ff9 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -100,6 +100,14 @@ struct evsel { perf_evsel__sb_cb_t *cb; void *data; } side_band; + /* + * For reporting purposes, an evsel sample can have a callchain + * synthesized from AUX area data. Keep track of synthesized sample + * types here. Note, the recorded sample_type cannot be changed because + * it is needed to continue to parse events. + * See also evsel__has_callchain(). + */ + __u64 synth_sample_type; }; struct perf_missing_features { @@ -384,7 +392,22 @@ static inline bool perf_evsel__has_branch_callstack(const struct evsel *evsel) static inline bool evsel__has_callchain(const struct evsel *evsel) { - return (evsel->core.attr.sample_type & PERF_SAMPLE_CALLCHAIN) != 0; + /* + * For reporting purposes, an evsel sample can have a recorded callchain + * or a callchain synthesized from AUX area data. + */ + return evsel->core.attr.sample_type & PERF_SAMPLE_CALLCHAIN || + evsel->synth_sample_type & PERF_SAMPLE_CALLCHAIN; +} + +static inline bool evsel__has_br_stack(const struct evsel *evsel) +{ + /* + * For reporting purposes, an evsel sample can have a recorded branch + * stack or a branch stack synthesized from AUX area data. + */ + return evsel->core.attr.sample_type & PERF_SAMPLE_BRANCH_STACK || + evsel->synth_sample_type & PERF_SAMPLE_BRANCH_STACK; } struct perf_env *perf_evsel__env(struct evsel *evsel); diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 5de955d60a04452179102be968606866cbc8a0d5..17caadb33e6c8615fdb01513debaf8debf1cdb72 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1236,7 +1236,7 @@ static void dump_sample(struct evsel *evsel, union perf_event *event, if (evsel__has_callchain(evsel)) callchain__printf(evsel, sample); - if (sample_type & PERF_SAMPLE_BRANCH_STACK) + if (evsel__has_br_stack(evsel)) branch_stack__printf(sample, perf_evsel__has_branch_callstack(evsel)); if (sample_type & PERF_SAMPLE_REGS_USER)