Re: [PATCH 04/14] perf, x86: Basic Haswell LBR call stack support
From: Stephane Eranian
Date: Wed Feb 05 2014 - 10:40:41 EST
On Fri, Jan 3, 2014 at 6:48 AM, Yan, Zheng <zheng.z.yan@xxxxxxxxx> wrote:
> When the call stack feature is enabled, the LBR stack will capture
> unfiltered call data normally, but as return instructions are executed,
> the last captured branch record is flushed from the on-chip registers
> in a last-in first-out (LIFO) manner. Thus, branch information relative
> to leaf functions will not be captured, while preserving the call stack
> information of the main line execution path.
>
This is a generic description of the LBR call stack feature. It does not
describe what the patch actually does which is implement the basic
internal infrastructure for CALL_STACK mode using LBR callstack.
> Signed-off-by: Yan, Zheng <zheng.z.yan@xxxxxxxxx>
> ---
> arch/x86/kernel/cpu/perf_event.h | 7 ++-
> arch/x86/kernel/cpu/perf_event_intel.c | 2 +-
> arch/x86/kernel/cpu/perf_event_intel_lbr.c | 98 +++++++++++++++++++++++-------
> 3 files changed, 82 insertions(+), 25 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
> index 80b8e83..3ef4b79 100644
> --- a/arch/x86/kernel/cpu/perf_event.h
> +++ b/arch/x86/kernel/cpu/perf_event.h
> @@ -460,7 +460,10 @@ struct x86_pmu {
> };
>
> enum {
> - PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE = PERF_SAMPLE_BRANCH_MAX_SHIFT,
> + PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = PERF_SAMPLE_BRANCH_MAX_SHIFT,
> + PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE,
> +
> + PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT,
> };
>
> #define x86_add_quirk(func_) \
> @@ -697,6 +700,8 @@ void intel_pmu_lbr_init_atom(void);
>
> void intel_pmu_lbr_init_snb(void);
>
> +void intel_pmu_lbr_init_hsw(void);
> +
> int intel_pmu_setup_lbr_filter(struct perf_event *event);
>
> int p4_pmu_init(void);
> diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
> index 4325bae..84a1c09 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -2494,7 +2494,7 @@ __init int intel_pmu_init(void)
> memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
> memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
>
> - intel_pmu_lbr_init_snb();
> + intel_pmu_lbr_init_hsw();
>
> x86_pmu.event_constraints = intel_hsw_event_constraints;
> x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> index 7ff2a99..bdd8758 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
> @@ -39,6 +39,7 @@ static enum {
> #define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */
> #define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */
> #define LBR_FAR_BIT 8 /* do not capture far branches */
> +#define LBR_CALL_STACK_BIT 9 /* enable call stack */
>
> #define LBR_KERNEL (1 << LBR_KERNEL_BIT)
> #define LBR_USER (1 << LBR_USER_BIT)
> @@ -49,6 +50,7 @@ static enum {
> #define LBR_REL_JMP (1 << LBR_REL_JMP_BIT)
> #define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
> #define LBR_FAR (1 << LBR_FAR_BIT)
> +#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT)
>
> #define LBR_PLM (LBR_KERNEL | LBR_USER)
>
> @@ -74,24 +76,25 @@ static enum {
> * x86control flow changes include branches, interrupts, traps, faults
> */
> enum {
> - X86_BR_NONE = 0, /* unknown */
> -
> - X86_BR_USER = 1 << 0, /* branch target is user */
> - X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
> -
> - X86_BR_CALL = 1 << 2, /* call */
> - X86_BR_RET = 1 << 3, /* return */
> - X86_BR_SYSCALL = 1 << 4, /* syscall */
> - X86_BR_SYSRET = 1 << 5, /* syscall return */
> - X86_BR_INT = 1 << 6, /* sw interrupt */
> - X86_BR_IRET = 1 << 7, /* return from interrupt */
> - X86_BR_JCC = 1 << 8, /* conditional */
> - X86_BR_JMP = 1 << 9, /* jump */
> - X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
> - X86_BR_IND_CALL = 1 << 11,/* indirect calls */
> - X86_BR_ABORT = 1 << 12,/* transaction abort */
> - X86_BR_IN_TX = 1 << 13,/* in transaction */
> - X86_BR_NO_TX = 1 << 14,/* not in transaction */
> + X86_BR_NONE = 0, /* unknown */
> +
> + X86_BR_USER = 1 << 0, /* branch target is user */
> + X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
> +
> + X86_BR_CALL = 1 << 2, /* call */
> + X86_BR_RET = 1 << 3, /* return */
> + X86_BR_SYSCALL = 1 << 4, /* syscall */
> + X86_BR_SYSRET = 1 << 5, /* syscall return */
> + X86_BR_INT = 1 << 6, /* sw interrupt */
> + X86_BR_IRET = 1 << 7, /* return from interrupt */
> + X86_BR_JCC = 1 << 8, /* conditional */
> + X86_BR_JMP = 1 << 9, /* jump */
> + X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
> + X86_BR_IND_CALL = 1 << 11,/* indirect calls */
> + X86_BR_ABORT = 1 << 12,/* transaction abort */
> + X86_BR_IN_TX = 1 << 13,/* in transaction */
> + X86_BR_NO_TX = 1 << 14,/* not in transaction */
> + X86_BR_CALL_STACK = 1 << 15,/* call stack */
> };
>
> #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
> @@ -135,7 +138,14 @@ static void __intel_pmu_lbr_enable(void)
> wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
>
> rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
> - debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
> + debugctl |= DEBUGCTLMSR_LBR;
> + /*
> + * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
> + * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
> + * may cause superfluous increase/decrease of LBR_TOS.
> + */
Is that a bug or a feature?
That prevent any use of the call-stack mode in the kernel because by the
time you get to perf_events code, the stack will have been overwritten. you
can get by if you are only interested in user level execution, the LBR priv
level filtering will cause a freeze, though with some skid. I assume you are
limiting this feature to user priv level by enforcing that users pass the
PERF_SAMPLE_BRANCH_USER flag.
> + if (!cpuc->lbr_sel || !(cpuc->lbr_sel->config & LBR_CALL_STACK))
> + debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
> wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
> }
>
> @@ -354,7 +364,7 @@ void intel_pmu_lbr_read(void)
> * - in case there is no HW filter
> * - in case the HW filter has errata or limitations
> */
> -static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
> +static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
> {
> u64 br_type = event->attr.branch_sample_type;
> int mask = 0;
> @@ -388,11 +398,21 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
> if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
> mask |= X86_BR_NO_TX;
>
> + if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
> + if (!x86_pmu.lbr_sel_map)
> + return -EOPNOTSUPP;
I am not sure checking lbr_sel_map here is enough. You need to
check if the CALL_STACK entry is populated, meaning the HW feature
exists.
> + if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
> + return -EINVAL;
> + mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
> + X86_BR_CALL_STACK;
Why have BR_RET here?
> + }
> +
> /*
> * stash actual user request into reg, it may
> * be used by fixup code for some CPU
> */
> event->hw.branch_reg.reg = mask;
> + return 0;
> }
>
> /*
> @@ -421,8 +441,11 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
> reg = &event->hw.branch_reg;
> reg->idx = EXTRA_REG_LBR;
>
> - /* LBR_SELECT operates in suppress mode so invert mask */
> - reg->config = ~mask & x86_pmu.lbr_sel_mask;
> + /*
> + * the first 8 bits (LBR_SEL_MASK) in LBR_SELECT operates
> + * in suppress mode so invert mask
> + */
> + reg->config = mask ^ x86_pmu.lbr_sel_mask;
>
> return 0;
> }
> @@ -440,7 +463,9 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
> /*
> * setup SW LBR filter
> */
> - intel_pmu_setup_sw_lbr_filter(event);
> + ret = intel_pmu_setup_sw_lbr_filter(event);
> + if (ret)
> + return ret;
>
> /*
> * setup HW LBR filter, if any
> @@ -695,6 +720,19 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
> [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
> };
>
> +static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
> + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
> + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
> + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
> + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
> + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
> + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
> + | LBR_FAR,
> + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
> + [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
> + | LBR_RETURN | LBR_CALL_STACK,
> +};
> +
> /* core */
> void intel_pmu_lbr_init_core(void)
> {
> @@ -751,6 +789,20 @@ void intel_pmu_lbr_init_snb(void)
> pr_cont("16-deep LBR, ");
> }
>
> +/* haswell */
> +void intel_pmu_lbr_init_hsw(void)
> +{
> + x86_pmu.lbr_nr = 16;
> + x86_pmu.lbr_tos = MSR_LBR_TOS;
> + x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
> + x86_pmu.lbr_to = MSR_LBR_NHM_TO;
> +
> + x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
> + x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
> +
> + pr_cont("16-deep LBR, ");
> +}
> +
> /* atom */
> void intel_pmu_lbr_init_atom(void)
> {
> --
> 1.8.4.2
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/