Re: [PATCH V1 2/3] perf/x86/intel/bm.c: Add Intel Branch Monitoring support
From: Peter Zijlstra
Date: Mon Nov 13 2017 - 04:00:54 EST
On Sat, Nov 11, 2017 at 01:20:05PM -0800, Megha Dey wrote:
> Currently, the cannonlake family of Intel processors support the
> branch monitoring feature. Intel's Branch monitoring feature is trying
> to utilize heuristics to detect the occurrence of an ROP (Return
> Oriented Programming) attack.
>
> A perf-based kernel driver has been used to monitor the occurrence of
> one of the 6 branch monitoring events. There are 2 counters that each
> can select between one of these events for evaluation over a specified
> instruction window size (0 to 1023). For each counter, a threshold value
> (0 to 127) can be configured to set a point at which ROP detection event
> action is taken (determined by user-space). Each task can monitor
> a maximum of 2 events at any given time.
>
> Apart from window_size(global) and threshold(per-counter), various sysfs
> entries are provided for the user to configure: guest_disable, lbr_freeze,
> window_cnt_sel, cnt_and_mode (all global) and mispred_evt_cnt(per-counter).
> For all events belonging to the same task, the global parameters are
> shared.
Is there any sensible documentation on this except the MSR listings?
>
> Everytime a task is scheduled out, we save current window and count
> associated with the event being monitored. When the task is scheduled
> next, we start counting from previous count associated with this event.
> Thus, a full context switch in this case is not necessary.
What? That doesn't make any sense. The fact that we scheduled out and
then in again _is_ a full context switch no?
>
> Signed-off-by: Megha Dey <megha.dey@xxxxxxxxxxxxxxx>
> Signed-off-by: Yu-Cheng Yu <yu-cheng.yu@xxxxxxxxx>
That SoB chain is buggered.
> +static int intel_bm_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
> +{
> + struct perf_event *event;
> + union bm_detect_status stat;
> + int i;
> + unsigned long x;
> +
> + rdmsrl(BR_DETECT_STATUS_MSR, stat.raw);
if (!stat.event)
return NMI_DONE;
saves you a whole bunch of indentation, no?
> +
> + if (stat.event) {
> + wrmsrl(BR_DETECT_STATUS_MSR, 0);
> + apic_write(APIC_LVTPC, APIC_DM_NMI);
> + /*
> + * Issue wake-up to corresponding polling event
> + */
> + x = stat.ctrl_hit;
> + for_each_set_bit(i, &x, BM_MAX_COUNTERS) {
> + event = current->thread.bm_counter_owner[i];
> + local64_inc(&event->count);
> + atomic_set(&event->hw.bm_poll, POLLIN);
> + event->pending_wakeup = 1;
> + irq_work_queue(&event->pending);
> + }
> + return NMI_HANDLED;
> + }
> + return NMI_DONE;
> +}
> +
> +/*
> + * Unmask the NMI bit of the local APIC the first time task is scheduled
> + * on a particular CPU.
> + */
> +static void intel_bm_unmask_nmi(void)
> +{
> + this_cpu_write(bm_unmask_apic, 0);
> +
> + if (!(this_cpu_read(bm_unmask_apic))) {
> + apic_write(APIC_LVTPC, APIC_DM_NMI);
> + this_cpu_inc(bm_unmask_apic);
> + }
> +}
What? Why?
> +static int intel_bm_event_add(struct perf_event *event, int mode)
> +{
> + union bm_detect_status cur_stat, prev_stat;
> +
> + WARN_ON(event->hw.id >= BM_MAX_COUNTERS);
> +
> + prev_stat.raw = local64_read(&event->hw.prev_count);
> +
> + /*
> + * Start counting from previous count associated with this event
> + */
> + rdmsrl(BR_DETECT_STATUS_MSR, cur_stat.raw);
> +
> + cur_stat.count[event->hw.id] = prev_stat.count[event->hw.id];
> + cur_stat.count_window = prev_stat.count_window;
> + wrmsrl(BR_DETECT_STATUS_MSR, cur_stat.raw);
Why are you writing back the value you read? Just to waste cycles?
> + wrmsrl(BR_DETECT_CONTROL_MSR, event->hw.bm_ctrl);
> +
> + intel_bm_unmask_nmi();
> +
> + wrmsrl(BR_DETECT_COUNTER_CONFIG_BASE + event->hw.id,
> + (event->hw.bm_counter_conf | 1));
Please use a named construct for that enable bit.
> +
> + return 0;
> +}
> +
> +static void intel_bm_event_update(struct perf_event *event)
> +{
> + union bm_detect_status cur_stat;
> +
> + rdmsrl(BR_DETECT_STATUS_MSR, cur_stat.raw);
> + local64_set(&event->hw.prev_count, (uint64_t)cur_stat.raw);
> +}
That looks wrong... the general point of update functions is to update
the count, the above does not in fact do that.
> +
> +static void intel_bm_event_del(struct perf_event *event, int flags)
> +{
> + WARN_ON(event->hw.id >= BM_MAX_COUNTERS);
> +
> + wrmsrl(BR_DETECT_COUNTER_CONFIG_BASE + event->hw.id,
> + (event->hw.bm_counter_conf & ~1));
Either that EN bit is part of the bm_counter_conf, in which case you
didn't need to add it in _add(), or its not and you don't need to clear
it here. Make up your mind.
> +
> + intel_bm_event_update(event);
Except of course, that does not in fact update...
> +}
> +
> +static void intel_bm_event_destroy(struct perf_event *event)
> +{
> + bm_counter_owner[event->hw.id] = NULL;
> +}
> +
> +static DEFINE_MUTEX(bm_counter_mutex);
> +
> +static int intel_bm_event_init(struct perf_event *event)
> +{
> + u64 cfg;
> + int counter_to_use = -1, i;
> +
> + local64_set(&event->hw.prev_count, 0);
> +
> + /*
> + * Find a hardware counter for the target task
> + */
> + bm_counter_owner = event->hw.target->thread.bm_counter_owner;
> +
> + mutex_lock(&bm_counter_mutex);
> + for (i = 0; i < BM_MAX_COUNTERS; i++) {
> + if (bm_counter_owner[i] == NULL) {
> + counter_to_use = i;
> + bm_counter_owner[i] = event;
> + break;
> + }
> + }
> + mutex_unlock(&bm_counter_mutex);
> +
> + if (counter_to_use == -1)
> + return -EBUSY;
> +
> + event->hw.bm_ctrl = (bm_window_size << BM_WINDOW_SIZE_SHIFT) |
> + (bm_guest_disable << BM_GUEST_DISABLE_SHIFT) |
> + (bm_lbr_freeze << BM_LBR_FREEZE_SHIFT) |
> + (bm_window_cnt_sel << BM_WINDOW_CNT_SEL_SHIFT) |
> + (bm_cnt_and_mode << BM_CNT_AND_MODE_SHIFT) |
> + BM_ENABLE;
> + event->hw.bm_counter_conf = (bm_threshold << BM_THRESHOLD_SHIFT) |
> + (bm_mispred_evt_cnt << BM_MISPRED_EVT_CNT_SHIFT) |
> + (cfg << BM_EVENT_TYPE_SHIFT);
> +
> + event->hw.id = counter_to_use;
> + local64_set(&event->count, 0);
That is just a really ugly hack to work around:
> +static struct pmu intel_bm_pmu = {
> + .task_ctx_nr = perf_sw_context,
this. And you didn't bother to mention that atrocity in your Changelog.
NAK.