Re: [PATCH V6] perf: Reset the dirty counter to prevent the leak for an RDPMC task

From: Liang, Kan
Date: Wed May 12 2021 - 12:27:40 EST




On 5/12/2021 10:54 AM, Rob Herring wrote:
How about this one?
Would you mind splitting this to core and x86 parts.


Sure, I will split the patch.

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index c6fedd2..9052578 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1636,6 +1636,8 @@ static void x86_pmu_del(struct perf_event *event,
int flags)
if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
goto do_del;

+ __set_bit(event->hw.idx, cpuc->dirty);
+
/*
* Not a TXN, therefore cleanup properly.
*/
@@ -2484,12 +2486,43 @@ static int x86_pmu_event_init(struct perf_event
*event)
return err;
}

+static void x86_pmu_clear_dirty_counters(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int i;
+
+ /* Don't need to clear the assigned counter. */
+ for (i = 0; i < cpuc->n_events; i++)
+ __clear_bit(cpuc->assign[i], cpuc->dirty);
+
+ if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX))
+ return;
+
+ for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) {
+ /* Metrics and fake events don't have corresponding HW counters. */
+ if (is_metric_idx(i) || (i == INTEL_PMC_IDX_FIXED_VLBR))
+ continue;
+ else if (i >= INTEL_PMC_IDX_FIXED)
+ wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + (i - INTEL_PMC_IDX_FIXED), 0);
+ else
+ wrmsrl(x86_pmu_event_addr(i), 0);
+ }
+
+ bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX);
+}
+
static void x86_pmu_event_mapped(struct perf_event *event, struct
mm_struct *mm)
{
if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
return;

/*
+ * Enable sched_task() for the RDPMC task.
+ */
+ if (x86_pmu.sched_task && event->hw.target)
+ atomic_inc(&event->pmu->sched_cb_usages);
+
+ /*
* This function relies on not being called concurrently in two
* tasks in the same mm. Otherwise one task could observe
* perf_rdpmc_allowed > 1 and return all the way back to
@@ -2507,10 +2540,12 @@ static void x86_pmu_event_mapped(struct
perf_event *event, struct mm_struct *mm)

static void x86_pmu_event_unmapped(struct perf_event *event, struct
mm_struct *mm)
{
-
if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
return;

+ if (x86_pmu.sched_task && event->hw.target)
+ atomic_dec(&event->pmu->sched_cb_usages);
+
if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
}
@@ -2616,6 +2651,14 @@ static const struct attribute_group
*x86_pmu_attr_groups[] = {
static void x86_pmu_sched_task(struct perf_event_context *ctx, bool
sched_in)
{
static_call_cond(x86_pmu_sched_task)(ctx, sched_in);
+
+ /*
+ * If a new task has the RDPMC enabled, clear the dirty counters
+ * to prevent the potential leak.
+ */
+ if (sched_in && ctx && READ_ONCE(x86_pmu.attr_rdpmc) &&
+ current->mm && atomic_read(&current->mm->context.perf_rdpmc_allowed))
+ x86_pmu_clear_dirty_counters();
}

static void x86_pmu_swap_task_ctx(struct perf_event_context *prev,
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 10c8171..55bd891 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -229,6 +229,7 @@ struct cpu_hw_events {
*/
struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ unsigned long dirty[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
int enabled;

int n_events; /* the # of events in the below arrays */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index c8a3388..3a85dbe 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -301,6 +301,9 @@ struct pmu {
/* number of address filters this PMU can do */
unsigned int nr_addr_filters;

+ /* Track the per PMU sched_task() callback users */
+ atomic_t sched_cb_usages;
To align with the per cpu one: s/usages/usage/


OK

I think we should be able to use refcount_t here instead?

I think they are the same for this case. Is there a particular reason for the change? Are they different in ARM?


+
/*
* Fully disable/enable this PMU, can be used to protect from the PMI
* as well as for lazy/batch writing of the MSRs.
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1574b70..8216acc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3851,7 +3851,7 @@ static void perf_event_context_sched_in(struct
perf_event_context *ctx,
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, ctx, task);

- if (cpuctx->sched_cb_usage && pmu->sched_task)
+ if (pmu->sched_task && (cpuctx->sched_cb_usage ||
atomic_read(&pmu->sched_cb_usages)))
For completeness, shouldn't this condition be added everywhere
->sched_task() can be called perhaps with the exception of
__perf_pmu_sched_task() which is only called when the task context
doesn't change.

In theory, it's harmless to add it in the other places, because we also check it in the X86 specific code. But the other checks can bring some overhead. I'd like to avoid the overhead in a context switch.

Since X86 is the only user for sched_task() for now, I prefer to only add the check here. I will add some comments to explain the reason.

If ARM needs it in the other places later, please feel free to add it.

Thanks,
Kan