[PATCH v2] perf/core: fix RCU issues with cgroup monitoring mode

From: Stephane Eranian
Date: Tue Oct 27 2015 - 15:25:35 EST

Next message: Rob Herring: "Re: [PATCH v8 6/8] scsi: ufs: make the UFS variant a platform device"
Previous message: Javier Martinez Canillas: "Re: [PATCH] get_maintainer: Don't fallback to git by default"
Next in thread: Eric Dumazet: "Re: [PATCH v2] perf/core: fix RCU issues with cgroup monitoring mode"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

This patch eliminates all known RCU violations detected
by the RCU checker (PROVE_RCU). The impact code paths
were all related to cgroup mode monitoring and involved
access a task's cgrp.

V2 is updated to include suggestions from PeterZ to eliminate
some of the warnings without grabbing the rcu_read lock because
we know we are already holding the ctx->lock which prevents
the cgroup from disappearing while we are accessing it.
The trick, as suggested by Peter, is to modify the
perf_cgroup_from_task() to take an extra boolean parameter
to allow bypassing the lockdep test in the task_subsys_cstate()
macros. This patch uses this approach to update all calls the
perf_cgroup_from_task().
V2 Patch relative to:
8b3c8e6 Revert "rculist: Make list_entry_rcu() use lockless_dereference()"

Signed-off-by: Stephane Eranian <eranian@xxxxxxxxxx>
---
arch/x86/kernel/cpu/perf_event_intel_cqm.c | 2 +-
include/linux/perf_event.h | 4 ++--
kernel/events/core.c | 27 +++++++++++++++++----------
3 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 377e8f8..d96bbf1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -298,7 +298,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b)
static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
{
if (event->attach_state & PERF_ATTACH_TASK)
- return perf_cgroup_from_task(event->hw.target);
+ return perf_cgroup_from_task(event->hw.target, false);

return event->cgrp;
}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d841d33..24f3539 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -697,9 +697,9 @@ struct perf_cgroup {
* if there is no cgroup event for the current CPU context.
*/
static inline struct perf_cgroup *
-perf_cgroup_from_task(struct task_struct *task)
+perf_cgroup_from_task(struct task_struct *task, bool safe)
{
- return container_of(task_css(task, perf_event_cgrp_id),
+ return container_of(task_css_check(task, perf_event_cgrp_id, safe),
struct perf_cgroup, css);
}
#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ea02109..2003240 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -435,7 +435,8 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
if (!is_cgroup_event(event))
return;

- cgrp = perf_cgroup_from_task(current);
+ /* holding ctx->lock, so cgroup access is safe */
+ cgrp = perf_cgroup_from_task(current, true);
/*
* Do not update time when cgroup is not active
*/
@@ -458,7 +459,8 @@ perf_cgroup_set_timestamp(struct task_struct *task,
if (!task || !ctx->nr_cgroups)
return;

- cgrp = perf_cgroup_from_task(task);
+ /* holding ctx->lock, so cgroup access is safe */
+ cgrp = perf_cgroup_from_task(task, true);
info = this_cpu_ptr(cgrp->info);
info->timestamp = ctx->timestamp;
}
@@ -489,7 +491,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
* we reschedule only in the presence of cgroup
* constrained events.
*/
- rcu_read_lock();

list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
@@ -523,7 +524,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
* event_filter_match() to not have to pass
* task around
*/
- cpuctx->cgrp = perf_cgroup_from_task(task);
+ cpuctx->cgrp = perf_cgroup_from_task(task, false);
cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
}
perf_pmu_enable(cpuctx->ctx.pmu);
@@ -531,8 +532,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
}
}

- rcu_read_unlock();
-
local_irq_restore(flags);
}

@@ -542,17 +541,18 @@ static inline void perf_cgroup_sched_out(struct task_struct *task,
struct perf_cgroup *cgrp1;
struct perf_cgroup *cgrp2 = NULL;

+ rcu_read_lock();
/*
* we come here when we know perf_cgroup_events > 0
*/
- cgrp1 = perf_cgroup_from_task(task);
+ cgrp1 = perf_cgroup_from_task(task, false);

/*
* next is NULL when called from perf_event_enable_on_exec()
* that will systematically cause a cgroup_switch()
*/
if (next)
- cgrp2 = perf_cgroup_from_task(next);
+ cgrp2 = perf_cgroup_from_task(next, false);

/*
* only schedule out current cgroup events if we know
@@ -561,6 +561,8 @@ static inline void perf_cgroup_sched_out(struct task_struct *task,
*/
if (cgrp1 != cgrp2)
perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+
+ rcu_read_unlock();
}

static inline void perf_cgroup_sched_in(struct task_struct *prev,
@@ -569,13 +571,14 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev,
struct perf_cgroup *cgrp1;
struct perf_cgroup *cgrp2 = NULL;

+ rcu_read_lock();
/*
* we come here when we know perf_cgroup_events > 0
*/
- cgrp1 = perf_cgroup_from_task(task);
+ cgrp1 = perf_cgroup_from_task(task, false);

/* prev can never be NULL */
- cgrp2 = perf_cgroup_from_task(prev);
+ cgrp2 = perf_cgroup_from_task(prev, false);

/*
* only need to schedule in cgroup events if we are changing
@@ -584,6 +587,8 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev,
*/
if (cgrp1 != cgrp2)
perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+
+ rcu_read_unlock();
}

static inline int perf_cgroup_connect(int fd, struct perf_event *event,
@@ -9442,7 +9447,9 @@ static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
static int __perf_cgroup_move(void *info)
{
struct task_struct *task = info;
+ rcu_read_lock();
perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
+ rcu_read_unlock();
return 0;
}

--
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Rob Herring: "Re: [PATCH v8 6/8] scsi: ufs: make the UFS variant a platform device"
Previous message: Javier Martinez Canillas: "Re: [PATCH] get_maintainer: Don't fallback to git by default"
Next in thread: Eric Dumazet: "Re: [PATCH v2] perf/core: fix RCU issues with cgroup monitoring mode"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]