Re: [PATCH 3/3] introduce task_rcu_dereference()

From: Peter Zijlstra
Date: Wed May 18 2016 - 13:02:29 EST



So I keep coming back to this, and every time I look at it my brain
explodes.

On Mon, Oct 27, 2014 at 08:54:46PM +0100, Oleg Nesterov wrote:
> +struct task_struct *task_rcu_dereference(struct task_struct **ptask)
> +{
> + struct task_struct *task;
> + struct sighand_struct *sighand;

I think that needs: ' = NULL', because if

> +
> + /*
> + * We need to verify that release_task() was not called and thus
> + * delayed_put_task_struct() can't run and drop the last reference
> + * before rcu_read_unlock(). We check task->sighand != NULL, but
> + * we can read the already freed and reused memory.
> + */
> + retry:
> + task = rcu_dereference(*ptask);
> + if (!task)
> + return NULL;
> +
> + probe_slab_address(&task->sighand, sighand);

this fails because of DEBUG_PAGE_ALLOC, we might not write to sighand at
all, and

> + /*
> + * Pairs with atomic_dec_and_test(usage) in put_task_struct(task).
> + * If this task was already freed we can not miss the preceding
> + * update of this pointer.
> + */
> + smp_rmb();
> + if (unlikely(task != ACCESS_ONCE(*ptask)))
> + goto retry;
> +
> + /*
> + * Either this is the same task and we can trust sighand != NULL, or
> + * its memory was re-instantiated as another instance of task_struct.
> + * In the latter case the new task can not go away until another rcu
> + * gp pass, so the only problem is that sighand == NULL can be false
> + * positive but we can pretend we got this NULL before it was freed.
> + */
> + if (sighand)

this will be inspecting random values on the stack.

> + return task;
> +
> + /*
> + * We could even eliminate the false positive mentioned above:
> + *
> + * probe_slab_address(&task->sighand, sighand);
> + * if (sighand)
> + * goto retry;
> + *
> + * if sighand != NULL because we read the freed memory we should
> + * see the new pointer, otherwise we will likely return this task.
> + */
> + return NULL;
> +}

This thing does more than rcu_dereference; because it also guarantees
that task->usage > 0 for the entire RCU section you do this under.
Because delayed_put_task_struct() will be delayed until at least the
matching rcu_read_unlock().


Should we instead create a primitive like try_get_task_struct() which
returns true if it got a reference? Because that is typically what
people end up doing..

A little like so?

---
include/linux/sched.h | 2 ++
kernel/exit.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 29 +++++++----------------------
3 files changed, 55 insertions(+), 22 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 38526b67e787..b2baa4af04e1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2132,6 +2132,8 @@ static inline void put_task_struct(struct task_struct *t)
__put_task_struct(t);
}

+struct task_struct *try_get_task_struct(struct task_struct **ptask);
+
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
extern void task_cputime(struct task_struct *t,
cputime_t *utime, cputime_t *stime);
diff --git a/kernel/exit.c b/kernel/exit.c
index fd90195667e1..a1670dfd587b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -210,6 +210,52 @@ void release_task(struct task_struct *p)
goto repeat;
}

+struct task_struct *try_get_task_struct(struct task_struct **ptask)
+{
+ struct sighand_struct *sighand = NULL;
+ struct task_struct *task = NULL;
+
+ rcu_read_lock();
+ /*
+ * We need to verify that release_task() was not called and thus
+ * delayed_put_task_struct() can't run and drop the last reference
+ * before rcu_read_unlock(). We check task->sighand != NULL,
+ * but we can read the already freed and reused memory.
+ */
+retry:
+ task = rcu_dereference(*ptask);
+ if (!task)
+ goto unlock;
+
+ probe_kernel_address(&task->sighand, sighand);
+
+ /*
+ * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
+ * was already freed we can not miss the preceding update of this
+ * pointer.
+ */
+ if (unlikely(task != READ_ONCE(*ptask)))
+ goto retry;
+
+ /*
+ * Either this is the same task and we can trust sighand != NULL, or
+ * its memory was re-instantiated as another instrance of task_struct.
+ * In the latter case the new task can not go away until another RCU
+ * GP pass, so the only problem is that sighand == NULL can be a false
+ * positive, but we can pretend we got this NULL before it was freed.
+ */
+ if (!sighand) {
+ task = NULL;
+ goto unlock;
+ }
+
+ get_task_struct(task);
+
+unlock:
+ rcu_read_unlock();
+ return task;
+}
+
/*
* Determine if a process group is "orphaned", according to the POSIX
* definition in 2.2.2.52. Orphaned process groups are not to be affected
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 218f8e83db73..1d3a410c481b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1374,30 +1374,15 @@ static void task_numa_compare(struct task_numa_env *env,
int dist = env->dist;
bool assigned = false;

- rcu_read_lock();
-
- raw_spin_lock_irq(&dst_rq->lock);
- cur = dst_rq->curr;
- /*
- * No need to move the exiting task or idle task.
- */
- if ((cur->flags & PF_EXITING) || is_idle_task(cur))
- cur = NULL;
- else {
- /*
- * The task_struct must be protected here to protect the
- * p->numa_faults access in the task_weight since the
- * numa_faults could already be freed in the following path:
- * finish_task_switch()
- * --> put_task_struct()
- * --> __put_task_struct()
- * --> task_numa_free()
- */
- get_task_struct(cur);
+ cur = try_get_task_struct(&dst_rq->curr);
+ if (cur) {
+ if ((cur->flags & PF_EXITING) || is_idle_task(cur)) {
+ put_task_struct(cur);
+ cur = NULL;
+ }
}

- raw_spin_unlock_irq(&dst_rq->lock);
-
+ rcu_read_lock();
/*
* Because we have preemption enabled we can get migrated around and
* end try selecting ourselves (current == env->p) as a swap candidate.