Re: [PATCH v3] sched/numa: fix unsafe get_task_struct() in task_numa_assign()
From: Oleg Nesterov
Date: Mon Oct 20 2014 - 10:51:40 EST
Kirill,
I leave this to you and Peter, but...
On 10/20, Kirill Tkhai wrote:
>
> @@ -259,10 +259,15 @@ void __init fork_init(unsigned long mempages)
> #ifndef ARCH_MIN_TASKALIGN
> #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES
> #endif
> - /* create a slab on which task_structs can be allocated */
> + /*
> + * Create a slab on which task_structs can be allocated.
> + * Note, we need SLAB_DESTROY_BY_RCU flag, when we access
> + * rq::curr under RCU read lock. See scheduler code.
> + */
> task_struct_cachep =
> kmem_cache_create("task_struct", sizeof(struct task_struct),
> - ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
> + ARCH_MIN_TASKALIGN,
> + SLAB_PANIC | SLAB_NOTRACK | SLAB_DESTROY_BY_RCU, NULL);
to me this change needs more justification.
Again, perhaps we will need to change the lifetime rules for task_struct
anyway, if we have more problems like this. But until then this looks like
an overkill to me. Plus rq_curr_if_not_put() looks too subtle, and it is
not generic.
May be we should start with something simple and stupid?
(it seems we can remove rcu_read_lock() with this patch, but I am not
sure).
Oleg.
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1087,9 +1087,6 @@ static void task_numa_assign(struct task_numa_env *env,
{
if (env->best_task)
put_task_struct(env->best_task);
- if (p)
- get_task_struct(p);
-
env->best_task = p;
env->best_imp = imp;
env->best_cpu = env->dst_cpu;
@@ -1139,6 +1136,18 @@ static bool load_too_imbalanced(long src_load, long dst_load,
return (imb > old_imb);
}
+struct task_struct *get_rq_curr(struct rq *rq)
+{
+ struct task_struct *curr;
+
+ raw_spin_lock_irq(&rq->lock);
+ curr = rq->curr;
+ get_task_struct(curr);
+ raw_spin_unlock_irq(&rq->lock);
+
+ return curr;
+}
+
/*
* This checks if the overall compute and NUMA accesses of the system would
* be improved if the source tasks was migrated to the target dst_cpu taking
@@ -1156,11 +1165,9 @@ static void task_numa_compare(struct task_numa_env *env,
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
- rcu_read_lock();
- cur = ACCESS_ONCE(dst_rq->curr);
- if (cur->pid == 0) /* idle */
- cur = NULL;
+ cur = get_rq_curr(dst_rq);
+ rcu_read_lock();
/*
* "imp" is the fault differential for the source task between the
* source and destination node. Calculate the total differential for
@@ -1235,6 +1242,7 @@ balance:
*/
if (!load_too_imbalanced(src_load, dst_load, env)) {
imp = moveimp - 1;
+ put_task_struct(cur);
cur = NULL;
goto assign;
}
@@ -1254,8 +1262,12 @@ balance:
assign:
task_numa_assign(env, cur, imp);
+ cur = NULL;
unlock:
rcu_read_unlock();
+
+ if (cur)
+ put_task_struct(cur);
}
static void task_numa_find_cpu(struct task_numa_env *env,
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/