Re: [PATCH v1] kthread/smpboot: Serialize kthread parking against wakeup

From: Peter Zijlstra
Date: Tue Jun 05 2018 - 16:13:39 EST


On Tue, Jun 05, 2018 at 06:35:16PM +0200, Oleg Nesterov wrote:
> Yes, but this won't fix the race decribed by Kohli...

Clearly I'm not going strong today... and yes, looking at that again I
didn't address that.

> Plus this complicates the schedule() paths for the very special case

I checked, we already spilled @preempt onto the stack, context_switch()
is inlined so the extra argument is a no-op, if finish_task_switch()
also gets inlined (possible, could force it) the only additional cost is
the @preempt load from stack in the slow path. If it doesn't get inlined
we get the additional function call overhead, which should be minimal.

But yes, I wasn't a fan either.

> and to me it seems that all this kthread_park/unpark logic needs some
> serious cleanups...

I really hated how even with TASK_PARKED special the smpboot thread
could still get migrated, which is why I moved the completion. Do you
have any saner suggestions?

Humm, I suppose we could do wait_task_inactive() after
wait_for_completion(). I absolutely abhor wait_task_inactive(), but I
remember us both failing to fix that at least twice :/

Also, I think we still need TASK_PARKED as a special state for that.

How's the below?

---
include/linux/kthread.h | 1 -
include/linux/sched.h | 2 +-
kernel/kthread.c | 32 +++++++++++++++++++++++++-------
kernel/sched/core.c | 31 +++++++++++--------------------
4 files changed, 37 insertions(+), 29 deletions(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 2803264c512f..c1961761311d 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -62,7 +62,6 @@ void *kthread_probe_data(struct task_struct *k);
int kthread_park(struct task_struct *k);
void kthread_unpark(struct task_struct *k);
void kthread_parkme(void);
-void kthread_park_complete(struct task_struct *k);

int kthreadd(void *unused);
extern struct task_struct *kthreadd_task;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 14e4f9c12337..4e32c1cc7794 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -117,7 +117,7 @@ struct task_group;
* the comment with set_special_state().
*/
#define is_special_task_state(state) \
- ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_DEAD))
+ ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))

#define __set_current_state(state_value) \
do { \
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 481951bf091d..8f66a3dc767a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -177,12 +177,24 @@ void *kthread_probe_data(struct task_struct *task)
static void __kthread_parkme(struct kthread *self)
{
for (;;) {
- set_current_state(TASK_PARKED);
+ /*
+ * TASK_PARKED is a special state; we must serialize against
+ * possible pending wakeups to avoid store-store collisions on
+ * task->state.
+ *
+ * Such a collision might possibly result in the task state
+ * changin from TASK_PARKED and us failing the
+ * wait_task_inactive() in kthread_park().
+ */
+ set_special_state(TASK_PARKED);
if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
break;
+
+ complete_all(&self->parked);
schedule();
}
__set_current_state(TASK_RUNNING);
+ reinit_completion(&self->parked);
}

void kthread_parkme(void)
@@ -191,11 +203,6 @@ void kthread_parkme(void)
}
EXPORT_SYMBOL_GPL(kthread_parkme);

-void kthread_park_complete(struct task_struct *k)
-{
- complete_all(&to_kthread(k)->parked);
-}
-
static int kthread(void *_create)
{
/* Copy data: it's on kthread's stack */
@@ -459,8 +466,10 @@ void kthread_unpark(struct task_struct *k)
if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
__kthread_bind(k, kthread->cpu, TASK_PARKED);

- reinit_completion(&kthread->parked);
clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ /*
+ * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
+ */
wake_up_state(k, TASK_PARKED);
}
EXPORT_SYMBOL_GPL(kthread_unpark);
@@ -487,7 +496,16 @@ int kthread_park(struct task_struct *k)
set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
if (k != current) {
wake_up_process(k);
+ /*
+ * Wait for __kthread_parkme() to complete(), this means we
+ * _will_ have TASK_PARKED and are about to call schedule().
+ */
wait_for_completion(&kthread->parked);
+ /*
+ * Now wait for that schedule() to complete and the task to
+ * get scheduled out.
+ */
+ WARN_ON_ONCE(!wait_task_inactive(p, TASK_PARKED));
}

return 0;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8d59b259af4a..cf72c4eed7da 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7,7 +7,6 @@
*/
#include "sched.h"

-#include <linux/kthread.h>
#include <linux/nospec.h>

#include <asm/switch_to.h>
@@ -2701,28 +2700,20 @@ static struct rq *finish_task_switch(struct task_struct *prev)
membarrier_mm_sync_core_before_usermode(mm);
mmdrop(mm);
}
- if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) {
- switch (prev_state) {
- case TASK_DEAD:
- if (prev->sched_class->task_dead)
- prev->sched_class->task_dead(prev);
+ if (unlikely(prev_state == TASK_DEAD)) {
+ if (prev->sched_class->task_dead)
+ prev->sched_class->task_dead(prev);

- /*
- * Remove function-return probe instances associated with this
- * task and put them back on the free list.
- */
- kprobe_flush_task(prev);
-
- /* Task is done with its stack. */
- put_task_stack(prev);
+ /*
+ * Remove function-return probe instances associated with this
+ * task and put them back on the free list.
+ */
+ kprobe_flush_task(prev);

- put_task_struct(prev);
- break;
+ /* Task is done with its stack. */
+ put_task_stack(prev);

- case TASK_PARKED:
- kthread_park_complete(prev);
- break;
- }
+ put_task_struct(prev);
}

tick_nohz_task_switch();