Re: [RFC][PATCH v2 5/5] sched: User Mode Concurency Groups

From: Peter Zijlstra
Date: Mon Jan 24 2022 - 09:00:21 EST


On Fri, Jan 21, 2022 at 12:47:58PM +0100, Peter Zijlstra wrote:
> On Thu, Jan 20, 2022 at 04:55:22PM +0100, Peter Zijlstra wrote:
>
> > +SYSCALL_DEFINE2(umcg_wait, u32, flags, u64, timo)
> > +{
> > + struct task_struct *tsk = current;
> > + struct umcg_task __user *self = READ_ONCE(tsk->umcg_task);
> > + bool worker = tsk->flags & PF_UMCG_WORKER;
> > + int ret;
> > +
> > + if (!self || flags)
> > + return -EINVAL;
> > +
> > + if (worker) {
> > + tsk->flags &= ~PF_UMCG_WORKER;
> > + if (timo)
> > + return -ERANGE;
> > + }
> > +
> > + /* see umcg_sys_{enter,exit}() syscall exceptions */
> > + ret = umcg_pin_pages();
> > + if (ret)
> > + goto unblock;
> > +
> > + /*
> > + * Clear UMCG_TF_COND_WAIT *and* check state == RUNNABLE.
> > + */
> > + ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNABLE);
> > + if (ret)
> > + goto unpin;
> > +
> > + ret = umcg_wake_next(tsk, self);
> > + if (ret)
> > + goto unpin;
> > +
> > + if (worker) {
> > + /*
> > + * If this fails it is possible ::next_tid is already running
> > + * while this task is not going to block. This violates our
> > + * constraints.
> > + *
> > + * That said, pretty much the only way to make this fail is by
> > + * force munmap()'ing things. In which case one is most welcome
> > + * to the pieces.
> > + */
> > + ret = umcg_enqueue_and_wake(tsk);
> > + if (ret)
> > + goto unpin;
> > + }
> > +
> > + umcg_unpin_pages();
> > +
> > + ret = umcg_wait(timo);
> > + switch (ret) {
> > + case 0: /* all done */
> > + case -EINTR: /* umcg_notify_resume() will continue the wait */
>
> So I was playing with the whole worker timeout thing last night and
> realized this is broken. If we get a signal while we have a timeout, the
> timeout gets lost.
>
> I think the easiest solution is to have umcg_notify_resume() also resume
> the timeout, but the first pass of that was yuck, so I need to try
> again.
>
> Related, by moving the whole enqueue-and-wake thing into the timeout, we
> get more 'fun' failure cases :-(

This is the best I can come up with,... but it's a hot mess :-(

Still, let me go try this.

---

--- a/include/uapi/linux/umcg.h
+++ b/include/uapi/linux/umcg.h
@@ -127,6 +127,14 @@ struct umcg_task {
} __attribute__((packed, aligned(UMCG_TASK_ALIGN)));

/**
+ * enum umcg_wait_flag - flags to pass to sys_umcg_wait
+ * @UMCG_WAIT_ENQUEUE: Enqueue the task on runnable_workers_ptr before waiting
+ */
+enum umcg_wait_flag {
+ UMCG_WAIT_ENQUEUE = 0x0001,
+};
+
+/**
* enum umcg_ctl_flag - flags to pass to sys_umcg_ctl
* @UMCG_CTL_REGISTER: register the current task as a UMCG task
* @UMCG_CTL_UNREGISTER: unregister the current task as a UMCG task
--- a/kernel/sched/umcg.c
+++ b/kernel/sched/umcg.c
@@ -227,7 +227,6 @@ static int umcg_update_state(struct task

#define UMCG_DIE(reason) __UMCG_DIE(,reason)
#define UMCG_DIE_PF(reason) __UMCG_DIE(pagefault_enable(), reason)
-#define UMCG_DIE_UNPIN(reason) __UMCG_DIE(umcg_unpin_pages(), reason)

/* Called from syscall enter path and exceptions that can schedule */
void umcg_sys_enter(struct pt_regs *regs, long syscall)
@@ -371,15 +370,23 @@ static int umcg_enqueue_runnable(struct

static int umcg_enqueue_and_wake(struct task_struct *tsk)
{
- int ret;
-
- ret = umcg_enqueue_runnable(tsk);
+ int ret = umcg_enqueue_runnable(tsk);
if (!ret)
ret = umcg_wake_server(tsk);

return ret;
}

+static int umcg_pin_enqueue_and_wake(struct task_struct *tsk)
+{
+ int ret = umcg_pin_pages();
+ if (!ret) {
+ ret = umcg_enqueue_and_wake(tsk);
+ umcg_unpin_pages();
+ }
+ return ret;
+}
+
/*
* umcg_wait: Wait for ->state to become RUNNING
*
@@ -469,16 +476,11 @@ static void umcg_unblock_and_wait(void)
/* avoid recursion vs schedule() */
tsk->flags &= ~PF_UMCG_WORKER;

- if (umcg_pin_pages())
- UMCG_DIE("pin");
-
if (umcg_update_state(tsk, self, UMCG_TASK_BLOCKED, UMCG_TASK_RUNNABLE))
- UMCG_DIE_UNPIN("state");
+ UMCG_DIE("state");

- if (umcg_enqueue_and_wake(tsk))
- UMCG_DIE_UNPIN("enqueue-wake");
-
- umcg_unpin_pages();
+ if (umcg_pin_enqueue_and_wake(tsk))
+ UMCG_DIE("pin-enqueue-wake");

switch (umcg_wait(0)) {
case 0:
@@ -544,18 +546,13 @@ void umcg_notify_resume(struct pt_regs *
goto done;

if (state & UMCG_TF_PREEMPT) {
- if (umcg_pin_pages())
- UMCG_DIE("pin");
-
if (umcg_update_state(tsk, self,
UMCG_TASK_RUNNING,
UMCG_TASK_RUNNABLE))
- UMCG_DIE_UNPIN("state");
+ UMCG_DIE("state");

- if (umcg_enqueue_and_wake(tsk))
- UMCG_DIE_UNPIN("enqueue-wake");
-
- umcg_unpin_pages();
+ if (umcg_pin_enqueue_and_wake(tsk))
+ UMCG_DIE("pin-enqueue-wake");
}

if (WARN_ON_ONCE(timeout && syscall_get_nr(tsk, regs) != __NR_umcg_wait))
@@ -570,6 +567,13 @@ void umcg_notify_resume(struct pt_regs *

case -ETIMEDOUT:
regs_set_return_value(regs, ret);
+ if (worker) {
+ ret = umcg_pin_enqueue_and_wake(tsk);
+ if (ret) {
+ umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNING);
+ regs_set_return_value(regs, ret);
+ }
+ }
break;

default:
@@ -710,7 +714,6 @@ static int umcg_wake_next(struct task_st
* Returns:
* 0 - OK;
* -ETIMEDOUT - the timeout expired;
- * -ERANGE - the timeout is out of range (worker);
* -EAGAIN - ::state wasn't RUNNABLE, concurrent wakeup;
* -EFAULT - failed accessing struct umcg_task __user of the current
* task, the server or next;
@@ -725,48 +728,40 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags, u
bool worker = tsk->flags & PF_UMCG_WORKER;
int ret;

- if (!self || flags)
+ if (!self || (flags & ~(UMCG_WAIT_ENQUEUE)))
return -EINVAL;

- if (worker) {
- tsk->flags &= ~PF_UMCG_WORKER;
- if (timo)
- return -ERANGE;
- }
+ if ((flags & UMCG_WAIT_ENQUEUE) && (timo || !worker))
+ return -EINVAL;

- /* see umcg_sys_{enter,exit}() syscall exceptions */
- ret = umcg_pin_pages();
- if (ret)
- goto unblock;
+ if (worker)
+ tsk->flags &= ~PF_UMCG_WORKER;

/*
* Clear UMCG_TF_COND_WAIT *and* check state == RUNNABLE.
*/
ret = umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNABLE);
if (ret)
- goto unpin;
+ goto unblock;

ret = umcg_wake_next(tsk, self);
if (ret)
- goto unpin;
+ goto unblock;

- if (worker) {
+ if (flags & UMCG_WAIT_ENQUEUE) {
/*
* If this fails it is possible ::next_tid is already running
* while this task is not going to block. This violates our
* constraints.
*
- * That said, pretty much the only way to make this fail is by
- * force munmap()'ing things. In which case one is most welcome
- * to the pieces.
+ * Userspace can detect this case by looking at: ::next_tid &
+ * TID_RUNNING.
*/
- ret = umcg_enqueue_and_wake(tsk);
+ ret = umcg_pin_enqueue_and_wake(tsk);
if (ret)
- goto unpin;
+ goto unblock;
}

- umcg_unpin_pages();
-
ret = umcg_wait(timo);
switch (ret) {
case 0: /* all done */
@@ -775,6 +770,26 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags, u
ret = 0;
break;

+ case -ETIMEDOUT:
+ if (worker) {
+ /*
+ * See the UMCG_WAIT_ENQUEUE case above; except this is
+ * even more complicated because we *did* wait and
+ * things might have progressed lots.
+ *
+ * Still, abort the wait because otherwise nobody would
+ * ever find us again. Hopefully userspace can make
+ * sense of things.
+ */
+ ret = umcg_pin_enqueue_and_wake(tsk);
+ if (ret)
+ goto unblock;
+
+ ret = -ETIMEDOUT;
+ break;
+ }
+ goto unblock;
+
default:
goto unblock;
}
@@ -783,8 +798,6 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags, u
tsk->flags |= PF_UMCG_WORKER;
return ret;

-unpin:
- umcg_unpin_pages();
unblock:
umcg_update_state(tsk, self, UMCG_TASK_RUNNABLE, UMCG_TASK_RUNNING);
goto out;