Re: [RFC][PATCH v2 5/5] sched: User Mode Concurency Groups

From: Peter Zijlstra
Date: Tue Jan 25 2022 - 11:20:22 EST


On Mon, Jan 24, 2022 at 03:29:56PM +0100, Peter Zijlstra wrote:

> Oh how I hate signals... this can get scribbled by a syscall/fault from
> sigcontext :/

OK, the below seems to work. I'll see if I can clean it up some.

--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -94,28 +94,44 @@ static inline int syscall_get_arch(struc

#else /* CONFIG_X86_64 */

-static inline void syscall_get_arguments(struct task_struct *task,
- struct pt_regs *regs,
- unsigned long *args)
+static inline unsigned long
+syscall_get_argument(struct task_struct *task, struct pt_regs *regs, int nr)
{
-# ifdef CONFIG_IA32_EMULATION
+#ifdef CONFIG_IA32_EMULATION
if (task->thread_info.status & TS_COMPAT) {
- *args++ = regs->bx;
- *args++ = regs->cx;
- *args++ = regs->dx;
- *args++ = regs->si;
- *args++ = regs->di;
- *args = regs->bp;
+ switch (nr) {
+ case 0: return regs->bx;
+ case 1: return regs->cx;
+ case 2: return regs->dx;
+ case 3: return regs->si;
+ case 4: return regs->di;
+ case 5: return regs->bp;
+ }
} else
-# endif
+#endif
{
- *args++ = regs->di;
- *args++ = regs->si;
- *args++ = regs->dx;
- *args++ = regs->r10;
- *args++ = regs->r8;
- *args = regs->r9;
+ switch (nr) {
+ case 0: return regs->di;
+ case 1: return regs->si;
+ case 2: return regs->dx;
+ case 3: return regs->r10;
+ case 4: return regs->r8;
+ case 5: return regs->r9;
+ }
}
+
+ WARN_ON_ONCE(1);
+ return 0;
+}
+
+static inline void syscall_get_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ unsigned long *args)
+{
+ int i;
+
+ for (i = 0; i < 6; i++)
+ *args++ = syscall_get_argument(task, regs, i);
}

static inline int syscall_get_arch(struct task_struct *task)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1307,6 +1307,9 @@ struct task_struct {
struct task_struct *umcg_server;
struct umcg_task __user *umcg_server_task;
struct page *umcg_server_page;
+
+ unsigned long umcg_stack_pointer;
+ unsigned int umcg_worker;
#endif

struct tlbflush_unmap_batch tlb_ubc;
--- a/kernel/sched/umcg.c
+++ b/kernel/sched/umcg.c
@@ -459,7 +459,7 @@ static int umcg_wait(u64 timo)
/*
* Blocked case for umcg_sys_exit(), shared with sys_umcg_ctl().
*/
-static void umcg_unblock_and_wait(void)
+static void umcg_unblock(void)
{
struct task_struct *tsk = current;
struct umcg_task __user *self = READ_ONCE(tsk->umcg_task);
@@ -478,15 +478,7 @@ static void umcg_unblock_and_wait(void)

umcg_unpin_pages();

- switch (umcg_wait(0)) {
- case 0:
- case -EINTR:
- /* notify_resume will continue the wait after the signal */
- break;
-
- default:
- UMCG_DIE("wait");
- }
+ /* notify-resume will wait */

tsk->flags |= PF_UMCG_WORKER;
}
@@ -509,7 +501,7 @@ void umcg_sys_exit(struct pt_regs *regs)
return;
}

- umcg_unblock_and_wait();
+ umcg_unblock();
}

/* return-to-user path */
@@ -518,11 +510,47 @@ void umcg_notify_resume(struct pt_regs *
struct task_struct *tsk = current;
struct umcg_task __user *self = tsk->umcg_task;
bool worker = tsk->flags & PF_UMCG_WORKER;
+ u64 timeout = 0;
u32 state;
+ int ret;
+
+ /*
+ * Unix signals are horrible, but we have to handle them somehow.
+ *
+ * - simply discarding a signal breaks userspace so is not an option.
+ *
+ * - returning -EINTR and have userspace deal with it is not an option
+ * since we can be blocked here due to !syscall reasons (page-faults
+ * for example). But it's also not permissible to have random
+ * syscalls return -EINTR that didn't before.
+ *
+ * - subjecting signal handlers to UMCG would render existing signal
+ * handler code subject to the whims and latencies of UMCG; given that
+ * most signal hander code is short and time sensitive, this seems
+ * undesirable (consider ^C not working because it got delivered to a
+ * blocked task).
+ *
+ * Therefore the chosen path is to exclude signal context from UMCG
+ * entirely and treat it as unmanaged time.
+ */
+ if (tsk->umcg_stack_pointer) {
+ if (tsk->umcg_stack_pointer != user_stack_pointer(regs))
+ return;
+
+ tsk->umcg_stack_pointer = 0;
+ worker = tsk->umcg_worker;
+ tsk->umcg_worker = 0;
+
+ if (worker) {
+ set_syscall_work(SYSCALL_UMCG);
+ /* and PF_UMCG_SYSCALL at done */
+ }
+ goto resume;
+ }

/* avoid recursion vs schedule() */
if (worker)
- current->flags &= ~PF_UMCG_WORKER;
+ tsk->flags &= ~PF_UMCG_WORKER;

if (get_user(state, &self->state))
UMCG_DIE("get-state");
@@ -554,10 +582,31 @@ void umcg_notify_resume(struct pt_regs *
umcg_unpin_pages();
}

- switch (umcg_wait(0)) {
+resume:
+ /*
+ * Hack alert! Since the return-to-user path must resume waiting it
+ * needs access to the timeout argument and set the return value.
+ */
+ if (syscall_get_nr(tsk, regs) == __NR_umcg_wait)
+ timeout = syscall_get_argument(tsk, regs, 1);
+
+ ret = umcg_wait(timeout);
+ switch (ret) {
case 0:
+ break;
+
case -EINTR:
/* we will resume the wait after the signal */
+ WARN_ON_ONCE(tsk->umcg_stack_pointer);
+ tsk->umcg_stack_pointer = user_stack_pointer(regs);
+ tsk->umcg_worker = worker;
+ clear_task_syscall_work(tsk, SYSCALL_UMCG);
+ /* implicitly clears PF_UMCG_WORKER with the early exit */
+ return;
+
+ case -ETIMEDOUT:
+ /* must be __NR_umcg_wait */
+ regs_set_return_value(regs, ret);
break;

default:
@@ -566,7 +615,7 @@ void umcg_notify_resume(struct pt_regs *

done:
if (worker)
- current->flags |= PF_UMCG_WORKER;
+ tsk->flags |= PF_UMCG_WORKER;
}

/**
@@ -755,16 +804,7 @@ SYSCALL_DEFINE2(umcg_wait, u32, flags, u

umcg_unpin_pages();

- ret = umcg_wait(timo);
- switch (ret) {
- case 0: /* all done */
- case -EINTR: /* umcg_notify_resume() will continue the wait */
- ret = 0;
- break;
-
- default:
- goto unblock;
- }
+ /* notify-resume will wait */
out:
if (worker)
tsk->flags |= PF_UMCG_WORKER;
@@ -831,7 +871,7 @@ static int umcg_register(struct umcg_tas
set_syscall_work(SYSCALL_UMCG); /* hook syscall */
set_thread_flag(TIF_UMCG); /* hook return-to-user */

- umcg_unblock_and_wait();
+ umcg_unblock();

} else {
if ((ut.state & (UMCG_TASK_MASK | UMCG_TF_MASK)) != UMCG_TASK_RUNNING)