Re: [RFC PATCH v2 4/5] sched: UMCG: add a blocked worker list

From: Peter Zijlstra
Date: Thu Jan 27 2022 - 10:38:20 EST


On Thu, Jan 13, 2022 at 03:39:39PM -0800, Peter Oskolkov wrote:

> This change introduces the following benefits:
> - block detection how behaves similarly to wake detection;
> without this patch worker wakeups added wakees to the list
> and woke the server, while worker blocks only woke the server
> without adding blocked workers to a list, forcing servers
> to explicitly check worker's state;

> - if the blocked worker woke sufficiently quickly, the server
> woken on the block event would observe its worker now as
> RUNNABLE, so the block event had to be inferred rather than
> explicitly signalled by the worker being added to the blocked
> worker list;

This I think is missing the point, there is no race if the server checks
curr->state == RUNNING.

> - it is now possible for a single server to control several
> RUNNING workers, which makes writing userspace schedulers
> simpler for smaller processes that do not need to scale beyond
> one "server";

How about something like so on top?

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1298,6 +1298,7 @@ struct task_struct {

#ifdef CONFIG_UMCG
/* setup by sys_umcg_ctrl() */
+ u32 umcg_flags;
clockid_t umcg_clock;
struct umcg_task __user *umcg_task;

--- a/include/uapi/linux/umcg.h
+++ b/include/uapi/linux/umcg.h
@@ -119,6 +119,8 @@ struct umcg_task {
*
* Readable/writable by both the kernel and the userspace: the
* kernel adds items to the list, userspace removes them.
+ *
+ * Only used with UMCG_CTL_MULTI.
*/
__u64 blocked_workers_ptr; /* r/w */

@@ -147,11 +149,13 @@ enum umcg_wait_flag {
* @UMCG_CTL_REGISTER: register the current task as a UMCG task
* @UMCG_CTL_UNREGISTER: unregister the current task as a UMCG task
* @UMCG_CTL_WORKER: register the current task as a UMCG worker
+ * @UMCG_CTL_MULTI: allow 1:n worker relations, enables blocked_workers_ptr
*/
enum umcg_ctl_flag {
UMCG_CTL_REGISTER = 0x00001,
UMCG_CTL_UNREGISTER = 0x00002,
UMCG_CTL_WORKER = 0x10000,
+ UMCG_CTL_MULTI = 0x20000,
};

#endif /* _UAPI_LINUX_UMCG_H */
--- a/kernel/sched/umcg.c
+++ b/kernel/sched/umcg.c
@@ -335,7 +335,7 @@ static inline int umcg_enqueue_runnable(
}

/*
- * Enqueue @tsk on it's server's blocked list
+ * Enqueue @tsk on it's server's blocked list OR ensure @tsk == server::next_tid
*
* Must be called in umcg_pin_pages() context, relies on tsk->umcg_server.
*
@@ -346,10 +346,34 @@ static inline int umcg_enqueue_runnable(
* Returns:
* 0: success
* -EFAULT
+ * -ESRCH server::next_tid is not a valid UMCG task
+ * -EINVAL server::next_tid doesn't match @tsk
*/
static inline int umcg_enqueue_blocked(struct task_struct *tsk)
{
- return umcg_enqueue(tsk, true /* blocked */);
+ struct task_struct *next;
+ u32 next_tid;
+ int ret;
+
+ if (tsk->umcg_server->umcg_flags & UMCG_CTL_MULTI)
+ return umcg_enqueue(tsk, true /* blocked */);
+
+ /*
+ * When !MULTI, ensure this worker is the current worker,
+ * ensuring the 1:1 relation.
+ */
+ if (get_user(next_tid, &tsk->umcg_server_task->next_tid))
+ return -EFAULT;
+
+ next = umcg_get_task(next_tid);
+ if (!next)
+ return -ESRCH;
+
+ ret = (next == tsk) ? 0 : -EINVAL;
+
+ put_task_struct(next);
+
+ return ret;
}

/* pre-schedule() */
@@ -911,6 +934,8 @@ static int umcg_register(struct umcg_tas
return -EINVAL;
}

+ current->umcg_flags = flags;
+
if (current->umcg_task || !self)
return -EINVAL;

@@ -1061,7 +1086,7 @@ SYSCALL_DEFINE3(umcg_ctl, u32, flags, st

flags &= ~UMCG_CTL_CMD;

- if (flags & ~(UMCG_CTL_WORKER))
+ if (flags & ~(UMCG_CTL_WORKER|UMCG_CTL_MULTI))
return -EINVAL;

switch (cmd) {