Re: [PATCH 3/5 v0.6] sched/umcg: RFC: implement UMCG syscalls

From: Tao Zhou
Date: Sun Sep 19 2021 - 14:26:58 EST


Hi Peter,

On Fri, Sep 17, 2021 at 11:03:21AM -0700, Peter Oskolkov wrote:
> Define struct umcg_task and two syscalls: sys_umcg_ctl sys_umcg_wait.
>
> All key operations, such as wait/wake/context-switch, as well as
> timeouts and block/wake detection, are working quite reliably.
>
> In addition, the userspace can now force the kernel to preempt
> a running worker by changing its state from RUNNING to
> RUNNING | PREEMPTED and sending a signal to it. This new functionality
> is less well tested than the key operations above, but is working
> well in the common case of the worker busy in the userspace.
>
> These big things remain to be addressed (in no particular order):
> - tracing/debugging
> - faster context switches (see umcg_do_context_switch in umcg.c)
> - other architectures (we will need at least arm64 in addition to amd64)
> - tools/lib/umcg for userspace
> - kselftests
>
> I'm working on finalizing libumcg and kselftests.
>
> See Documentation/userspace-api/umcg.[txt|rst] for API usage and
> other details.
>
> v0.5->v0.6 changes:
> - umcg_task pages are now pinned for RUNNING workers;
> - waking workers now wait for the userspace to schedule them
> in exit_to_user_mode_loop() instead of in sched_update_worker();
> - added umcg_clear_child to fork and execve;
> - changed current->umcg_task assignments to WRITE_ONCE;
> - server/worker interactions are restricted to tasks in the same mm;
>
> v0.4->v0.5 changes:
> - handling idle workers and servers is now much simpler on the kernel
> side, thanks to Thierry Delisle's suggestion:
> https://lore.kernel.org/lkml/3530714d-125b-e0f5-45b2-72695e2fc4ee@xxxxxxxxxxxx/
> - minor tweaks to improve preemption handling;
>
> v0.3->v0.4 changes:
> - removed server_tid and api_version fields from struct umcg_task;
> - added timeout handling to sys_umcg_wait();
> - implemented worker preemption via signals;
> - handling idle workers and servers is changed again (see umcg.rst).
>
> v0.2->v0.3 changes:
> - the overall approach is now based on peterz@'s suggestion in
> https://lore.kernel.org/patchwork/cover/1433967/
> (should I add Suggested-by?)
> - new protocol for working with idle workers and servers is used, to avoid
> spinning in the kernel;
> - waking a UMCG task now does not require spinning.
>
> Signed-off-by: Peter Oskolkov <posk@xxxxxxxxxx>
> ---
> arch/x86/entry/syscalls/syscall_64.tbl | 2 +
> fs/exec.c | 1 +
> include/linux/sched.h | 56 ++
> include/linux/syscalls.h | 4 +
> include/uapi/asm-generic/unistd.h | 8 +-
> include/uapi/linux/umcg.h | 117 ++++
> init/Kconfig | 10 +
> kernel/entry/common.c | 1 +
> kernel/exit.c | 2 +
> kernel/sched/Makefile | 1 +
> kernel/sched/core.c | 15 +-
> kernel/sched/umcg.c | 745 +++++++++++++++++++++++++
> kernel/sys_ni.c | 4 +
> 13 files changed, 963 insertions(+), 3 deletions(-)
> create mode 100644 include/uapi/linux/umcg.h
> create mode 100644 kernel/sched/umcg.c
>
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index ce18119ea0d0..0c6c7fd72b0b 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -368,6 +368,8 @@
> 444 common landlock_create_ruleset sys_landlock_create_ruleset
> 445 common landlock_add_rule sys_landlock_add_rule
> 446 common landlock_restrict_self sys_landlock_restrict_self
> +447 common umcg_ctl sys_umcg_ctl
> +448 common umcg_wait sys_umcg_wait
>
> #
> # Due to a historical design error, certain syscalls are numbered differently
> diff --git a/fs/exec.c b/fs/exec.c
> index 18594f11c31f..d652ef8017b2 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1835,6 +1835,7 @@ static int bprm_execve(struct linux_binprm *bprm,
> current->fs->in_exec = 0;
> current->in_execve = 0;
> rseq_execve(current);
> + umcg_execve(current);
> acct_update_integrals(current);
> task_numa_free(current, false);
> return retval;
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 549018e46801..4cf9070d1361 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -66,6 +66,7 @@ struct sighand_struct;
> struct signal_struct;
> struct task_delay_info;
> struct task_group;
> +struct umcg_task;
>
> /*
> * Task state bitmask. NOTE! These bits are also
> @@ -1230,6 +1231,12 @@ struct task_struct {
> unsigned long rseq_event_mask;
> #endif
>
> +#ifdef CONFIG_UMCG
> + struct umcg_task __user *umcg_task;
> + struct page *pinned_umcg_worker_page; /* self */
> + struct page *pinned_umcg_server_page;
> +#endif
> +
> struct tlbflush_unmap_batch tlb_ubc;
>
> union {
> @@ -1606,6 +1613,7 @@ extern struct pid *cad_pid;
> #define PF_KTHREAD 0x00200000 /* I am a kernel thread */
> #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
> #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
> +#define PF_UMCG_WORKER 0x01000000 /* UMCG worker */
> #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
> #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
> #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */
> @@ -2191,6 +2199,54 @@ static inline void rseq_execve(struct task_struct *t)
>
> #endif
>
> +#ifdef CONFIG_UMCG
> +
> +void umcg_handle_resuming_worker(void);
> +void umcg_handle_exiting_worker(void);
> +void umcg_clear_child(struct task_struct *tsk);
> +
> +/* Called by bprm_execve() in fs/exec.c. */
> +static inline void umcg_execve(struct task_struct *tsk)
> +{
> + if (tsk->umcg_task)
> + umcg_clear_child(tsk);
> +}
> +
> +/* Called by exit_to_user_mode_loop() in kernel/entry/common.c.*/
> +static inline void umcg_handle_notify_resume(void)
> +{
> + if (current->flags & PF_UMCG_WORKER)
> + umcg_handle_resuming_worker();
> +}
> +
> +/* Called by do_exit() in kernel/exit.c. */
> +static inline void umcg_handle_exit(void)
> +{
> + if (current->flags & PF_UMCG_WORKER)
> + umcg_handle_exiting_worker();
> +}
> +
> +/*
> + * umcg_wq_worker_[sleeping|running] are called in core.c by
> + * sched_submit_work() and sched_update_worker().
> + */
> +void umcg_wq_worker_sleeping(struct task_struct *tsk);
> +void umcg_wq_worker_running(struct task_struct *tsk);
> +
> +#else
> +
> +static inline void umcg_execve(struct task_struct *tsk)
> +{
> +}
> +static inline void umcg_handle_notify_resume(void)
> +{
> +}
> +static inline void umcg_handle_exit(void)
> +{
> +}
> +
> +#endif
> +
> #ifdef CONFIG_DEBUG_RSEQ
>
> void rseq_syscall(struct pt_regs *regs);
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 050511e8f1f8..f3e1ef8d842f 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -71,6 +71,7 @@ struct open_how;
> struct mount_attr;
> struct landlock_ruleset_attr;
> enum landlock_rule_type;
> +struct umcg_task;
>
> #include <linux/types.h>
> #include <linux/aio_abi.h>
> @@ -1050,6 +1051,9 @@ asmlinkage long sys_landlock_create_ruleset(const struct landlock_ruleset_attr _
> asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type rule_type,
> const void __user *rule_attr, __u32 flags);
> asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags);
> +asmlinkage long sys_umcg_ctl(u32 flags, struct umcg_task __user *self);
> +asmlinkage long sys_umcg_wait(u32 flags, u64 abs_timeout);
> +
>
> /*
> * Architecture-specific system calls
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 6de5a7fc066b..1a4c9ac0e296 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -873,8 +873,14 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule)
> #define __NR_landlock_restrict_self 446
> __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self)
>
> +#define __NR_umcg_ctl 447
> +__SYSCALL(__NR_umcg_ctl, sys_umcg_ctl)
> +#define __NR_umcg_wait 448
> +__SYSCALL(__NR_umcg_wait, sys_umcg_wait)
> +
> +
> #undef __NR_syscalls
> -#define __NR_syscalls 447
> +#define __NR_syscalls 449
>
> /*
> * 32 bit systems traditionally used different
> diff --git a/include/uapi/linux/umcg.h b/include/uapi/linux/umcg.h
> new file mode 100644
> index 000000000000..edce804781f9
> --- /dev/null
> +++ b/include/uapi/linux/umcg.h
> @@ -0,0 +1,117 @@
> +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
> +#ifndef _UAPI_LINUX_UMCG_H
> +#define _UAPI_LINUX_UMCG_H
> +
> +#include <linux/limits.h>
> +#include <linux/types.h>
> +
> +/*
> + * UMCG: User Managed Concurrency Groups.
> + *
> + * Syscalls (see kernel/sched/umcg.c):
> + * sys_umcg_ctl() - register/unregister UMCG tasks;
> + * sys_umcg_wait() - wait/wake/context-switch.
> + *
> + * struct umcg_task (below): controls the state of UMCG tasks.
> + *
> + * See Documentation/userspace-api/umcg.[txt|rst] for detals.
> + */
> +
> +/*
> + * UMCG task states, the first 8 bits. The states represent the user space
> + * point of view.
> + */
> +#define UMCG_TASK_NONE 0
> +#define UMCG_TASK_RUNNING 1
> +#define UMCG_TASK_IDLE 2
> +#define UMCG_TASK_BLOCKED 3
> +
> +/* The first byte: RUNNING, IDLE, or BLOCKED. */
> +#define UMCG_TASK_STATE_MASK 0xff
> +
> +/* UMCG task state flags, bits 8-15 */
> +
> +/*
> + * UMCG_TF_LOCKED: locked by the userspace in preparation to calling umcg_wait.
> + */
> +#define UMCG_TF_LOCKED (1 << 8)
> +
> +/*
> + * UMCG_TF_PREEMPTED: the userspace indicates the worker should be preempted.
> + */
> +#define UMCG_TF_PREEMPTED (1 << 9)
> +
> +/**
> + * struct umcg_task - controls the state of UMCG tasks.
> + *
> + * The struct is aligned at 64 bytes to ensure that it fits into
> + * a single cache line.
> + */
> +struct umcg_task {
> + /**
> + * @state: the current state of the UMCG task described by this struct.
> + *
> + * Readable/writable by both the kernel and the userspace.
> + *
> + * UMCG task state:
> + * bits 0 - 7: task state;
> + * bits 8 - 15: state flags;
> + * bits 16 - 23: reserved; must be zeroes;
> + * bits 24 - 31: for userspace use.
> + */
> + uint32_t state; /* r/w */
> +
> + /**
> + * @next_tid: the TID of the UMCG task that should be context-switched
> + * into in sys_umcg_wait(). Can be zero.
> + *
> + * Running UMCG workers must have next_tid set to point to IDLE
> + * UMCG servers.
> + *
> + * Read-only for the kernel, read/write for the userspace.
> + */
> + uint32_t next_tid; /* r */
> +
> + /**
> + * @idle_workers_ptr: a single-linked list of idle workers. Can be NULL.
> + *
> + * Readable/writable by both the kernel and the userspace: the
> + * kernel adds items to the list, the userspace removes them.
> + */
> + uint64_t idle_workers_ptr; /* r/w */
> +
> + /**
> + * @idle_server_tid_ptr: a pointer pointing to a single idle server.
> + * Readonly.
> + */
> + uint64_t idle_server_tid_ptr; /* r */
> +} __attribute__((packed, aligned(8 * sizeof(__u64))));
> +
> +/**
> + * enum umcg_ctl_flag - flags to pass to sys_umcg_ctl
> + * @UMCG_CTL_REGISTER: register the current task as a UMCG task
> + * @UMCG_CTL_UNREGISTER: unregister the current task as a UMCG task
> + * @UMCG_CTL_WORKER: register the current task as a UMCG worker
> + */
> +enum umcg_ctl_flag {
> + UMCG_CTL_REGISTER = 0x00001,
> + UMCG_CTL_UNREGISTER = 0x00002,
> + UMCG_CTL_WORKER = 0x10000,
> +};
> +
> +/**
> + * enum umcg_wait_flag - flags to pass to sys_umcg_wait
> + * @UMCG_WAIT_WAKE_ONLY: wake @self->next_tid, don't put @self to sleep;
> + * @UMCG_WAIT_WF_CURRENT_CPU: wake @self->next_tid on the current CPU
> + * (use WF_CURRENT_CPU); @UMCG_WAIT_WAKE_ONLY
> + * must be set.
> + */
> +enum umcg_wait_flag {
> + UMCG_WAIT_WAKE_ONLY = 1,
> + UMCG_WAIT_WF_CURRENT_CPU = 2,
> +};
> +
> +/* See Documentation/userspace-api/umcg.[txt|rst].*/
> +#define UMCG_IDLE_NODE_PENDING (1ULL)
> +
> +#endif /* _UAPI_LINUX_UMCG_H */
> diff --git a/init/Kconfig b/init/Kconfig
> index a61c92066c2e..c15a50a61ba6 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1662,6 +1662,16 @@ config MEMBARRIER
>
> If unsure, say Y.
>
> +config UMCG
> + bool "Enable User Managed Concurrency Groups API"
> + depends on X86_64
> + default n
> + help
> + Enable User Managed Concurrency Groups API, which form the basis
> + for an in-process M:N userspace scheduling framework.
> + At the moment this is an experimental/RFC feature that is not
> + guaranteed to be backward-compatible.
> +
> config KALLSYMS
> bool "Load all symbols for debugging/ksymoops" if EXPERT
> default y
> diff --git a/kernel/entry/common.c b/kernel/entry/common.c
> index bf16395b9e13..f3cd335ab513 100644
> --- a/kernel/entry/common.c
> +++ b/kernel/entry/common.c
> @@ -173,6 +173,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
>
> if (ti_work & _TIF_NOTIFY_RESUME) {
> tracehook_notify_resume(regs);
> + umcg_handle_notify_resume(); /* might sleep */
> rseq_handle_notify_resume(NULL, regs);
> }
>
> diff --git a/kernel/exit.c b/kernel/exit.c
> index fd1c04193e18..fdd4e923cca9 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -744,6 +744,8 @@ void __noreturn do_exit(long code)
> if (unlikely(!tsk->pid))
> panic("Attempted to kill the idle task!");
>
> + umcg_handle_exit();
> +
> /*
> * If do_exit is called because this processes oopsed, it's possible
> * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
> diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
> index 978fcfca5871..e4e481eee1b7 100644
> --- a/kernel/sched/Makefile
> +++ b/kernel/sched/Makefile
> @@ -37,3 +37,4 @@ obj-$(CONFIG_MEMBARRIER) += membarrier.o
> obj-$(CONFIG_CPU_ISOLATION) += isolation.o
> obj-$(CONFIG_PSI) += psi.o
> obj-$(CONFIG_SCHED_CORE) += core_sched.o
> +obj-$(CONFIG_UMCG) += umcg.o
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 12a9d053e724..c9133cf153b9 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4159,6 +4159,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
> p->wake_entry.u_flags = CSD_TYPE_TTWU;
> p->migration_pending = NULL;
> #endif
> +#ifdef CONFIG_UMCG
> + umcg_clear_child(p);
> +#endif
> }
>
> DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
> @@ -6105,10 +6108,14 @@ static inline void sched_submit_work(struct task_struct *tsk)
> * in the possible wakeup of a kworker and because wq_worker_sleeping()
> * requires it.
> */
> - if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
> + if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_UMCG_WORKER)) {
> preempt_disable();
> if (task_flags & PF_WQ_WORKER)
> wq_worker_sleeping(tsk);
> +#ifdef CONFIG_UMCG
> + else if (task_flags & PF_UMCG_WORKER)
> + umcg_wq_worker_sleeping(tsk);
> +#endif
> else
> io_wq_worker_sleeping(tsk);
> preempt_enable_no_resched();
> @@ -6127,9 +6134,13 @@ static inline void sched_submit_work(struct task_struct *tsk)
>
> static void sched_update_worker(struct task_struct *tsk)
> {
> - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
> + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_UMCG_WORKER)) {
> if (tsk->flags & PF_WQ_WORKER)
> wq_worker_running(tsk);
> +#ifdef CONFIG_UMCG
> + else if (tsk->flags & PF_UMCG_WORKER)
> + umcg_wq_worker_running(tsk);
> +#endif
> else
> io_wq_worker_running(tsk);
> }
> diff --git a/kernel/sched/umcg.c b/kernel/sched/umcg.c
> new file mode 100644
> index 000000000000..aa4dbb31c425
> --- /dev/null
> +++ b/kernel/sched/umcg.c
> @@ -0,0 +1,745 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +
> +/*
> + * User Managed Concurrency Groups (UMCG).
> + *
> + * See Documentation/userspace-api/umcg.[txt|rst] for detals.
> + */
> +
> +#include <linux/syscalls.h>
> +#include <linux/types.h>
> +#include <linux/uaccess.h>
> +#include <linux/umcg.h>
> +
> +#include "sched.h"
> +#include "umcg_uaccess.h"
> +
> +/**
> + * umcg_pin_pages: pin pages containing struct umcg_task of this worker
> + * and its server.
> + *
> + * The pages are pinned when the worker exits to the userspace and unpinned
> + * when the worker is in sched_submit_work(), i.e. when the worker is
> + * about to be removed from its runqueue. Thus at most NR_CPUS UMCG pages
> + * are pinned at any one time across the whole system.
> + */
> +static int umcg_pin_pages(u32 server_tid)
> +{
> + struct umcg_task __user *worker_ut = current->umcg_task;
> + struct umcg_task __user *server_ut = NULL;
> + struct task_struct *tsk;
> +
> + rcu_read_lock();
> + tsk = find_task_by_vpid(server_tid);
> + if (tsk)
> + server_ut = READ_ONCE(tsk->umcg_task);
> + rcu_read_unlock();
> +
> + if (!server_ut)
> + return -EINVAL;
> +
> + if (READ_ONCE(current->mm) != READ_ONCE(tsk->mm))
> + return -EINVAL;
> +
> + tsk = current;
> +
> + /* worker_ut is stable, don't need to repin */
> + if (!tsk->pinned_umcg_worker_page)
> + if (1 != pin_user_pages_fast((unsigned long)worker_ut, 1, 0,
> + &tsk->pinned_umcg_worker_page))
> + return -EFAULT;
> +
> + /* server_ut may change, need to repin */
> + if (tsk->pinned_umcg_server_page) {
> + unpin_user_page(tsk->pinned_umcg_server_page);
> + tsk->pinned_umcg_server_page = NULL;
> + }
> +
> + if (1 != pin_user_pages_fast((unsigned long)server_ut, 1, 0,
> + &tsk->pinned_umcg_server_page))
> + return -EFAULT;
> +
> + return 0;
> +}
> +
> +static void umcg_unpin_pages(void)
> +{
> + struct task_struct *tsk = current;
> +
> + if (tsk->pinned_umcg_worker_page)
> + unpin_user_page(tsk->pinned_umcg_worker_page);
> + if (tsk->pinned_umcg_server_page)
> + unpin_user_page(tsk->pinned_umcg_server_page);
> +
> + tsk->pinned_umcg_worker_page = NULL;
> + tsk->pinned_umcg_server_page = NULL;
> +}
> +
> +static void umcg_clear_task(struct task_struct *tsk)
> +{
> + /*
> + * This is either called for the current task, or for a newly forked
> + * task that is not yet running, so we don't need strict atomicity
> + * below.
> + */
> + if (tsk->umcg_task) {
> + WRITE_ONCE(tsk->umcg_task, NULL);
> +
> + /* These can be simple writes - see the commment above. */
> + tsk->pinned_umcg_worker_page = NULL;
> + tsk->pinned_umcg_server_page = NULL;
> + tsk->flags &= ~PF_UMCG_WORKER;
> + }
> +}
> +
> +/* Called for a forked or execve-ed child. */
> +void umcg_clear_child(struct task_struct *tsk)
> +{
> + umcg_clear_task(tsk);
> +}
> +
> +/* Called both by normally (unregister) and abnormally exiting workers. */
> +void umcg_handle_exiting_worker(void)
> +{
> + umcg_unpin_pages();
> + umcg_clear_task(current);
> +}
> +
> +/**
> + * sys_umcg_ctl: (un)register the current task as a UMCG task.
> + * @flags: ORed values from enum umcg_ctl_flag; see below;
> + * @self: a pointer to struct umcg_task that describes this
> + * task and governs the behavior of sys_umcg_wait if
> + * registering; must be NULL if unregistering.
> + *
> + * @flags & UMCG_CTL_REGISTER: register a UMCG task:
> + * UMCG workers:
> + * - self->state must be UMCG_TASK_IDLE

The code below checks UMCG_TASK_BLOCK, but the comment says UMCG_TASK_IDLE.

Doc says: '+When registering a worker, self->state must be BLOCKED;'

So, the comment here needs to be modified.

> + * - @flags & UMCG_CTL_WORKER
> + * UMCG servers:
> + * - self->state must be UMCG_TASK_RUNNING
> + * - !(@flags & UMCG_CTL_WORKER)
> + *
> + * All tasks:
> + * - self->next_tid must be zero
> + *
> + * If the conditions above are met, sys_umcg_ctl() immediately returns
> + * if the registered task is a server; a worker will be added to
> + * idle_workers_ptr, and the worker put to sleep; an idle server
> + * from idle_server_tid_ptr will be woken, if present.
> + *
> + * @flags == UMCG_CTL_UNREGISTER: unregister a UMCG task. If the current task
> + * is a UMCG worker, the userspace is responsible for waking its
> + * server (before or after calling sys_umcg_ctl).
> + *
> + * Return:
> + * 0 - success
> + * -EFAULT - failed to read @self
> + * -EINVAL - some other error occurred
> + */
> +SYSCALL_DEFINE2(umcg_ctl, u32, flags, struct umcg_task __user *, self)
> +{
> + struct umcg_task ut;
> +
> + if (flags == UMCG_CTL_UNREGISTER) {
> + if (self || !current->umcg_task)
> + return -EINVAL;
> +
> + if (current->flags & PF_UMCG_WORKER)
> + umcg_handle_exiting_worker();
> + else
> + umcg_clear_task(current);
> +
> + return 0;
> + }
> +
> + /* Register the current task as a UMCG task. */
> + if (!(flags & UMCG_CTL_REGISTER))
> + return -EINVAL;
> +
> + flags &= ~UMCG_CTL_REGISTER;
> + if (flags && flags != UMCG_CTL_WORKER)
> + return -EINVAL;
> +
> + if (current->umcg_task || !self)
> + return -EINVAL;
> +
> + if (copy_from_user(&ut, self, sizeof(ut)))
> + return -EFAULT;
> +
> + if (ut.next_tid)
> + return -EINVAL;
> +
> + if (flags == UMCG_CTL_WORKER) {
> + if (ut.state != UMCG_TASK_BLOCKED)
> + return -EINVAL;

Here is where the code check UMCG_TASK_BLOCKED.

> +
> + WRITE_ONCE(current->umcg_task, self);
> + current->flags |= PF_UMCG_WORKER;
> +
> + set_tsk_need_resched(current);
> + return 0;
> + }
> +
> + /* This is a server task. */
> + if (ut.state != UMCG_TASK_RUNNING)
> + return -EINVAL;
> +
> + WRITE_ONCE(current->umcg_task, self);
> + return 0;
> +}
> +
> +/**
> + * handle_timedout_worker - make sure the worker is added to idle_workers
> + * upon a "clean" timeout.
> + */
> +static int handle_timedout_worker(struct umcg_task __user *self)
> +{
> + u32 prev_state, next_state;
> + int ret;
> +
> + if (get_user(prev_state, &self->state))
> + return -EFAULT;
> +
> + if ((prev_state & UMCG_TASK_STATE_MASK) == UMCG_TASK_IDLE) {
> + /* TODO: should we care here about TF_LOCKED or TF_PREEMPTED? */
> +
> + next_state = prev_state & ~UMCG_TASK_STATE_MASK;
> + next_state |= UMCG_TASK_BLOCKED;
> +
> + ret = cmpxchg_user_32(&self->state, &prev_state, next_state);
> + if (ret)
> + return ret;
> +
> + return -ETIMEDOUT;
> + }
> +
> + return 0; /* Not really timed out. */
> +}
> +
> +/**
> + * umcg_idle_loop - sleep until the current task becomes RUNNING or a timeout
> + * @abs_timeout - absolute timeout in nanoseconds; zero => no timeout
> + *
> + * The function marks the current task as INTERRUPTIBLE and calls
> + * schedule(). It returns when either the timeout expires or
> + * the UMCG state of the task becomes RUNNING.
> + *
> + * Note: because UMCG workers should not be running WITHOUT attached servers,
> + * and because servers should not be running WITH attached workers,
> + * the function returns only on fatal signal pending and ignores/flushes
> + * all other signals.
> + */
> +static int umcg_idle_loop(u64 abs_timeout)
> +{
> + int ret;
> + struct hrtimer_sleeper timeout;
> + struct umcg_task __user *self = current->umcg_task;
> +
> + if (abs_timeout) {
> + hrtimer_init_sleeper_on_stack(&timeout, CLOCK_REALTIME,
> + HRTIMER_MODE_ABS);
> +
> + hrtimer_set_expires_range_ns(&timeout.timer, (s64)abs_timeout,
> + current->timer_slack_ns);
> + }
> +
> + while (true) {
> + u32 umcg_state;
> +
> + set_current_state(TASK_INTERRUPTIBLE);
> +
> + smp_mb(); /* Order with set_current_state() above. */

set_current_state() implies the smp_mb(). I don't think about it just
feel it should be put above the set_current_state() to order with
__set_current_state() below.

> + ret = -EFAULT;
> + if (get_user(umcg_state, &self->state)) {
> + set_current_state(TASK_RUNNING);
> + goto out;
> + }
> +
> + ret = 0;
> + if ((umcg_state & UMCG_TASK_STATE_MASK) == UMCG_TASK_RUNNING) {
> + set_current_state(TASK_RUNNING);
> + goto out;
> + }
> +
> + if (abs_timeout)
> + hrtimer_sleeper_start_expires(&timeout, HRTIMER_MODE_ABS);
> +
> + if (!abs_timeout || timeout.task) {
> + /*
> + * Clear PF_UMCG_WORKER to elide workqueue handlers.
> + */
> + const bool worker = current->flags & PF_UMCG_WORKER;
> +
> + if (worker)
> + current->flags &= ~PF_UMCG_WORKER;
> +
> + /*
> + * Note: freezable_schedule() here is not appropriate
> + * as umcg_idle_loop can be called from rwsem locking
> + * context (via workqueue handlers), which may
> + * trigger a lockdep warning for mmap_lock.
> + */
> + schedule();
> +
> + if (worker)
> + current->flags |= PF_UMCG_WORKER;
> + }
> + __set_current_state(TASK_RUNNING);
> +
> + /*
> + * Check for timeout before checking the state, as workers
> + * are not going to return from schedule() unless
> + * they are RUNNING.
> + */
> + ret = -ETIMEDOUT;
> + if (abs_timeout && !timeout.task)
> + goto out;
> +
> + ret = -EFAULT;
> + if (get_user(umcg_state, &self->state))
> + goto out;
> +
> + ret = 0;
> + if ((umcg_state & UMCG_TASK_STATE_MASK) == UMCG_TASK_RUNNING)
> + goto out;
> +
> + ret = -EINTR;
> + if (fatal_signal_pending(current))
> + goto out;
> +
> + if (signal_pending(current))
> + flush_signals(current);
> + }
> +
> +out:
> + if (abs_timeout) {
> + hrtimer_cancel(&timeout.timer);
> + destroy_hrtimer_on_stack(&timeout.timer);
> + }
> +
> + /* Workers must go through workqueue handlers upon wakeup. */
> + if (current->flags & PF_UMCG_WORKER) {
> + if (ret == -ETIMEDOUT)
> + ret = handle_timedout_worker(self);
> +
> + set_tsk_need_resched(current);
> + }
> +
> + return ret;
> +}
> +
> +/*
> + * Try to wake up. May be called with preempt_disable set. May be called
> + * cross-process.
> + *
> + * Note: umcg_ttwu succeeds even if ttwu fails: see wait/wake state
> + * ordering logic.
> + */
> +static int umcg_ttwu(u32 next_tid, int wake_flags)
> +{
> + struct task_struct *next;
> +
> + rcu_read_lock();
> + next = find_task_by_vpid(next_tid);
> + if (!next || !(READ_ONCE(next->umcg_task))) {
> + rcu_read_unlock();
> + return -ESRCH;
> + }
> +
> + /* Note: next does not necessarily share mm with current. */
> +
> + try_to_wake_up(next, TASK_NORMAL, wake_flags); /* Result ignored. */
> + rcu_read_unlock();
> +
> + return 0;
> +}
> +
> +/*
> + * At the moment, umcg_do_context_switch simply wakes up @next with
> + * WF_CURRENT_CPU and puts the current task to sleep. May be called cross-mm.
> + *
> + * In the future an optimization will be added to adjust runtime accounting
> + * so that from the kernel scheduling perspective the two tasks are
> + * essentially treated as one. In addition, the context switch may be performed
> + * right here on the fast path, instead of going through the wake/wait pair.
> + */
> +static int umcg_do_context_switch(u32 next_tid, u64 abs_timeout)
> +{
> + struct task_struct *next;
> +
> + rcu_read_lock();
> + next = find_task_by_vpid(next_tid);
> + if (!next) {
> + rcu_read_unlock();
> + return -ESRCH;
> + }
> +
> + /* Note: next does not necessarily share mm with current. */
> +
> + /* TODO: instead of wake + sleep, do a context switch. */
> + try_to_wake_up(next, TASK_NORMAL, WF_CURRENT_CPU); /* Result ignored. */
> + rcu_read_unlock();
> +
> + return umcg_idle_loop(abs_timeout);
> +}
> +
> +/**
> + * sys_umcg_wait: put the current task to sleep and/or wake another task.
> + * @flags: zero or a value from enum umcg_wait_flag.
> + * @abs_timeout: when to wake the task, in nanoseconds; zero for no timeout.
> + *
> + * @self->state must be UMCG_TASK_IDLE (where @self is current->umcg_task)
> + * if !(@flags & UMCG_WAIT_WAKE_ONLY).
> + *
> + * If @self->next_tid is not zero, it must point to an IDLE UMCG task.
> + * The userspace must have changed its state from IDLE to RUNNING
> + * before calling sys_umcg_wait() in the current task. This "next"
> + * task will be woken (context-switched-to on the fast path) when the
> + * current task is put to sleep.
> + *
> + * See Documentation/userspace-api/umcg.[txt|rst] for detals.
> + *
> + * Return:
> + * 0 - OK;
> + * -ETIMEDOUT - the timeout expired;
> + * -EFAULT - failed accessing struct umcg_task __user of the current
> + * task;
> + * -ESRCH - the task to wake not found or not a UMCG task;
> + * -EINVAL - another error happened (e.g. bad @flags, or the current
> + * task is not a UMCG task, etc.)
> + */
> +SYSCALL_DEFINE2(umcg_wait, u32, flags, u64, abs_timeout)
> +{
> + struct umcg_task __user *self = current->umcg_task;
> + u32 next_tid;
> +
> + if (!self)
> + return -EINVAL;
> +
> + if (get_user(next_tid, &self->next_tid))
> + return -EFAULT;
> +
> + if (flags & UMCG_WAIT_WAKE_ONLY) {
> + if (!next_tid || abs_timeout)
> + return -EINVAL;
> +
> + flags &= ~UMCG_WAIT_WAKE_ONLY;
> + if (flags & ~UMCG_WAIT_WF_CURRENT_CPU)
> + return -EINVAL;
> +
> + return umcg_ttwu(next_tid, flags & UMCG_WAIT_WF_CURRENT_CPU ?
> + WF_CURRENT_CPU : 0);
> + }
> +
> + /* Unlock the worker, if locked. */
> + if (current->flags & PF_UMCG_WORKER) {
> + u32 umcg_state;
> +
> + if (get_user(umcg_state, &self->state))
> + return -EFAULT;
> +
> + if ((umcg_state & UMCG_TF_LOCKED) && cmpxchg_user_32(
> + &self->state, &umcg_state,
> + umcg_state & ~UMCG_TF_LOCKED))
> + return -EFAULT;
> + }
> +
> + if (next_tid)
> + return umcg_do_context_switch(next_tid, abs_timeout);
> +
> + return umcg_idle_loop(abs_timeout);
> +}
> +
> +/*
> + * NOTE: all code below is called from workqueue submit/update, or
> + * syscall exit to usermode loop, so all errors result in the
> + * termination of the current task (via SIGKILL).
> + */
> +
> +/* Returns true on success, false on _any_ error. */
> +static bool mark_server_running(u32 server_tid, bool may_sleep)
> +{
> + struct umcg_task __user *ut_server = NULL;
> + u32 state = UMCG_TASK_IDLE;
> + struct task_struct *tsk;
> +
> + rcu_read_lock();
> + tsk = find_task_by_vpid(server_tid);
> + if (tsk)
> + ut_server = READ_ONCE(tsk->umcg_task);
> + rcu_read_unlock();
> +
> + if (!ut_server)
> + return false;
> +
> + if (READ_ONCE(current->mm) != READ_ONCE(tsk->mm))
> + return false;
> +
> + if (may_sleep)
> + return !cmpxchg_user_32(&ut_server->state, &state, UMCG_TASK_RUNNING);
> +
> + return !cmpxchg_user_32_nosleep(&ut_server->state, &state, UMCG_TASK_RUNNING);
> +}
> +
> +/*
> + * Called by sched_submit_work() for UMCG workers from within preempt_disable()
> + * context. In the common case, the worker's state changes RUNNING => BLOCKED,
> + * and its server's state changes IDLE => RUNNING, and the server is ttwu-ed.
> + *
> + * Under some conditions (e.g. the worker is "locked", see
> + * /Documentation/userspace-api/umcg.[txt|rst] for more details), the
> + * function does nothing.
> + */
> +static void __umcg_wq_worker_sleeping(struct task_struct *tsk)
> +{
> + struct umcg_task __user *ut_worker = tsk->umcg_task;
> + u32 prev_state, next_state, server_tid;
> + bool preempted = false;
> + int ret;
> +
> + if (WARN_ONCE((tsk != current) || !ut_worker, "Invalid umcg worker"))
> + return;
> +
> + /* Sometimes "locked" workers run without servers. */
> + if (unlikely(!tsk->pinned_umcg_server_page))
> + return;
> +
> + smp_mb(); /* The userspace may change the state concurrently. */
> + if (get_user_nosleep(prev_state, &ut_worker->state))
> + goto die; /* EFAULT */
> +
> + if (prev_state & UMCG_TF_LOCKED)
> + return;
> +
> + if ((prev_state & UMCG_TASK_STATE_MASK) != UMCG_TASK_RUNNING)
> + return; /* the worker is in umcg_wait */
> +
> +retry_once:
> + next_state = prev_state & ~UMCG_TASK_STATE_MASK;
> + next_state |= UMCG_TASK_BLOCKED;
> + preempted = prev_state & UMCG_TF_PREEMPTED;
> +
> + ret = cmpxchg_user_32_nosleep(&ut_worker->state, &prev_state, next_state);
> + if (ret == -EAGAIN) {
> + if (preempted)
> + goto die; /* Preemption can only happen once. */
> +
> + if (prev_state != (UMCG_TASK_RUNNING | UMCG_TF_PREEMPTED))
> + goto die; /* Only preemption can happen. */
> +
> + preempted = true;
> + goto retry_once;
> + }
> + if (ret)
> + goto die; /* EFAULT */
> +
> + if (get_user_nosleep(server_tid, &ut_worker->next_tid))
> + goto die; /* EFAULT */
> +
> + if (!server_tid)
> + return; /* Waking a waiting worker leads here. */
> +
> + /* The idle server's wait may timeout. */
> + /* TODO: make a smarter context switch below when available. */
> + if (mark_server_running(server_tid, false))
> + umcg_ttwu(server_tid, WF_CURRENT_CPU);
> +
> + return;
> +
> +die:
> + pr_warn("umcg_wq_worker_sleeping: killing task %d\n", current->pid);
> + force_sig(SIGKILL);
> +}
> +
> +/* Called from sched_submit_work() with preempt_disable. */
> +void umcg_wq_worker_sleeping(struct task_struct *tsk)
> +{
> + __umcg_wq_worker_sleeping(tsk);
> + umcg_unpin_pages();
> +}
> +
> +/**
> + * enqueue_idle_worker - push an idle worker onto idle_workers_ptr list/stack.

This function attract idot like me enough.

> + * Returns true on success, false on a fatal failure.
> + *
> + * See Documentation/userspace-api/umcg.[txt|rst] for details.
> + */
> +static bool enqueue_idle_worker(struct umcg_task __user *ut_worker)
> +{
> + u64 __user *node = &ut_worker->idle_workers_ptr;
> + u64 __user *head_ptr;
> + u64 first = (u64)node;
> + u64 head;
> +
> + if (get_user(head, node) || !head)
> + return false;
> +
> + head_ptr = (u64 __user *)head;
> +
> + if (put_user(UMCG_IDLE_NODE_PENDING, node))
> + return false;
> +
> + if (xchg_user_64(head_ptr, &first))
> + return false;
> +
> + if (put_user(first, node))
> + return false;
> +
> + return true;
> +}
> +
> +/**
> + * get_idle_server - retrieve an idle server, if present.
> + *
> + * Returns true on success, false on a fatal failure.
> + */
> +static bool get_idle_server(struct umcg_task __user *ut_worker, u32 *server_tid)
> +{
> + u64 server_tid_ptr;
> + u32 tid;
> + int ret;
> +
> + *server_tid = 0; /* Empty result is OK. */
> +
> + if (get_user(server_tid_ptr, &ut_worker->idle_server_tid_ptr))
> + return false;
> +
> + if (!server_tid_ptr)
> + return false;
> +
> + tid = 0;
> + ret = xchg_user_32((u32 __user *)server_tid_ptr, &tid);
> +
> + if (ret)
> + return false;
> +
> + if (tid && mark_server_running(tid, true))
> + *server_tid = tid;
> +
> + return true;
> +}
> +
> +/*
> + * Returns true to wait for the userspace to schedule this worker, false
> + * to return to the userspace. In the common case, enqueues the worker
> + * to idle_workers_ptr list and wakes the idle server (if present).
> + */
> +static bool process_waking_worker(struct task_struct *tsk, u32 *server_tid)
> +{
> + struct umcg_task __user *ut_worker = tsk->umcg_task;
> + u32 prev_state, next_state;
> + int ret = 0;
> +
> + *server_tid = 0;
> +
> + if (WARN_ONCE((tsk != current) || !ut_worker, "Invalid umcg worker"))
> + return false;
> +
> + if (fatal_signal_pending(tsk))
> + return false;
> +
> + smp_mb(); /* The userspace may concurrently modify the worker's state. */
> + if (get_user(prev_state, &ut_worker->state))
> + goto die;
> +
> + if ((prev_state & UMCG_TASK_STATE_MASK) == UMCG_TASK_RUNNING) {
> + u32 tid;
> +
> + if (prev_state & UMCG_TF_LOCKED)
> + return true; /* Wakeup: wait but don't enqueue. */
> +
> + smp_mb(); /* Order getting state and getting server_tid */
> +
> + if (get_user(tid, &ut_worker->next_tid))
> + goto die;
> +
> + *server_tid = tid;
> +
> + if (prev_state & UMCG_TF_PREEMPTED) {
> + if (!tid)
> + goto die; /* PREEMPTED workers must have a server. */
> +
> + /* Always enqueue preempted workers. */
> + if (!mark_server_running(tid, true))
> + goto die;
> + } else if (tid)
> + return false; /* pass-through: RUNNING with a server. */
> +
> + /* If !PREEMPTED, the worker gets here via UMCG_WAIT_WAKE_ONLY */
> + } else if (unlikely((prev_state & UMCG_TASK_STATE_MASK) == UMCG_TASK_IDLE &&
> + (prev_state & UMCG_TF_LOCKED)))
> + return false; /* The worker prepares to sleep or to unregister. */
> +
> + if ((prev_state & UMCG_TASK_STATE_MASK) == UMCG_TASK_IDLE)
> + return true; /* the worker called umcg_wait(); don't enqueue */
> +
> + next_state = prev_state & ~UMCG_TASK_STATE_MASK;
> + next_state |= UMCG_TASK_IDLE;
> +
> + if (prev_state != next_state)
> + ret = cmpxchg_user_32(&ut_worker->state, &prev_state, next_state);
> + if (ret)
> + goto die;
> +
> + if (!enqueue_idle_worker(ut_worker))
> + goto die;
> +
> + smp_mb(); /* Order enqueuing the worker with getting the server. */
> + if (!(*server_tid) && !get_idle_server(ut_worker, server_tid))
> + goto die;
> +
> + return true;
> +
> +die:
> + pr_warn("umcg_process_waking_worker: killing task %d\n", current->pid);
> + force_sig(SIGKILL);
> + return false;
> +}
> +
> +/*
> + * Called from sched_update_worker(): defer all work until later, as
> + * sched_update_worker() may be called with in-kernel locks held.
> + */
> +void umcg_wq_worker_running(struct task_struct *tsk)
> +{
> + set_tsk_thread_flag(tsk, TIF_NOTIFY_RESUME);
> +}
> +
> +/* Called via TIF_NOTIFY_RESUME flag from exit_to_user_mode_loop. */
> +void umcg_handle_resuming_worker(void)
> +{
> + u32 server_tid;
> +
> + /* Avoid recursion by removing PF_UMCG_WORKER */
> + current->flags &= ~PF_UMCG_WORKER;
> +
> + do {
> + bool should_wait;
> +
> + should_wait = process_waking_worker(current, &server_tid);
> +
> + if (!should_wait)
> + break;
> +
> + if (server_tid)
> + umcg_do_context_switch(server_tid, 0);
> + else
> + umcg_idle_loop(0);
> + } while (true);
> +
> + if (server_tid && umcg_pin_pages(server_tid))
> + goto die;
> +
> + if (!server_tid) /* No server => no reason to pin pages. */
> + umcg_unpin_pages();
> +
> + goto out;
> +
> +die:
> + pr_warn("%s: killing task %d\n", __func__, current->pid);
> + force_sig(SIGKILL);
> +out:
> + current->flags |= PF_UMCG_WORKER;
> +}
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 0ea8128468c3..cd1be6356e42 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -272,6 +272,10 @@ COND_SYSCALL(landlock_create_ruleset);
> COND_SYSCALL(landlock_add_rule);
> COND_SYSCALL(landlock_restrict_self);
>
> +/* kernel/sched/umcg.c */
> +COND_SYSCALL(umcg_ctl);
> +COND_SYSCALL(umcg_wait);
> +
> /* arch/example/kernel/sys_example.c */
>
> /* mm/fadvise.c */
> --
> 2.25.1
>



Thanks,
Tao