[RFC PATCH v2 2/5] sched: Note schedule() invocations at return-to-user with SM_USER

From: Valentin Schneider
Date: Fri Feb 02 2024 - 03:16:21 EST


task_struct.in_return_to_user is currently updated via atomic operations in
schedule_usermode().

However, one can note:
o .in_return_to_user is only updated for the current task
o There are no remote (smp_processor_id() != task_cpu(p)) accesses to
.in_return_to_user

Add schedule_with_mode() to factorize schedule() with different flags to
pass down to __schedule_loop().

Add SM_USER to denote schedule() calls from return-to-userspace points.

Update .in_return_to_user from within the preemption-disabled, rq_lock-held
part of __schedule().

Suggested-by: Benjamin Segall <bsegall@xxxxxxxxxx>
Signed-off-by: Valentin Schneider <vschneid@xxxxxxxxxx>
---
include/linux/sched.h | 2 +-
kernel/sched/core.c | 43 ++++++++++++++++++++++++++++++++-----------
kernel/sched/fair.c | 17 ++++++++++++++++-
3 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4a0105d1eaa21..1b6f17b2150a6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1544,7 +1544,7 @@ struct task_struct {
#endif

#ifdef CONFIG_CFS_BANDWIDTH
- atomic_t in_return_to_user;
+ int in_return_to_user;
#endif
/*
* New fields for task_struct should be added above here, so that
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a7c028fad5a89..54e6690626b13 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4531,7 +4531,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#endif
#ifdef CONFIG_CFS_BANDWIDTH
INIT_LIST_HEAD(&p->se.kernel_node);
- atomic_set(&p->in_return_to_user, 0);
+ p->in_return_to_user = false;
#endif

#ifdef CONFIG_SCHEDSTATS
@@ -5147,6 +5147,9 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf

static inline void finish_lock_switch(struct rq *rq)
{
+#ifdef CONFIG_CFS_BANDWIDTH
+ current->in_return_to_user = false;
+#endif
/*
* If we are tracking spinlock dependencies then we have to
* fix up the runqueue lock - which gets 'carried over' from
@@ -6562,6 +6565,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
#define SM_PREEMPT 0x1
#define SM_RTLOCK_WAIT 0x2

+/*
+ * Special case for CFS_BANDWIDTH where we need to know if the call to
+ * __schedule() is directely preceding an entry into userspace.
+ * It is removed from the mode argument as soon as it is used to not go against
+ * the SM_MASK_PREEMPT optimisation below.
+ */
+#ifdef CONFIG_CFS_BANDWIDTH
+# define SM_USER 0x4
+#else
+# define SM_USER SM_NONE
+#endif
+
#ifndef CONFIG_PREEMPT_RT
# define SM_MASK_PREEMPT (~0U)
#else
@@ -6646,6 +6661,14 @@ static void __sched notrace __schedule(unsigned int sched_mode)
rq_lock(rq, &rf);
smp_mb__after_spinlock();

+#ifdef CONFIG_CFS_BANDWIDTH
+ if (sched_mode & SM_USER) {
+ prev->in_return_to_user = true;
+ sched_mode &= ~SM_USER;
+ }
+#endif
+ SCHED_WARN_ON(sched_mode & SM_USER);
+
/* Promote REQ to ACT */
rq->clock_update_flags <<= 1;
update_rq_clock(rq);
@@ -6807,7 +6830,7 @@ static __always_inline void __schedule_loop(unsigned int sched_mode)
} while (need_resched());
}

-asmlinkage __visible void __sched schedule(void)
+static __always_inline void schedule_with_mode(unsigned int sched_mode)
{
struct task_struct *tsk = current;

@@ -6817,22 +6840,20 @@ asmlinkage __visible void __sched schedule(void)

if (!task_is_running(tsk))
sched_submit_work(tsk);
- __schedule_loop(SM_NONE);
+ __schedule_loop(sched_mode);
sched_update_worker(tsk);
}
+
+asmlinkage __visible void __sched schedule(void)
+{
+ schedule_with_mode(SM_NONE);
+}
EXPORT_SYMBOL(schedule);

asmlinkage __visible void __sched schedule_usermode(void)
{
#ifdef CONFIG_CFS_BANDWIDTH
- /*
- * This is only atomic because of this simple implementation. We could
- * do something with an SM_USER to avoid other-cpu scheduler operations
- * racing against these writes.
- */
- atomic_set(&current->in_return_to_user, true);
- schedule();
- atomic_set(&current->in_return_to_user, false);
+ schedule_with_mode(SM_USER);
#else
schedule();
#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a1808459a5acc..96504be6ee14a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6631,7 +6631,22 @@ static void enqueue_kernel(struct cfs_rq *cfs_rq, struct sched_entity *se, int c

static bool is_kernel_task(struct task_struct *p)
{
- return sysctl_sched_cfs_bandwidth_kernel_bypass && !atomic_read(&p->in_return_to_user);
+ /*
+ * The flag is updated within __schedule() with preemption disabled,
+ * under the rq lock, and only when the task is current.
+ *
+ * Holding the rq lock for that task's CPU is thus sufficient for the
+ * value to be stable, if the task is enqueued.
+ *
+ * If the task is dequeued, then task_cpu(p) *can* change, but this
+ * so far only happens in enqueue_task_fair() which means either:
+ * - the task is being activated, its CPU has been set previously in ttwu()
+ * - the task is going through a "change" cycle (e.g. sched_move_task()),
+ * the pi_lock is also held so the CPU is stable.
+ */
+ lockdep_assert_rq_held(cpu_rq(task_cpu(p)));
+
+ return sysctl_sched_cfs_bandwidth_kernel_bypass && !p->in_return_to_user;
}

/*
--
2.43.0