[RFC PATCH 03/22] [PoC] kernel/entry/common: Mark syscall as a kernel critical section
From: K Prateek Nayak
Date: Thu Feb 20 2025 - 04:35:19 EST
Mark the syscall boundary as a kernel critical section. Use a per-task
"kernel_cs_count" to track task's entry from userspace and exit to
userspace. When "kernel_cs_count" is non-zero, the task is executing in
kernel mode.
For this Proof-of-Concept, "kernel_cs_count" can only be 1 or 0 for a
tasks and the implementation will run with the same assumption. The
critical section is defined as an integer count to allow fine grained
control in the future where certain boundaries within the kernel can be
marked as resource holding critical sections.
For the sake of simplicity, the whole kernel mode is marked as a
critical section in this PoC. For future extensibility,
sched_notify_critical_sction{entry,exit}() helpers are defined to mark
boundaries of kernel critical section and is similar to preempt_count()
mechanism.
Signed-off-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>
---
include/linux/sched.h | 19 ++++++++++++++++++-
kernel/entry/common.c | 7 +++++++
kernel/entry/common.h | 4 ++++
kernel/sched/fair.c | 20 ++++++++++++++++++++
4 files changed, 49 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 34862d904ea3..63f3f235a5c1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -577,7 +577,24 @@ struct sched_entity {
/* cached value of my_q->h_nr_running */
unsigned int runnable_weight;
int depth;
-#endif
+
+#ifdef CONFIG_CFS_BANDWIDTH
+ /*
+ * Keep track of tasks, and cfs_rq(s) that contains tasks
+ * running in kernel mode. Any throttling event for the
+ * cfs_rq will be deferred until this count hits 0.
+ *
+ * Semantics:
+ *
+ * - For task: It represents if the task is currently
+ * running in kernel mode. It is always 0 or 1.
+ *
+ * TODO: Describe for sched_entity when implementing.
+ */
+ int kernel_cs_count;
+ /* hole */
+#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_SMP
/*
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index cc93cdcc36d0..b132b96e2b96 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -83,6 +83,8 @@ __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, lon
{
unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
+ sched_notify_critical_section_entry();
+
if (work & SYSCALL_WORK_ENTER)
syscall = syscall_trace_enter(regs, syscall, work);
@@ -214,6 +216,11 @@ static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *reg
{
syscall_exit_to_user_mode_prepare(regs);
local_irq_disable_exit_to_user();
+ /*
+ * Notify scheduler that the task is exiting to userspace after a
+ * syscall. Must be called before checking for NEED_RESCHED work.
+ */
+ sched_notify_critical_section_exit();
exit_to_user_mode_prepare(regs);
}
diff --git a/kernel/entry/common.h b/kernel/entry/common.h
index f6e6d02f07fe..73e699a4c3e9 100644
--- a/kernel/entry/common.h
+++ b/kernel/entry/common.h
@@ -4,4 +4,8 @@
bool syscall_user_dispatch(struct pt_regs *regs);
+/* sched notifiers for CFS bandwidth deferral */
+extern void sched_notify_critical_section_entry(void);
+extern void sched_notify_critical_section_exit(void);
+
#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 857808da23d8..becf2d35f35a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -58,6 +58,8 @@
#include "stats.h"
#include "autogroup.h"
+#include "../entry/common.h" /* critical section entry / exit notifiers */
+
/*
* The initial- and re-scaling of tunables is configurable
*
@@ -6704,6 +6706,20 @@ bool cfs_task_bw_constrained(struct task_struct *p)
return false;
}
+__always_inline void sched_notify_critical_section_entry(void)
+{
+ current->se.kernel_cs_count++;
+ /*
+ * Post this point, the task is considered to be in a kernel
+ * critical section and will defer bandwidth throttling.
+ */
+}
+
+__always_inline void sched_notify_critical_section_exit(void)
+{
+ current->se.kernel_cs_count--;
+}
+
#ifdef CONFIG_NO_HZ_FULL
/* called from pick_next_task_fair() */
static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
@@ -6772,6 +6788,10 @@ bool cfs_task_bw_constrained(struct task_struct *p)
return false;
}
#endif
+
+__always_inline void sched_notify_critical_section_entry(void) {}
+__always_inline void sched_notify_critical_section_exit(void) {}
+
#endif /* CONFIG_CFS_BANDWIDTH */
#if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
--
2.43.0