[PATCH] sched/core.c: Add min_ratio for cfs bandwidth_control
From: Chuyi Zhou
Date: Tue Oct 11 2022 - 08:08:51 EST
Tasks may be throttled when holding locks for a long time by current
cfs bandwidth control mechanism once users set a too small quota/period
ratio, which can result whole system get stuck[1].
In order to prevent the above situation from happening, this patch adds
sysctl_sched_cfs_bandwidth_min_ratio in /proc/sys/kernel, which indicates
the minimum percentage of quota/period users can set. The default value is
zero and users can set quota and period without triggering this constraint.
Link[1]:https://lore.kernel.org/lkml/5987be34-b527-4ff5-a17d-5f6f0dc94d6d@xxxxxxxxxx/T/
Signed-off-by: Chuyi Zhou <zhouchuyi@xxxxxxxxxxxxx>
Suggested-by: Abel Wu <wuyun.abel@xxxxxxxxxxxxx>
---
include/linux/sched/sysctl.h | 4 ++++
kernel/sched/core.c | 23 +++++++++++++++++++++++
kernel/sysctl.c | 10 ++++++++++
3 files changed, 37 insertions(+)
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index e650946816d0..375c14f8fca9 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -21,6 +21,10 @@ enum sched_tunable_scaling {
SCHED_TUNABLESCALING_END,
};
+#ifdef CONFIG_CFS_BANDWIDTH
+extern unsigned int sysctl_sched_cfs_bandwidth_min_ratio;
+#endif
+
#define NUMA_BALANCING_DISABLED 0x0
#define NUMA_BALANCING_NORMAL 0x1
#define NUMA_BALANCING_MEMORY_TIERING 0x2
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 60fdc0faf1c9..fc5b6671154d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10541,6 +10541,12 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
}
#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * The minimum of quota/period ratio users can set, default is zero and users can set
+ * quota and period without triggering this constraint.
+ */
+unsigned int sysctl_sched_cfs_bandwidth_min_ratio;
+
static DEFINE_MUTEX(cfs_constraints_mutex);
const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
@@ -10550,6 +10556,20 @@ static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
+static int check_cfs_bandwidth_min_ratio(u64 period, u64 quota)
+{
+ u64 ratio;
+
+ if (!sysctl_sched_cfs_bandwidth_min_ratio)
+ return 0;
+
+ ratio = div64_u64(quota * 100, period);
+ if (ratio < sysctl_sched_cfs_bandwidth_min_ratio)
+ return -1;
+
+ return 0;
+}
+
static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
u64 burst)
{
@@ -10585,6 +10605,9 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
burst + quota > max_cfs_runtime))
return -EINVAL;
+ if (quota != RUNTIME_INF && check_cfs_bandwidth_min_ratio(period, quota))
+ return -EINVAL;
+
/*
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 205d605cacc5..26f05ec7d9aa 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1642,6 +1642,16 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_FOUR,
},
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ .procname = "sched_cfs_bandwidth_min_ratio",
+ .data = &sysctl_sched_cfs_bandwidth_min_ratio,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+#endif /* CONFIG_CFS_BANDWIDTH */
{
.procname = "panic",
.data = &panic_timeout,
--
2.20.1