[PATCH v3 3/3] sched: Introduce RLIMIT_UCLAMP

From: Quentin Perret
Date: Wed Jun 23 2021 - 08:34:57 EST


There is currently nothing preventing tasks from changing their per-task
clamp values in anyway that they like. The rationale is probably that
system administrators are still able to limit those clamps thanks to the
cgroup interface. While this is probably fine in many systems where
userspace apps are expected to drive their own power-performance, this
causes pain in a system where both per-task and per-cgroup clamp values
are expected to be under the control of core system components (as is
the case for Android).

To fix this, let's introduce a new rlimit to control the uclamp
behaviour. This allows unprivileged tasks to lower their uclamp
requests, but not increase them unless they have been allowed to do so
via rlimit. This is consistent with the existing behaviour for nice
values or RT priorities.

The default RLIMIT_UCLAMP is set to RLIMIT_INFINITY to keep the existing
behaviour.

Signed-off-by: Quentin Perret <qperret@xxxxxxxxxx>
---
fs/proc/base.c | 1 +
include/asm-generic/resource.h | 1 +
include/uapi/asm-generic/resource.h | 3 +-
kernel/sched/core.c | 48 ++++++++++++++++++++++++-----
4 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9cbd915025ad..91a78cf1fe79 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -586,6 +586,7 @@ static const struct limit_names lnames[RLIM_NLIMITS] = {
[RLIMIT_NICE] = {"Max nice priority", NULL},
[RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
[RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
+ [RLIMIT_UCLAMP] = {"Max utilization clamp", NULL},
};

/* Display limits for a process */
diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h
index 8874f681b056..53483b7cd4d7 100644
--- a/include/asm-generic/resource.h
+++ b/include/asm-generic/resource.h
@@ -26,6 +26,7 @@
[RLIMIT_NICE] = { 0, 0 }, \
[RLIMIT_RTPRIO] = { 0, 0 }, \
[RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \
+ [RLIMIT_UCLAMP] = { RLIM_INFINITY, RLIM_INFINITY }, \
}

#endif
diff --git a/include/uapi/asm-generic/resource.h b/include/uapi/asm-generic/resource.h
index f12db7a0da64..4d0fe4d564bf 100644
--- a/include/uapi/asm-generic/resource.h
+++ b/include/uapi/asm-generic/resource.h
@@ -46,7 +46,8 @@
0-39 for nice level 19 .. -20 */
#define RLIMIT_RTPRIO 14 /* maximum realtime priority */
#define RLIMIT_RTTIME 15 /* timeout for RT tasks in us */
-#define RLIM_NLIMITS 16
+#define RLIMIT_UCLAMP 16 /* maximum utilization clamp */
+#define RLIM_NLIMITS 17

/*
* SuS says limits have to be unsigned.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ad055fb9ed2d..b094da4c5fea 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1430,6 +1430,11 @@ static int uclamp_validate(struct task_struct *p,
if (util_min != -1 && util_max != -1 && util_min > util_max)
return -EINVAL;

+ return 0;
+}
+
+static void uclamp_enable(void)
+{
/*
* We have valid uclamp attributes; make sure uclamp is enabled.
*
@@ -1438,8 +1443,20 @@ static int uclamp_validate(struct task_struct *p,
* scheduler locks.
*/
static_branch_enable(&sched_uclamp_used);
+}

- return 0;
+static bool can_uclamp(struct task_struct *p, int value, enum uclamp_id clamp_id)
+{
+ unsigned long uc_rlimit = task_rlimit(p, RLIMIT_UCLAMP);
+
+ if (value == -1) {
+ if (rt_task(p) && clamp_id == UCLAMP_MIN)
+ value = sysctl_sched_uclamp_util_min_rt_default;
+ else
+ value = uclamp_none(clamp_id);
+ }
+
+ return value <= p->uclamp_req[clamp_id].value || value <= uc_rlimit;
}

static bool uclamp_reset(const struct sched_attr *attr,
@@ -1580,6 +1597,11 @@ static inline int uclamp_validate(struct task_struct *p,
{
return -EOPNOTSUPP;
}
+static inline void uclamp_enable(void) { }
+static bool can_uclamp(struct task_struct *p, int value, enum uclamp_id clamp_id)
+{
+ return true;
+}
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr) { }
static inline void uclamp_fork(struct task_struct *p) { }
@@ -6116,6 +6138,13 @@ static int __sched_setscheduler(struct task_struct *p,
(rt_policy(policy) != (attr->sched_priority != 0)))
return -EINVAL;

+ /* Update task specific "requested" clamps */
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
+ retval = uclamp_validate(p, attr);
+ if (retval)
+ return retval;
+ }
+
/*
* Allow unprivileged RT tasks to decrease priority:
*/
@@ -6165,6 +6194,15 @@ static int __sched_setscheduler(struct task_struct *p,
/* Normal users shall not reset the sched_reset_on_fork flag: */
if (p->sched_reset_on_fork && !reset_on_fork)
return -EPERM;
+
+ /* Can't increase util-clamps */
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
+ !can_uclamp(p, attr->sched_util_min, UCLAMP_MIN))
+ return -EPERM;
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
+ !can_uclamp(p, attr->sched_util_max, UCLAMP_MAX))
+ return -EPERM;
}

if (user) {
@@ -6176,12 +6214,8 @@ static int __sched_setscheduler(struct task_struct *p,
return retval;
}

- /* Update task specific "requested" clamps */
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
- retval = uclamp_validate(p, attr);
- if (retval)
- return retval;
- }
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
+ uclamp_enable();

if (pi)
cpuset_read_lock();
--
2.32.0.288.g62a8d224e6-goog