Re: [PATCH v6 1/2] sched/uclamp: Add a new sysctl to control RT default boost value

From: Peter Zijlstra
Date: Mon Jul 13 2020 - 07:21:55 EST


On Mon, Jul 06, 2020 at 03:28:38PM +0100, Qais Yousef wrote:

> +static void __uclamp_sync_util_min_rt_default(struct task_struct *p)
> +{
> + unsigned int default_util_min;
> + struct uclamp_se *uc_se;
> +
> + WARN_ON_ONCE(!rcu_read_lock_held());
> +
> + if (!rt_task(p))
> + return;
> +
> + uc_se = &p->uclamp_req[UCLAMP_MIN];
> +
> + /* Only sync if user didn't override the default */
> + if (uc_se->user_defined)
> + return;
> +
> + /* Sync with smp_wmb() in uclamp_sync_util_min_rt_default() */
> + smp_rmb();
> + default_util_min = sysctl_sched_uclamp_util_min_rt_default;
> + uclamp_se_set(uc_se, default_util_min, false);
> +}
> +
> +static void uclamp_sync_util_min_rt_default(void)
> +{
> + struct task_struct *g, *p;
> +
> + /*
> + * Make sure the updated sysctl_sched_uclamp_util_min_rt_default which
> + * was just written is synchronized against any future read on another
> + * cpu.
> + */
> + smp_wmb();
> +
> + /*
> + * Wait for all updaters to observe the new change.
> + *
> + * There are 2 races to deal with here:
> + *
> + * 1. fork()->copy_process()
> + *
> + * If a task was concurrently forking, for_each_process_thread()
> + * will not see it, hence it could have copied the old value and
> + * we missed the opportunity to update it.
> + *
> + * This should be handled by sched_post_fork() where it'll ensure
> + * it performs the sync after the fork.
> + *
> + * 2. fork()->sched_post_fork()
> + * __setscheduler_uclamp()
> + *
> + * Both of these functions could read the old value but then get
> + * preempted, during which a user might write new value to
> + * sysctl_sched_uclamp_util_min_rt_default.
> + *
> + * // read sysctl_sched_uclamp_util_min_rt_default;
> + * // PREEMPT-OUT
> + * .
> + * . <-- sync happens here
> + * .
> + * // PREEMPT-IN
> + * // write p->uclamp_req[UCLAMP_MIN]
> + *
> + * That section is protected with rcu_read_lock(), so
> + * synchronize_rcu() will guarantee it has finished before we
> + * perform the update. Hence ensure that this sync happens after
> + * any concurrent sync which should guarantee correctness.
> + */
> + synchronize_rcu();
> +
> + rcu_read_lock();
> + for_each_process_thread(g, p)
> + __uclamp_sync_util_min_rt_default(p);
> + rcu_read_unlock();
> +}

It's monday, and I cannot get my brain working.. I cannot decipher the
comments you have with the smp_[rw]mb(), what actual ordering do they
enforce?

Also, your synchronize_rcu() relies on write_lock() beeing
non-preemptible, which isn't true on PREEMPT_RT.

The below seems simpler...

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1013,8 +1013,6 @@ static void __uclamp_sync_util_min_rt_de
unsigned int default_util_min;
struct uclamp_se *uc_se;

- WARN_ON_ONCE(!rcu_read_lock_held());
-
if (!rt_task(p))
return;

@@ -1024,8 +1022,6 @@ static void __uclamp_sync_util_min_rt_de
if (uc_se->user_defined)
return;

- /* Sync with smp_wmb() in uclamp_sync_util_min_rt_default() */
- smp_rmb();
default_util_min = sysctl_sched_uclamp_util_min_rt_default;
uclamp_se_set(uc_se, default_util_min, false);
}
@@ -1035,47 +1031,21 @@ static void uclamp_sync_util_min_rt_defa
struct task_struct *g, *p;

/*
- * Make sure the updated sysctl_sched_uclamp_util_min_rt_default which
- * was just written is synchronized against any future read on another
- * cpu.
- */
- smp_wmb();
-
- /*
- * Wait for all updaters to observe the new change.
- *
- * There are 2 races to deal with here:
- *
- * 1. fork()->copy_process()
- *
- * If a task was concurrently forking, for_each_process_thread()
- * will not see it, hence it could have copied the old value and
- * we missed the opportunity to update it.
- *
- * This should be handled by sched_post_fork() where it'll ensure
- * it performs the sync after the fork.
- *
- * 2. fork()->sched_post_fork()
- * __setscheduler_uclamp()
- *
- * Both of these functions could read the old value but then get
- * preempted, during which a user might write new value to
- * sysctl_sched_uclamp_util_min_rt_default.
- *
- * // read sysctl_sched_uclamp_util_min_rt_default;
- * // PREEMPT-OUT
- * .
- * . <-- sync happens here
- * .
- * // PREEMPT-IN
- * // write p->uclamp_req[UCLAMP_MIN]
- *
- * That section is protected with rcu_read_lock(), so
- * synchronize_rcu() will guarantee it has finished before we
- * perform the update. Hence ensure that this sync happens after
- * any concurrent sync which should guarantee correctness.
- */
- synchronize_rcu();
+ * copy_process() sysctl_uclamp
+ * uclamp_min_rt = X;
+ * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
+ * // link thread smp_mb__after_spinlock()
+ * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
+ * sched_post_fork() for_each_process_thread()
+ * __uclamp_sync_rt() __uclamp_sync_rt()
+ *
+ * Ensures that either sched_post_fork() will observe the new
+ * uclamp_min_rt or for_each_process_thread() will observe the new
+ * task.
+ */
+ read_lock(&tasklist_lock);
+ smp_mb__after_spinlock();
+ read_unlock(&tasklist_lock);

rcu_read_lock();
for_each_process_thread(g, p)
@@ -1408,6 +1378,9 @@ int sysctl_sched_uclamp_handler(struct c
uclamp_update_root_tg();
}

+ if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default)
+ uclamp_sync_util_min_rt_default();
+
/*
* We update all RUNNABLE tasks only when task groups are in use.
* Otherwise, keep it simple and do just a lazy update at each next
@@ -1466,9 +1439,7 @@ static void __setscheduler_uclamp(struct
* at runtime.
*/
if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) {
- rcu_read_lock();
__uclamp_sync_util_min_rt_default(p);
- rcu_read_unlock();
} else {
uclamp_se_set(uc_se, uclamp_none(clamp_id), false);
}
@@ -1521,6 +1492,11 @@ static void __init init_uclamp_rq(struct
rq->uclamp_flags = 0;
}

+static void uclamp_post_fork(struct task_struct *p)
+{
+ __uclamp_sync_util_min_rt_default(p);
+}
+
static void __init init_uclamp(void)
{
struct uclamp_se uc_max = {};