[PATCH 05/19] sched: Core-wide rq->lock

From: Peter Zijlstra
Date: Thu Apr 22 2021 - 08:38:25 EST

Next message: Peter Zijlstra: "[PATCH 09/19] sched: Basic tracking of matching tasks"
Previous message: Joe Perches: "Re: [PATCH v5] printk: Userspace format enumeration support"
In reply to: Peter Zijlstra: "[PATCH 12/19] sched: Fix priority inversion of cookied task with sibling"
Next in thread: Peter Zijlstra: "[PATCH 09/19] sched: Basic tracking of matching tasks"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Introduce the basic infrastructure to have a core wide rq->lock.

This relies on the rq->__lock order being in increasing CPU number. It
is also constrained to SMT8 per lockdep (and SMT256 per preempt_count).

Luckily SMT8 is the max supported SMT count for Linux (Mips, Sparc and
Power are known to have this).

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
kernel/Kconfig.preempt | 6 ++
kernel/sched/core.c | 139 +++++++++++++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 37 +++++++++++++
3 files changed, 182 insertions(+)

--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -99,3 +99,9 @@ config PREEMPT_DYNAMIC

Interesting if you want the same pre-built kernel should be used for
both Server and Desktop workloads.
+
+config SCHED_CORE
+ bool "Core Scheduling for SMT"
+ default y
+ depends on SCHED_SMT
+
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -84,6 +84,103 @@ unsigned int sysctl_sched_rt_period = 10

__read_mostly int scheduler_running;

+#ifdef CONFIG_SCHED_CORE
+
+DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
+
+/*
+ * Magic required such that:
+ *
+ * raw_spin_rq_lock(rq);
+ * ...
+ * raw_spin_rq_unlock(rq);
+ *
+ * ends up locking and unlocking the _same_ lock, and all CPUs
+ * always agree on what rq has what lock.
+ *
+ * XXX entirely possible to selectively enable cores, don't bother for now.
+ */
+
+static DEFINE_MUTEX(sched_core_mutex);
+static int sched_core_count;
+static struct cpumask sched_core_mask;
+
+static void __sched_core_flip(bool enabled)
+{
+ int cpu, t, i;
+
+ cpus_read_lock();
+
+ /*
+ * Toggle the online cores, one by one.
+ */
+ cpumask_copy(&sched_core_mask, cpu_online_mask);
+ for_each_cpu(cpu, &sched_core_mask) {
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+
+ i = 0;
+ local_irq_disable();
+ for_each_cpu(t, smt_mask) {
+ /* supports up to SMT8 */
+ raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
+ }
+
+ for_each_cpu(t, smt_mask)
+ cpu_rq(t)->core_enabled = enabled;
+
+ for_each_cpu(t, smt_mask)
+ raw_spin_unlock(&cpu_rq(t)->__lock);
+ local_irq_enable();
+
+ cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
+ }
+
+ /*
+ * Toggle the offline CPUs.
+ */
+ cpumask_copy(&sched_core_mask, cpu_possible_mask);
+ cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
+
+ for_each_cpu(cpu, &sched_core_mask)
+ cpu_rq(cpu)->core_enabled = enabled;
+
+ cpus_read_unlock();
+}
+
+static void __sched_core_enable(void)
+{
+ // XXX verify there are no cookie tasks (yet)
+
+ static_branch_enable(&__sched_core_enabled);
+ __sched_core_flip(true);
+}
+
+static void __sched_core_disable(void)
+{
+ // XXX verify there are no cookie tasks (left)
+
+ __sched_core_flip(false);
+ static_branch_disable(&__sched_core_enabled);
+}
+
+void sched_core_get(void)
+{
+ mutex_lock(&sched_core_mutex);
+ if (!sched_core_count++)
+ __sched_core_enable();
+ mutex_unlock(&sched_core_mutex);
+}
+
+void sched_core_put(void)
+{
+ mutex_lock(&sched_core_mutex);
+ if (!--sched_core_count)
+ __sched_core_disable();
+ mutex_unlock(&sched_core_mutex);
+}
+
+#endif /* CONFIG_SCHED_CORE */
+
/*
* part of the period that we allow rt tasks to run in us.
* default: 0.95s
@@ -5042,6 +5139,40 @@ pick_next_task(struct rq *rq, struct tas
BUG();
}

+#ifdef CONFIG_SCHED_CORE
+
+static inline void sched_core_cpu_starting(unsigned int cpu)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ struct rq *rq, *core_rq = NULL;
+ int i;
+
+ core_rq = cpu_rq(cpu)->core;
+
+ if (!core_rq) {
+ for_each_cpu(i, smt_mask) {
+ rq = cpu_rq(i);
+ if (rq->core && rq->core == rq)
+ core_rq = rq;
+ }
+
+ if (!core_rq)
+ core_rq = cpu_rq(cpu);
+
+ for_each_cpu(i, smt_mask) {
+ rq = cpu_rq(i);
+
+ WARN_ON_ONCE(rq->core && rq->core != core_rq);
+ rq->core = core_rq;
+ }
+ }
+}
+#else /* !CONFIG_SCHED_CORE */
+
+static inline void sched_core_cpu_starting(unsigned int cpu) {}
+
+#endif /* CONFIG_SCHED_CORE */
+
/*
* __schedule() is the main scheduler function.
*
@@ -8006,6 +8137,7 @@ static void sched_rq_cpu_starting(unsign

int sched_cpu_starting(unsigned int cpu)
{
+ sched_core_cpu_starting(cpu);
sched_rq_cpu_starting(cpu);
sched_tick_start(cpu);
return 0;
@@ -8290,6 +8424,11 @@ void __init sched_init(void)
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
+
+#ifdef CONFIG_SCHED_CORE
+ rq->core = NULL;
+ rq->core_enabled = 0;
+#endif
}

set_load_weight(&init_task, false);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1075,6 +1075,12 @@ struct rq {
#endif
unsigned int push_busy;
struct cpu_stop_work push_work;
+
+#ifdef CONFIG_SCHED_CORE
+ /* per rq */
+ struct rq *core;
+ unsigned int core_enabled;
+#endif
};

#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1113,6 +1119,35 @@ static inline bool is_migration_disabled
#endif
}

+#ifdef CONFIG_SCHED_CORE
+
+DECLARE_STATIC_KEY_FALSE(__sched_core_enabled);
+
+static inline bool sched_core_enabled(struct rq *rq)
+{
+ return static_branch_unlikely(&__sched_core_enabled) && rq->core_enabled;
+}
+
+static inline bool sched_core_disabled(void)
+{
+ return !static_branch_unlikely(&__sched_core_enabled);
+}
+
+static inline raw_spinlock_t *rq_lockp(struct rq *rq)
+{
+ if (sched_core_enabled(rq))
+ return &rq->core->__lock;
+
+ return &rq->__lock;
+}
+
+#else /* !CONFIG_SCHED_CORE */
+
+static inline bool sched_core_enabled(struct rq *rq)
+{
+ return false;
+}
+
static inline bool sched_core_disabled(void)
{
return true;
@@ -1123,6 +1158,8 @@ static inline raw_spinlock_t *rq_lockp(s
return &rq->__lock;
}

+#endif /* CONFIG_SCHED_CORE */
+
static inline void lockdep_assert_rq_held(struct rq *rq)
{
lockdep_assert_held(rq_lockp(rq));

Next message: Peter Zijlstra: "[PATCH 09/19] sched: Basic tracking of matching tasks"
Previous message: Joe Perches: "Re: [PATCH v5] printk: Userspace format enumeration support"
In reply to: Peter Zijlstra: "[PATCH 12/19] sched: Fix priority inversion of cookied task with sibling"
Next in thread: Peter Zijlstra: "[PATCH 09/19] sched: Basic tracking of matching tasks"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]