[PATCH 5/7] RT: CPU priority management

From: Gregory Haskins
Date: Thu Oct 11 2007 - 18:14:52 EST

Next message: Gregory Haskins: "[PATCH 6/7] RT: Convert Steve's rt-push infrastructure to cpupri"
Previous message: Gregory Haskins: "[PATCH 4/7] RT: Add a per-cpu rt_overload indication"
In reply to: Gregory Haskins: "[PATCH 4/7] RT: Add a per-cpu rt_overload indication"
Next in thread: Gregory Haskins: "[PATCH 6/7] RT: Convert Steve's rt-push infrastructure to cpupri"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

This code tracks the priority of each CPU so that global migration
decisions are easy to calculate. Each CPU can be in a state as follows:

(INVALID), IDLE, NORMAL, RT1, ... RT99

going from the lowest priority to the highest. CPUs in the INVALID state
are not eligible for routing. The system maintains this state with
a 2 dimensional bitmap (the first for priority class, the second for cpus
in that class). Therefore a typical application without affinity
restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
searches). For tasks with affinity restrictions, the algorithm has a
worst case complexity of O(min(102, NR_CPUS)), though the scenario that
yields the worst case search is fairly contrived.

Because this type of data structure is going to be cache/lock hot,
certain design considerations were made to mitigate this overhead, such
as: seqlocks, per_cpu data to avoid cacheline contention, avoiding locks
in the update code when possible, etc.

Signed-off-by: Gregory Haskins <ghaskins@xxxxxxxxxx>
---

include/linux/cpupri.h | 25 +++++
kernel/Kconfig.preempt | 11 ++
kernel/Makefile | 1
kernel/cpupri.c | 220 ++++++++++++++++++++++++++++++++++++++++++++++++
kernel/sched.c | 36 ++++++++
5 files changed, 293 insertions(+), 0 deletions(-)

diff --git a/include/linux/cpupri.h b/include/linux/cpupri.h
new file mode 100644
index 0000000..1ca5007
--- /dev/null
+++ b/include/linux/cpupri.h
@@ -0,0 +1,25 @@
+#ifndef _LINUX_CPUPRI_H
+#define _LINUX_CPUPRI_H
+
+#include <linux/sched.h>
+
+#define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO
+
+#define CPUPRI_INVALID -2
+#define CPUPRI_IDLE -1
+#define CPUPRI_NORMAL 0
+/* values 1-99 are RT priorities */
+
+#ifdef CONFIG_CPU_PRIORITIES
+int cpupri_find_best(int cpu, int pri, struct task_struct *p);
+void cpupri_set(int cpu, int pri);
+int cpupri_get(int cpu);
+void cpupri_init(void);
+#else
+#define cpupri_find_best(cpu, pri, tsk) cpu
+#define cpupri_set(cpu, pri) do { } while(0)
+#define cpupri_get(cpu) CPUPRI_INVALID
+#define cpupri_init() do { } while(0)
+#endif /* CONFIG_CPU_PRIORITIES */
+
+#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 2316f28..5397e59 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -197,3 +197,14 @@ config RCU_TRACE
Say Y here if you want to enable RCU tracing
Say N if you are unsure.

+config CPU_PRIORITIES
+ bool "Enable CPU priority management"
+ default n
+ help
+ This option allows the scheduler to efficiently track the absolute
+ priority of the current task on each CPU. This helps it to make
+ global decisions for real-time tasks before a overload conflict
+ actually occurs.
+
+ Say Y here if you want to enable priority management
+ Say N if you are unsure.
\ No newline at end of file
diff --git a/kernel/Makefile b/kernel/Makefile
index e4e2acf..63aaaf5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -66,6 +66,7 @@ obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
+obj-$(CONFIG_CPU_PRIORITIES) += cpupri.o

ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@xxxxxxxxxxxxxxxx>, the -fno-omit-frame-pointer is
diff --git a/kernel/cpupri.c b/kernel/cpupri.c
new file mode 100644
index 0000000..46a8758
--- /dev/null
+++ b/kernel/cpupri.c
@@ -0,0 +1,220 @@
+/*
+ * kernel/cpupri.c
+ *
+ * CPU priority management
+ *
+ * Copyright (C) 2007 Novell
+ *
+ * Author: Gregory Haskins <ghaskins@xxxxxxxxxx>
+ *
+ * This code tracks the priority of each CPU so that global migration
+ * decisions are easy to calculate. Each CPU can be in a state as follows:
+ *
+ * (INVALID), IDLE, NORMAL, RT1, ... RT99
+ *
+ * going from the lowest priority to the highest. CPUs in the INVALID state
+ * are not eligible for routing. The system maintains this state with
+ * a 2 dimensional bitmap (the first for priority class, the second for cpus
+ * in that class). Therefore a typical application without affinity
+ * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
+ * searches). For tasks with affinity restrictions, the algorithm has a
+ * worst case complexity of O(min(102, NR_CPUS)), though the scenario that
+ * yields the worst case search is fairly contrived.
+ *
+ * Because this type of data structure is going to be cache/lock hot,
+ * certain design considerations were made to mitigate this overhead, such
+ * as: seqlocks, per_cpu data to avoid cacheline contention, avoiding locks
+ * in the update code when possible, etc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/cpupri.h>
+#include <asm/idle.h>
+
+struct cpu_priority {
+ raw_seqlock_t lock;
+ cpumask_t pri_to_cpu[CPUPRI_NR_PRIORITIES];
+ long pri_active[CPUPRI_NR_PRIORITIES/BITS_PER_LONG];
+};
+
+static DEFINE_PER_CPU(int, cpu_to_pri);
+
+static __cacheline_aligned_in_smp struct cpu_priority cpu_priority;
+
+#define for_each_cpupri_active(array, idx) \
+ for( idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \
+ idx < CPUPRI_NR_PRIORITIES; \
+ idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
+
+/**
+ * cpupri_find_best - find the best (lowest-pri) CPU in the system
+ * @cpu: The recommended/default CPU
+ * @task_pri: The priority of the task being scheduled (IDLE-RT99)
+ * @p: The task being scheduled
+ *
+ * Note: This function returns the recommeded CPU as calculated during the
+ * current invokation. By the time the call returns, the CPUs may have in
+ * fact changed priorities any number of times. While not ideal, it is not
+ * an issue of correctness since the normal rebalancer logic will correct
+ * any discrepancies created by racing against the uncertainty of the current
+ * priority configuration.
+ *
+ * Returns: (int)cpu - The recommended cpu to accept the task
+ */
+int cpupri_find_best(int def_cpu, int task_pri, struct task_struct *p)
+{
+ int idx = 0;
+ struct cpu_priority *cp = &cpu_priority;
+ int this_cpu = smp_processor_id();
+ int cpu = def_cpu;
+ unsigned long seq;
+
+ do {
+ seq = read_seqbegin(&cp->lock);
+
+ for_each_cpupri_active(cp->pri_active, idx) {
+ cpumask_t mask;
+ int lowest_pri = idx-1;
+
+ if (lowest_pri > task_pri)
+ break;
+
+ cpus_and(mask, p->cpus_allowed, cp->pri_to_cpu[idx]);
+
+ if (!cpus_empty(mask)) {
+ /*
+ * We select a CPU in the following priority:
+ *
+ * def_cpu, this_cpu, first_cpu
+ *
+ * for efficiency. def_cpu preserves cache
+ * affinity, and this_cpu is cheaper to preempt
+ * (note that sometimes they are the same).
+ * Finally, we will take whatever is available
+ * if the first two don't pan out.
+ */
+ if (cpu_isset(def_cpu, mask))
+ break;
+
+ if (cpu_isset(this_cpu, mask)) {
+ cpu = this_cpu;
+ break;
+ }
+
+ cpu = first_cpu(mask);
+ break;
+ }
+ }
+ } while (unlikely(read_seqretry(&cp->lock, seq)));
+
+ return cpu;
+}
+
+/**
+ * cpupri_set - update the cpu priority setting
+ * @cpu: The target cpu
+ * @pri: The priority (INVALID-RT99) to assign to this CPU
+ *
+ * Note: Assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpupri_set(int cpu, int pri)
+{
+ struct cpu_priority *cp = &cpu_priority;
+ int *cpri = &per_cpu(cpu_to_pri, cpu);
+
+ /*
+ * Its safe to check the CPU priority outside the seqlock because
+ * it can only be modified while holding the rq->lock for this cpu
+ */
+ if (*cpri != pri) {
+ int oldpri = *cpri;
+ unsigned long flags;
+
+ write_seqlock_irqsave(&cp->lock, flags);
+
+ /*
+ * If the cpu was currently mapped to a different value, we
+ * first need to unmap the old value
+ */
+ if (likely(oldpri != CPUPRI_INVALID)) {
+ int idx = oldpri+1;
+ cpumask_t *mask = &cp->pri_to_cpu[idx];
+
+ cpu_clear(cpu, *mask);
+ if (cpus_empty(*mask))
+ __clear_bit(idx, cp->pri_active);
+ }
+
+ if (likely(pri != CPUPRI_INVALID)) {
+ int idx = pri+1;
+ cpumask_t *mask = &cp->pri_to_cpu[idx];
+
+ cpu_set(cpu, *mask);
+ __set_bit(idx, cp->pri_active);
+ }
+
+ write_sequnlock_irqrestore(&cp->lock, flags);
+
+ *cpri = pri;
+ }
+}
+
+/**
+ * cpupri_get - read the current cpu priority setting
+ * @cpu: The target cpu
+ *
+ * Note: Assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (int)pri - The current priority of this cpu
+ */
+int cpupri_get(int cpu)
+{
+ return per_cpu(cpu_to_pri, cpu);
+}
+
+#if 0
+static int cpupri_idle(struct notifier_block *b, unsigned long event, void *v)
+{
+ // FIXME: We now need to hold the rq->lock before calling!
+ if (event == IDLE_START)
+ cpupri_set(raw_smp_processor_id(), CPUPRI_IDLE);
+
+ return 0;
+}
+
+static struct notifier_block cpupri_idle_notifier = {
+ .notifier_call = cpupri_idle
+};
+#endif
+
+/**
+ * cpupri_init - initialize the cpupri subsystem
+ *
+ * This must be called during the scheduler initialization before the
+ * other methods may be used.
+ *
+ * Returns: (void)
+ */
+void cpupri_init(void)
+{
+ struct cpu_priority *cp = &cpu_priority;
+ int i;
+
+y memset(cp, 0, sizeof(*cp));
+
+ raw_seqlock_init(&cp->lock);
+
+ for_each_possible_cpu(i) {
+ per_cpu(cpu_to_pri, i) = CPUPRI_INVALID;
+ }
+
+ // idle_notifier_register(&cpupri_idle_notifier);
+}
+
+
diff --git a/kernel/sched.c b/kernel/sched.c
index 41b0e9c..0065551 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -64,6 +64,7 @@
#include <linux/delayacct.h>
#include <linux/reciprocal_div.h>
#include <linux/unistd.h>
+#include <linux/cpupri.h>

#include <asm/tlb.h>

@@ -474,6 +475,37 @@ static inline void set_task_cfs_rq(struct task_struct *p)
}
#endif

+static int calc_task_cpupri(struct rq *rq, struct task_struct *p)
+{
+ int pri;
+
+ if (rt_task(p))
+ pri = p->rt_priority;
+ else if (p == rq->idle)
+ pri = CPUPRI_IDLE;
+ else
+ pri = CPUPRI_NORMAL;
+
+ return pri;
+}
+
+#ifdef CONFIG_CPU_PRIORITIES
+static void set_cpupri(struct rq *rq, struct task_struct *p)
+{
+ int pri = calc_task_cpupri(rq, p);
+ cpupri_set(rq->cpu, pri);
+}
+
+static int get_cpupri(struct rq *rq)
+{
+ return cpupri_get(rq->cpu);
+}
+
+#else
+#define set_cpupri(rq, task) do { } while (0)
+#define get_cpupri(rq) CPUPRI_INVALID
+#endif
+
/*
* We really dont want to do anything complex within switch_to()
* on PREEMPT_RT - this check enforces this.
@@ -2259,6 +2291,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
fire_sched_out_preempt_notifiers(prev, next);
prepare_lock_switch(rq, next);
prepare_arch_switch(next);
+ set_cpupri(rq, next);
}

/**
@@ -4603,6 +4636,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
* this runqueue and our priority is higher than the current's
*/
if (task_running(rq, p)) {
+ set_cpupri(rq, p);
if (p->prio > oldprio)
resched_task(rq->curr);
} else {
@@ -7290,6 +7324,8 @@ void __init sched_init(void)
int highest_cpu = 0;
int i, j;

+ cpupri_init();
+
/*
* Link up the scheduling class hierarchy:
*/

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Gregory Haskins: "[PATCH 6/7] RT: Convert Steve's rt-push infrastructure to cpupri"
Previous message: Gregory Haskins: "[PATCH 4/7] RT: Add a per-cpu rt_overload indication"
In reply to: Gregory Haskins: "[PATCH 4/7] RT: Add a per-cpu rt_overload indication"
Next in thread: Gregory Haskins: "[PATCH 6/7] RT: Convert Steve's rt-push infrastructure to cpupri"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]