[RFC v2 7/8] sched/{fair,tune}: track RUNNABLE tasks impact on per CPU boost value
From: Patrick Bellasi
Date: Thu Oct 27 2016 - 13:42:10 EST
When per-task boosting is enabled, every time a task enters/exits a CPU
its boost value could impact the currently selected OPP for that CPU.
Thus, the "aggregated" boost value for that CPU potentially needs to
be updated to match the current maximum boost value among all the tasks
currently RUNNABLE on that CPU.
This patch introduces the required support to keep track of which boost
groups are impacting a CPU. Each time a task is enqueued/dequeued to/from
a RQ of a CPU its boost group is used to increment a per-cpu counter of
RUNNABLE tasks on that CPU.
Only when the number of runnable tasks for a specific boost group
becomes 1 or 0 the corresponding boost group changes its effects on
that CPU, specifically:
a) boost_group::tasks == 1: this boost group starts to impact the CPU
b) boost_group::tasks == 0: this boost group stops to impact the CPU
In each of these two conditions the aggregation function:
schedtune_cpu_update(cpu)
could be required to run in order to identify the new maximum boost
value required for the CPU.
The proposed patch minimizes the number of times the aggregation
function is executed while still providing the required support to
always boost a CPU to the maximum boost value required by all its
currently RUNNABLE tasks.
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Signed-off-by: Patrick Bellasi <patrick.bellasi@xxxxxxx>
---
kernel/exit.c | 5 ++
kernel/sched/fair.c | 28 +++++++
kernel/sched/tune.c | 216 ++++++++++++++++++++++++++++++++++++++++++++++++++++
kernel/sched/tune.h | 13 ++++
4 files changed, 262 insertions(+)
diff --git a/kernel/exit.c b/kernel/exit.c
index 9d68c45..541e4e1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,6 +55,8 @@
#include <linux/shm.h>
#include <linux/kcov.h>
+#include "sched/tune.h"
+
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include <asm/pgtable.h>
@@ -775,6 +777,9 @@ void __noreturn do_exit(long code)
}
exit_signals(tsk); /* sets PF_EXITING */
+
+ schedtune_exit_task(tsk);
+
/*
* Ensure that all new tsk->pi_lock acquisitions must observe
* PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 313a815..f56953b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4570,6 +4570,25 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_shares(cfs_rq);
}
+ /*
+ * Update SchedTune accouting.
+ *
+ * We do it before updating the CPU capacity to ensure the
+ * boost value of the current task is accounted for in the
+ * selection of the OPP.
+ *
+ * We do it also in the case where we enqueue a trottled task;
+ * we could argue that a throttled task should not boost a CPU,
+ * however:
+ * a) properly implementing CPU boosting considering throttled
+ * tasks will increase a lot the complexity of the solution
+ * b) it's not easy to quantify the benefits introduced by
+ * such a more complex solution.
+ * Thus, for the time being we go for the simple solution and boost
+ * also for throttled RQs.
+ */
+ schedtune_enqueue_task(p, cpu_of(rq));
+
if (!se)
add_nr_running(rq, 1);
@@ -4629,6 +4648,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_shares(cfs_rq);
}
+ /*
+ * Update SchedTune accouting
+ *
+ * We do it before updating the CPU capacity to ensure the
+ * boost value of the current task is accounted for in the
+ * selection of the OPP.
+ */
+ schedtune_dequeue_task(p, cpu_of(rq));
+
if (!se)
sub_nr_running(rq, 1);
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 6a51a4d..965a3e1 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -7,6 +7,7 @@
#include <linux/cgroup.h>
#include <linux/err.h>
#include <linux/percpu.h>
+#include <linux/rcupdate.h>
#include <linux/slab.h>
#include "sched.h"
@@ -16,6 +17,8 @@ unsigned int sysctl_sched_cfs_boost __read_mostly;
#ifdef CONFIG_CGROUP_SCHED_TUNE
+static bool schedtune_initialized;
+
/*
* CFS Scheduler Tunables for Task Groups.
*/
@@ -99,6 +102,8 @@ struct boost_groups {
/* Count of RUNNABLE tasks on that boost group */
unsigned int tasks;
} group[boostgroups_max];
+ /* CPU's boost group locking */
+ raw_spinlock_t lock;
};
/* Boost groups affecting each CPU in the system */
@@ -171,6 +176,213 @@ int schedtune_cpu_boost(int cpu)
return bg->boost_max;
}
+#define ENQUEUE_TASK 1
+#define DEQUEUE_TASK -1
+
+static inline void
+schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
+{
+ struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+ int tasks = bg->group[idx].tasks + task_count;
+
+ /* Update boosted tasks count while avoiding to make it negative */
+ bg->group[idx].tasks = max(0, tasks);
+
+ /* Boost group activation or deactivation on that RQ */
+ if (tasks == 1 || tasks == 0)
+ schedtune_cpu_update(cpu);
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_enqueue_task(struct task_struct *p, int cpu)
+{
+ struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+ unsigned long irq_flags;
+ struct schedtune *st;
+ int idx;
+
+ lockdep_assert_held(&cpu_rq(cpu)->lock);
+
+ if (!unlikely(schedtune_initialized))
+ return;
+
+ /*
+ * When a task is marked PF_EXITING by do_exit() it's going to be
+ * dequeued and enqueued multiple times in the exit path.
+ * Thus we avoid any further update, since we do not want to change
+ * CPU boosting while the task is exiting.
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ /*
+ * Boost group accouting is protected by a per-cpu lock and requires
+ * interrupt to be disabled to avoid race conditions for example on
+ * do_exit()::cgroup_exit() and task migration.
+ */
+ raw_spin_lock_irqsave(&bg->lock, irq_flags);
+ rcu_read_lock();
+
+ st = task_schedtune(p);
+ idx = st->idx;
+
+ schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK);
+
+ rcu_read_unlock();
+ raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
+
+static int schedtune_can_attach(struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *dst_css;
+ struct rq_flags rq_flags;
+ struct task_struct *task;
+ struct boost_groups *bg;
+ unsigned int cpu;
+ struct rq *rq;
+ int src_bg; /* Source boost group index */
+ int dst_bg; /* Destination boost group index */
+ int tasks;
+
+ if (!unlikely(schedtune_initialized))
+ return 0;
+
+ cgroup_taskset_for_each(task, dst_css, tset) {
+
+ /*
+ * Lock the CPU's RQ the task is enqueued to avoid race
+ * conditions with migration code while the task is being
+ * accounted
+ */
+ rq = task_rq_lock(task, &rq_flags);
+
+ if (!task->on_rq) {
+ task_rq_unlock(rq, task, &rq_flags);
+ continue;
+ }
+
+ /*
+ * Boost group accouting is protected by a per-cpu lock and
+ * requires interrupt to be disabled to avoid race conditions
+ * on tasks migrations.
+ */
+ cpu = cpu_of(rq);
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ raw_spin_lock(&bg->lock);
+
+ dst_bg = css_st(dst_css)->idx;
+ src_bg = task_schedtune(task)->idx;
+
+ /*
+ * Current task is not changing boostgroup, which can
+ * happen when the new hierarchy is in use.
+ */
+ if (unlikely(dst_bg == src_bg)) {
+ raw_spin_unlock(&bg->lock);
+ task_rq_unlock(rq, task, &rq_flags);
+ continue;
+ }
+
+ /*
+ * This is the case of a RUNNABLE task which is switching its
+ * current boost group.
+ */
+
+ /* Move task from src to dst boost group */
+ tasks = bg->group[src_bg].tasks - 1;
+ bg->group[src_bg].tasks = max(0, tasks);
+ bg->group[dst_bg].tasks += 1;
+
+ raw_spin_unlock(&bg->lock);
+ task_rq_unlock(rq, task, &rq_flags);
+
+ /* Update CPU boost group */
+ if (bg->group[src_bg].tasks == 0 ||
+ bg->group[dst_bg].tasks == 1)
+ schedtune_cpu_update(task_cpu(task));
+ }
+
+ return 0;
+}
+
+static void schedtune_cancel_attach(struct cgroup_taskset *tset)
+{
+ /*
+ * This can happen only if SchedTune controller is mounted with
+ * other hierarchies and one of them fails. Since usually SchedTune is
+ * mounted on its own hierarchy, for the time being we do not implement
+ * a proper rollback mechanism.
+ */
+ WARN(1, "SchedTune cancel attach not implemented");
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_dequeue_task(struct task_struct *p, int cpu)
+{
+ struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+ unsigned long irq_flags;
+ struct schedtune *st;
+ int idx;
+
+ lockdep_assert_held(&cpu_rq(cpu)->lock);
+
+ if (!unlikely(schedtune_initialized))
+ return;
+
+ /*
+ * When a task is marked PF_EXITING by do_exit() it's going to be
+ * dequeued and enqueued multiple times in the exit path.
+ * Thus we avoid any further update, since we do not want to change
+ * CPU boosting while the task is exiting.
+ * The last dequeue is already enforce by the do_exit() code path
+ * via schedtune_exit_task().
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ /*
+ * Boost group accouting is protected by a per-cpu lock and requires
+ * interrupt to be disabled to avoid race conditions on...
+ */
+ raw_spin_lock_irqsave(&bg->lock, irq_flags);
+ rcu_read_lock();
+
+ st = task_schedtune(p);
+ idx = st->idx;
+
+ schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK);
+
+ rcu_read_unlock();
+ raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
+
+void schedtune_exit_task(struct task_struct *tsk)
+{
+ struct rq_flags rq_flags;
+ struct schedtune *st;
+ unsigned int cpu;
+ struct rq *rq;
+ int idx;
+
+ if (!unlikely(schedtune_initialized))
+ return;
+
+ rq = task_rq_lock(tsk, &rq_flags);
+ rcu_read_lock();
+
+ cpu = cpu_of(rq);
+ st = task_schedtune(tsk);
+ idx = st->idx;
+ schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK);
+
+ rcu_read_unlock();
+ task_rq_unlock(rq, tsk, &rq_flags);
+}
+
static u64
boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
@@ -288,6 +500,8 @@ schedtune_css_free(struct cgroup_subsys_state *css)
struct cgroup_subsys schedtune_cgrp_subsys = {
.css_alloc = schedtune_css_alloc,
.css_free = schedtune_css_free,
+ .can_attach = schedtune_can_attach,
+ .cancel_attach = schedtune_cancel_attach,
.legacy_cftypes = files,
.early_init = 1,
};
@@ -306,6 +520,8 @@ schedtune_init_cgroups(void)
pr_info("schedtune: configured to support %d boost groups\n",
boostgroups_max);
+
+ schedtune_initialized = true;
}
#endif /* CONFIG_CGROUP_SCHED_TUNE */
diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h
index e936b91..ae7dccf 100644
--- a/kernel/sched/tune.h
+++ b/kernel/sched/tune.h
@@ -14,14 +14,27 @@ extern struct reciprocal_value schedtune_spc_rdiv;
int schedtune_cpu_boost(int cpu);
+void schedtune_exit_task(struct task_struct *tsk);
+
+void schedtune_enqueue_task(struct task_struct *p, int cpu);
+void schedtune_dequeue_task(struct task_struct *p, int cpu);
+
#else /* CONFIG_CGROUP_SCHED_TUNE */
#define schedtune_cpu_boost(cpu) get_sysctl_sched_cfs_boost()
+#define schedtune_enqueue_task(task, cpu) do { } while (0)
+#define schedtune_dequeue_task(task, cpu) do { } while (0)
+#define schedtune_exit_task(task) do { } while (0)
+
#endif /* CONFIG_CGROUP_SCHED_TUNE */
#else /* CONFIG_SCHED_TUNE */
#define schedtune_cpu_boost(cpu) 0
+#define schedtune_enqueue_task(task, cpu) do { } while (0)
+#define schedtune_dequeue_task(task, cpu) do { } while (0)
+#define schedtune_exit_task(task) do { } while (0)
+
#endif /* CONFIG_SCHED_TUNE */
--
2.10.1