Re: [RFC PATCH 1/2] sched/fair: Only throttle CFS tasks on return to userspace

From: Benjamin Segall
Date: Thu Dec 07 2023 - 18:47:17 EST


Alright, this took longer than I intended, but here's the basic version
with the duplicate "runqueue" just being a list rather than actual EEVDF
or anything sensible.

I also wasn't actually able to adapt it to work via task_work like I
thought I could; flagging the entry to the relevant schedule() calls is
too critical.

--------

Subject: [RFC PATCH] sched/fair: only throttle CFS tasks in userspace

The basic idea of this implementation is to maintain duplicate runqueues
in each cfs_rq that contain duplicate pointers to sched_entitys which
should bypass throttling. Then we can skip throttling cfs_rqs that have
any such children, and when we pick inside any not-actually-throttled
cfs_rq, we only look at this duplicated list.

"Which tasks should bypass throttling" here is "all schedule() calls
that don't set a special flag", but could instead involve the lockdep
markers (except for the problem of percpu-rwsem and similar) or explicit
flags around syscalls and faults, or something else.

This approach avoids any O(tasks) loops, but leaves partially-throttled
cfs_rqs still contributing their full h_nr_running to their parents,
which might result in worse balancing. Also it adds more (generally
still small) overhead to the common enqueue/dequeue/pick paths.

The very basic debug test added is to run a cpusoaker and "cat
/sys/kernel/debug/sched_locked_spin" pinned to the same cpu in the same
cgroup with a quota < 1 cpu.
---
include/linux/sched.h | 7 ++
kernel/entry/common.c | 2 +-
kernel/entry/kvm.c | 2 +-
kernel/sched/core.c | 20 ++++
kernel/sched/debug.c | 28 +++++
kernel/sched/fair.c | 232 ++++++++++++++++++++++++++++++++++++++++--
kernel/sched/sched.h | 3 +
7 files changed, 281 insertions(+), 13 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8d258162deb0a..3e9e0e308d66d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -301,10 +301,11 @@ extern long schedule_timeout(long timeout);
extern long schedule_timeout_interruptible(long timeout);
extern long schedule_timeout_killable(long timeout);
extern long schedule_timeout_uninterruptible(long timeout);
extern long schedule_timeout_idle(long timeout);
asmlinkage void schedule(void);
+asmlinkage void schedule_usermode(void);
extern void schedule_preempt_disabled(void);
asmlinkage void preempt_schedule_irq(void);
#ifdef CONFIG_PREEMPT_RT
extern void schedule_rtlock(void);
#endif
@@ -576,10 +577,13 @@ struct sched_entity {
struct cfs_rq *cfs_rq;
/* rq "owned" by this entity/group: */
struct cfs_rq *my_q;
/* cached value of my_q->h_nr_running */
unsigned long runnable_weight;
+#ifdef CONFIG_CFS_BANDWIDTH
+ struct list_head kernel_node;
+#endif
#endif

#ifdef CONFIG_SMP
/*
* Per entity load average tracking.
@@ -1562,10 +1566,13 @@ struct task_struct {

#ifdef CONFIG_USER_EVENTS
struct user_event_mm *user_event_mm;
#endif

+#ifdef CONFIG_CFS_BANDWIDTH
+ atomic_t in_return_to_user;
+#endif
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
*/
randomized_struct_fields_end
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index d7ee4bc3f2ba3..008c4cf7c09a1 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -154,11 +154,11 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
while (ti_work & EXIT_TO_USER_MODE_WORK) {

local_irq_enable_exit_to_user(ti_work);

if (ti_work & _TIF_NEED_RESCHED)
- schedule();
+ schedule_usermode(); /* TODO: also all of the arch/* loops that don't use this yet */

if (ti_work & _TIF_UPROBE)
uprobe_notify_resume(regs);

if (ti_work & _TIF_PATCH_PENDING)
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index 2e0f75bcb7fd1..fc4b73de07539 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -12,11 +12,11 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
kvm_handle_signal_exit(vcpu);
return -EINTR;
}

if (ti_work & _TIF_NEED_RESCHED)
- schedule();
+ schedule_usermode();

if (ti_work & _TIF_NOTIFY_RESUME)
resume_user_mode_work(NULL);

ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index db4be4921e7f0..a7c028fad5a89 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4527,10 +4527,14 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
INIT_LIST_HEAD(&p->se.group_node);

#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+ INIT_LIST_HEAD(&p->se.kernel_node);
+ atomic_set(&p->in_return_to_user, 0);
+#endif

#ifdef CONFIG_SCHEDSTATS
/* Even if schedstat is disabled, there should not be garbage */
memset(&p->stats, 0, sizeof(p->stats));
#endif
@@ -6816,10 +6820,26 @@ asmlinkage __visible void __sched schedule(void)
__schedule_loop(SM_NONE);
sched_update_worker(tsk);
}
EXPORT_SYMBOL(schedule);

+asmlinkage __visible void __sched schedule_usermode(void)
+{
+#ifdef CONFIG_CFS_BANDWIDTH
+ /*
+ * This is only atomic because of this simple implementation. We could
+ * do something with an SM_USER to avoid other-cpu scheduler operations
+ * racing against these writes.
+ */
+ atomic_set(&current->in_return_to_user, true);
+ schedule();
+ atomic_set(&current->in_return_to_user, false);
+#else
+ schedule();
+#endif
+}
+
/*
* synchronize_rcu_tasks() makes sure that no task is stuck in preempted
* state (have scheduled out non-voluntarily) by making sure that all
* tasks have either left the run queue or have gone into user space.
* As idle tasks do not do either, they must not ever be preempted
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 168eecc209b49..5d9d011a0e6fb 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -317,10 +317,36 @@ static const struct file_operations sched_verbose_fops = {
.write = sched_verbose_write,
.open = simple_open,
.llseek = default_llseek,
};

+static DEFINE_MUTEX(sched_debug_spin_mutex);
+static int sched_debug_spin_show(struct seq_file *m, void *v) {
+ int count;
+ mutex_lock(&sched_debug_spin_mutex);
+ for (count = 0; count < 1000; count++) {
+ u64 start2;
+ start2 = jiffies;
+ while (jiffies == start2)
+ cpu_relax();
+ schedule();
+ }
+ mutex_unlock(&sched_debug_spin_mutex);
+ return 0;
+}
+static int sched_debug_spin_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_debug_spin_show, NULL);
+}
+
+static const struct file_operations sched_debug_spin_fops = {
+ .open = sched_debug_spin_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
static const struct seq_operations sched_debug_sops;

static int sched_debug_open(struct inode *inode, struct file *filp)
{
return seq_open(filp, &sched_debug_sops);
@@ -372,10 +398,12 @@ static __init int sched_init_debug(void)
debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
#endif

debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);

+ debugfs_create_file("sched_locked_spin", 0444, NULL, NULL,
+ &sched_debug_spin_fops);
return 0;
}
late_initcall(sched_init_debug);

#ifdef CONFIG_SMP
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bcea3d55d95d3..15e5e358c91e1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -126,10 +126,11 @@ int __weak arch_asym_cpu_priority(int cpu)
* we will always only issue the remaining available time.
*
* (default: 5 msec, units: microseconds)
*/
static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+static unsigned int sysctl_sched_cfs_bandwidth_kernel_bypass = 1;
#endif

#ifdef CONFIG_NUMA_BALANCING
/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
@@ -144,10 +145,19 @@ static struct ctl_table sched_fair_sysctls[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE,
},
+ {
+ .procname = "sched_cfs_bandwidth_kernel_bypass",
+ .data = &sysctl_sched_cfs_bandwidth_kernel_bypass,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
#endif
#ifdef CONFIG_NUMA_BALANCING
{
.procname = "numa_balancing_promote_rate_limit_MBps",
.data = &sysctl_numa_balancing_promote_rate_limit,
@@ -5416,18 +5426,38 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}

/*
* Pick the next process, keeping these things in mind, in this order:
- * 1) keep things fair between processes/task groups
- * 2) pick the "next" process, since someone really wants that to run
- * 3) pick the "last" process, for cache locality
- * 4) do not run the "skip" process, if something else is available
+ * 1) If we're inside a throttled cfs_rq, only pick threads in the kernel
+ * 2) keep things fair between processes/task groups
+ * 3) pick the "next" process, since someone really wants that to run
+ * 4) pick the "last" process, for cache locality
+ * 5) do not run the "skip" process, if something else is available
*/
static struct sched_entity *
-pick_next_entity(struct cfs_rq *cfs_rq)
+pick_next_entity(struct cfs_rq *cfs_rq, bool throttled)
{
+#ifdef CONFIG_CFS_BANDWIDTH
+ /*
+ * TODO: This might trigger, I'm not sure/don't remember. Regardless,
+ * while we do not explicitly handle the case where h_kernel_running
+ * goes to 0, we will call account/check_cfs_rq_runtime at worst in
+ * entity_tick and notice that we can now properly do the full
+ * throttle_cfs_rq.
+ */
+ WARN_ON_ONCE(list_empty(&cfs_rq->kernel_children));
+ if (throttled && !list_empty(&cfs_rq->kernel_children)) {
+ /*
+ * TODO: you'd want to factor out pick_eevdf to just take
+ * tasks_timeline, and replace this list with a second rbtree
+ * and a call to pick_eevdf.
+ */
+ return list_first_entry(&cfs_rq->kernel_children,
+ struct sched_entity, kernel_node);
+ }
+#endif
/*
* Enabling NEXT_BUDDY will affect latency but not fairness.
*/
if (sched_feat(NEXT_BUDDY) &&
cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
@@ -5622,12 +5652,18 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
if (cfs_rq->throttled)
return;
/*
* if we're unable to extend our runtime we resched so that the active
* hierarchy can be throttled
+ *
+ * Don't resched_curr() if curr is in the kernel. We won't throttle the
+ * cfs_rq if any task is in the kernel, and if curr in particular is we
+ * don't need to preempt it in favor of whatever other task is in the
+ * kernel.
*/
- if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
+ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr) &&
+ list_empty(&rq_of(cfs_rq)->curr->se.kernel_node))
resched_curr(rq_of(cfs_rq));
}

static __always_inline
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
@@ -5712,16 +5748,26 @@ static int tg_throttle_down(struct task_group *tg, void *data)
cfs_rq->throttle_count++;

return 0;
}

+static void enqueue_kernel(struct cfs_rq *cfs_rq, struct sched_entity *se, int count);
+static void dequeue_kernel(struct cfs_rq *cfs_rq, struct sched_entity *se, int count);
+
static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, idle_task_delta, dequeue = 1;
+ long task_delta, idle_task_delta, kernel_delta, dequeue = 1;
+
+ /*
+ * We don't actually throttle, though account() will have made sure to
+ * resched us so that we pick into a kernel task.
+ */
+ if (cfs_rq->h_kernel_running)
+ return false;

raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
/*
@@ -5749,10 +5795,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();

task_delta = cfs_rq->h_nr_running;
idle_task_delta = cfs_rq->idle_h_nr_running;
+ kernel_delta = cfs_rq->h_kernel_running;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
/* throttled entity or throttle-on-deactivate */
if (!se->on_rq)
goto done;
@@ -5762,10 +5809,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (cfs_rq_is_idle(group_cfs_rq(se)))
idle_task_delta = cfs_rq->h_nr_running;

qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
+ dequeue_kernel(qcfs_rq, se, kernel_delta);

if (qcfs_rq->load.weight) {
/* Avoid re-evaluating load for this entity: */
se = parent_entity(se);
break;
@@ -5784,10 +5832,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (cfs_rq_is_idle(group_cfs_rq(se)))
idle_task_delta = cfs_rq->h_nr_running;

qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
+ dequeue_kernel(qcfs_rq, se, kernel_delta);
}

/* At this point se is NULL and we are at root level*/
sub_nr_running(rq, task_delta);

@@ -5806,11 +5855,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, idle_task_delta;
+ long task_delta, idle_task_delta, kernel_delta;

se = cfs_rq->tg->se[cpu_of(rq)];

cfs_rq->throttled = 0;

@@ -5841,10 +5890,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
goto unthrottle_throttle;
}

task_delta = cfs_rq->h_nr_running;
idle_task_delta = cfs_rq->idle_h_nr_running;
+ kernel_delta = cfs_rq->h_kernel_running;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);

if (se->on_rq)
break;
@@ -5853,10 +5903,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
if (cfs_rq_is_idle(group_cfs_rq(se)))
idle_task_delta = cfs_rq->h_nr_running;

qcfs_rq->h_nr_running += task_delta;
qcfs_rq->idle_h_nr_running += idle_task_delta;
+ enqueue_kernel(qcfs_rq, se, kernel_delta);

/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(qcfs_rq))
goto unthrottle_throttle;
}
@@ -5870,10 +5921,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
if (cfs_rq_is_idle(group_cfs_rq(se)))
idle_task_delta = cfs_rq->h_nr_running;

qcfs_rq->h_nr_running += task_delta;
qcfs_rq->idle_h_nr_running += idle_task_delta;
+ enqueue_kernel(qcfs_rq, se, kernel_delta);

/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(qcfs_rq))
goto unthrottle_throttle;
}
@@ -6528,10 +6580,90 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
if (cfs_task_bw_constrained(p))
tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
}
#endif

+/*
+ * We keep track of all children that are runnable in the kernel with a count of
+ * all descendants. The state is checked on enqueue and put_prev (and hard
+ * cleared on dequeue), and is stored just as the filled/empty state of the
+ * kernel_node list entry.
+ *
+ * These are simple helpers that do both parts, and should be called bottom-up
+ * until hitting a throttled cfs_rq whenever a task changes state (or a cfs_rq
+ * is (un)throttled).
+ */
+static void enqueue_kernel(struct cfs_rq *cfs_rq, struct sched_entity *se, int count)
+{
+ if (count == 0)
+ return;
+
+ if (list_empty(&se->kernel_node))
+ list_add(&se->kernel_node, &cfs_rq->kernel_children);
+ cfs_rq->h_kernel_running += count;
+}
+
+static bool is_kernel_task(struct task_struct *p)
+{
+ return sysctl_sched_cfs_bandwidth_kernel_bypass && !atomic_read(&p->in_return_to_user);
+}
+
+/*
+ * When called on a task this always transitions it to a !kernel state.
+ *
+ * When called on a group it is just synchronizing the state with the new
+ * h_kernel_waiters, unless this it has been throttled and is !on_rq
+ */
+static void dequeue_kernel(struct cfs_rq *cfs_rq, struct sched_entity *se, int count)
+{
+ if (count == 0)
+ return;
+
+ if (!se->on_rq || entity_is_task(se) ||
+ !group_cfs_rq(se)->h_kernel_running)
+ list_del_init(&se->kernel_node);
+ cfs_rq->h_kernel_running -= count;
+}
+
+/*
+ * Returns if the cfs_rq "should" be throttled but might not be because of
+ * kernel threads bypassing throttle.
+ */
+static bool cfs_rq_throttled_loose(struct cfs_rq *cfs_rq)
+{
+ if (!cfs_bandwidth_used())
+ return false;
+
+ if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
+ return false;
+ return true;
+}
+
+static void unthrottle_on_enqueue(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+
+ if (!cfs_bandwidth_used() || !sysctl_sched_cfs_bandwidth_kernel_bypass)
+ return;
+ if (!cfs_rq_of(&p->se)->throttle_count)
+ return;
+
+ /*
+ * MAYBE TODO: doing it this simple way is O(throttle_count *
+ * cgroup_depth). We could optimize that into a single pass, but making
+ * a mostly-copy of unthrottle_cfs_rq that does that is a pain and easy
+ * to get wrong. (And even without unthrottle_on_enqueue it's O(nm),
+ * just not while holding rq->lock the whole time)
+ */
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ if (cfs_rq->throttled)
+ unthrottle_cfs_rq(cfs_rq);
+ }
+}
+
#else /* CONFIG_CFS_BANDWIDTH */

static inline bool cfs_bandwidth_used(void)
{
return false;
@@ -6575,10 +6707,20 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
bool cfs_task_bw_constrained(struct task_struct *p)
{
return false;
}
#endif
+static void enqueue_kernel(struct cfs_rq *cfs_rq, struct sched_entity *se, int count) {}
+static void dequeue_kernel(struct cfs_rq *cfs_rq, struct sched_entity *se, int count) {}
+static inline bool is_kernel_task(struct task_struct *p)
+{
+ return false;
+}
+static bool cfs_rq_throttled_loose(struct cfs_rq *cfs_rq)
+{
+ return false;
+}
#endif /* CONFIG_CFS_BANDWIDTH */

#if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {}
#endif
@@ -6678,10 +6820,11 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int idle_h_nr_running = task_has_idle_policy(p);
int task_new = !(flags & ENQUEUE_WAKEUP);
+ bool kernel_task = is_kernel_task(p);

/*
* The code below (indirectly) updates schedutil which looks at
* the cfs_rq utilization to select a frequency.
* Let's add the task's estimated utilization to the cfs_rq's
@@ -6706,10 +6849,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;

if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;
+ if (kernel_task)
+ enqueue_kernel(cfs_rq, se, 1);

/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;

@@ -6726,10 +6871,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;

if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;
+ if (kernel_task)
+ enqueue_kernel(cfs_rq, se, 1);

/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
}
@@ -6756,10 +6903,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)

enqueue_throttle:
assert_list_leaf_cfs_rq(rq);

hrtick_update(rq);
+
+ if (kernel_task)
+ unthrottle_on_enqueue(p);
}

static void set_next_buddy(struct sched_entity *se);

/*
@@ -6772,10 +6922,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int task_sleep = flags & DEQUEUE_SLEEP;
int idle_h_nr_running = task_has_idle_policy(p);
bool was_sched_idle = sched_idle_rq(rq);
+ bool kernel_task = !list_empty(&p->se.kernel_node);

util_est_dequeue(&rq->cfs, p);

for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -6784,10 +6935,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running--;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;

if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;
+ if (kernel_task)
+ dequeue_kernel(cfs_rq, se, 1);

/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto dequeue_throttle;

@@ -6816,10 +6969,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running--;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;

if (cfs_rq_is_idle(cfs_rq))
idle_h_nr_running = 1;
+ if (kernel_task)
+ dequeue_kernel(cfs_rq, se, 1);

/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto dequeue_throttle;

@@ -8316,15 +8471,44 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int

preempt:
resched_curr(rq);
}

+static void handle_kernel_task_prev(struct task_struct *prev)
+{
+#ifdef CONFIG_CFS_BANDWIDTH
+ struct sched_entity *se = &prev->se;
+ bool new_kernel = is_kernel_task(prev);
+ bool prev_kernel = !list_empty(&se->kernel_node);
+ /*
+ * These extra loops are bad and against the whole point of the merged
+ * PNT, but it's a pain to merge, particularly since we want it to occur
+ * before check_cfs_runtime.
+ */
+ if (prev_kernel && !new_kernel) {
+ WARN_ON_ONCE(!se->on_rq); /* dequeue should have removed us */
+ for_each_sched_entity(se) {
+ dequeue_kernel(cfs_rq_of(se), se, 1);
+ if (cfs_rq_throttled(cfs_rq_of(se)))
+ break;
+ }
+ } else if (!prev_kernel && new_kernel && se->on_rq) {
+ for_each_sched_entity(se) {
+ enqueue_kernel(cfs_rq_of(se), se, 1);
+ if (cfs_rq_throttled(cfs_rq_of(se)))
+ break;
+ }
+ }
+#endif
+}
+
#ifdef CONFIG_SMP
static struct task_struct *pick_task_fair(struct rq *rq)
{
struct sched_entity *se;
struct cfs_rq *cfs_rq;
+ bool throttled = false;

again:
cfs_rq = &rq->cfs;
if (!cfs_rq->nr_running)
return NULL;
@@ -8341,11 +8525,14 @@ static struct task_struct *pick_task_fair(struct rq *rq)

if (unlikely(check_cfs_rq_runtime(cfs_rq)))
goto again;
}

- se = pick_next_entity(cfs_rq);
+ if (cfs_rq_throttled_loose(cfs_rq))
+ throttled = true;
+
+ se = pick_next_entity(cfs_rq, throttled);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);

return task_of(se);
}
@@ -8356,10 +8543,18 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
{
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
struct task_struct *p;
int new_tasks;
+ bool throttled;
+
+ /*
+ * We want to handle this before check_cfs_runtime(prev). We'll
+ * duplicate a little work in the goto simple case, but that's fine
+ */
+ if (prev)
+ handle_kernel_task_prev(prev);

again:
if (!sched_fair_runnable(rq))
goto idle;

@@ -8373,10 +8568,11 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
*
* Therefore attempt to avoid putting and setting the entire cgroup
* hierarchy, only change the part that actually changes.
*/

+ throttled = false;
do {
struct sched_entity *curr = cfs_rq->curr;

/*
* Since we got here without doing put_prev_entity() we also
@@ -8404,11 +8600,14 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf

goto simple;
}
}

- se = pick_next_entity(cfs_rq);
+ if (cfs_rq_throttled_loose(cfs_rq))
+ throttled = true;
+
+ se = pick_next_entity(cfs_rq, throttled);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);

p = task_of(se);

@@ -8442,12 +8641,15 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
simple:
#endif
if (prev)
put_prev_task(rq, prev);

+ throttled = false;
do {
- se = pick_next_entity(cfs_rq);
+ if (cfs_rq_throttled_loose(cfs_rq))
+ throttled = true;
+ se = pick_next_entity(cfs_rq, throttled);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);

p = task_of(se);
@@ -8507,10 +8709,12 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq)
static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
{
struct sched_entity *se = &prev->se;
struct cfs_rq *cfs_rq;

+ handle_kernel_task_prev(prev);
+
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
put_prev_entity(cfs_rq, se);
}
}
@@ -12788,10 +12992,13 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
#ifdef CONFIG_SMP
raw_spin_lock_init(&cfs_rq->removed.lock);
#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+ INIT_LIST_HEAD(&cfs_rq->kernel_children);
+#endif
}

#ifdef CONFIG_FAIR_GROUP_SCHED
static void task_change_group_fair(struct task_struct *p)
{
@@ -12940,10 +13147,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,

se->my_q = cfs_rq;
/* guarantee group entities always have weight */
update_load_set(&se->load, NICE_0_LOAD);
se->parent = parent;
+#ifdef CONFIG_CFS_BANDWIDTH
+ INIT_LIST_HEAD(&se->kernel_node);
+#endif
}

static DEFINE_MUTEX(shares_mutex);

static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e58a54bda77de..5fee806cc4538 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -578,10 +578,11 @@ struct cfs_rq {
u64 min_vruntime_copy;
#endif

struct rb_root_cached tasks_timeline;

+
/*
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct sched_entity *curr;
@@ -656,12 +657,14 @@ struct cfs_rq {
u64 throttled_clock_pelt_time;
u64 throttled_clock_self;
u64 throttled_clock_self_time;
int throttled;
int throttle_count;
+ int h_kernel_running;
struct list_head throttled_list;
struct list_head throttled_csd_list;
+ struct list_head kernel_children;;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};

static inline int rt_bandwidth_enabled(void)
--
2.43.0.472.g3155946c3a-goog