[RFC][PATCH 4/4] sched, numa: Ignore pinned tasks

From: Peter Zijlstra
Date: Fri May 15 2015 - 11:51:11 EST

Next message: Peter Zijlstra: "[RFC][PATCH 0/4] sched,numa: pinned task accounting"
Previous message: Peter Zijlstra: "[RFC][PATCH 1/4] sched: Fix a race between __kthread_bind() and sched_setaffinity()"
In reply to: Tejun Heo: "Re: [RFC][PATCH 1/4] sched: Fix a race between __kthread_bind() and sched_setaffinity()"
Next in thread: Rik van Riel: "Re: [RFC][PATCH 4/4] sched, numa: Ignore pinned tasks"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Per cpu (kernel) threads can currently trick the load balancer in
thinking there's something to do and result in moving perfectly placed
tasks away.

By virtue of the new do_set_cpus_allowed() we can easily add nr_pinned
accounting. Which we'll use to change the fbq classification such that
we make it less likely to pick such tasks for migration.

Note that it is still possible we'll migrate these well placed tasks
away, further patches could reduce this still.

Suggested-by: Rik van Riel <riel@xxxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++--------
kernel/sched/sched.h | 1 +
2 files changed, 35 insertions(+), 8 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -858,12 +858,20 @@ static unsigned int task_scan_max(struct

static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
+ if (p->nr_cpus_allowed == 1) {
+ p->numa_preferred_nid = -1;
+ rq->nr_pinned_running++;
+ }
rq->nr_numa_running += (p->numa_preferred_nid != -1);
rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
}

static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
+ if (p->nr_cpus_allowed == 1) {
+ rq->nr_pinned_running--;
+ WARN_ON_ONCE(p->numa_preferred_nid != -1);
+ }
rq->nr_numa_running -= (p->numa_preferred_nid != -1);
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}
@@ -2266,6 +2274,12 @@ void task_tick_numa(struct rq *rq, struc
return;

/*
+ * We don't care about NUMA placement if we can't migrate the task.
+ */
+ if (curr->nr_cpus_allowed == 1)
+ return;
+
+ /*
* Using runtime rather than walltime has the dual advantage that
* we (mostly) drive the selection from busy threads and that the
* task needs to have done some actual work before we bother with
@@ -5567,7 +5581,7 @@ static bool yield_to_task_fair(struct rq

static unsigned long __read_mostly max_load_balance_interval = HZ/10;

-enum fbq_type { regular, remote, all };
+enum fbq_type { regular, remote, movable, all };

#define LBF_ALL_PINNED 0x01
#define LBF_NEED_BREAK 0x02
@@ -6112,6 +6126,7 @@ struct sg_lb_stats {
enum group_type group_type;
int group_no_capacity;
#ifdef CONFIG_NUMA_BALANCING
+ unsigned int nr_pinned_running;
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
#endif
@@ -6449,6 +6464,7 @@ static inline void update_sg_lb_stats(st
*overload = true;

#ifdef CONFIG_NUMA_BALANCING
+ sgs->nr_pinned_running += rq->nr_pinned_running;
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
@@ -6522,19 +6538,27 @@ static bool update_sd_pick_busiest(struc
#ifdef CONFIG_NUMA_BALANCING
static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
{
- if (sgs->sum_nr_running > sgs->nr_numa_running)
+ unsigned int nr_migratable = sgs->sum_nr_running - sgs->nr_pinned_running;
+
+ if (nr_migratable > sgs->nr_numa_running)
return regular;
- if (sgs->sum_nr_running > sgs->nr_preferred_running)
+ if (nr_migratable > sgs->nr_preferred_running)
return remote;
+ if (nr_migratable)
+ return movable;
return all;
}

static inline enum fbq_type fbq_classify_rq(struct rq *rq)
{
- if (rq->nr_running > rq->nr_numa_running)
+ unsigned int nr_migratable = rq->cfs.h_nr_running - rq->nr_pinned_running;
+
+ if (nr_migratable > rq->nr_numa_running)
return regular;
- if (rq->nr_running > rq->nr_preferred_running)
+ if (nr_migratable > rq->nr_preferred_running)
return remote;
+ if (nr_migratable)
+ return movable;
return all;
}
#else
@@ -6938,9 +6962,11 @@ static struct rq *find_busiest_queue(str

/*
* We classify groups/runqueues into three groups:
- * - regular: there are !numa tasks
- * - remote: there are numa tasks that run on the 'wrong' node
- * - all: there is no distinction
+ * - regular: there are (migratable) !numa tasks
+ * - remote: there are (migratable) numa tasks that
+ * run on the 'wrong' node
+ * - movable: there are (migratable) tasks
+ * - all: there are tasks
*
* In order to avoid migrating ideally placed numa tasks,
* ignore those when there's better options.
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -571,6 +571,7 @@ struct rq {
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
+ unsigned int nr_pinned_running;
#endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Peter Zijlstra: "[RFC][PATCH 0/4] sched,numa: pinned task accounting"
Previous message: Peter Zijlstra: "[RFC][PATCH 1/4] sched: Fix a race between __kthread_bind() and sched_setaffinity()"
In reply to: Tejun Heo: "Re: [RFC][PATCH 1/4] sched: Fix a race between __kthread_bind() and sched_setaffinity()"
Next in thread: Rik van Riel: "Re: [RFC][PATCH 4/4] sched, numa: Ignore pinned tasks"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]