[PATCH 29/31] sched: numa: CPU follows memory

From: Mel Gorman
Date: Tue Nov 13 2012 - 06:14:13 EST


NOTE: This is heavily based on "autonuma: CPU follows memory algorithm"
and "autonuma: mm_autonuma and task_autonuma data structures"
with bits taken but worked within the scheduler hooks and home
node mechanism as defined by schednuma.

This patch adds per-mm and per-task data structures to track the number
of faults in total and on a per-nid basis. On each NUMA fault it
checks if the system would benefit if the current task was migrated
to another node. If the task should be migrated, its home node is
updated and the task is requeued.

Signed-off-by: Mel Gorman <mgorman@xxxxxxx>
---
include/linux/mm_types.h | 26 +++++
include/linux/sched.h | 19 +++-
kernel/fork.c | 18 +++
kernel/sched/core.c | 3 +
kernel/sched/fair.c | 275 ++++++++++++++++++++++++++++++++++++++++++++--
kernel/sched/sched.h | 14 +++
6 files changed, 344 insertions(+), 11 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index b40f4ef..66172d6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -308,6 +308,29 @@ struct mm_rss_stat {
atomic_long_t count[NR_MM_COUNTERS];
};

+#ifdef CONFIG_BALANCE_NUMA
+/*
+ * Per-mm structure that contains the NUMA memory placement statistics
+ * generated by pte_numa faults.
+ */
+struct mm_balancenuma {
+ /*
+ * Number of pages that will trigger NUMA faults for this mm. Total
+ * decays each time whether the home node should change to keep
+ * track only of recent events
+ */
+ unsigned long mm_numa_fault_tot;
+
+ /*
+ * Number of pages that will trigger NUMA faults for each [nid].
+ * Also decays.
+ */
+ unsigned long mm_numa_fault[0];
+
+ /* do not add more variables here, the above array size is dynamic */
+};
+#endif /* CONFIG_BALANCE_NUMA */
+
struct mm_struct {
struct vm_area_struct * mmap; /* list of VMAs */
struct rb_root mm_rb;
@@ -411,6 +434,9 @@ struct mm_struct {

/* numa_scan_seq prevents two threads setting pte_numa */
int numa_scan_seq;
+
+ /* this is used by the scheduler and the page allocator */
+ struct mm_balancenuma *mm_balancenuma;
#endif
struct uprobes_state uprobes_state;
};
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7ebf32e..336ec68 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1188,6 +1188,23 @@ enum perf_event_task_context {
perf_nr_task_contexts,
};

+#ifdef CONFIG_BALANCE_NUMA
+/*
+ * Per-task structure that contains the NUMA memory placement statistics
+ * generated by pte_numa faults. This structure is dynamically allocated
+ * when the first pte_numa fault is handled.
+ */
+struct task_balancenuma {
+ /* Total number of eligible pages that triggered NUMA faults */
+ unsigned long task_numa_fault_tot;
+
+ /* Number of pages that triggered NUMA faults for each [nid] */
+ unsigned long task_numa_fault[0];
+
+ /* do not add more variables here, the above array size is dynamic */
+};
+#endif /* CONFIG_BALANCE_NUMA */
+
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
@@ -1488,6 +1505,7 @@ struct task_struct {
unsigned int numa_scan_period;
u64 node_stamp; /* migration stamp */
struct callback_head numa_work;
+ struct task_balancenuma *task_balancenuma;
#endif /* CONFIG_BALANCE_NUMA */

struct rcu_head rcu;
@@ -2022,7 +2040,6 @@ extern unsigned int sysctl_balance_numa_scan_delay;
extern unsigned int sysctl_balance_numa_scan_period_min;
extern unsigned int sysctl_balance_numa_scan_period_max;
extern unsigned int sysctl_balance_numa_scan_size;
-extern unsigned int sysctl_balance_numa_settle_count;

#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_migration_cost;
diff --git a/kernel/fork.c b/kernel/fork.c
index 8b20ab7..c8752f6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -525,6 +525,20 @@ static void mm_init_aio(struct mm_struct *mm)
#endif
}

+#ifdef CONFIG_BALANCE_NUMA
+static inline void free_mm_balancenuma(struct mm_struct *mm)
+{
+ if (mm->mm_balancenuma)
+ kfree(mm->mm_balancenuma);
+
+ mm->mm_balancenuma = NULL;
+}
+#else
+static inline void free_mm_balancenuma(struct mm_struct *mm)
+{
+}
+#endif
+
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
{
atomic_set(&mm->mm_users, 1);
@@ -539,6 +553,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
spin_lock_init(&mm->page_table_lock);
mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->cached_hole_size = ~0UL;
+ mm->mm_balancenuma = NULL;
mm_init_aio(mm);
mm_init_owner(mm, p);

@@ -548,6 +563,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
return mm;
}

+ free_mm_balancenuma(mm);
free_mm(mm);
return NULL;
}
@@ -597,6 +613,7 @@ void __mmdrop(struct mm_struct *mm)
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
check_mm(mm);
+ free_mm_balancenuma(mm);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -854,6 +871,7 @@ fail_nocontext:
* If init_new_context() failed, we cannot use mmput() to free the mm
* because it calls destroy_context()
*/
+ free_mm_balancenuma(mm);
mm_free_pgd(mm);
free_mm(mm);
return NULL;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3d9fc26..9472d5d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1543,6 +1543,7 @@ static void __sched_fork(struct task_struct *p)
p->node_stamp = 0ULL;
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+ p->task_balancenuma = NULL;
p->numa_scan_period = sysctl_balance_numa_scan_delay;
p->numa_work.next = &p->numa_work;
#endif /* CONFIG_BALANCE_NUMA */
@@ -1787,6 +1788,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
if (mm)
mmdrop(mm);
if (unlikely(prev_state == TASK_DEAD)) {
+ free_task_balancenuma(prev);
+
/*
* Remove function-return probe instances associated with this
* task and put them back on the free list.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a816bbe..abcf7f5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -836,15 +836,234 @@ unsigned int sysctl_balance_numa_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_balance_numa_scan_delay = 1000;

+#define BALANCENUMA_SCALE 1000
+static inline unsigned long balancenuma_weight(unsigned long nid_faults,
+ unsigned long total_faults)
+{
+ if (nid_faults > total_faults)
+ nid_faults = total_faults;
+
+ return nid_faults * BALANCENUMA_SCALE / total_faults;
+}
+
+static inline unsigned long balancenuma_task_weight(struct task_struct *p,
+ int nid)
+{
+ struct task_balancenuma *task_balancenuma = p->task_balancenuma;
+ unsigned long nid_faults, total_faults;
+
+ nid_faults = task_balancenuma->task_numa_fault[nid];
+ total_faults = task_balancenuma->task_numa_fault_tot;
+ return balancenuma_weight(nid_faults, total_faults);
+}
+
+static inline unsigned long balancenuma_mm_weight(struct task_struct *p,
+ int nid)
+{
+ struct mm_balancenuma *mm_balancenuma = p->mm->mm_balancenuma;
+ unsigned long nid_faults, total_faults;
+
+ nid_faults = mm_balancenuma->mm_numa_fault[nid];
+ total_faults = mm_balancenuma->mm_numa_fault_tot;
+
+ /* It's possible for total_faults to decay to 0 in parallel so check */
+ return total_faults ? balancenuma_weight(nid_faults, total_faults) : 0;
+}
+
+/*
+ * Examines all other nodes examining remote tasks to see if there would
+ * be fewer remote numa faults if tasks swapped home nodes
+ */
+static void task_numa_find_placement(struct task_struct *p)
+{
+ struct cpumask *allowed = tsk_cpus_allowed(p);
+ int this_cpu = smp_processor_id();
+ int this_nid = numa_node_id();
+ long p_task_weight, p_mm_weight;
+ long weight_diff_max = 0;
+ struct task_struct *selected_task = NULL;
+ int selected_nid = -1;
+ int nid;
+
+ p_task_weight = balancenuma_task_weight(p, this_nid);
+ p_mm_weight = balancenuma_mm_weight(p, this_nid);
+
+ /* Examine a task on every other node */
+ for_each_online_node(nid) {
+ int cpu;
+ for_each_cpu_and(cpu, cpumask_of_node(nid), allowed) {
+ struct rq *rq;
+ struct mm_struct *other_mm;
+ struct task_struct *other_task;
+ long this_weight, other_weight, p_weight;
+ long other_diff, this_diff;
+
+ if (!cpu_online(cpu) || idle_cpu(cpu))
+ continue;
+
+ /* Racy check if a task is running on the other rq */
+ rq = cpu_rq(cpu);
+ other_mm = rq->curr->mm;
+ if (!other_mm || !other_mm->mm_balancenuma)
+ continue;
+
+ /* Effectively pin the other task to get fault stats */
+ raw_spin_lock_irq(&rq->lock);
+ other_task = rq->curr;
+ other_mm = other_task->mm;
+
+ /* Ensure the other task has usable stats */
+ if (!other_task->task_balancenuma ||
+ !other_task->task_balancenuma->task_numa_fault_tot ||
+ !other_mm ||
+ !other_mm->mm_balancenuma ||
+ !other_mm->mm_balancenuma->mm_numa_fault_tot) {
+ raw_spin_unlock_irq(&rq->lock);
+ continue;
+ }
+
+ /* Ensure the other task can be swapped */
+ if (!cpumask_test_cpu(this_cpu,
+ tsk_cpus_allowed(other_task))) {
+ raw_spin_unlock_irq(&rq->lock);
+ continue;
+ }
+
+ /*
+ * Read the fault statistics. If the remote task is a
+ * thread in the process then use the task statistics.
+ * Otherwise use the per-mm statistics.
+ */
+ if (other_mm == p->mm) {
+ this_weight = balancenuma_task_weight(p, nid);
+ other_weight = balancenuma_task_weight(other_task, nid);
+ p_weight = p_task_weight;
+ } else {
+ this_weight = balancenuma_mm_weight(p, nid);
+ other_weight = balancenuma_mm_weight(other_task, nid);
+ p_weight = p_mm_weight;
+ }
+
+ raw_spin_unlock_irq(&rq->lock);
+
+ /*
+ * other_diff: How much does the current task perfer to
+ * run on the remote node thn the task that is
+ * currently running there?
+ */
+ other_diff = this_weight - other_weight;
+
+ /*
+ * this_diff: How much does the currrent task prefer to
+ * run on the remote NUMA node compared to the current
+ * node?
+ */
+ this_diff = this_weight - p_weight;
+
+ /*
+ * Would swapping the tasks reduce the overall
+ * cross-node NUMA faults?
+ */
+ if (other_diff > 0 && this_diff > 0) {
+ long weight_diff = other_diff + this_diff;
+
+ /* Remember the best candidate. */
+ if (weight_diff > weight_diff_max) {
+ weight_diff_max = weight_diff;
+ selected_nid = nid;
+ selected_task = other_task;
+ }
+ }
+ }
+ }
+
+ /* Swap the task on the selected target node */
+ if (selected_nid != -1) {
+ sched_setnode(p, selected_nid);
+ sched_setnode(selected_task, this_nid);
+ }
+}
+
static void task_numa_placement(struct task_struct *p)
{
+ unsigned long task_total, mm_total;
+ struct mm_balancenuma *mm_balancenuma;
+ struct task_balancenuma *task_balancenuma;
+ unsigned long mm_max_weight, task_max_weight;
+ int this_nid, nid, mm_selected_nid, task_selected_nid;
+
int seq = ACCESS_ONCE(p->mm->numa_scan_seq);

if (p->numa_scan_seq == seq)
return;
p->numa_scan_seq = seq;

- /* FIXME: Scheduling placement policy hints go here */
+ this_nid = numa_node_id();
+ mm_balancenuma = p->mm->mm_balancenuma;
+ task_balancenuma = p->task_balancenuma;
+
+ /* If the task has no NUMA hinting page faults, use current nid */
+ mm_total = ACCESS_ONCE(mm_balancenuma->mm_numa_fault_tot);
+ if (!mm_total)
+ return;
+ task_total = task_balancenuma->task_numa_fault_tot;
+ if (!task_total)
+ return;
+
+ /*
+ * Identify the NUMA node where this thread (task_struct), and
+ * the process (mm_struct) as a whole, has the largest number
+ * of NUMA faults
+ */
+ mm_selected_nid = task_selected_nid = -1;
+ mm_max_weight = task_max_weight = 0;
+ for_each_online_node(nid) {
+ unsigned long mm_nid_fault, task_nid_fault;
+ unsigned long mm_numa_weight, task_numa_weight;
+
+ /* Read the number of task and mm faults on node */
+ mm_nid_fault = ACCESS_ONCE(mm_balancenuma->mm_numa_fault[nid]);
+ task_nid_fault = task_balancenuma->task_numa_fault[nid];
+
+ /*
+ * The weights are the relative number of pte_numa faults that
+ * were handled on this node in comparison to all pte_numa faults
+ * overall
+ */
+ mm_numa_weight = balancenuma_weight(mm_nid_fault, mm_total);
+ task_numa_weight = balancenuma_weight(task_nid_fault, task_total);
+ if (mm_numa_weight > mm_max_weight) {
+ mm_max_weight = mm_numa_weight;
+ mm_selected_nid = nid;
+ }
+ if (task_numa_weight > task_max_weight) {
+ task_max_weight = task_numa_weight;
+ task_selected_nid = nid;
+ }
+
+ /* Decay the stats by a factor of 2 */
+ p->mm->mm_balancenuma->mm_numa_fault[nid] >>= 1;
+ }
+
+ /* Recheck for a usable task_numa_fault_tot after decaying */
+ if (!task_balancenuma->task_numa_fault_tot ||
+ !mm_balancenuma->mm_numa_fault_tot)
+ return;
+
+ /*
+ * If this NUMA node is the selected one based on process
+ * memory and task NUMA faults then set the home node.
+ * There should be no need to requeue the task.
+ */
+ if (task_selected_nid == this_nid && mm_selected_nid == this_nid) {
+ p->numa_scan_period = min(sysctl_balance_numa_scan_period_max,
+ p->numa_scan_period * 2);
+ p->home_node = this_nid;
+ return;
+ }
+
+ p->numa_scan_period = sysctl_balance_numa_scan_period_min;
+ task_numa_find_placement(p);
}

/*
@@ -854,7 +1073,30 @@ void task_numa_fault(int node, int pages, bool misplaced)
{
struct task_struct *p = current;

- /* FIXME: Allocate task-specific structure for placement policy here */
+ if (!p->task_balancenuma) {
+ int size = sizeof(struct task_balancenuma) +
+ (sizeof(unsigned long) * nr_node_ids);
+ p->task_balancenuma = kzalloc(size, GFP_KERNEL);
+ if (!p->task_balancenuma)
+ return;
+ }
+
+ if (!p->mm->mm_balancenuma) {
+ int size = sizeof(struct mm_balancenuma) +
+ (sizeof(unsigned long) * nr_node_ids);
+ p->mm->mm_balancenuma = kzalloc(size, GFP_KERNEL);
+ if (!p->mm->mm_balancenuma) {
+ kfree(p->task_balancenuma);
+ p->task_balancenuma = NULL;
+ return;
+ }
+ }
+
+ /* Record fault statistics */
+ p->task_balancenuma->task_numa_fault_tot++;
+ p->task_balancenuma->task_numa_fault[node]++;
+ p->mm->mm_balancenuma->mm_numa_fault_tot++;
+ p->mm->mm_balancenuma->mm_numa_fault[node]++;

/*
* task_numa_placement can be expensive so only call it if pages were
@@ -864,6 +1106,21 @@ void task_numa_fault(int node, int pages, bool misplaced)
task_numa_placement(p);
}

+static void reset_ptenuma_scan(struct task_struct *p)
+{
+ ACCESS_ONCE(p->mm->numa_scan_seq)++;
+
+ if (p->mm && p->mm->mm_balancenuma)
+ p->mm->mm_balancenuma->mm_numa_fault_tot >>= 1;
+ if (p->task_balancenuma) {
+ int nid;
+ p->task_balancenuma->task_numa_fault_tot >>= 1;
+ for_each_online_node(nid) {
+ p->task_balancenuma->task_numa_fault[nid] >>= 1;
+ }
+ }
+}
+
/*
* The expensive part of numa migration is done from task_work context.
* Triggered from task_tick_numa().
@@ -912,7 +1169,7 @@ void task_numa_work(struct callback_head *work)
down_read(&mm->mmap_sem);
vma = find_vma(mm, offset);
if (!vma) {
- ACCESS_ONCE(mm->numa_scan_seq)++;
+ reset_ptenuma_scan(p);
offset = 0;
vma = mm->mmap;
}
@@ -937,14 +1194,12 @@ void task_numa_work(struct callback_head *work)
* It is possible to reach the end of the VMA list but the last few VMAs are
* not guaranteed to the vma_migratable. If they are not, we would find the
* !migratable VMA on the next scan but not reset the scanner to the start
- * so check it now.
+ * so we must check it now.
*/
- if (!vma) {
- ACCESS_ONCE(mm->numa_scan_seq)++;
- offset = 0;
- vma = mm->mmap;
- }
- mm->numa_scan_offset = offset;
+ if (vma)
+ mm->numa_scan_offset = offset;
+ else
+ reset_ptenuma_scan(p);
up_read(&mm->mmap_sem);
}

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3f0e5a1..92df3d4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -502,6 +502,20 @@ DECLARE_PER_CPU(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() (&__raw_get_cpu_var(runqueues))

+
+#ifdef CONFIG_BALANCE_NUMA
+static inline void free_task_balancenuma(struct task_struct *p)
+{
+ if (p->task_balancenuma)
+ kfree(p->task_balancenuma);
+ p->task_balancenuma = NULL;
+}
+#else
+static inline void free_task_balancenuma(struct task_struct *p)
+{
+}
+#endif /* CONFIG_BALANCE_NUMA */
+
#ifdef CONFIG_SMP

#define rcu_dereference_check_sched_domain(p) \
--
1.7.9.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/