[PATCH 41/46] sched: numa: Introduce per-mm and per-task structures

From: Mel Gorman
Date: Wed Nov 21 2012 - 05:24:10 EST


NOTE: This is heavily based on "autonuma: CPU follows memory algorithm"
and "autonuma: mm_autonuma and task_autonuma data structures"

At the most basic level, any placement policy is going to make some
sort of smart decision based on per-mm and per-task statistics. This
patch simply introduces the structures with basic fault statistics
that can be expaned upon or replaced later. It may be that a placement
policy can approximate without needing both structures in which case
they can be safely deleted later while still having a comparison point
to ensure the approximation is accurate.

[dhillf@xxxxxxxxx: Use @pages parameter for fault statistics]
Signed-off-by: Mel Gorman <mgorman@xxxxxxx>
---
include/linux/mm_types.h | 26 ++++++++++++++++++++++++++
include/linux/sched.h | 18 ++++++++++++++++++
kernel/fork.c | 18 ++++++++++++++++++
kernel/sched/core.c | 3 +++
kernel/sched/fair.c | 25 ++++++++++++++++++++++++-
kernel/sched/sched.h | 14 ++++++++++++++
6 files changed, 103 insertions(+), 1 deletion(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6b478ff..9588a91 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -312,6 +312,29 @@ struct mm_rss_stat {
atomic_long_t count[NR_MM_COUNTERS];
};

+#ifdef CONFIG_BALANCE_NUMA
+/*
+ * Per-mm structure that contains the NUMA memory placement statistics
+ * generated by pte_numa faults.
+ */
+struct mm_balancenuma {
+ /*
+ * Number of pages that will trigger NUMA faults for this mm. Total
+ * decays each time whether the home node should change to keep
+ * track only of recent events
+ */
+ unsigned long mm_numa_fault_tot;
+
+ /*
+ * Number of pages that will trigger NUMA faults for each [nid].
+ * Also decays.
+ */
+ unsigned long mm_numa_fault[0];
+
+ /* do not add more variables here, the above array size is dynamic */
+};
+#endif /* CONFIG_BALANCE_NUMA */
+
struct mm_struct {
struct vm_area_struct * mmap; /* list of VMAs */
struct rb_root mm_rb;
@@ -415,6 +438,9 @@ struct mm_struct {

/* numa_scan_seq prevents two threads setting pte_numa */
int numa_scan_seq;
+
+ /* this is used by the scheduler and the page allocator */
+ struct mm_balancenuma *mm_balancenuma;
#endif
struct uprobes_state uprobes_state;
};
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1cccfc3..7b6625a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1188,6 +1188,23 @@ enum perf_event_task_context {
perf_nr_task_contexts,
};

+#ifdef CONFIG_BALANCE_NUMA
+/*
+ * Per-task structure that contains the NUMA memory placement statistics
+ * generated by pte_numa faults. This structure is dynamically allocated
+ * when the first pte_numa fault is handled.
+ */
+struct task_balancenuma {
+ /* Total number of eligible pages that triggered NUMA faults */
+ unsigned long task_numa_fault_tot;
+
+ /* Number of pages that triggered NUMA faults for each [nid] */
+ unsigned long task_numa_fault[0];
+
+ /* do not add more variables here, the above array size is dynamic */
+};
+#endif /* CONFIG_BALANCE_NUMA */
+
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
@@ -1488,6 +1505,7 @@ struct task_struct {
unsigned int numa_scan_period;
u64 node_stamp; /* migration stamp */
struct callback_head numa_work;
+ struct task_balancenuma *task_balancenuma;
#endif /* CONFIG_BALANCE_NUMA */

struct rcu_head rcu;
diff --git a/kernel/fork.c b/kernel/fork.c
index 8b20ab7..c8752f6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -525,6 +525,20 @@ static void mm_init_aio(struct mm_struct *mm)
#endif
}

+#ifdef CONFIG_BALANCE_NUMA
+static inline void free_mm_balancenuma(struct mm_struct *mm)
+{
+ if (mm->mm_balancenuma)
+ kfree(mm->mm_balancenuma);
+
+ mm->mm_balancenuma = NULL;
+}
+#else
+static inline void free_mm_balancenuma(struct mm_struct *mm)
+{
+}
+#endif
+
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
{
atomic_set(&mm->mm_users, 1);
@@ -539,6 +553,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
spin_lock_init(&mm->page_table_lock);
mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->cached_hole_size = ~0UL;
+ mm->mm_balancenuma = NULL;
mm_init_aio(mm);
mm_init_owner(mm, p);

@@ -548,6 +563,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
return mm;
}

+ free_mm_balancenuma(mm);
free_mm(mm);
return NULL;
}
@@ -597,6 +613,7 @@ void __mmdrop(struct mm_struct *mm)
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
check_mm(mm);
+ free_mm_balancenuma(mm);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -854,6 +871,7 @@ fail_nocontext:
* If init_new_context() failed, we cannot use mmput() to free the mm
* because it calls destroy_context()
*/
+ free_mm_balancenuma(mm);
mm_free_pgd(mm);
free_mm(mm);
return NULL;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3d9fc26..9472d5d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1543,6 +1543,7 @@ static void __sched_fork(struct task_struct *p)
p->node_stamp = 0ULL;
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+ p->task_balancenuma = NULL;
p->numa_scan_period = sysctl_balance_numa_scan_delay;
p->numa_work.next = &p->numa_work;
#endif /* CONFIG_BALANCE_NUMA */
@@ -1787,6 +1788,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
if (mm)
mmdrop(mm);
if (unlikely(prev_state == TASK_DEAD)) {
+ free_task_balancenuma(prev);
+
/*
* Remove function-return probe instances associated with this
* task and put them back on the free list.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 462de9b..fc8f95d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -855,7 +855,30 @@ void task_numa_fault(int node, int pages)
{
struct task_struct *p = current;

- /* FIXME: Allocate task-specific structure for placement policy here */
+ if (!p->task_balancenuma) {
+ int size = sizeof(struct task_balancenuma) +
+ (sizeof(unsigned long) * nr_node_ids);
+ p->task_balancenuma = kzalloc(size, GFP_KERNEL);
+ if (!p->task_balancenuma)
+ return;
+ }
+
+ if (!p->mm->mm_balancenuma) {
+ int size = sizeof(struct mm_balancenuma) +
+ (sizeof(unsigned long) * nr_node_ids);
+ p->mm->mm_balancenuma = kzalloc(size, GFP_KERNEL);
+ if (!p->mm->mm_balancenuma) {
+ kfree(p->task_balancenuma);
+ p->task_balancenuma = NULL;
+ return;
+ }
+ }
+
+ /* Record fault statistics */
+ p->task_balancenuma->task_numa_fault_tot += pages;
+ p->task_balancenuma->task_numa_fault[node] += pages;
+ p->mm->mm_balancenuma->mm_numa_fault_tot += pages;
+ p->mm->mm_balancenuma->mm_numa_fault[node] += pages;

/*
* Assume that as faults occur that pages are getting properly placed
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3f0e5a1..92df3d4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -502,6 +502,20 @@ DECLARE_PER_CPU(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() (&__raw_get_cpu_var(runqueues))

+
+#ifdef CONFIG_BALANCE_NUMA
+static inline void free_task_balancenuma(struct task_struct *p)
+{
+ if (p->task_balancenuma)
+ kfree(p->task_balancenuma);
+ p->task_balancenuma = NULL;
+}
+#else
+static inline void free_task_balancenuma(struct task_struct *p)
+{
+}
+#endif /* CONFIG_BALANCE_NUMA */
+
#ifdef CONFIG_SMP

#define rcu_dereference_check_sched_domain(p) \
--
1.7.9.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/