Re: [PATCH 7/8] sched: Split accounting of NUMA hinting faults thatpass two-stage filter

From: Srikar Dronamraju
Date: Fri Jun 28 2013 - 03:01:38 EST

Next message: Ankur Raina: "[patch v1 1/1]regulator: fix for DA9055 regulator"
Previous message: Stephen Rothwell: "Re: mmotm 2013-06-27-16-36 uploaded (wait event common)"
In reply to: Mel Gorman: "Re: [PATCH 7/8] sched: Split accounting of NUMA hinting faults thatpass two-stage filter"
Next in thread: Peter Zijlstra: "Re: [PATCH 7/8] sched: Split accounting of NUMA hinting faults thatpass two-stage filter"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

* Mel Gorman <mgorman@xxxxxxx> [2013-06-26 15:38:06]:

> Ideally it would be possible to distinguish between NUMA hinting faults
> that are private to a task and those that are shared. This would require
> that the last task that accessed a page for a hinting fault would be
> recorded which would increase the size of struct page. Instead this patch
> approximates private pages by assuming that faults that pass the two-stage
> filter are private pages and all others are shared. The preferred NUMA
> node is then selected based on where the maximum number of approximately
> private faults were measured.

Should we consider only private faults for preferred node?
I would think if tasks have shared pages then moving all tasks that share
the same pages to a node where the share pages are around would be
preferred. No? If yes, how does the preferred node logic help to achieve
the above?

>
> Signed-off-by: Mel Gorman <mgorman@xxxxxxx>
> ---
> include/linux/sched.h | 4 ++--
> kernel/sched/fair.c | 32 ++++++++++++++++++++++----------
> mm/huge_memory.c | 7 ++++---
> mm/memory.c | 9 ++++++---
> 4 files changed, 34 insertions(+), 18 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 82a6136..a41edea 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1600,10 +1600,10 @@ struct task_struct {
> #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
>
> #ifdef CONFIG_NUMA_BALANCING
> -extern void task_numa_fault(int node, int pages, bool migrated);
> +extern void task_numa_fault(int last_node, int node, int pages, bool migrated);
> extern void set_numabalancing_state(bool enabled);
> #else
> -static inline void task_numa_fault(int node, int pages, bool migrated)
> +static inline void task_numa_fault(int last_node, int node, int pages, bool migrated)
> {
> }
> static inline void set_numabalancing_state(bool enabled)
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 99951a8..490e601 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -833,6 +833,11 @@ find_idlest_cpu_node(int this_cpu, int nid)
> return idlest_cpu;
> }
>
> +static inline int task_faults_idx(int nid, int priv)
> +{
> + return 2 * nid + priv;
> +}
> +
> static void task_numa_placement(struct task_struct *p)
> {
> int seq, nid, max_nid = 0;
> @@ -849,13 +854,19 @@ static void task_numa_placement(struct task_struct *p)
> /* Find the node with the highest number of faults */
> for (nid = 0; nid < nr_node_ids; nid++) {
> unsigned long faults;
> + int priv, i;
>
> - /* Decay existing window and copy faults since last scan */
> - p->numa_faults[nid] >>= 1;
> - p->numa_faults[nid] += p->numa_faults_buffer[nid];
> - p->numa_faults_buffer[nid] = 0;
> + for (priv = 0; priv < 2; priv++) {
> + i = task_faults_idx(nid, priv);
> +
> + /* Decay existing window and copy faults since last scan */
> + p->numa_faults[i] >>= 1;
> + p->numa_faults[i] += p->numa_faults_buffer[i];
> + p->numa_faults_buffer[i] = 0;
> + }
>
> - faults = p->numa_faults[nid];
> + /* Find maximum private faults */
> + faults = p->numa_faults[task_faults_idx(nid, 1)];
> if (faults > max_faults) {
> max_faults = faults;
> max_nid = nid;
> @@ -887,24 +898,25 @@ static void task_numa_placement(struct task_struct *p)
> /*
> * Got a PROT_NONE fault for a page on @node.
> */
> -void task_numa_fault(int node, int pages, bool migrated)
> +void task_numa_fault(int last_nid, int node, int pages, bool migrated)
> {
> struct task_struct *p = current;
> + int priv = (cpu_to_node(task_cpu(p)) == last_nid);
>
> if (!sched_feat_numa(NUMA))
> return;
>
> /* Allocate buffer to track faults on a per-node basis */
> if (unlikely(!p->numa_faults)) {
> - int size = sizeof(*p->numa_faults) * nr_node_ids;
> + int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
>
> /* numa_faults and numa_faults_buffer share the allocation */
> - p->numa_faults = kzalloc(size * 2, GFP_KERNEL);
> + p->numa_faults = kzalloc(size * 4, GFP_KERNEL);
> if (!p->numa_faults)
> return;
>
> BUG_ON(p->numa_faults_buffer);
> - p->numa_faults_buffer = p->numa_faults + nr_node_ids;
> + p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
> }
>
> /*
> @@ -918,7 +930,7 @@ void task_numa_fault(int node, int pages, bool migrated)
> task_numa_placement(p);
>
> /* Record the fault, double the weight if pages were migrated */
> - p->numa_faults_buffer[node] += pages << migrated;
> + p->numa_faults_buffer[task_faults_idx(node, priv)] += pages << migrated;
> }
>
> static void reset_ptenuma_scan(struct task_struct *p)
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index e2f7f5aa..7cd7114 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1292,7 +1292,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
> {
> struct page *page;
> unsigned long haddr = addr & HPAGE_PMD_MASK;
> - int target_nid;
> + int target_nid, last_nid;
> int current_nid = -1;
> bool migrated;
>
> @@ -1307,6 +1307,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
> if (current_nid == numa_node_id())
> count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
>
> + last_nid = page_nid_last(page);
> target_nid = mpol_misplaced(page, vma, haddr);
> if (target_nid == -1) {
> put_page(page);
> @@ -1332,7 +1333,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
> if (!migrated)
> goto check_same;
>
> - task_numa_fault(target_nid, HPAGE_PMD_NR, true);
> + task_numa_fault(last_nid, target_nid, HPAGE_PMD_NR, true);
> return 0;
>
> check_same:
> @@ -1347,7 +1348,7 @@ clear_pmdnuma:
> out_unlock:
> spin_unlock(&mm->page_table_lock);
> if (current_nid != -1)
> - task_numa_fault(current_nid, HPAGE_PMD_NR, false);
> + task_numa_fault(last_nid, current_nid, HPAGE_PMD_NR, false);
> return 0;
> }
>
> diff --git a/mm/memory.c b/mm/memory.c
> index ba94dec..c28bf52 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3536,7 +3536,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
> {
> struct page *page = NULL;
> spinlock_t *ptl;
> - int current_nid = -1;
> + int current_nid = -1, last_nid;
> int target_nid;
> bool migrated = false;
>
> @@ -3566,6 +3566,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
> return 0;
> }
>
> + last_nid = page_nid_last(page);
> current_nid = page_to_nid(page);
> target_nid = numa_migrate_prep(page, vma, addr, current_nid);
> pte_unmap_unlock(ptep, ptl);
> @@ -3586,7 +3587,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
>
> out:
> if (current_nid != -1)
> - task_numa_fault(current_nid, 1, migrated);
> + task_numa_fault(last_nid, current_nid, 1, migrated);
> return 0;
> }
>
> @@ -3602,6 +3603,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
> spinlock_t *ptl;
> bool numa = false;
> int local_nid = numa_node_id();
> + int last_nid;
>
> spin_lock(&mm->page_table_lock);
> pmd = *pmdp;
> @@ -3654,6 +3656,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
> * migrated to.
> */
> curr_nid = local_nid;
> + last_nid = page_nid_last(page);
> target_nid = numa_migrate_prep(page, vma, addr,
> page_to_nid(page));
> if (target_nid == -1) {
> @@ -3666,7 +3669,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
> migrated = migrate_misplaced_page(page, target_nid);
> if (migrated)
> curr_nid = target_nid;
> - task_numa_fault(curr_nid, 1, migrated);
> + task_numa_fault(last_nid, curr_nid, 1, migrated);
>
> pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
> }
> --
> 1.8.1.4
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@xxxxxxxxxx For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>

--
Thanks and Regards
Srikar Dronamraju

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Ankur Raina: "[patch v1 1/1]regulator: fix for DA9055 regulator"
Previous message: Stephen Rothwell: "Re: mmotm 2013-06-27-16-36 uploaded (wait event common)"
In reply to: Mel Gorman: "Re: [PATCH 7/8] sched: Split accounting of NUMA hinting faults thatpass two-stage filter"
Next in thread: Peter Zijlstra: "Re: [PATCH 7/8] sched: Split accounting of NUMA hinting faults thatpass two-stage filter"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]