[PATCH] mm: mempolicy: N:M interleave policy for tiered memory nodes

From: Johannes Weiner
Date: Tue Jun 07 2022 - 13:24:13 EST


From: Hasan Al Maruf <hasanalmaruf@xxxxxx>

Existing interleave policy spreads out pages evenly across a set of
specified nodes, i.e. 1:1 interleave. Upcoming tiered memory systems
have CPU-less memory nodes with different peak bandwidth and
latency-bandwidth characteristics. In such systems, we will want to
use the additional bandwidth provided by lowtier memory for
bandwidth-intensive applications. However, the default 1:1 interleave
can lead to suboptimal bandwidth distribution.

Introduce an N:M interleave policy, where N pages allocated to the
top-tier nodes are followed by M pages allocated to lowtier nodes.
This provides the capability to steer the fraction of memory traffic
that goes to toptier vs. lowtier nodes. For example, 4:1 interleave
leads to an 80%/20% traffic breakdown between toptier and lowtier.

The ratios are configured through a new sysctl:

vm.numa_tier_interleave = toptier lowtier

We have run experiments on bandwidth-intensive production services on
CXL-based tiered memory systems, where lowtier CXL memory has, when
compared to the toptier memory directly connected to the CPU:

- ~half of the peak bandwidth
- ~80ns higher idle latency
- steeper latency vs. bandwidth curve

Results show that regular interleaving leads to a ~40% performance
regression over baseline; 5:1 interleaving shows an ~8% improvement
over baseline. We have found the optimal distribution changes based on
hardware characteristics: slower CXL memory will shift the optimal
breakdown from 5:1 to (e.g.) 8:1.

The sysctl only applies to processes and vmas with an "interleave"
policy and has no bearing on contexts using prefer or bind policies.

It defaults to a setting of "1 1", which represents even interleaving,
and so is backward compatible with existing setups.

Signed-off-by: Hasan Al Maruf <hasanalmaruf@xxxxxx>
Signed-off-by: Hao Wang <haowang3@xxxxxx>
Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
---
Documentation/admin-guide/sysctl/vm.rst | 16 ++++++
include/linux/mempolicy.h | 2 +
include/linux/sched.h | 1 +
kernel/sysctl.c | 10 ++++
mm/mempolicy.c | 67 +++++++++++++++++++++++--
5 files changed, 93 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 747e325ebcd0..0247a828ec50 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -55,6 +55,7 @@ files can be found in mm/swap.c.
- nr_hugepages_mempolicy
- nr_overcommit_hugepages
- nr_trim_pages (only if CONFIG_MMU=n)
+- numa_tier_interleave
- numa_zonelist_order
- oom_dump_tasks
- oom_kill_allocating_task
@@ -597,6 +598,21 @@ The default value is 1.
See Documentation/admin-guide/mm/nommu-mmap.rst for more information.


+numa_tier_interleave
+====================
+
+This sysctl is for tiered NUMA systems. It's a tuple that configures
+an N:M distribution between toptier and lowtier nodes for interleaving
+memory allocation policies.
+
+The first value configures the share of pages allocated on toptier
+nodes. The second value configures the share of lowtier placements.
+
+Allowed values range from 1 up to (and including) 100.
+
+The default value is 1 1, meaning even distribution.
+
+
numa_zonelist_order
===================

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 668389b4b53d..4bd0f2a67052 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -54,6 +54,8 @@ struct mempolicy {
} w;
};

+extern int numa_tier_interleave[2];
+
/*
* Support for managing mempolicy data objects (clone, copy, destroy)
* The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fc42f7213dd9..7351cf31579b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1236,6 +1236,7 @@ struct task_struct {
/* Protected by alloc_lock: */
struct mempolicy *mempolicy;
short il_prev;
+ short il_count;
short pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 50870a1db114..cfb238c6e0da 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -21,6 +21,7 @@

#include <linux/module.h>
#include <linux/mm.h>
+#include <linux/mempolicy.h>
#include <linux/swap.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
@@ -2139,6 +2140,15 @@ static struct ctl_table vm_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
+ {
+ .procname = "numa_tier_interleave",
+ .data = &numa_tier_interleave,
+ .maxlen = sizeof(numa_tier_interleave),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE_HUNDRED,
+ },
#endif
{
.procname = "hugetlb_shm_group",
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e4a409b8ac0b..3b532536cd44 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -120,6 +120,9 @@ static struct kmem_cache *sn_cache;
policied. */
enum zone_type policy_zone = 0;

+/* Toptier:lowtier interleaving ratio */
+int numa_tier_interleave[2] = { 1, 1 };
+
/*
* run-time system-wide default policy => local allocation
*/
@@ -871,8 +874,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
task_lock(current);
old = current->mempolicy;
current->mempolicy = new;
- if (new && new->mode == MPOL_INTERLEAVE)
+ if (new && new->mode == MPOL_INTERLEAVE) {
current->il_prev = MAX_NUMNODES-1;
+ current->il_count = 0;
+ }
task_unlock(current);
mpol_put(old);
ret = 0;
@@ -1881,15 +1886,47 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
return nd;
}

+static unsigned next_node_tier(int nid, struct mempolicy *policy, bool toptier)
+{
+ unsigned next, start = nid;
+
+ do {
+ next = next_node_in(next, policy->nodes);
+ if (next == MAX_NUMNODES)
+ break;
+ if (toptier == node_is_toptier(next))
+ break;
+ } while (next != start);
+ return next;
+}
+
/* Do dynamic interleaving for a process */
static unsigned interleave_nodes(struct mempolicy *policy)
{
unsigned next;
struct task_struct *me = current;

- next = next_node_in(me->il_prev, policy->nodes);
+ if (numa_tier_interleave[0] > 1 || numa_tier_interleave[1] > 1) {
+ /*
+ * When N:M interleaving is configured, allocate N
+ * pages over toptier nodes first, then the remainder
+ * on lowtier ones.
+ */
+ if (me->il_count < numa_tier_interleave[0])
+ next = next_node_tier(me->il_prev, policy, true);
+ else
+ next = next_node_tier(me->il_prev, policy, false);
+ me->il_count++;
+ if (me->il_count >=
+ numa_tier_interleave[0] + numa_tier_interleave[1])
+ me->il_count = 0;
+ } else {
+ next = next_node_in(me->il_prev, policy->nodes);
+ }
+
if (next < MAX_NUMNODES)
me->il_prev = next;
+
return next;
}

@@ -1963,7 +2000,31 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
nnodes = nodes_weight(nodemask);
if (!nnodes)
return numa_node_id();
- target = (unsigned int)n % nnodes;
+
+ if (numa_tier_interleave[0] > 1 || numa_tier_interleave[1] > 1) {
+ unsigned vnnodes = 0;
+ int vtarget;
+
+ /*
+ * When N:M interleaving is configured, calculate a
+ * virtual target for @n in an N:M-scaled nodelist...
+ */
+ for_each_node_mask(nid, nodemask)
+ vnnodes += numa_tier_interleave[!node_is_toptier(nid)];
+ vtarget = (int)((unsigned int)n % vnnodes);
+
+ /* ...then map it back to the physical nodelist */
+ target = 0;
+ for_each_node_mask(nid, nodemask) {
+ vtarget -= numa_tier_interleave[!node_is_toptier(nid)];
+ if (vtarget < 0)
+ break;
+ target++;
+ }
+ } else {
+ target = (unsigned int)n % nnodes;
+ }
+
nid = first_node(nodemask);
for (i = 0; i < target; i++)
nid = next_node(nid, nodemask);
--
2.36.1