[RFC PATCH v4 13/27] mm/mempolicy: NP_OPS_MEMPOLICY - support private node mempolicy

From: Gregory Price

Date: Sun Feb 22 2026 - 03:50:57 EST


Some private nodes want userland to directly allocate from the node
via set_mempolicy() and mbind() - but don't want that node as normal
allocable system memory in the fallback lists.

Add NP_OPS_MEMPOLICY flag requiring NP_OPS_MIGRATION (since mbind can
drive migrations). Only allow private nodes in policy nodemasks if
all private nodes in the mask support NP_OPS_MEMPOLICY. This prevents
__GFP_PRIVATE from unlocking nodes without NP_OPS_MEMPOLICY support.

Add __GFP_PRIVATE to mempolicy migration sites so moves to opted-in
private nodes succeed.

Update the sysfs "has_memory" attribute to include N_MEMORY_PRIVATE
nodes with NP_OPS_MEMPOLICY set, allowing existing numactl userland
tools to work without modification.

Signed-off-by: Gregory Price <gourry@xxxxxxxxxx>
---
drivers/base/node.c | 22 +++++++++++++-
include/linux/node_private.h | 40 +++++++++++++++++++++++++
include/uapi/linux/mempolicy.h | 1 +
mm/mempolicy.c | 54 ++++++++++++++++++++++++++++++----
mm/page_alloc.c | 5 ++++
5 files changed, 116 insertions(+), 6 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index e587f5781135..c08b5a948779 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -953,6 +953,10 @@ int node_private_set_ops(int nid, const struct node_private_ops *ops)
(!ops->migrate_to || !ops->folio_migrate))
return -EINVAL;

+ if ((ops->flags & NP_OPS_MEMPOLICY) &&
+ !(ops->flags & NP_OPS_MIGRATION))
+ return -EINVAL;
+
mutex_lock(&node_private_lock);
np = rcu_dereference_protected(NODE_DATA(nid)->node_private,
lockdep_is_held(&node_private_lock));
@@ -1145,6 +1149,21 @@ static ssize_t show_node_state(struct device *dev,
nodemask_pr_args(&node_states[na->state]));
}

+/* has_memory includes N_MEMORY + N_MEMORY_PRIVATE that support mempolicy. */
+static ssize_t show_has_memory(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ nodemask_t mask = node_states[N_MEMORY];
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY_PRIVATE) {
+ if (node_private_has_flag(nid, NP_OPS_MEMPOLICY))
+ node_set(nid, mask);
+ }
+
+ return sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&mask));
+}
+
#define _NODE_ATTR(name, state) \
{ __ATTR(name, 0444, show_node_state, NULL), state }

@@ -1155,7 +1174,8 @@ static struct node_attr node_state_attr[] = {
#ifdef CONFIG_HIGHMEM
[N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
#endif
- [N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY),
+ [N_MEMORY] = { __ATTR(has_memory, 0444, show_has_memory, NULL),
+ N_MEMORY },
[N_MEMORY_PRIVATE] = _NODE_ATTR(has_private_memory, N_MEMORY_PRIVATE),
[N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
[N_GENERIC_INITIATOR] = _NODE_ATTR(has_generic_initiator,
diff --git a/include/linux/node_private.h b/include/linux/node_private.h
index 0c5be1ee6e60..e9b58afa366b 100644
--- a/include/linux/node_private.h
+++ b/include/linux/node_private.h
@@ -86,6 +86,8 @@ struct node_private_ops {

/* Allow user/kernel migration; requires migrate_to and folio_migrate */
#define NP_OPS_MIGRATION BIT(0)
+/* Allow mempolicy-directed allocation and mbind migration to this node */
+#define NP_OPS_MEMPOLICY BIT(1)

/**
* struct node_private - Per-node container for N_MEMORY_PRIVATE nodes
@@ -276,6 +278,34 @@ static inline int node_private_migrate_to(struct list_head *folios, int nid,

return ret;
}
+
+static inline bool node_mpol_eligible(int nid)
+{
+ bool ret;
+
+ if (!node_state(nid, N_MEMORY_PRIVATE))
+ return node_state(nid, N_MEMORY);
+
+ rcu_read_lock();
+ ret = node_private_has_flag(nid, NP_OPS_MEMPOLICY);
+ rcu_read_unlock();
+ return ret;
+}
+
+static inline bool nodes_private_mpol_allowed(const nodemask_t *nodes)
+{
+ int nid;
+ bool eligible = false;
+
+ for_each_node_mask(nid, *nodes) {
+ if (!node_state(nid, N_MEMORY_PRIVATE))
+ continue;
+ if (!node_mpol_eligible(nid))
+ return false;
+ eligible = true;
+ }
+ return eligible;
+}
#endif /* CONFIG_MEMORY_HOTPLUG */

#else /* !CONFIG_NUMA */
@@ -364,6 +394,16 @@ static inline int node_private_migrate_to(struct list_head *folios, int nid,
return -ENODEV;
}

+static inline bool node_mpol_eligible(int nid)
+{
+ return false;
+}
+
+static inline bool nodes_private_mpol_allowed(const nodemask_t *nodes)
+{
+ return false;
+}
+
static inline int node_private_register(int nid, struct node_private *np)
{
return -ENODEV;
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 8fbbe613611a..b606eae983c8 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -64,6 +64,7 @@ enum {
#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
#define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */
+#define MPOL_F_PRIVATE (1 << 5) /* policy targets private node; use __GFP_PRIVATE */

/*
* Enabling zone reclaim means the page allocator will attempt to fulfill
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2b0f9762d171..8ac014950e88 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -406,8 +406,6 @@ static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
static int mpol_set_nodemask(struct mempolicy *pol,
const nodemask_t *nodes, struct nodemask_scratch *nsc)
{
- int ret;
-
/*
* Default (pol==NULL) resp. local memory policies are not a
* subject of any remapping. They also do not need any special
@@ -416,9 +414,12 @@ static int mpol_set_nodemask(struct mempolicy *pol,
if (!pol || pol->mode == MPOL_LOCAL)
return 0;

- /* Check N_MEMORY */
+ /* Check N_MEMORY and N_MEMORY_PRIVATE*/
nodes_and(nsc->mask1,
cpuset_current_mems_allowed, node_states[N_MEMORY]);
+ nodes_and(nsc->mask2, cpuset_current_mems_allowed,
+ node_states[N_MEMORY_PRIVATE]);
+ nodes_or(nsc->mask1, nsc->mask1, nsc->mask2);

VM_BUG_ON(!nodes);

@@ -432,8 +433,13 @@ static int mpol_set_nodemask(struct mempolicy *pol,
else
pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;

- ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
- return ret;
+ /* All private nodes in the mask must have NP_OPS_MEMPOLICY. */
+ if (nodes_private_mpol_allowed(&nsc->mask2))
+ pol->flags |= MPOL_F_PRIVATE;
+ else if (nodes_intersects(nsc->mask2, node_states[N_MEMORY_PRIVATE]))
+ return -EINVAL;
+
+ return mpol_ops[pol->mode].create(pol, &nsc->mask2);
}

/*
@@ -500,6 +506,7 @@ static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
nodemask_t tmp;
+ int nid;

if (pol->flags & MPOL_F_STATIC_NODES)
nodes_and(tmp, pol->w.user_nodemask, *nodes);
@@ -514,6 +521,21 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
if (nodes_empty(tmp))
tmp = *nodes;

+ /*
+ * Drop private nodes that don't have mempolicy support.
+ * cpusets guarantees at least one N_MEMORY node in effective_mems
+ * and mems_allowed, so dropping private nodes here is safe.
+ */
+ for_each_node_mask(nid, tmp) {
+ if (node_state(nid, N_MEMORY_PRIVATE) &&
+ !node_private_has_flag(nid, NP_OPS_MEMPOLICY))
+ node_clear(nid, tmp);
+ }
+ if (nodes_intersects(tmp, node_states[N_MEMORY_PRIVATE]))
+ pol->flags |= MPOL_F_PRIVATE;
+ else
+ pol->flags &= ~MPOL_F_PRIVATE;
+
pol->nodes = tmp;
}

@@ -661,6 +683,9 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
}
if (!queue_folio_required(folio, qp))
return;
+ if (folio_is_private_node(folio) &&
+ !folio_private_flags(folio, NP_OPS_MIGRATION))
+ return;
if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
!vma_migratable(walk->vma) ||
!migrate_folio_add(folio, qp->pagelist, qp->flags))
@@ -717,6 +742,9 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
folio = vm_normal_folio(vma, addr, ptent);
if (!folio || folio_is_zone_device(folio))
continue;
+ if (folio_is_private_node(folio) &&
+ !folio_private_flags(folio, NP_OPS_MIGRATION))
+ continue;
if (folio_test_large(folio) && max_nr != 1)
nr = folio_pte_batch(folio, pte, ptent, max_nr);
/*
@@ -1451,6 +1479,9 @@ static struct folio *alloc_migration_target_by_mpol(struct folio *src,
else
gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;

+ if (pol->flags & MPOL_F_PRIVATE)
+ gfp |= __GFP_PRIVATE;
+
return folio_alloc_mpol(gfp, order, pol, ilx, nid);
}
#else
@@ -2280,6 +2311,15 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
nodemask = &pol->nodes;
if (pol->home_node != NUMA_NO_NODE)
*nid = pol->home_node;
+ else if ((pol->flags & MPOL_F_PRIVATE) &&
+ !node_isset(*nid, pol->nodes)) {
+ /*
+ * Private nodes are not in N_MEMORY nodes' zonelists.
+ * When the preferred nid (usually numa_node_id()) can't
+ * reach the policy nodes, start from a policy node.
+ */
+ *nid = first_node(pol->nodes);
+ }
/*
* __GFP_THISNODE shouldn't even be used with the bind policy
* because we might easily break the expectation to stay on the
@@ -2533,6 +2573,10 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct
gfp |= __GFP_NOWARN;

pol = get_vma_policy(vma, addr, order, &ilx);
+
+ if (pol->flags & MPOL_F_PRIVATE)
+ gfp |= __GFP_PRIVATE;
+
folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
mpol_cond_put(pol);
return folio;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5a1b35421d78..ec6c1f8e85d8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3849,8 +3849,13 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
* if another process has NUMA bindings and is causing
* kswapd wakeups on only some nodes. Avoid accidental
* "node_reclaim_mode"-like behavior in this case.
+ *
+ * Nodes without kswapd (some private nodes) are never
+ * skipped - this causes some mempolicies to silently
+ * fall back to DRAM even if the node is eligible.
*/
if (skip_kswapd_nodes &&
+ zone->zone_pgdat->kswapd &&
!waitqueue_active(&zone->zone_pgdat->kswapd_wait)) {
skipped_kswapd_nodes = true;
continue;
--
2.53.0