[PATCH] mm/mempolicy: add sysfs interface to override NUMA node bandwidth

From: YeeLi

Date: Thu Mar 12 2026 - 05:14:49 EST


From: yeeli <seven.yi.lee@xxxxxxxxx>

Automatic tuning for weighted interleaving [1] provides real benefits on
systems with CXL support. However, platforms that lack HMAT or CDAT
information cannot make use of this feature.

If the bandwidth reported by firmware or the device deviates from the
actual measured bandwidth, administrators also lack a clear way to adjust
the per-node weight values.

This patch introduces an optional Kconfig option,
CONFIG_NUMA_BW_MANUAL_OVERRIDE (default n), which exposes node bandwidth
R/W sysfs attributes under:

/sys/kernel/mm/mempolicy/weighted_interleave/bw_nodeN

The sysfs files are created and removed dynamically on node hotplug
events, in sync with the existing weighted_interleave/nodeN attributes.

Userspace can write a single bandwidth value (in MB/s) to override both
read_bandwidth and write_bandwidth for the corresponding NUMA node. The
value is then propagated to the internal node_bw_table via
mempolicy_set_node_perf().

This interface is intended for debugging and experimentation only.

[1] Link:
https://lkml.kernel.org/r/20250505182328.4148265-1-joshua.hahnjy@xxxxxxxxx

Signed-off-by: yeeli <seven.yi.lee@xxxxxxxxx>
---
mm/Kconfig | 20 +++++++
mm/mempolicy.c | 148 +++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 168 insertions(+)

diff --git a/mm/Kconfig b/mm/Kconfig
index bd0ea5454af8..40554df18edc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1441,6 +1441,26 @@ config NUMA_EMU
into virtual nodes when booted with "numa=fake=N", where N is the
number of nodes. This is only useful for debugging.

+config NUMA_BW_MANUAL_OVERRIDE
+ bool "Allow manual override of per-NUMA-node bandwidth for weighted interleave"
+ depends on NUMA && SYSFS
+ default n
+ help
+ This option exposes writable sysfs attributes under
+ /sys/kernel/mm/mempolicy/weighted_interleave/bw_nodeN, allowing
+ userspace to manually set read/write bandwidth values for each NUMA node.
+
+ These values update the internal node_bw_table and can influence
+ weighted interleave auto-tuning (if enabled).
+
+ WARNING: This is intended for debugging, development, or platforms
+ with incorrect HMAT/CDAT firmware data. Overriding hardware-reported
+ bandwidth can lead to suboptimal performance, instability, or
+ incorrect resource allocation decisions.
+
+ Say N unless you are actively developing or debugging bandwidth-aware
+ memory policies.
+
config ARCH_HAS_USER_SHADOW_STACK
bool
help
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 68a98ba57882..0b7f42491748 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -226,6 +226,7 @@ int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)

bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
+
if (!new_bw)
return -ENOMEM;

@@ -3614,6 +3615,9 @@ struct iw_node_attr {
struct sysfs_wi_group {
struct kobject wi_kobj;
struct mutex kobj_lock;
+#ifdef CONFIG_NUMA_BW_MANUAL_OVERRIDE
+ struct iw_node_attr *bw_attrs[MAX_NUMNODES];
+#endif
struct iw_node_attr *nattrs[];
};

@@ -3855,6 +3859,128 @@ static int sysfs_wi_node_add(int nid)
return ret;
}

+#ifdef CONFIG_NUMA_BW_MANUAL_OVERRIDE
+static ssize_t bw_node_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ struct iw_node_attr *node_attr;
+
+ node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+
+ /*A Node without CDAT or HMAT*/
+ if (!node_bw_table)
+ return sprintf(buf, "N/A\n");
+
+ if (!node_bw_table[node_attr->nid])
+ return sprintf(buf, "0\n");
+
+ return sprintf(buf, "%u(MB/s)\n", node_bw_table[node_attr->nid]);
+}
+
+static ssize_t bw_node_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct iw_node_attr *node_attr;
+ unsigned long val = 0;
+ int ret;
+ struct access_coordinate coords = {
+ .read_bandwidth = 0,
+ .write_bandwidth = 0,
+ };
+
+ node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
+
+ ret = kstrtoul(buf, 0, &val);
+
+ coords.read_bandwidth = val;
+ coords.write_bandwidth = val;
+
+ if (ret)
+ return ret;
+
+ if (val > UINT_MAX)
+ return -EINVAL;
+
+ ret = mempolicy_set_node_perf(node_attr->nid, &coords);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static int sysfs_bw_node_add(int nid)
+{
+ int ret;
+ char *name;
+ struct iw_node_attr *new_attr;
+
+ if (nid < 0 || nid >= nr_node_ids) {
+ pr_err("invalid node id: %d\n", nid);
+ return -EINVAL;
+ }
+
+ new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL);
+ if (!new_attr)
+ return -ENOMEM;
+
+ name = kasprintf(GFP_KERNEL, "bw_node%d", nid);
+ if (!name) {
+ kfree(new_attr);
+ return -ENOMEM;
+ }
+
+ sysfs_attr_init(&new_attr->kobj_attr.attr);
+ new_attr->kobj_attr.attr.name = name;
+ new_attr->kobj_attr.attr.mode = 0644;
+ new_attr->kobj_attr.show = bw_node_show;
+ new_attr->kobj_attr.store = bw_node_store;
+ new_attr->nid = nid;
+
+ mutex_lock(&wi_group->kobj_lock);
+ if (wi_group->bw_attrs[nid]) {
+ mutex_unlock(&wi_group->kobj_lock);
+ ret = -EEXIST;
+ goto out;
+ }
+
+ ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
+
+ if (ret) {
+ mutex_unlock(&wi_group->kobj_lock);
+ goto out;
+ }
+ wi_group->bw_attrs[nid] = new_attr;
+ mutex_unlock(&wi_group->kobj_lock);
+ return 0;
+
+out:
+ kfree(new_attr->kobj_attr.attr.name);
+ kfree(new_attr);
+ return ret;
+}
+
+static void sysfs_bw_node_delete(int nid)
+{
+ struct iw_node_attr *attr;
+
+ if (nid < 0 || nid >= nr_node_ids)
+ return;
+
+ mutex_lock(&wi_group->kobj_lock);
+ attr = wi_group->bw_attrs[nid];
+
+ if (attr) {
+ sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
+ kfree(attr->kobj_attr.attr.name);
+ kfree(attr);
+ wi_group->nattrs[nid] = NULL;
+ }
+ mutex_unlock(&wi_group->kobj_lock);
+}
+#endif
+
static int wi_node_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
@@ -3868,9 +3994,22 @@ static int wi_node_notifier(struct notifier_block *nb,
if (err)
pr_err("failed to add sysfs for node%d during hotplug: %d\n",
nid, err);
+
+#ifdef CONFIG_NUMA_BW_MANUAL_OVERRIDE
+ err = sysfs_bw_node_add(nid);
+ if (err)
+ pr_err("failed to add sysfs bw_node%d: %d\n",
+ nid, err);
+#endif
break;
+
case NODE_REMOVED_LAST_MEMORY:
sysfs_wi_node_delete(nid);
+
+#ifdef CONFIG_NUMA_BW_MANUAL_OVERRIDE
+ sysfs_bw_node_delete(nid);
+#endif
+
break;
}

@@ -3906,6 +4045,15 @@ static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
nid, err);
goto err_cleanup_kobj;
}
+
+#ifdef CONFIG_NUMA_BW_MANUAL_OVERRIDE
+ err = sysfs_bw_node_add(nid);
+ if (err) {
+ pr_err("failed to add sysfs bw_node%d during init: %d\n", nid, err);
+ goto err_cleanup_kobj;
+ }
+#endif
+
}

hotplug_node_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
--
2.34.1