[PATCH] fs,mpam,x86/resctrl: Track reference count for L3 monitor domains

From: Tony Luck

Date: Thu May 21 2026 - 18:14:27 EST


There are race conditions[1] when the last CPU of a domain is taken offline
and a worker thread may access the domain structure after it is freed.

Add a rdt_l3_mon_domain::kref to track users of the domain. Don't try
to cancel worker threads when CPUs are taken offline. Just set the
target CPU for the thread to nr_cpu_ids to indicate the worker needs
to take action next time it runs.

Signed-off-by: Tony Luck <tony.luck@xxxxxxxxx>
Link: https://sashiko.dev/#/patchset/20260515193944.15114-1-tony.luck%40intel.com [1]
---
include/linux/resctrl.h | 4 +++
arch/x86/kernel/cpu/resctrl/core.c | 16 +++++++++--
drivers/resctrl/mpam_resctrl.c | 15 +++++++++-
fs/resctrl/monitor.c | 46 +++++++++++++++++++++++-------
fs/resctrl/rdtgroup.c | 16 +++--------
5 files changed, 71 insertions(+), 26 deletions(-)

diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 006e57fd7ca5..715cf62a5864 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -4,6 +4,7 @@

#include <linux/cacheinfo.h>
#include <linux/kernel.h>
+#include <linux/kref.h>
#include <linux/list.h>
#include <linux/pid.h>
#include <linux/resctrl_types.h>
@@ -181,6 +182,7 @@ struct mbm_cntr_cfg {
/**
* struct rdt_l3_mon_domain - group of CPUs sharing RDT_RESOURCE_L3 monitoring
* @hdr: common header for different domain types
+ * @kref: count of active users
* @ci_id: cache info id for this domain
* @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold
* @mbm_states: Per-event pointer to the MBM event's saved state.
@@ -195,6 +197,7 @@ struct mbm_cntr_cfg {
*/
struct rdt_l3_mon_domain {
struct rdt_domain_hdr hdr;
+ struct kref kref;
unsigned int ci_id;
unsigned long *rmid_busy_llc;
struct mbm_state *mbm_states[QOS_NUM_L3_MBM_EVENTS];
@@ -515,6 +518,7 @@ void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain
void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr);
void resctrl_online_cpu(unsigned int cpu);
void resctrl_offline_cpu(unsigned int cpu);
+void resctrl_arch_l3_mon_domain_release(struct kref *kref);

/*
* Architecture hook called at beginning of first file system mount attempt.
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 7667cf7c4e94..438016f3097c 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -396,6 +396,17 @@ static void l3_mon_domain_free(struct rdt_hw_l3_mon_domain *hw_dom)
kfree(hw_dom);
}

+void resctrl_arch_l3_mon_domain_release(struct kref *kref)
+{
+ struct rdt_hw_l3_mon_domain *hw_dom;
+ struct rdt_l3_mon_domain *d;
+
+ d = container_of(kref, struct rdt_l3_mon_domain, kref);
+ hw_dom = resctrl_to_arch_mon_dom(d);
+
+ l3_mon_domain_free(hw_dom);
+}
+
static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain *d)
{
struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
@@ -539,6 +550,7 @@ static void l3_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct
d->hdr.id = id;
d->hdr.type = RESCTRL_MON_DOMAIN;
d->hdr.rid = RDT_RESOURCE_L3;
+ kref_init(&d->kref);
ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
if (!ci) {
pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name);
@@ -680,18 +692,16 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r)

switch (r->rid) {
case RDT_RESOURCE_L3: {
- struct rdt_hw_l3_mon_domain *hw_dom;
struct rdt_l3_mon_domain *d;

if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3))
return;

d = container_of(hdr, struct rdt_l3_mon_domain, hdr);
- hw_dom = resctrl_to_arch_mon_dom(d);
resctrl_offline_mon_domain(r, hdr);
list_del_rcu(&hdr->list);
synchronize_rcu();
- l3_mon_domain_free(hw_dom);
+ kref_put(&d->kref, resctrl_arch_l3_mon_domain_release);
break;
}
case RDT_RESOURCE_PERF_PKG: {
diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c
index 226ff6f532fa..15f688f18fb6 100644
--- a/drivers/resctrl/mpam_resctrl.c
+++ b/drivers/resctrl/mpam_resctrl.c
@@ -1391,6 +1391,8 @@ mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res)
if (!dom)
return ERR_PTR(-ENOMEM);

+ kref_init(&dom->resctrl_mon_dom.kref);
+
if (r->alloc_capable) {
dom->ctrl_comp = ctrl_comp;

@@ -1548,6 +1550,17 @@ int mpam_resctrl_online_cpu(unsigned int cpu)
return 0;
}

+void resctrl_arch_l3_mon_domain_release(struct kref *kref)
+{
+ struct mpam_resctrl_dom *dom;
+ struct rdt_l3_mon_domain *d;
+
+ d = container_of(kref, struct rdt_l3_mon_domain, kref);
+ dom = container_of(d, struct mpam_resctrl_dom, resctrl_mon_dom);
+
+ kfree(dom);
+}
+
void mpam_resctrl_offline_cpu(unsigned int cpu)
{
struct mpam_resctrl_res *res;
@@ -1589,7 +1602,7 @@ void mpam_resctrl_offline_cpu(unsigned int cpu)
}

if (ctrl_dom_empty && mon_dom_empty)
- kfree(dom);
+ kref_put(&dom->resctrl_mon_dom.kref, resctrl_arch_l3_mon_domain_release);
}
}

diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c
index 9fd901c78dc6..b4af302bbad1 100644
--- a/fs/resctrl/monitor.c
+++ b/fs/resctrl/monitor.c
@@ -799,15 +799,26 @@ void cqm_handle_limbo(struct work_struct *work)

d = container_of(work, struct rdt_l3_mon_domain, cqm_limbo.work);

+ if (d->cqm_work_cpu == nr_cpu_ids)
+ goto reschedule;
+
__check_limbo(d, false);

if (has_busy_rmid(d)) {
+reschedule:
d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask,
RESCTRL_PICK_ANY_CPU);
+ if (d->cqm_work_cpu == nr_cpu_ids)
+ goto out_release;
schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo,
delay);
+ goto out_unlock;
}

+out_release:
+ kref_put(&d->kref, resctrl_arch_l3_mon_domain_release);
+
+out_unlock:
mutex_unlock(&rdtgroup_mutex);
cpus_read_unlock();
}
@@ -829,8 +840,11 @@ void cqm_setup_limbo_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_
cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu);
dom->cqm_work_cpu = cpu;

- if (cpu < nr_cpu_ids)
- schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
+ if (cpu < nr_cpu_ids) {
+ kref_get(&dom->kref);
+ if (!schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay))
+ kref_put(&dom->kref, resctrl_arch_l3_mon_domain_release);
+ }
}

void mbm_handle_overflow(struct work_struct *work)
@@ -844,15 +858,17 @@ void mbm_handle_overflow(struct work_struct *work)
cpus_read_lock();
mutex_lock(&rdtgroup_mutex);

+ r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+ d = container_of(work, struct rdt_l3_mon_domain, mbm_over.work);
+
/*
- * If the filesystem has been unmounted this work no longer needs to
- * run.
+ * If the filesystem has been unmounted this work no longer needs to run.
*/
if (!resctrl_mounted || !resctrl_arch_mon_capable())
- goto out_unlock;
+ goto out_release;

- r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
- d = container_of(work, struct rdt_l3_mon_domain, mbm_over.work);
+ if (d->mbm_work_cpu == nr_cpu_ids)
+ goto reschedule;

list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
mbm_update(r, d, prgrp);
@@ -869,9 +885,16 @@ void mbm_handle_overflow(struct work_struct *work)
* Re-check for housekeeping CPUs. This allows the overflow handler to
* move off a nohz_full CPU quickly.
*/
+reschedule:
d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask,
RESCTRL_PICK_ANY_CPU);
- schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay);
+ if (d->mbm_work_cpu != nr_cpu_ids) {
+ schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay);
+ goto out_unlock;
+ }
+
+out_release:
+ kref_put(&d->kref, resctrl_arch_l3_mon_domain_release);

out_unlock:
mutex_unlock(&rdtgroup_mutex);
@@ -901,8 +924,11 @@ void mbm_setup_overflow_handler(struct rdt_l3_mon_domain *dom, unsigned long del
cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu);
dom->mbm_work_cpu = cpu;

- if (cpu < nr_cpu_ids)
- schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
+ if (cpu < nr_cpu_ids) {
+ kref_get(&dom->kref);
+ if (!schedule_delayed_work_on(cpu, &dom->mbm_over, delay))
+ kref_put(&dom->kref, resctrl_arch_l3_mon_domain_release);
+ }
}

int setup_rmid_lru_list(void)
diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c
index 5dfdaa6f9d8f..a4830a2364da 100644
--- a/fs/resctrl/rdtgroup.c
+++ b/fs/resctrl/rdtgroup.c
@@ -4332,8 +4332,6 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *h
goto out_unlock;

d = container_of(hdr, struct rdt_l3_mon_domain, hdr);
- if (resctrl_is_mbm_enabled())
- cancel_delayed_work(&d->mbm_over);
if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && has_busy_rmid(d)) {
/*
* When a package is going down, forcefully
@@ -4344,7 +4342,6 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *h
* package never comes back.
*/
__check_limbo(d, true);
- cancel_delayed_work(&d->cqm_limbo);
}

domain_destroy_l3_mon_state(d);
@@ -4524,15 +4521,10 @@ void resctrl_offline_cpu(unsigned int cpu)

d = get_mon_domain_from_cpu(cpu, l3);
if (d) {
- if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) {
- cancel_delayed_work(&d->mbm_over);
- mbm_setup_overflow_handler(d, 0, cpu);
- }
- if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) &&
- cpu == d->cqm_work_cpu && has_busy_rmid(d)) {
- cancel_delayed_work(&d->cqm_limbo);
- cqm_setup_limbo_handler(d, 0, cpu);
- }
+ if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu)
+ d->mbm_work_cpu = nr_cpu_ids;
+ if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && cpu == d->cqm_work_cpu)
+ d->cqm_work_cpu = nr_cpu_ids;
}

out_unlock:
--
2.54.0