[patch V2 0/4] sched/mmcid: Cure mode transition woes
From: Thomas Gleixner
Date: Mon Feb 02 2026 - 04:39:44 EST
This is a follow up to the V1 submission:
https://lore.kernel.org/20260129210219.452851594@xxxxxxxxxx
Ihor and Shrikanth reported hard lockups which can be tracked back to the recent
rewrite of the MM_CID management code.
1) The from task to CPU ownership transition lacks the intermediate
transition mode, which can lead to CID pool exhaustion and a
subsequent live lock. That intermediate mode was implemented for the
reverse operation already but omitted for this transition as the
original analysis missed a few possible scheduling scenarios.
2) Weakly ordered architectures can observe inconsistent state which
causes them to make the wrong decision. That leads to the same problem
as with #1.
The following series addresses these issue and fixes another albeit harmless
inconsistent state hickup which was found when analysing the above issues.
With these issues addressed the last change optimizes the bitmap
utilization in the transition modes.
The series applies on Linus tree and passes the selftests and a thread pool
emulator which stress tests the ownership transitions.
Changes vs. V1:
- Move the mm_cid_fixup_tasks_to_cpus() wrapping where it belongs (patch 1)
- Add barriers before and after the fixup functions to prevent CPU
reordering of the mode stores - Mathieu
- Update change logs - Mathieu
Delta patch against V1 is below
Thanks,
tglx
---
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -133,7 +133,6 @@ struct mm_cid_pcpu {
* as that is modified by mmget()/mm_put() by other entities which
* do not actually share the MM.
* @pcpu_thrs: Threshold for switching back from per CPU mode
- * @mode_change: Mode change in progress
* @update_deferred: A deferred switch back to per task mode is pending.
*/
struct mm_mm_cid {
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10445,6 +10445,12 @@ static bool mm_update_max_cids(struct mm
/* Flip the mode and set the transition flag to bridge the transfer */
WRITE_ONCE(mc->mode, mc->mode ^ (MM_CID_TRANSIT | MM_CID_ONCPU));
+ /*
+ * Order the store against the subsequent fixups so that
+ * acquire(rq::lock) cannot be reordered by the CPU before the
+ * store.
+ */
+ smp_mb();
return true;
}
@@ -10487,6 +10493,16 @@ static inline void mm_update_cpus_allowe
irq_work_queue(&mc->irq_work);
}
+static inline void mm_cid_complete_transit(struct mm_struct *mm, unsigned int mode)
+{
+ /*
+ * Ensure that the store removing the TRANSIT bit cannot be
+ * reordered by the CPU before the fixups have been completed.
+ */
+ smp_mb();
+ WRITE_ONCE(mm->mm_cid.mode, mode);
+}
+
static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp)
{
if (cid_on_cpu(t->mm_cid.cid)) {
@@ -10530,8 +10546,7 @@ static void mm_cid_fixup_cpus_to_tasks(s
}
}
}
- /* Clear the transition bit in the mode */
- WRITE_ONCE(mm->mm_cid.mode, 0);
+ mm_cid_complete_transit(mm, 0);
}
static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
@@ -10603,8 +10618,7 @@ static void mm_cid_fixup_tasks_to_cpus(v
struct mm_struct *mm = current->mm;
mm_cid_do_fixup_tasks_to_cpus(mm);
- /* Clear the transition bit in the mode */
- WRITE_ONCE(mm->mm_cid.mode, MM_CID_ONCPU);
+ mm_cid_complete_transit(mm, MM_CID_ONCPU);
}
static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3914,8 +3914,7 @@ static __always_inline void mm_cid_sched
/*
* If transition mode is done, transfer ownership when the CID is
- * within the convergion range. Otherwise the next schedule in will
- * have to allocate or converge
+ * within the convergence range to optimize the next schedule in.
*/
if (!cid_in_transit(mode) && cid < READ_ONCE(mm->mm_cid.max_cids)) {
if (cid_on_cpu(mode))