[RFC PATCH] sched: Introduce mm_cid runqueue cache
From: Mathieu Desnoyers
Date: Mon Mar 27 2023 - 15:53:32 EST
Introduce a per-runqueue cache containing { mm, mm_cid } entries.
Keep track of the recently allocated mm_cid for each mm rather than
freeing them immediately. This eliminates most atomic ops when
context switching back and forth between threads belonging to
different memory spaces in multi-threaded scenarios (many processes,
each with many threads).
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx>
Cc: Aaron Lu <aaron.lu@xxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
---
kernel/sched/core.c | 45 +++++++++----
kernel/sched/deadline.c | 3 +
kernel/sched/fair.c | 1 +
kernel/sched/rt.c | 2 +
kernel/sched/sched.h | 138 ++++++++++++++++++++++++++++++++++------
5 files changed, 158 insertions(+), 31 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0d18c3969f90..e91fc3b810e1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2329,6 +2329,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
lockdep_assert_rq_held(rq);
deactivate_task(rq, p, DEQUEUE_NOCLOCK);
+ rq_cid_cache_remove_mm_locked(rq, p->mm, false);
set_task_cpu(p, new_cpu);
rq_unlock(rq, rf);
@@ -2516,6 +2517,7 @@ int push_cpu_stop(void *arg)
// XXX validate p is still the highest prio task
if (task_rq(p) == rq) {
deactivate_task(rq, p, 0);
+ rq_cid_cache_remove_mm_locked(rq, p->mm, false);
set_task_cpu(p, lowest_rq->cpu);
activate_task(lowest_rq, p, 0);
resched_curr(lowest_rq);
@@ -3215,6 +3217,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
rq_pin_lock(dst_rq, &drf);
deactivate_task(src_rq, p, 0);
+ rq_cid_cache_remove_mm_locked(src_rq, p->mm, false);
set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0);
check_preempt_curr(dst_rq, p, 0);
@@ -3852,6 +3855,8 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags
p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
WRITE_ONCE(rq->ttwu_pending, 1);
+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
+ rq_cid_cache_remove_mm(task_rq(p), p->mm, false);
__smp_call_single_queue(cpu, &p->wake_entry.llist);
}
@@ -4269,6 +4274,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
wake_flags |= WF_MIGRATED;
psi_ttwu_dequeue(p);
+ rq_cid_cache_remove_mm(task_rq(p), p->mm, false);
set_task_cpu(p, cpu);
}
#else
@@ -5114,7 +5120,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
rseq_preempt(prev);
- switch_mm_cid(prev, next);
+ switch_mm_cid(rq, prev, next);
fire_sched_out_preempt_notifiers(prev, next);
kmap_local_sched_out();
prepare_task(next);
@@ -6253,6 +6259,7 @@ static bool try_steal_cookie(int this, int that)
goto next;
deactivate_task(src, p, 0);
+ rq_cid_cache_remove_mm_locked(src, p->mm, false);
set_task_cpu(p, this);
activate_task(dst, p, 0);
@@ -11386,42 +11393,54 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
void sched_mm_cid_exit_signals(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- unsigned long flags;
+ struct rq_flags rf;
+ struct rq *rq;
if (!mm)
return;
- local_irq_save(flags);
- mm_cid_put(mm, t->mm_cid);
+ preempt_disable();
+ rq = this_rq();
+ rq_lock_irqsave(rq, &rf);
t->mm_cid = -1;
t->mm_cid_active = 0;
- local_irq_restore(flags);
+ rq_cid_cache_remove_mm_locked(rq, mm, true);
+ rq_unlock_irqrestore(rq, &rf);
+ preempt_enable();
}
void sched_mm_cid_before_execve(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- unsigned long flags;
+ struct rq_flags rf;
+ struct rq *rq;
if (!mm)
return;
- local_irq_save(flags);
- mm_cid_put(mm, t->mm_cid);
+ preempt_disable();
+ rq = this_rq();
+ rq_lock_irqsave(rq, &rf);
t->mm_cid = -1;
t->mm_cid_active = 0;
- local_irq_restore(flags);
+ rq_cid_cache_remove_mm_locked(rq, mm, true);
+ rq_unlock_irqrestore(rq, &rf);
+ preempt_enable();
}
void sched_mm_cid_after_execve(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- unsigned long flags;
+ struct rq_flags rf;
+ struct rq *rq;
if (!mm)
return;
- local_irq_save(flags);
- t->mm_cid = mm_cid_get(mm);
+ preempt_disable();
+ rq = this_rq();
+ rq_lock_irqsave(rq, &rf);
+ t->mm_cid = mm_cid_get(rq, mm);
t->mm_cid_active = 1;
- local_irq_restore(flags);
+ rq_unlock_irqrestore(rq, &rf);
+ preempt_enable();
rseq_set_notify_resume(t);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 71b24371a6f7..34bb47442912 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -729,6 +729,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
__dl_add(dl_b, p->dl.dl_bw, cpumask_weight(later_rq->rd->span));
raw_spin_unlock(&dl_b->lock);
+ rq_cid_cache_remove_mm_locked(rq, p->mm, false);
set_task_cpu(p, later_rq->cpu);
double_unlock_balance(later_rq, rq);
@@ -2357,6 +2358,7 @@ static int push_dl_task(struct rq *rq)
}
deactivate_task(rq, next_task, 0);
+ rq_cid_cache_remove_mm_locked(rq, next_task->mm, false);
set_task_cpu(next_task, later_rq->cpu);
activate_task(later_rq, next_task, 0);
ret = 1;
@@ -2445,6 +2447,7 @@ static void pull_dl_task(struct rq *this_rq)
push_task = get_push_task(src_rq);
} else {
deactivate_task(src_rq, p, 0);
+ rq_cid_cache_remove_mm_locked(src_rq, p->mm, false);
set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0);
dmin = p->dl.deadline;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6986ea31c984..70ed6aef87ec 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8542,6 +8542,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
lockdep_assert_rq_held(env->src_rq);
deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
+ rq_cid_cache_remove_mm_locked(env->src_rq, p->mm, false);
set_task_cpu(p, env->dst_cpu);
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0a11f44adee5..3ad325db1db3 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2156,6 +2156,7 @@ static int push_rt_task(struct rq *rq, bool pull)
}
deactivate_task(rq, next_task, 0);
+ rq_cid_cache_remove_mm_locked(rq, next_task->mm, false);
set_task_cpu(next_task, lowest_rq->cpu);
activate_task(lowest_rq, next_task, 0);
resched_curr(lowest_rq);
@@ -2429,6 +2430,7 @@ static void pull_rt_task(struct rq *this_rq)
push_task = get_push_task(src_rq);
} else {
deactivate_task(src_rq, p, 0);
+ rq_cid_cache_remove_mm_locked(src_rq, p->mm, false);
set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0);
resched = true;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3e8df6d31c1e..b2e12857e2c3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -947,6 +947,19 @@ struct balance_callback {
void (*func)(struct rq *rq);
};
+#ifdef CONFIG_SCHED_MM_CID
+# define RQ_CID_CACHE_SIZE 8
+struct rq_cid_entry {
+ struct mm_struct *mm; /* NULL if unset */
+ int mm_cid;
+};
+
+struct rq_cid_cache {
+ struct rq_cid_entry entry[RQ_CID_CACHE_SIZE];
+ unsigned int head;
+};
+#endif
+
/*
* This is the main, per-CPU runqueue data structure.
*
@@ -1161,6 +1174,9 @@ struct rq {
call_single_data_t cfsb_csd;
struct list_head cfsb_csd_list;
#endif
+#ifdef CONFIG_SCHED_MM_CID
+ struct rq_cid_cache cid_cache;
+#endif
};
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3249,6 +3265,92 @@ static inline void update_current_exec_runtime(struct task_struct *curr,
}
#ifdef CONFIG_SCHED_MM_CID
+
+static inline void mm_cid_put(struct mm_struct *mm, int cid)
+{
+ lockdep_assert_irqs_disabled();
+ if (cid < 0)
+ return;
+ raw_spin_lock(&mm->cid_lock);
+ __cpumask_clear_cpu(cid, mm_cidmask(mm));
+ raw_spin_unlock(&mm->cid_lock);
+}
+
+static inline struct rq_cid_entry *rq_cid_cache_lookup(struct rq *rq, struct mm_struct *mm)
+{
+ struct rq_cid_cache *cid_cache = &rq->cid_cache;
+ int i;
+
+ for (i = 0; i < RQ_CID_CACHE_SIZE; i++) {
+ struct rq_cid_entry *entry = &cid_cache->entry[i];
+
+ if (entry->mm == mm)
+ return entry;
+ }
+ return NULL;
+}
+
+/* Removal from cache simply leaves an unused hole. */
+static inline int rq_cid_cache_lookup_remove(struct rq *rq, struct mm_struct *mm)
+{
+ struct rq_cid_entry *entry = rq_cid_cache_lookup(rq, mm);
+
+ if (!entry)
+ return -1;
+ entry->mm = NULL; /* Remove from cache */
+ return entry->mm_cid;
+}
+
+static inline void rq_cid_cache_remove_mm_locked(struct rq *rq, struct mm_struct *mm, bool release_mm)
+{
+ int cid;
+
+ if (!mm)
+ return;
+ /*
+ * Do not remove the cache entry for a runqueue that runs a task which
+ * currently uses the target mm.
+ */
+ if (!release_mm && rq->curr->mm == mm)
+ return;
+ cid = rq_cid_cache_lookup_remove(rq, mm);
+ mm_cid_put(mm, cid);
+}
+
+static inline void rq_cid_cache_remove_mm(struct rq *rq, struct mm_struct *mm, bool release_mm)
+{
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ rq_cid_cache_remove_mm_locked(rq, mm, release_mm);
+ rq_unlock_irqrestore(rq, &rf);
+}
+
+/*
+ * Add at head, move head forward. Cheap LRU cache.
+ * Only need to clear the cid mask bit from its own mm_cidmask(mm) when we
+ * overwrite an old entry from the cache. Note that this is not needed if the
+ * overwritten entry is an unused hole. This access to the old_mm from an
+ * unrelated thread requires that cache entry for a given mm gets pruned from
+ * the cache when a task is dequeued from the runqueue.
+ */
+static inline void rq_cid_cache_add(struct rq *rq, struct mm_struct *mm, int cid)
+{
+ struct rq_cid_cache *cid_cache = &rq->cid_cache;
+ struct mm_struct *old_mm;
+ struct rq_cid_entry *entry;
+ unsigned int pos;
+
+ pos = cid_cache->head;
+ entry = &cid_cache->entry[pos];
+ old_mm = entry->mm;
+ if (old_mm)
+ mm_cid_put(old_mm, entry->mm_cid);
+ entry->mm = mm;
+ entry->mm_cid = cid;
+ cid_cache->head = (pos + 1) % RQ_CID_CACHE_SIZE;
+}
+
static inline int __mm_cid_get(struct mm_struct *mm)
{
struct cpumask *cpumask;
@@ -3262,28 +3364,26 @@ static inline int __mm_cid_get(struct mm_struct *mm)
return cid;
}
-static inline void mm_cid_put(struct mm_struct *mm, int cid)
+static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm)
{
- lockdep_assert_irqs_disabled();
- if (cid < 0)
- return;
- raw_spin_lock(&mm->cid_lock);
- __cpumask_clear_cpu(cid, mm_cidmask(mm));
- raw_spin_unlock(&mm->cid_lock);
-}
-
-static inline int mm_cid_get(struct mm_struct *mm)
-{
- int ret;
+ struct rq_cid_entry *entry;
+ int cid;
lockdep_assert_irqs_disabled();
+ entry = rq_cid_cache_lookup(rq, mm);
+ if (entry) {
+ cid = entry->mm_cid;
+ goto end;
+ }
raw_spin_lock(&mm->cid_lock);
- ret = __mm_cid_get(mm);
+ cid = __mm_cid_get(mm);
raw_spin_unlock(&mm->cid_lock);
- return ret;
+ rq_cid_cache_add(rq, mm, cid);
+end:
+ return cid;
}
-static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
+static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
if (prev->mm_cid_active) {
if (next->mm_cid_active && next->mm == prev->mm) {
@@ -3295,15 +3395,17 @@ static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *n
prev->mm_cid = -1;
return;
}
- mm_cid_put(prev->mm, prev->mm_cid);
+ /* Leave the prev mm_cid in the cid rq cache. */
prev->mm_cid = -1;
}
if (next->mm_cid_active)
- next->mm_cid = mm_cid_get(next->mm);
+ next->mm_cid = mm_cid_get(rq, next->mm);
}
#else
-static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
+static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
+static inline void rq_cid_cache_remove_mm_locked(struct rq *rq, struct mm_struct *mm, bool release_mm) { }
+static inline void rq_cid_cache_remove_mm(struct rq *rq, struct mm_struct *mm, bool release_mm) { }
#endif
#endif /* _KERNEL_SCHED_SCHED_H */
--
2.25.1