Re: [RFC PATCH v8] sched: Fix performance regression introduced by mm_cid

From: Mathieu Desnoyers
Date: Tue Apr 18 2023 - 12:01:21 EST


On 2023-04-18 07:21, Aaron Lu wrote:
On Mon, Apr 17, 2023 at 11:08:31AM -0400, Mathieu Desnoyers wrote:
Introduce per-mm/cpu current concurrency id (mm_cid) to fix a PostgreSQL
sysbench regression reported by Aaron Lu.

For postgres_sysbench on SPR:
sched_mm_cid_migrate_to() is in the range of 0.1x% - 0.4x%, mm_cid_get()
is in the range of 0.1x% - 0.3x%. Other cid functions are pretty minor.

For hackbench on SPR:
ched_mm_cid_migrate_to() is about 3%-4%, mm_cid_get() is about 7%-8%,
other cid functions are pretty minor.

It's a bit higher than I would have expected for hackbench.

Can you run with the attached schedstats patch applied and boot
with schedstats=enable ? Let me know how the counters behave please,
because I cannot reproduce anything unexpected on my machine.

Thanks,

Mathieu

--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
commit 6cf6005b9c5d69e0f8d98900e9397401bf804fa4
Author: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx>
Date: Tue Apr 18 11:15:50 2023 -0400

schedstats mm_cid

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ce08ca68fdf4..93c56512f0d8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11658,12 +11658,15 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
src_cid = __sched_mm_cid_migrate_from_try_steal_cid(t, src_cid);
if (src_cid == -1)
return;
+ schedstat_inc(dst_rq->mm_cid_migrate_steal);
if (!mm_cid_is_unset(dst_cid)) {
+ schedstat_inc(dst_rq->mm_cid_migrate_clear);
__mm_cid_put(mm, src_cid);
return;
}
/* Move src_cid to dst cpu. */
mm_cid_snapshot_time(mm);
+ schedstat_inc(dst_rq->mm_cid_migrate_move);
WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
}

@@ -11741,6 +11744,7 @@ static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
return;
+ schedstat_inc(rq->mm_cid_task_work_clear_old);
sched_mm_cid_remote_clear(mm, cpu);
}

@@ -11754,6 +11758,7 @@ static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
cid = READ_ONCE(pcpu_cid->cid);
if (!mm_cid_is_valid(cid) || cid < weight)
return;
+ schedstat_inc(this_rq()->mm_cid_task_work_clear_compact);
sched_mm_cid_remote_clear(mm, cpu);
}

@@ -11789,6 +11794,7 @@ static void task_mm_cid_work(struct callback_head *work)
return;
if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
return;
+ schedstat_inc(this_rq()->mm_cid_task_work_nr_run);
cidmask = mm_cidmask(mm);
/* Clear cids that were not recently used. */
for_each_possible_cpu(cpu)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8ce48b24221e..b9a5e8201922 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1154,6 +1154,18 @@ struct rq {
u64 core_forceidle_start;
#endif

+ unsigned int mm_cid_task_work_nr_run;
+ unsigned int mm_cid_task_work_clear_old;
+ unsigned int mm_cid_task_work_clear_compact;
+ unsigned int mm_cid_get_cached;
+ unsigned int mm_cid_get_alloc;
+ unsigned int mm_cid_get_put_lazy;
+ unsigned int mm_cid_put_lazy;
+ unsigned int mm_cid_put;
+ unsigned int mm_cid_migrate_steal;
+ unsigned int mm_cid_migrate_clear;
+ unsigned int mm_cid_migrate_move;
+
/* Scratch cpumask to be temporarily used under rq_lock */
cpumask_var_t scratch_mask;

@@ -3285,6 +3297,7 @@ static inline void mm_cid_put_lazy(struct task_struct *t)
if (!mm_cid_is_lazy_put(cid) ||
!try_cmpxchg(&pcpu_cid->cid, &cid, MM_CID_UNSET))
return;
+ schedstat_inc(this_rq()->mm_cid_put_lazy);
__mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
}

@@ -3318,6 +3331,7 @@ static inline void mm_cid_put(struct mm_struct *mm)
cid = mm_cid_pcpu_unset(pcpu_cid);
if (cid == MM_CID_UNSET)
return;
+ schedstat_inc(this_rq()->mm_cid_put);
__mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
}

@@ -3421,14 +3435,18 @@ static inline int mm_cid_get(struct mm_struct *mm)
pcpu_cid = this_cpu_ptr(mm->pcpu_cid);
cid = READ_ONCE(pcpu_cid->cid);
if (mm_cid_is_valid(cid)) {
+ schedstat_inc(rq->mm_cid_get_cached);
mm_cid_snapshot_time(mm);
return cid;
}
if (mm_cid_is_lazy_put(cid)) {
- if (try_cmpxchg(&pcpu_cid->cid, &cid, MM_CID_UNSET))
+ if (try_cmpxchg(&pcpu_cid->cid, &cid, MM_CID_UNSET)) {
+ schedstat_inc(rq->mm_cid_get_put_lazy);
__mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+ }
}
cid = __mm_cid_get(mm);
+ schedstat_inc(rq->mm_cid_get_alloc);
WRITE_ONCE(pcpu_cid->cid, cid);
return cid;
}
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 857f837f52cb..7c947a1beafa 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -133,12 +133,34 @@ static int show_schedstat(struct seq_file *seq, void *v)

/* runqueue-specific stats */
seq_printf(seq,
- "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
+ "cpu%d %u 0 %u %u %u %u %llu %llu %lu "
+ "mm_cid_task_work_nr_run: %u "
+ "mm_cid_task_work_clear_old: %u "
+ "mm_cid_task_work_clear_compact: %u "
+ "mm_cid_get_cached: %u "
+ "mm_cid_get_alloc: %u "
+ "mm_cid_get_put_lazy: %u "
+ "mm_cid_put_lazy: %u "
+ "mm_cid_put: %u "
+ "mm_cid_migrate_steal: %u "
+ "mm_cid_migrate_clear: %u "
+ "mm_cid_migrate_move: %u",
cpu, rq->yld_count,
rq->sched_count, rq->sched_goidle,
rq->ttwu_count, rq->ttwu_local,
rq->rq_cpu_time,
- rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
+ rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount,
+ rq->mm_cid_task_work_nr_run,
+ rq->mm_cid_task_work_clear_old,
+ rq->mm_cid_task_work_clear_compact,
+ rq->mm_cid_get_cached,
+ rq->mm_cid_get_alloc,
+ rq->mm_cid_get_put_lazy,
+ rq->mm_cid_put_lazy,
+ rq->mm_cid_put,
+ rq->mm_cid_migrate_steal,
+ rq->mm_cid_migrate_clear,
+ rq->mm_cid_migrate_move);

seq_printf(seq, "\n");