Hi Peter,
On 3/25/25 8:09 PM, Peter Zijlstra wrote:
struct mmu_gather;
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e5c38718ff5..f8eafe440369 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1379,6 +1379,10 @@ struct task_struct {
unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_SCHED_CACHE
+ struct callback_head cache_work;
+#endif
IIUC this work updates stats for the whole mm and seems not
necessary for each task of the process to repeat same thing.
Hence would be better move this work to mm_struct.
+
+static inline
+void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
+{
+ struct mm_struct *mm = p->mm;
+ struct mm_sched *pcpu_sched;
+ unsigned long epoch;
+
+ /*
+ * init_task and kthreads don't be having no mm
+ */
+ if (!mm || !mm->pcpu_sched)
+ return;
+
+ pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched);
+
+ scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
+ __update_mm_sched(rq, pcpu_sched);
+ pcpu_sched->runtime += delta_exec;
+ rq->cpu_runtime += delta_exec;
+ epoch = rq->cpu_epoch;
+ }
+
+ /*
+ * If this task hasn't hit task_cache_work() for a while, invalidate
+ * it's preferred state.
+ */
+ if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) {
+ mm->mm_sched_cpu = -1;
+ pcpu_sched->occ = -1;
+ }
This seems too late. account_mm_sched() is called when p is runnable,
so if the whole process sleeps for a while before woken up, ttwu will
take the out-dated value.
+
+static void task_cache_work(struct callback_head *work)
+{
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ unsigned long m_a_occ = 0;
+ int cpu, m_a_cpu = -1;
+ cpumask_var_t cpus;
+
+ WARN_ON_ONCE(work != &p->cache_work);
+
+ work->next = work;
+
+ if (p->flags & PF_EXITING)
+ return;
+
+ if (!alloc_cpumask_var(&cpus, GFP_KERNEL))
+ return;
+
+ scoped_guard (cpus_read_lock) {
+ cpumask_copy(cpus, cpu_online_mask);
+
+ for_each_cpu(cpu, cpus) {
+ /* XXX sched_cluster_active */
+ struct sched_domain *sd = per_cpu(sd_llc, cpu);
+ unsigned long occ, m_occ = 0, a_occ = 0;
+ int m_cpu = -1, nr = 0, i;
+
+ for_each_cpu(i, sched_domain_span(sd)) {
+ occ = fraction_mm_sched(cpu_rq(i),
+ per_cpu_ptr(mm->pcpu_sched, i));
+ a_occ += occ;
+ if (occ > m_occ) {
+ m_occ = occ;
+ m_cpu = i;
+ }
It would be possible to cause task stacking on this hint cpu
due to its less frequently updated compared to wakeup.
And although the occupancy heuristic looks reasonable, IMHO it> I will do some tests and return with more feedback.
doesn't make much sense to compare between cpus as they share
the LLC, and a non-hint cpu with warmer L1/L2$ in same LLC with
the hint cpu seems more preferred.
Do you think it's appropriate or not to only hint on the hottest
LLC? So the tasks can hopefully wokenup on 'right' LLC on the
premise that wouldn't cause much imbalance between LLCs.