[RFC 1/1] mm: Add per-task struct tlb counters

From: Joe Damato
Date: Tue Sep 13 2022 - 21:53:50 EST


TLB shootdowns are tracked globally, but on a busy system it can be
difficult to disambiguate the source of TLB shootdowns.

Add two counter fields:
- nrtlbflush: number of tlb flush events received
- ngtlbflush: number of tlb flush events generated

Expose those fields in /proc/[pid]/stat so that they can be analyzed
alongside similar metrics (e.g. min_flt and maj_flt).

Signed-off-by: Joe Damato <jdamato@xxxxxxxxxx>
---
arch/x86/mm/tlb.c | 2 ++
fs/proc/array.c | 9 +++++++++
include/linux/sched.h | 6 ++++++
include/linux/sched/signal.h | 1 +
kernel/exit.c | 6 ++++++
kernel/fork.c | 1 +
6 files changed, 25 insertions(+)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c1e31e9..58f7c59 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -745,6 +745,7 @@ static void flush_tlb_func(void *info)
if (!local) {
inc_irq_stat(irq_tlb_count);
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ current->nrtlbflush++;

/* Can only happen on remote CPUs */
if (f->mm && f->mm != loaded_mm)
@@ -895,6 +896,7 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
* would not happen.
*/
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+ current->ngtlbflush++;
if (info->end == TLB_FLUSH_ALL)
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
else
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 49283b81..435afdc 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -469,6 +469,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
unsigned long long start_time;
unsigned long cmin_flt = 0, cmaj_flt = 0;
unsigned long min_flt = 0, maj_flt = 0;
+ unsigned long ngtlbflush = 0, nrtlbflush = 0;
u64 cutime, cstime, utime, stime;
u64 cgtime, gtime;
unsigned long rsslim = 0;
@@ -530,11 +531,15 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
do {
min_flt += t->min_flt;
maj_flt += t->maj_flt;
+ ngtlbflush += t->ngtlbflush;
+ nrtlbflush += t->nrtlbflush;
gtime += task_gtime(t);
} while_each_thread(task, t);

min_flt += sig->min_flt;
maj_flt += sig->maj_flt;
+ ngtlbflush += sig->ngtlbflush;
+ nrtlbflush += sig->nrtlbflush;
thread_group_cputime_adjusted(task, &utime, &stime);
gtime += sig->gtime;

@@ -554,6 +559,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
if (!whole) {
min_flt = task->min_flt;
maj_flt = task->maj_flt;
+ nrtlbflush = task->nrtlbflush;
+ ngtlbflush = task->ngtlbflush;
task_cputime_adjusted(task, &utime, &stime);
gtime = task_gtime(task);
}
@@ -643,6 +650,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
else
seq_puts(m, " 0");

+ seq_put_decimal_ull(m, " ", ngtlbflush);
+ seq_put_decimal_ull(m, " ", nrtlbflush);
seq_putc(m, '\n');
if (mm)
mmput(mm);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5cdf746..2a0d879 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1047,6 +1047,12 @@ struct task_struct {
unsigned long min_flt;
unsigned long maj_flt;

+ /* Number of TLB flushes generated by this task */
+ unsigned long ngtlbflush;
+
+ /* Number of TLB flushes received by this task */
+ unsigned long nrtlbflush;
+
/* Empty if CONFIG_POSIX_CPUTIMERS=n */
struct posix_cputimers posix_cputimers;

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 2009926..4e0b09c 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -189,6 +189,7 @@ struct signal_struct {
struct prev_cputime prev_cputime;
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
+ unsigned long ngtlbflush, nrtlbflush;
unsigned long inblock, oublock, cinblock, coublock;
unsigned long maxrss, cmaxrss;
struct task_io_accounting ioac;
diff --git a/kernel/exit.c b/kernel/exit.c
index 35e0a31..5a72755 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -141,6 +141,8 @@ static void __exit_signal(struct task_struct *tsk)
sig->gtime += task_gtime(tsk);
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
+ sig->ngtlbflush += tsk->ngtlbflush;
+ sig->nrtlbflush += tsk->nrtlbflush;
sig->nvcsw += tsk->nvcsw;
sig->nivcsw += tsk->nivcsw;
sig->inblock += task_io_get_inblock(tsk);
@@ -1095,6 +1097,10 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
p->min_flt + sig->min_flt + sig->cmin_flt;
psig->cmaj_flt +=
p->maj_flt + sig->maj_flt + sig->cmaj_flt;
+ psig->ngtlbflush +=
+ p->ngtlbflush + sig->ngtlbflush;
+ psig->nrtlbflush +=
+ p->nrtlbflush + sig->nrtlbflush;
psig->cnvcsw +=
p->nvcsw + sig->nvcsw + sig->cnvcsw;
psig->cnivcsw +=
diff --git a/kernel/fork.c b/kernel/fork.c
index b339918..5fa9f64 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1555,6 +1555,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
struct mm_struct *mm, *oldmm;

tsk->min_flt = tsk->maj_flt = 0;
+ tsk->ngtlbflush = tsk->nrtlbflush = 0;
tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
--
2.7.4