[RFC PATCH 3/5] x86/ibs: Enable per-process IBS from sched switch path
From: Bharata B Rao
Date:  Wed Feb 08 2023 - 02:37:09 EST
Program IBS for access profiling for threads from the
task sched switch path. IBS is programmed with a period
that corresponds to the incoming thread. Kernel threads are
excluded from this.
The sample period is currently kept at a fixed value of 10000.
Signed-off-by: Bharata B Rao <bharata@xxxxxxx>
---
 arch/x86/mm/ibs.c     | 27 +++++++++++++++++++++++++++
 include/linux/sched.h |  1 +
 kernel/sched/core.c   |  1 +
 kernel/sched/fair.c   |  1 +
 kernel/sched/sched.h  |  5 +++++
 5 files changed, 35 insertions(+)
diff --git a/arch/x86/mm/ibs.c b/arch/x86/mm/ibs.c
index adbc587b1767..a479029e9262 100644
--- a/arch/x86/mm/ibs.c
+++ b/arch/x86/mm/ibs.c
@@ -8,6 +8,7 @@
 #include <asm/perf_event.h> /* TODO: Move defns like IBS_OP_ENABLE into non-perf header */
 #include <asm/apic.h>
 
+#define IBS_SAMPLE_PERIOD      10000
 static u64 ibs_config __read_mostly;
 
 struct ibs_access_work {
@@ -15,6 +16,31 @@ struct ibs_access_work {
 	u64 laddr, paddr;
 };
 
+void hw_access_sched_in(struct task_struct *prev, struct task_struct *curr)
+{
+	u64 config = 0;
+	unsigned int period;
+
+	if (!static_branch_unlikely(&hw_access_hints))
+		return;
+
+	/* Disable IBS for kernel thread */
+	if (!curr->mm)
+		goto out;
+
+	if (curr->numa_sample_period)
+		period = curr->numa_sample_period;
+	else
+		period = IBS_SAMPLE_PERIOD;
+
+
+	config = (period >> 4)  & IBS_OP_MAX_CNT;
+	config |= (period & IBS_OP_MAX_CNT_EXT_MASK);
+	config |= ibs_config;
+out:
+	wrmsrl(MSR_AMD64_IBSOPCTL, config);
+}
+
 void task_ibs_access_work(struct callback_head *work)
 {
 	struct ibs_access_work *iwork = container_of(work, struct ibs_access_work, work);
@@ -198,6 +224,7 @@ int __init ibs_access_profiling_init(void)
 			  x86_amd_ibs_access_profile_startup,
 			  x86_amd_ibs_access_profile_teardown);
 
+	static_branch_enable(&hw_access_hints);
 	pr_info("IBS access profiling setup for NUMA Balancing\n");
 	return 0;
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 19dd4ee07436..66c532418d38 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1254,6 +1254,7 @@ struct task_struct {
 	int				numa_scan_seq;
 	unsigned int			numa_scan_period;
 	unsigned int			numa_scan_period_max;
+	unsigned int			numa_sample_period;
 	int				numa_preferred_nid;
 	unsigned long			numa_migrate_retry;
 	/* Migration stamp: */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e838feb6adc5..1c13fed8bebc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5165,6 +5165,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	prev_state = READ_ONCE(prev->__state);
 	vtime_task_switch(prev);
 	perf_event_task_sched_in(prev, current);
+	hw_access_sched_in(prev, current);
 	finish_task(prev);
 	tick_nohz_task_switch();
 	finish_lock_switch(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c9b9e62da779..3f617c799821 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3094,6 +3094,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
 	p->node_stamp			= 0;
 	p->numa_scan_seq		= mm ? mm->numa_scan_seq : 0;
 	p->numa_scan_period		= sysctl_numa_balancing_scan_delay;
+	p->numa_sample_period		= 0;
 	p->numa_migrate_retry		= 0;
 	/* Protect against double add, see task_tick_numa and task_numa_work */
 	p->numa_work.next		= &p->numa_work;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 771f8ddb7053..953d16c802d6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1723,11 +1723,16 @@ extern int migrate_task_to(struct task_struct *p, int cpu);
 extern int migrate_swap(struct task_struct *p, struct task_struct *t,
 			int cpu, int scpu);
 extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
+void hw_access_sched_in(struct task_struct *prev, struct task_struct *curr);
 #else
 static inline void
 init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
 {
 }
+static inline void hw_access_sched_in(struct task_struct *prev,
+				      struct task_struct *curr)
+{
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 #ifdef CONFIG_SMP
-- 
2.25.1