[RFC PATCH v7 7/7] x86/mm/ibs: In-kernel driver for AMD IBS Memory Profiler
From: Bharata B Rao
Date: Mon May 04 2026 - 02:11:34 EST
Use IBS (Instruction Based Sampling) Memory Profiler feature
present in AMD Zen6 processors for memory access tracking. The
access information obtained from IBS Memory Profiler is fed to
pghot sub-system for further action using
pghot_record_access(PGHOT_HWHINTS, ...) API.
IBS Memory Profiler as page hotness source is enabled by the
new config option HWMEM_PROFILER and is also gated by the
existing pghot_src_hwhints static key set via debugfs.
More details about IBS Memory Profiler can be obtained from
the AMD document titled "AMD64 Zen6 Instruction Based Sampling (IBS)
Extensions and Features".
Signed-off-by: Bharata B Rao <bharata@xxxxxxx>
---
arch/x86/Kconfig | 16 ++
arch/x86/include/asm/ibs-caps.h | 8 +
arch/x86/include/asm/ibs-mprof.h | 46 +++++
arch/x86/include/asm/msr-index.h | 8 +
arch/x86/mm/Makefile | 1 +
arch/x86/mm/ibs-mprof.c | 308 +++++++++++++++++++++++++++++++
include/linux/cpuhotplug.h | 1 +
include/linux/vm_event_item.h | 6 +
mm/Kconfig | 9 +
mm/vmstat.c | 6 +
10 files changed, 409 insertions(+)
create mode 100644 arch/x86/include/asm/ibs-mprof.h
create mode 100644 arch/x86/mm/ibs-mprof.c
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 99bb5217649a..f06c0c44ecce 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1514,6 +1514,22 @@ config AMD_MEM_ENCRYPT
This requires an AMD processor that supports Secure Memory
Encryption (SME).
+config AMD_IBS_MEMPROF
+ bool "AMD IBS Memory Profiler"
+ depends on X86_64 && CPU_SUP_AMD
+ depends on PGHOT
+ select HWMEM_PROFILER
+ help
+ Use the AMD Instruction Based Sampling (IBS) Memory Profiler
+ facility (present on Zen6 and later AMD CPUs) to feed
+ hardware-observed memory accesses into the pghot subsystem
+ for hot-page detection and promotion.
+
+ When disabled, no IBS Memory Profiler MSRs are programmed and
+ the corresponding NMI handler is not installed.
+
+ If unsure, say N.
+
# Common NUMA Features
config NUMA
bool "NUMA Memory Allocation and Scheduler Support"
diff --git a/arch/x86/include/asm/ibs-caps.h b/arch/x86/include/asm/ibs-caps.h
index ddf6c512c8f9..1f6c4058a0e3 100644
--- a/arch/x86/include/asm/ibs-caps.h
+++ b/arch/x86/include/asm/ibs-caps.h
@@ -29,6 +29,7 @@
#define IBS_CAPS_FETCHLAT (1U<<14)
#define IBS_CAPS_BIT63_FILTER (1U<<15)
#define IBS_CAPS_STRMST_RMTSOCKET (1U<<16)
+#define IBS_CAPS_MEM_PROFILER (1U<<18)
#define IBS_CAPS_OPDTLBPGSIZE (1U<<19)
#define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \
@@ -42,6 +43,13 @@
#define IBSCTL_LVT_OFFSET_VALID (1ULL<<8)
#define IBSCTL_LVT_OFFSET_MASK 0x0F
+/*
+ * IBS Memprofiler setup
+ */
+#define IBSCTL_MPROF_LVT_OFFSET_VALID (1ULL << 24)
+#define IBSCTL_MPROF_LVT_OFFSET_SHIFT 16
+#define IBSCTL_MPROF_LVT_OFFSET_MASK (0xFULL << IBSCTL_MPROF_LVT_OFFSET_SHIFT)
+
/* IBS fetch bits/masks */
#define IBS_FETCH_L3MISSONLY (1ULL << 59)
#define IBS_FETCH_RAND_EN (1ULL << 57)
diff --git a/arch/x86/include/asm/ibs-mprof.h b/arch/x86/include/asm/ibs-mprof.h
new file mode 100644
index 000000000000..91b1ce51d667
--- /dev/null
+++ b/arch/x86/include/asm/ibs-mprof.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_IBS_MPROF_H
+#define _ASM_X86_IBS_MPROF_H
+
+/*
+ * All bits are documented here for clarity even if the current
+ * driver doesn't use all of them.
+ */
+
+/* MSR_AMD64_IBS_MPROF_DATA2 bits */
+#define IBS_MPROF_DATA2_DATASRC_MASK 0x7
+#define IBS_MPROF_DATA2_DATASRC_MASK_HIGH 0xC0
+#define IBS_MPROF_DATA2_DATASRC_MASK_HIGH_SHIFT 0x3
+#define IBS_MPROF_DATA2_DATASRC_LCL_CCX 0x1
+#define IBS_MPROF_DATA2_DATASRC_PEER_CCX_NEAR 0x2
+#define IBS_MPROF_DATA2_DATASRC_DRAM 0x3
+#define IBS_MPROF_DATA2_DATASRC_CCX_FAR 0x5
+#define IBS_MPROF_DATA2_DATASRC_EXT_MEM 0x8
+#define IBS_MPROF_DATA2_RMT_NODE BIT_ULL(4)
+#define IBS_MPROF_DATA2_RMT_SOCKET BIT_ULL(9)
+
+/* MSR_AMD64_IBS_MPROF_DATA3 bits */
+#define IBS_MPROF_DATA3_LDOP BIT_ULL(0)
+#define IBS_MPROF_DATA3_STOP BIT_ULL(1)
+#define IBS_MPROF_DATA3_DCMISS BIT_ULL(7)
+#define IBS_MPROF_DATA3_LADDR_VALID BIT_ULL(17)
+#define IBS_MPROF_DATA3_PADDR_VALID BIT_ULL(18)
+#define IBS_MPROF_DATA3_L2MISS BIT_ULL(20)
+#define IBS_MPROF_DATA3_SW_PREFETCH BIT_ULL(21)
+
+/* MSR_AMD64_IBS_MPROF_CTL bits */
+#define IBS_MPROF_CTL_CNT_CTL BIT_ULL(19)
+#define IBS_MPROF_CTL_VAL BIT_ULL(18)
+#define IBS_MPROF_CTL_ENABLE BIT_ULL(17)
+#define IBS_MPROF_CTL_L3MISSONLY BIT_ULL(16)
+#define IBS_MPROF_CTL_MAXCNT_MASK 0x0000FFFFULL
+#define IBS_MPROF_CTL_MAXCNT_EXT_MASK (0x7FULL << 20) /* separate upper 7 bits */
+
+/* MSR_AMD64_IBS_MPROF_CTL2 bits */
+#define IBS_MPROF_CTL2_DISABLE BIT_ULL(0)
+#define IBS_MPROF_CTL2_EXCLUDE_USER BIT_ULL(1)
+#define IBS_MPROF_CTL2_EXCLUDE_KERNEL BIT_ULL(2)
+
+#define IBS_MPROF_SAMPLE_PERIOD 10000
+
+#endif /* _ASM_X86_IBS_MPROF_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index a14a0f43e04a..c44b68940f43 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -1315,4 +1315,12 @@
* a #GP
*/
+/* AMD IBS Memory Profiler MSRs */
+#define MSR_AMD64_IBS_MPROF_CTL 0xc0010380
+#define MSR_AMD64_IBS_MPROF_CTL2 0xc0010381
+#define MSR_AMD64_IBS_MPROF_DATA2 0xc0010382
+#define MSR_AMD64_IBS_MPROF_DATA3 0xc0010383
+#define MSR_AMD64_IBS_MPROF_LINADDR 0xc0010384
+#define MSR_AMD64_IBS_MPROF_PHYADDR 0xc0010385
+
#endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 3a5364853eab..050a7379d9f7 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -59,3 +59,4 @@ obj-$(CONFIG_X86_MEM_ENCRYPT) += mem_encrypt.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_amd.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
+obj-$(CONFIG_AMD_IBS_MEMPROF) += ibs-mprof.o
diff --git a/arch/x86/mm/ibs-mprof.c b/arch/x86/mm/ibs-mprof.c
new file mode 100644
index 000000000000..b3d59b21c8c9
--- /dev/null
+++ b/arch/x86/mm/ibs-mprof.c
@@ -0,0 +1,308 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "amd_ibs_memprof: " fmt
+
+#include <linux/init.h>
+#include <linux/pghot.h>
+#include <linux/percpu.h>
+#include <linux/workqueue.h>
+#include <linux/irq_work.h>
+#include <linux/mm.h>
+#include <linux/vm_event_item.h>
+#include <linux/vmstat.h>
+#include <linux/cpuhotplug.h>
+
+#include <asm/ibs-mprof.h>
+#include <asm/ibs-caps.h>
+#include <asm/nmi.h>
+#include <asm/apic.h>
+
+#define IBS_NR_SAMPLES 150 /* Percpu sample buffer size */
+
+static DEFINE_PER_CPU(bool, mprof_work_pending);
+
+/*
+ * Basic access info captured for each memory access.
+ */
+struct mprof_sample {
+ unsigned long pfn;
+ unsigned long time; /* jiffies when accessed */
+ int nid; /* Accessing node ID, if known */
+};
+
+/*
+ * Percpu buffer of access samples. Samples are accumulated here
+ * before pushing them to pghot sub-system for further action.
+ */
+struct mprof_sample_pcpu {
+ struct mprof_sample samples[IBS_NR_SAMPLES];
+ int head, tail;
+};
+
+static struct mprof_sample_pcpu __percpu *mprof_s;
+
+/*
+ * The workqueue for pushing the percpu access samples to pghot sub-system.
+ */
+static DEFINE_PER_CPU(struct work_struct, mprof_work);
+static DEFINE_PER_CPU(struct irq_work, mprof_irq_work);
+
+/*
+ * Record the IBS-reported access sample in percpu buffer.
+ * Called from IBS NMI handler.
+ */
+static bool mprof_push_sample(unsigned long pfn, int nid, unsigned long time)
+{
+ struct mprof_sample_pcpu *pcpu = raw_cpu_ptr(mprof_s);
+ int head = READ_ONCE(pcpu->head);
+ int tail = READ_ONCE(pcpu->tail);
+ int next = head + 1;
+
+ if (next >= IBS_NR_SAMPLES)
+ next = 0;
+
+ if (next == tail)
+ return false;
+
+ pcpu->samples[head].pfn = pfn;
+ pcpu->samples[head].time = time;
+ pcpu->samples[head].nid = nid;
+
+ smp_store_release(&pcpu->head, next);
+ return true;
+}
+
+static bool mprof_pop_sample(struct mprof_sample *s)
+{
+ struct mprof_sample_pcpu *pcpu = raw_cpu_ptr(mprof_s);
+ int tail = READ_ONCE(pcpu->tail);
+ int head = smp_load_acquire(&pcpu->head);
+ int next = tail + 1;
+
+ if (head == tail)
+ return false;
+
+ if (next >= IBS_NR_SAMPLES)
+ next = 0;
+
+ *s = pcpu->samples[tail];
+
+ WRITE_ONCE(pcpu->tail, next);
+ return true;
+}
+
+/*
+ * Remove access samples from percpu buffer and send them
+ * to pghot sub-system for further action.
+ */
+static void mprof_work_handler(struct work_struct *work)
+{
+ struct mprof_sample s;
+
+ while (mprof_pop_sample(&s))
+ pghot_record_access(s.pfn, s.nid, PGHOT_HWHINTS, s.time);
+
+ this_cpu_write(mprof_work_pending, false);
+}
+
+static void mprof_irq_handler(struct irq_work *i)
+{
+ struct work_struct *w = this_cpu_ptr(&mprof_work);
+
+ /*
+ * FIXME: pending samples on a CPU that goes offline before the
+ * work runs may be lost or migrated to the wrong CPU's ring;
+ * needs a teardown-time drain.
+ */
+ schedule_work_on(smp_processor_id(), w);
+}
+
+/*
+ * L3MissOnly + Exclude kernel RIP
+ */
+static void mprof_enable_profiling(void)
+{
+ u64 mprof_config = IBS_MPROF_CTL_CNT_CTL | IBS_MPROF_CTL_ENABLE |
+ IBS_MPROF_CTL_L3MISSONLY;
+ unsigned int period = IBS_MPROF_SAMPLE_PERIOD;
+ u64 ctl, ctl2;
+
+ /*
+ * Assemble bits 26:20 and 19:4 of periodic op counter in ctl.
+ * The lower 4 bits are always 0000b.
+ */
+ ctl = (period >> 4) & IBS_MPROF_CTL_MAXCNT_MASK;
+ ctl |= (period & IBS_MPROF_CTL_MAXCNT_EXT_MASK);
+ ctl |= mprof_config;
+ wrmsrq(MSR_AMD64_IBS_MPROF_CTL, ctl);
+
+ /*
+ * Exclude samples that have bit 63 of their RIP set.
+ */
+ ctl2 = IBS_MPROF_CTL2_EXCLUDE_KERNEL;
+ wrmsrq(MSR_AMD64_IBS_MPROF_CTL2, ctl2);
+}
+
+static void mprof_disable_profiling(u64 mem_ctl)
+{
+ mem_ctl &= ~IBS_MPROF_CTL_ENABLE;
+ mem_ctl &= ~IBS_MPROF_CTL_VAL;
+ wrmsrq(MSR_AMD64_IBS_MPROF_CTL, mem_ctl);
+
+ wrmsrq(MSR_AMD64_IBS_MPROF_CTL2, IBS_MPROF_CTL2_DISABLE);
+}
+
+/*
+ * IBS NMI handler: Process the memory access info reported by IBS.
+ *
+ * Reads the MSRs to collect all the information about the reported
+ * memory access, validates the access, stores the valid sample and
+ * schedules the work on this CPU to further process the sample.
+ */
+static int mprof_overflow_handler(unsigned int cmd, struct pt_regs *regs)
+{
+ u64 mem_ctl, mem_data3, mem_data2, paddr, data_src;
+ unsigned long pfn;
+ struct page *page;
+
+ rdmsrq(MSR_AMD64_IBS_MPROF_CTL, mem_ctl);
+ if (!(mem_ctl & IBS_MPROF_CTL_VAL))
+ return NMI_DONE;
+
+ mprof_disable_profiling(mem_ctl);
+ count_vm_event(HWHINT_TOTAL_EVENTS);
+
+ rdmsrq(MSR_AMD64_IBS_MPROF_DATA3, mem_data3);
+ rdmsrq(MSR_AMD64_IBS_MPROF_DATA2, mem_data2);
+
+ data_src = mem_data2 & IBS_MPROF_DATA2_DATASRC_MASK;
+ data_src |= ((mem_data2 & IBS_MPROF_DATA2_DATASRC_MASK_HIGH) >>
+ IBS_MPROF_DATA2_DATASRC_MASK_HIGH_SHIFT);
+
+ switch (data_src) {
+ case IBS_MPROF_DATA2_DATASRC_DRAM:
+ count_vm_event(HWHINT_DRAM_ACCESSES);
+ break;
+ case IBS_MPROF_DATA2_DATASRC_EXT_MEM:
+ count_vm_event(HWHINT_EXTMEM_ACCESSES);
+ break;
+ }
+
+ /* Is linear addr valid? */
+ if (!(mem_data3 & IBS_MPROF_DATA3_LADDR_VALID))
+ goto handled;
+
+ /* Is phys addr valid? */
+ if (!(mem_data3 & IBS_MPROF_DATA3_PADDR_VALID))
+ goto handled;
+ rdmsrq(MSR_AMD64_IBS_MPROF_PHYADDR, paddr);
+
+ pfn = PHYS_PFN(paddr);
+ page = pfn_to_online_page(pfn);
+ if (!page)
+ goto handled;
+
+ /*
+ * Use the accessing CPU's node as the migration target. On
+ * topologies where all CPUs reside on toptier nodes (the common
+ * case), this is the desired behaviour. Topologies that place
+ * CPUs on lower-tier nodes are rejected later by
+ * pghot_record_access() via the src_nid == nid early return.
+ */
+ if (!mprof_push_sample(pfn, numa_node_id(), jiffies))
+ goto handled;
+
+ if (!this_cpu_read(mprof_work_pending)) {
+ this_cpu_write(mprof_work_pending, true);
+ irq_work_queue(this_cpu_ptr(&mprof_irq_work));
+ }
+ count_vm_event(HWHINT_USEFUL_EVENTS);
+
+handled:
+ mprof_enable_profiling();
+ return NMI_HANDLED;
+}
+
+static int get_mprof_lvt_offset(void)
+{
+ u64 val;
+
+ rdmsrq(MSR_AMD64_IBSCTL, val);
+ if (!(val & IBSCTL_MPROF_LVT_OFFSET_VALID))
+ return -EINVAL;
+
+ return (val & IBSCTL_MPROF_LVT_OFFSET_MASK) >>
+ IBSCTL_MPROF_LVT_OFFSET_SHIFT;
+}
+
+static int x86_amd_ibs_mprof_startup(unsigned int cpu)
+{
+ int offset = get_mprof_lvt_offset();
+
+ if (offset < 0) {
+ pr_warn("offset not valid on cpu #%d\n", cpu);
+ return 0;
+ }
+
+ if (setup_APIC_eilvt(offset, 0, APIC_DELIVERY_MODE_NMI, 0)) {
+ pr_warn("APIC setup failed on cpu #%d\n", cpu);
+ return 0;
+ }
+
+ mprof_enable_profiling();
+ return 0;
+}
+
+static int x86_amd_ibs_mprof_teardown(unsigned int cpu)
+{
+ int offset = get_mprof_lvt_offset();
+ u64 mem_ctl;
+
+ if (offset >= 0)
+ setup_APIC_eilvt(offset, 0, APIC_DELIVERY_MODE_FIXED, 1);
+
+ rdmsrq(MSR_AMD64_IBS_MPROF_CTL, mem_ctl);
+ mprof_disable_profiling(mem_ctl);
+
+ return 0;
+}
+
+static int __init mprof_access_profiling_init(void)
+{
+ u32 mprof_caps = cpuid_eax(IBS_CPUID_FEATURES);
+ int cpu, ret;
+
+ if (!(mprof_caps & IBS_CAPS_MEM_PROFILER)) {
+ pr_info("capability is unavailable for access profiling\n");
+ return 0;
+ }
+
+ mprof_s = alloc_percpu_gfp(struct mprof_sample_pcpu, GFP_KERNEL | __GFP_ZERO);
+ if (!mprof_s) {
+ pr_err("alloc_percpu_gfp failed\n");
+ return 0;
+ }
+
+ for_each_possible_cpu(cpu) {
+ INIT_WORK(per_cpu_ptr(&mprof_work, cpu), mprof_work_handler);
+ init_irq_work(per_cpu_ptr(&mprof_irq_work, cpu), mprof_irq_handler);
+ }
+
+ register_nmi_handler(NMI_LOCAL, mprof_overflow_handler, 0, "ibs-memprof");
+
+ ret = cpuhp_setup_state(CPUHP_AP_MM_AMD_IBS_MEMPROF_STARTING,
+ "x86/amd/ibs_mprof:starting",
+ x86_amd_ibs_mprof_startup,
+ x86_amd_ibs_mprof_teardown);
+
+ if (ret) {
+ unregister_nmi_handler(NMI_LOCAL, "ibs-memprof");
+ free_percpu(mprof_s);
+ pr_err("cpuhp_setup_state failed: %d\n", ret);
+ } else {
+ pr_info("IBS Memory Profiler setup for memory access profiling\n");
+ }
+ return 0;
+}
+
+device_initcall(mprof_access_profiling_init);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 22ba327ec227..feaa3f571726 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -150,6 +150,7 @@ enum cpuhp_state {
CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
CPUHP_AP_PERF_X86_STARTING,
CPUHP_AP_PERF_X86_AMD_IBS_STARTING,
+ CPUHP_AP_MM_AMD_IBS_MEMPROF_STARTING,
CPUHP_AP_PERF_XTENSA_STARTING,
CPUHP_AP_ARM_VFP_STARTING,
CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING,
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 58d510711bd4..a9c04a9735c6 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -179,6 +179,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
PGHOT_RECORDED_ACCESSES,
PGHOT_RECORDED_HINTFAULTS,
PGHOT_RECORDED_HWHINTS,
+#ifdef CONFIG_HWMEM_PROFILER
+ HWHINT_TOTAL_EVENTS,
+ HWHINT_DRAM_ACCESSES,
+ HWHINT_EXTMEM_ACCESSES,
+ HWHINT_USEFUL_EVENTS,
+#endif /* CONFIG_HWMEM_PROFILER */
#endif /* CONFIG_PGHOT */
NR_VM_EVENT_ITEMS
};
diff --git a/mm/Kconfig b/mm/Kconfig
index cc4b5685ecd4..674cfcea7bb0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1494,6 +1494,15 @@ config PGHOT_PRECISE
4 bytes per page against the default one byte per page. Preferable
to enable this on systems with multiple nodes in toptier.
+config HWMEM_PROFILER
+ bool
+ depends on PGHOT
+ help
+ Umbrella symbol enabled by any in-kernel driver that forwards
+ hardware-observed memory accesses to the pghot subsystem (for
+ example AMD_IBS_MEMPROF on x86_64). Drivers select this; users
+ do not enable it directly.
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/vmstat.c b/mm/vmstat.c
index da668ff05032..06e7ae06519e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1493,6 +1493,12 @@ const char * const vmstat_text[] = {
[I(PGHOT_RECORDED_ACCESSES)] = "pghot_recorded_accesses",
[I(PGHOT_RECORDED_HINTFAULTS)] = "pghot_recorded_hintfaults",
[I(PGHOT_RECORDED_HWHINTS)] = "pghot_recorded_hwhints",
+#ifdef CONFIG_HWMEM_PROFILER
+ [I(HWHINT_TOTAL_EVENTS)] = "hwhint_total_events",
+ [I(HWHINT_DRAM_ACCESSES)] = "hwhint_dram_accesses",
+ [I(HWHINT_EXTMEM_ACCESSES)] = "hwhint_extmem_accesses",
+ [I(HWHINT_USEFUL_EVENTS)] = "hwhint_useful_events",
+#endif /* CONFIG_HWMEM_PROFILER */
#endif /* CONFIG_PGHOT */
#undef I
#endif /* CONFIG_VM_EVENT_COUNTERS */
--
2.34.1