[RFC v2] mm: introduce page pin owner
From: Minchan Kim
Date: Tue Dec 28 2021 - 12:59:13 EST
A Contiguous Memory Allocator(CMA) allocation can fail if any page
within the requested range has an elevated refcount(a pinned page).
Debugging such failures is difficult, because the struct pages only
show a combined refcount, and do not show the callstacks or
backtraces of the code that acquired each refcount. So the source
of the page pins remains a mystery, at the time of CMA failure.
In order to solve this without adding too much overhead, just do
nothing most of the time, which is pretty low overhead. However,
once a CMA failure occurs, then mark the page (this requires a
pointer's worth of space in struct page, but it uses page extensions
to get that), and start tracing the subsequent put_page() calls.
As the program finishes up, each page pin will be undone, and
traced with a backtrace. The programmer reads the trace output and
sees the list of all page pinning code paths.
This will consume an additional 8 bytes per 4KB page, or an
additional 0.2% of RAM. In addition to the storage space, it will
have some performance cost, due to increasing the size of struct
page so that it is greater than the cacheline size (or multiples
thereof) of popular (x86, ...) CPUs.
The idea can apply every user of migrate_pages as well as CMA to
know the reason why the page migration failed. To support it,
the implementation takes "enum migrate_reason" string as filter
of the tracepoint(see below).
Usage)
trace_dir="/sys/kernel/tracing"
echo 1 > $trace_dir/options/stacktrace
echo 1 > $trace_dir/events/page_pin_owner/enable
echo "contig_range" > $trace_dir/events/page_pin_owner/report_page_pinners/filter
===
example:
If you are interested in compaction failure, you want to use
"compaction" as a filter instead of "contig_range".
For the other filter strings, you can refer migrate_reason_names
in mm/debug.c
===
..
run workload
..
cat $trace_dir/trace
..
bash-7442 [007] ...1. 288.504690: report_page_pinners: pfn=0x91430 flags=uptodate|swapbacked count=2 mapcount=0 mapping=ffff88f5480a2709 mt=5 reason=contig_range err=-11
bash-7442 [007] ...1. 288.504691: <stack trace>
=> trace_event_raw_event_report_page_pinners
=> __report_page_pinners
=> migrate_pages
=> alloc_contig_range
=> cma_alloc.cold
=> cma_alloc_write
=> simple_attr_write
=> debugfs_attr_write
=> full_proxy_write
=> vfs_write
=> ksys_write
=> do_syscall_64
=> entry_SYSCALL_64_after_hwframe
..
..
bash-7442 [007] ...1. 288.506426: page_pin_owner_put: pfn=0x91430 flags=uptodate|swapbacked count=2 mapcount=0 mapping=ffff88f5480a2709 mt=5
bash-7442 [007] ...1. 288.506427: <stack trace>
=> trace_event_raw_event_page_pin_owner_put
=> __page_pin_owner_put
=> put_page
=> putback_movable_pages
=> alloc_contig_range
=> cma_alloc.cold
=> cma_alloc_write
=> simple_attr_write
=> debugfs_attr_write
=> full_proxy_write
=> vfs_write
=> ksys_write
=> do_syscall_64
=> entry_SYSCALL_64_after_hwframe
..
cc1-31104 [001] d..2. 288.507632: page_pin_owner_put: pfn=0x91430 flags=uptodate|swapbacked count=1 mapcount=0 mapping=ffff88f5480a2709 mt=4
cc1-31104 [001] d..2. 288.507636: <stack trace>
=> trace_event_raw_event_page_pin_owner_put
=> __page_pin_owner_put
=> release_pages
=> tlb_flush_mmu
=> unmap_page_range
=> unmap_vmas
=> exit_mmap
=> mmput
=> do_exit
=> do_group_exit
=> __x64_sys_exit_group
=> do_syscall_64
=> entry_SYSCALL_64_after_hwframe
..
make-31157 [007] d..3. 288.508333: page_pin_owner_put: pfn=0x91430 flags=uptodate|lru|swapbacked count=0 mapcount=0 mapping=ffff88f5480a2709 mt=5
make-31157 [007] d..3. 288.508335: <stack trace>
=> trace_event_raw_event_page_pin_owner_put
=> __page_pin_owner_put
=> release_pages
=> __pagevec_lru_add
=> lru_add_drain_cpu
=> lru_add_drain
=> exit_mmap
=> mmput
=> begin_new_exec
=> load_elf_binary
=> bprm_execve
=> do_execveat_common.isra.0
=> __x64_sys_execve
=> do_syscall_64
=> entry_SYSCALL_64_after_hwframe
Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx>
---
* from v1 - https://lore.kernel.org/lkml/20211206184730.858850-1-minchan@xxxxxxxxxx/
* PagePinOwner naming suggestion - jhubbard@
* Description/Kconfig suggestion - jhubbard@
* General migrate_page supports - huww98@
include/linux/mm.h | 7 ++-
include/linux/page_ext.h | 3 +
include/linux/page_pin_owner.h | 48 ++++++++++++++
include/trace/events/page_pin_owner.h | 81 ++++++++++++++++++++++++
mm/Kconfig.debug | 15 +++++
mm/Makefile | 1 +
mm/migrate.c | 5 +-
mm/page_alloc.c | 3 +
mm/page_ext.c | 4 ++
mm/page_pin_owner.c | 91 +++++++++++++++++++++++++++
10 files changed, 256 insertions(+), 2 deletions(-)
create mode 100644 include/linux/page_pin_owner.h
create mode 100644 include/trace/events/page_pin_owner.h
create mode 100644 mm/page_pin_owner.c
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 82be88c1fdbb..5c437faf129c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -26,6 +26,7 @@
#include <linux/err.h>
#include <linux/page-flags.h>
#include <linux/page_ref.h>
+#include <linux/page_pin_owner.h>
#include <linux/memremap.h>
#include <linux/overflow.h>
#include <linux/sizes.h>
@@ -714,8 +715,12 @@ static inline unsigned int folio_order(struct folio *folio)
*/
static inline int put_page_testzero(struct page *page)
{
+ int ret;
+
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
- return page_ref_dec_and_test(page);
+ ret = page_ref_dec_and_test(page);
+ page_pin_owner_put(page);
+ return ret;
}
static inline int folio_put_testzero(struct folio *folio)
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index fabb2e1e087f..3236efd5ab07 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -23,6 +23,9 @@ enum page_ext_flags {
PAGE_EXT_YOUNG,
PAGE_EXT_IDLE,
#endif
+#if defined(CONFIG_PAGE_PIN_OWNER)
+ PAGE_EXT_PIN_OWNER,
+#endif
};
/*
diff --git a/include/linux/page_pin_owner.h b/include/linux/page_pin_owner.h
new file mode 100644
index 000000000000..e68adcdd6799
--- /dev/null
+++ b/include/linux/page_pin_owner.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_PAGE_PIN_OWNER_H
+#define __LINUX_PAGE_PIN_OWNER_H
+
+#include <linux/jump_label.h>
+
+#ifdef CONFIG_PAGE_PIN_OWNER
+extern struct static_key_false page_pin_owner_inited;
+extern struct page_ext_operations page_pin_owner_ops;
+
+void __report_page_pinners(struct page *page, int reason, int err);
+void __page_pin_owner_put(struct page *page);
+void __reset_page_pin_owner(struct page *page, unsigned int order);
+
+static inline void reset_page_pin_owner(struct page *page, unsigned int order)
+{
+ if (static_branch_unlikely(&page_pin_owner_inited))
+ __reset_page_pin_owner(page, order);
+}
+
+static inline void report_page_pinners(struct page *page, int reason, int err)
+{
+ if (!static_branch_unlikely(&page_pin_owner_inited))
+ return;
+
+ __report_page_pinners(page, reason, err);
+}
+
+static inline void page_pin_owner_put(struct page *page)
+{
+ if (!static_branch_unlikely(&page_pin_owner_inited))
+ return;
+
+ __page_pin_owner_put(page);
+}
+
+#else
+static inline void reset_page_pin_owner(struct page *page, unsigned int order)
+{
+}
+static inline void report_page_pinners(struct page *page, int reason, int err)
+{
+}
+static inline void page_pin_owner_put(struct page *page)
+{
+}
+#endif /* CONFIG_PAGE_PIN_OWNER */
+#endif /*__LINUX_PAGE_PIN_OWNER_H */
diff --git a/include/trace/events/page_pin_owner.h b/include/trace/events/page_pin_owner.h
new file mode 100644
index 000000000000..0096297312f5
--- /dev/null
+++ b/include/trace/events/page_pin_owner.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM page_pin_owner
+
+#if !defined(_TRACE_PAGE_PIN_OWNER_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PAGE_PIN_OWNER_H
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+#include <trace/events/mmflags.h>
+
+TRACE_EVENT(page_pin_owner_put,
+
+ TP_PROTO(struct page *page),
+
+ TP_ARGS(page),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, pfn)
+ __field(unsigned long, flags)
+ __field(int, count)
+ __field(int, mapcount)
+ __field(void *, mapping)
+ __field(int, mt)
+ ),
+
+ TP_fast_assign(
+ __entry->pfn = page_to_pfn(page);
+ __entry->flags = page->flags;
+ __entry->count = page_ref_count(page);
+ __entry->mapcount = page_mapcount(page);
+ __entry->mapping = page->mapping;
+ __entry->mt = get_pageblock_migratetype(page);
+ ),
+
+ TP_printk("pfn=0x%lx flags=%s count=%d mapcount=%d mapping=%p mt=%d",
+ __entry->pfn,
+ show_page_flags(__entry->flags & ((1UL << NR_PAGEFLAGS) - 1)),
+ __entry->count,
+ __entry->mapcount, __entry->mapping, __entry->mt)
+);
+
+TRACE_EVENT(report_page_pinners,
+
+ TP_PROTO(struct page *page, const char *reason, int err),
+
+ TP_ARGS(page, reason, err),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, pfn)
+ __field(unsigned long, flags)
+ __field(int, count)
+ __field(int, mapcount)
+ __field(void *, mapping)
+ __field(int, mt)
+ __field(const char *, reason)
+ __field(int, err)
+ ),
+
+ TP_fast_assign(
+ __entry->pfn = page_to_pfn(page);
+ __entry->flags = page->flags;
+ __entry->count = page_ref_count(page);
+ __entry->mapcount = page_mapcount(page);
+ __entry->mapping = page->mapping;
+ __entry->mt = get_pageblock_migratetype(page);
+ __entry->reason = reason;
+ __entry->err = err;
+ ),
+
+ TP_printk("pfn=0x%lx flags=%s count=%d mapcount=%d mapping=%p mt=%d reason=%s err=%d",
+ __entry->pfn,
+ show_page_flags(__entry->flags & ((1UL << NR_PAGEFLAGS) - 1)),
+ __entry->count, __entry->mapcount, __entry->mapping,
+ __entry->mt, __entry->reason, __entry->err)
+);
+
+#endif /* _TRACE_PAGE_PIN_OWNER_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 5bd5bb097252..ed2d5e6450b7 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -86,6 +86,21 @@ config PAGE_TABLE_CHECK_ENFORCED
If unsure say "n".
+config PAGE_PIN_OWNER
+ bool "Track page pinner"
+ depends on DEBUG_KERNEL && TRACEPOINTS && STACKTRACE_SUPPORT
+ select PAGE_EXTENSION
+ select STACKTRACE
+ help
+ This keeps track of what call chain is the pinner of a page, may
+ help to find contiguous page allocation(memory-hotplug, compaction,
+ cma and so on) failure. Even if you include this feature in your
+ build, it is disabled by default. You should pass "page_pin_owner=on"
+ to boot parameter in order to enable it. Eats a fair amount of memory
+ if enabled.
+
+ If unsure, say N.
+
config PAGE_POISONING
bool "Poison pages after freeing"
help
diff --git a/mm/Makefile b/mm/Makefile
index 588d3113f3b0..ca89f4fa38ce 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -104,6 +104,7 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
obj-$(CONFIG_PAGE_OWNER) += page_owner.o
+obj-$(CONFIG_PAGE_PIN_OWNER) += page_pin_owner.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
obj-$(CONFIG_ZPOOL) += zpool.o
diff --git a/mm/migrate.c b/mm/migrate.c
index c7da064b4781..aa8a2c21da8e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1105,6 +1105,8 @@ static int unmap_and_move(new_page_t get_new_page,
rc = __unmap_and_move(page, newpage, force, mode);
if (rc == MIGRATEPAGE_SUCCESS)
set_page_owner_migrate_reason(newpage, reason);
+ else
+ report_page_pinners(page, reason, rc);
out:
if (rc != -EAGAIN) {
@@ -1273,7 +1275,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (rc == MIGRATEPAGE_SUCCESS) {
move_hugetlb_state(hpage, new_hpage, reason);
put_new_page = NULL;
- }
+ } else
+ report_page_pinners(hpage, reason, rc);
out_unlock:
unlock_page(hpage);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b5d62e1c8d81..98125b1b6e7e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -64,6 +64,7 @@
#include <linux/sched/rt.h>
#include <linux/sched/mm.h>
#include <linux/page_owner.h>
+#include <linux/page_pin_owner.h>
#include <linux/page_table_check.h>
#include <linux/kthread.h>
#include <linux/memcontrol.h>
@@ -1310,6 +1311,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
if (memcg_kmem_enabled() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
reset_page_owner(page, order);
+ reset_page_pin_owner(page, order);
page_table_check_free(page, order);
return false;
}
@@ -1350,6 +1352,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
page_cpupid_reset_last(page);
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
reset_page_owner(page, order);
+ reset_page_pin_owner(page, order);
page_table_check_free(page, order);
if (!PageHighMem(page)) {
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 2e66d934d63f..3c0df97b9b8b 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -7,6 +7,7 @@
#include <linux/vmalloc.h>
#include <linux/kmemleak.h>
#include <linux/page_owner.h>
+#include <linux/page_pin_owner.h>
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
@@ -79,6 +80,9 @@ static struct page_ext_operations *page_ext_ops[] __initdata = {
#ifdef CONFIG_PAGE_TABLE_CHECK
&page_table_check_ops,
#endif
+#ifdef CONFIG_PAGE_PIN_OWNER
+ &page_pin_owner_ops,
+#endif
};
unsigned long page_ext_size = sizeof(struct page_ext);
diff --git a/mm/page_pin_owner.c b/mm/page_pin_owner.c
new file mode 100644
index 000000000000..736617df093c
--- /dev/null
+++ b/mm/page_pin_owner.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/page_pin_owner.h>
+#include <linux/migrate.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/page_pin_owner.h>
+
+static bool page_pin_owner_enabled;
+DEFINE_STATIC_KEY_FALSE(page_pin_owner_inited);
+EXPORT_SYMBOL(page_pin_owner_inited);
+
+static int __init early_page_pin_owner_param(char *buf)
+{
+ return kstrtobool(buf, &page_pin_owner_enabled);
+}
+early_param("page_pin_owner", early_page_pin_owner_param);
+
+static bool need_page_pin_owner(void)
+{
+ return page_pin_owner_enabled;
+}
+
+static void init_page_pin_owner(void)
+{
+ if (!page_pin_owner_enabled)
+ return;
+
+ static_branch_enable(&page_pin_owner_inited);
+}
+
+struct page_ext_operations page_pin_owner_ops = {
+ .need = need_page_pin_owner,
+ .init = init_page_pin_owner,
+};
+
+void __reset_page_pin_owner(struct page *page, unsigned int order)
+{
+ struct page_ext *page_ext;
+ int i;
+
+ page_ext = lookup_page_ext(page);
+ if (unlikely(!page_ext))
+ return;
+
+ for (i = 0; i < (1 << order); i++) {
+ if (!test_bit(PAGE_EXT_PIN_OWNER, &page_ext->flags))
+ break;
+
+ clear_bit(PAGE_EXT_PIN_OWNER, &page_ext->flags);
+ page_ext = page_ext_next(page_ext);
+ }
+}
+
+void __report_page_pinners(struct page *page, int reason, int err)
+{
+ struct page_ext *page_ext;
+
+ page_ext = lookup_page_ext(page);
+ if (unlikely(!page_ext))
+ return;
+
+ test_and_set_bit(PAGE_EXT_PIN_OWNER, &page_ext->flags);
+ trace_report_page_pinners(page, migrate_reason_names[reason], err);
+}
+
+void __page_pin_owner_put(struct page *page)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+
+ if (unlikely(!page_ext))
+ return;
+
+ if (!test_bit(PAGE_EXT_PIN_OWNER, &page_ext->flags))
+ return;
+
+ trace_page_pin_owner_put(page);
+}
+EXPORT_SYMBOL(__page_pin_owner_put);
+
+static int __init page_pin_owner_init(void)
+{
+ if (!static_branch_unlikely(&page_pin_owner_inited)) {
+ pr_info("page_pin_owner is disabled\n");
+ return 0;
+ }
+
+ pr_info("page_pin_owner is enabled\n");
+ return 0;
+}
+late_initcall(page_pin_owner_init)
--
2.34.1.448.ga2b2bfdf31-goog