[PATCH RFC v1 3/3] powerpc/memtrace: Don't offline memory blocks via offline_pages()

From: David Hildenbrand
Date: Tue Dec 17 2019 - 07:39:18 EST


offline_pages() should not be called outside of the MM core. Especially,
having to manually fiddle with the memory block states is a sign that
this is not a good idea. To offline memory block devices cleanly,
device_offline() should be used. This is the only remaining caller of
offline_pages(), except the official device_offline() way.

E.g., when trying to allocate right now we trigger messages like
[ 11.227817] page:c00c000000182000 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0
[ 11.228056] raw: 007ffff000000000 c000000001538860 c000000001538860 0000000000000000
[ 11.228070] raw: 0000000000000000 0000000000000001 00000001ffffffff 0000000000000000
[ 11.228097] page dumped because: unmovable page

and theoretically we might end up looping quite a long time trying to
offline memory, which would have to be canceled by the user manually
(CTRL-C).

Memtrace needs to identify+allocate multiple consecutive memory blocks.
It also has to remove the memory blocks to remove all page tables
(HW requirement).

Let's use alloc_contig_pages() to allocate memory that spans multiple
memory block devices. We can then set all pages PageOffline() to allow
these pages to get isolated. A temporary memory notifier can then make
offlining of these pages succeed by dropping its reference to the pages
on MEM_GOING_OFFLINE events(as documented in include/linux/page-flags.h
for PageOffline() pages). Error handling is a bit tricky.

Note1: ZONE_MOVABLE memory blocks won't be considered. Not sure if that
was ever really relevant. (unmovable data would end up on these memory
blocks for a tiny little time frame)

Note2: We don't have to care about online_page_callback_t, as we forbid
re-onlining from our memory notifier.

Note3: I was told this feature is never used along with DIMM-based memory
hotunplug - otherwise bad things will happen when a DIMM would try to
remove "alread-removed" memory (that is still in use).

Tested under QEMU with powernv emulation (kernel + initramfs).

$ mount -t debugfs none /sys/kernel/debug/
$ cat /sys/devices/system/memory/block_size_bytes
10000000
$ echo 0x20000000 > /sys/kernel/debug/powerpc/memtrace/enable
[ 19.809790] Offlined Pages 4096
[ 19.835842] Offlined Pages 4096
[ 19.853136] memtrace: Allocated trace memory on node 0 at 0x0000000040000000

Unfortunately, QEMU does not support NUMA for powernv yet, so I cannot
test that.

Cc: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx>
Cc: Paul Mackerras <paulus@xxxxxxxxx>
Cc: Michael Ellerman <mpe@xxxxxxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: David Hildenbrand <david@xxxxxxxxxx>
Cc: Allison Randal <allison@xxxxxxxxxxx>
Cc: Jens Axboe <axboe@xxxxxxxxx>
Cc: Anshuman Khandual <anshuman.khandual@xxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxxxxx>
Cc: Oscar Salvador <osalvador@xxxxxxx>
Cc: Balbir Singh <bsingharora@xxxxxxxxx>
Cc: Rashmica Gupta <rashmica.g@xxxxxxxxx>
Cc: linuxppc-dev@xxxxxxxxxxxxxxxx
Signed-off-by: David Hildenbrand <david@xxxxxxxxxx>
---
arch/powerpc/platforms/powernv/Kconfig | 1 +
arch/powerpc/platforms/powernv/memtrace.c | 175 ++++++++++++++--------
2 files changed, 112 insertions(+), 64 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 938803eab0ad..571a0fa9f055 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -29,6 +29,7 @@ config OPAL_PRD
config PPC_MEMTRACE
bool "Enable removal of RAM from kernel mappings for tracing"
depends on PPC_POWERNV && MEMORY_HOTREMOVE
+ select CONTIG_ALLOC
help
Enabling this option allows for the removal of memory (RAM)
from the kernel mappings to be used for hardware tracing.
diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
index 2d2a0a2acd60..fe1e8f3926a1 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -76,83 +76,130 @@ static int memtrace_free_node(int nid, unsigned long start, unsigned long size)
return ret;
}

-static int check_memblock_online(struct memory_block *mem, void *arg)
-{
- if (mem->state != MEM_ONLINE)
- return -1;
-
- return 0;
-}
-
-static int change_memblock_state(struct memory_block *mem, void *arg)
-{
- unsigned long state = (unsigned long)arg;
-
- mem->state = state;
-
- return 0;
-}
+struct memtrace_alloc_info {
+ struct notifier_block memory_notifier;
+ unsigned long base_pfn;
+ unsigned long nr_pages;
+};

-/* called with device_hotplug_lock held */
-static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
+static int memtrace_memory_notifier_cb(struct notifier_block *nb,
+ unsigned long action, void *arg)
{
- const unsigned long start = PFN_PHYS(start_pfn);
- const unsigned long size = PFN_PHYS(nr_pages);
-
- if (walk_memory_blocks(start, size, NULL, check_memblock_online))
- return false;
-
- walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE,
- change_memblock_state);
-
- if (offline_pages(start_pfn, nr_pages)) {
- walk_memory_blocks(start, size, (void *)MEM_ONLINE,
- change_memblock_state);
- return false;
+ struct memtrace_alloc_info *info = container_of(nb,
+ struct memtrace_alloc_info,
+ memory_notifier);
+ unsigned long pfn, start_pfn, end_pfn;
+ const struct memory_notify *mhp = arg;
+ static bool going_offline;
+
+ /* Ignore ranges that don't overlap. */
+ if (mhp->start_pfn + mhp->nr_pages <= info->base_pfn ||
+ info->base_pfn + info->nr_pages <= mhp->start_pfn)
+ return NOTIFY_OK;
+
+ start_pfn = max_t(unsigned long, info->base_pfn, mhp->start_pfn);
+ end_pfn = min_t(unsigned long, info->base_pfn + info->nr_pages,
+ mhp->start_pfn + mhp->nr_pages);
+
+ /*
+ * Drop our reference to the allocated (PageOffline()) pages, but
+ * reaquire them in case offlining fails. We might get called for
+ * MEM_CANCEL_OFFLINE but not for MEM_GOING_OFFLINE in case another
+ * notifier aborted offlining.
+ */
+ switch (action) {
+ case MEM_GOING_OFFLINE:
+ for (pfn = start_pfn; pfn < end_pfn; pfn++)
+ page_ref_dec(pfn_to_page(pfn));
+ going_offline = true;
+ break;
+ case MEM_CANCEL_OFFLINE:
+ if (going_offline)
+ for (pfn = start_pfn; pfn < end_pfn; pfn++)
+ page_ref_inc(pfn_to_page(pfn));
+ going_offline = false;
+ break;
+ case MEM_GOING_ONLINE:
+ /*
+ * While our notifier is active, user space could
+ * offline+re-online this memory. Disallow any such activity.
+ */
+ return notifier_to_errno(-EBUSY);
}
-
- walk_memory_blocks(start, size, (void *)MEM_OFFLINE,
- change_memblock_state);
-
-
- return true;
+ return NOTIFY_OK;
}

static u64 memtrace_alloc_node(u32 nid, u64 size)
{
- u64 start_pfn, end_pfn, nr_pages, pfn;
- u64 base_pfn;
- u64 bytes = memory_block_size_bytes();
+ const unsigned long memory_block_bytes = memory_block_size_bytes();
+ const unsigned long nr_pages = size >> PAGE_SHIFT;
+ struct memtrace_alloc_info info = {
+ .memory_notifier = {
+ .notifier_call = memtrace_memory_notifier_cb,
+ },
+ };
+ unsigned long base_pfn, to_remove_pfn, pfn;
+ struct page *page;
+ int ret;

if (!node_spanned_pages(nid))
return 0;

- start_pfn = node_start_pfn(nid);
- end_pfn = node_end_pfn(nid);
- nr_pages = size >> PAGE_SHIFT;
-
- /* Trace memory needs to be aligned to the size */
- end_pfn = round_down(end_pfn - nr_pages, nr_pages);
-
- lock_device_hotplug();
- for (base_pfn = end_pfn; base_pfn > start_pfn; base_pfn -= nr_pages) {
- if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) {
- /*
- * Remove memory in memory block size chunks so that
- * iomem resources are always split to the same size and
- * we never try to remove memory that spans two iomem
- * resources.
- */
- end_pfn = base_pfn + nr_pages;
- for (pfn = base_pfn; pfn < end_pfn; pfn += bytes>> PAGE_SHIFT) {
- __remove_memory(nid, pfn << PAGE_SHIFT, bytes);
- }
- unlock_device_hotplug();
- return base_pfn << PAGE_SHIFT;
- }
+ /*
+ * Try to allocate memory (that might span multiple memory blocks)
+ * on the requested node. Trace memory needs to be aligned to the size,
+ * which is guaranteed by alloc_contig_pages().
+ */
+ page = alloc_contig_pages(nr_pages, __GFP_THISNODE, nid, NULL);
+ if (!page)
+ return 0;
+ to_remove_pfn = base_pfn = page_to_pfn(page);
+ info.base_pfn = base_pfn;
+ info.nr_pages = nr_pages;
+
+ /* PageOffline() allows to isolate the memory when offlining. */
+ for (pfn = base_pfn; pfn < base_pfn + nr_pages; pfn++)
+ __SetPageOffline(pfn_to_page(pfn));
+
+ /* A temporary memory notifier allows to offline the isolated memory. */
+ ret = register_memory_notifier(&info.memory_notifier);
+ if (ret)
+ goto out_free_pages;
+
+ /*
+ * Try to offline and remove all involved memory blocks. This will
+ * only fail in the unlikely event that another memory notifier NACKs
+ * the offlining request - no memory has to be migrated.
+ *
+ * Remove memory in memory block size chunks so that iomem resources
+ * are always split to the same size and we never try to remove memory
+ * that spans two iomem resources.
+ */
+ for (; to_remove_pfn < base_pfn + nr_pages;
+ to_remove_pfn += PHYS_PFN(memory_block_bytes)) {
+ ret = offline_and_remove_memory(nid, PFN_PHYS(to_remove_pfn),
+ memory_block_bytes);
+ if (ret)
+ goto out_readd_memory;
}
- unlock_device_hotplug();

+ unregister_memory_notifier(&info.memory_notifier);
+ return PFN_PHYS(base_pfn);
+out_readd_memory:
+ /* Unregister before adding+onlining (notifer blocks onlining). */
+ unregister_memory_notifier(&info.memory_notifier);
+ if (to_remove_pfn != base_pfn) {
+ ret = memtrace_free_node(nid, PFN_PHYS(base_pfn),
+ PFN_PHYS(to_remove_pfn - base_pfn));
+ if (ret)
+ /* Even more unlikely, log and ignore. */
+ pr_err("Failed to add trace memory to node %d\n", nid);
+ }
+out_free_pages:
+ /* Only free memory that was not temporarily offlined+removed. */
+ for (pfn = to_remove_pfn; pfn < base_pfn + nr_pages; pfn++)
+ __ClearPageOffline(pfn_to_page(pfn));
+ free_contig_range(to_remove_pfn, nr_pages - (to_remove_pfn - base_pfn));
return 0;
}

--
2.23.0