[PATCH v3 3/6] mm: handle poisoning of pfn without struct pages

From: ankita
Date: Wed Apr 05 2023 - 14:02:58 EST


From: Ankit Agrawal <ankita@xxxxxxxxxx>

The kernel MM does not currently handle ECC errors / poison on a memory
region that is not backed by struct pages. In this series, mapping request
from QEMU to the device memory is executed using remap_pfn_range().
Hence added a new mechanism to handle memory failure on such memory.

Make kernel MM expose a function to allow modules managing the device
memory to register a failure function and the address space that is
associated with the device memory. MM maintains this information as
interval tree. The registered memory failure function is used by MM to
notify the module of the PFN, so that the module may take any required
action. The module for example may use the information to track the
poisoned pages.

In this implementation, kernel MM follows the following sequence (mostly)
similar to the memory_failure() handler for struct page backed memory:
1. memory_failure() is triggered on reception of a poison error. An
absence of struct page is detected and consequently memory_failure_pfn
is executed.
2. memory_failure_pfn() call the newly introduced failure handler exposed
by the module managing the poisoned memory to notify it of the problematic
PFN.
3. memory_failure_pfn() unmaps the stage-2 mapping to the PFN.
4. memory_failure_pfn() collects the processes mapped to the PFN.
5. memory_failure_pfn() sends SIGBUS (BUS_MCEERR_AO) to all the processes
mapping the faulty PFN using kill_procs().
6. An access to the faulty PFN by an operation in VM at a later point of
time is trapped and user_mem_abort() is called.
7. user_mem_abort() calls __gfn_to_pfn_memslot() on the PFN, and the
following execution path is followed: __gfn_to_pfn_memslot() ->
hva_to_pfn() -> hva_to_pfn_remapped() -> fixup_user_fault() ->
handle_mm_fault() -> handle_pte_fault() -> do_fault(). do_fault() is
expected to return VM_FAULT_HWPOISON on the PFN (it currently does not
and is fixed as part of another patch in the series).
8. __gfn_to_pfn_memslot() then returns KVM_PFN_ERR_HWPOISON, which cause
the poison with SIGBUS (BUS_MCEERR_AR) to be sent to the QEMU process
through kvm_send_hwpoison_signal().

Signed-off-by: Ankit Agrawal <ankita@xxxxxxxxxx>
---
include/linux/memory-failure.h | 22 +++++
include/linux/mm.h | 1 +
include/ras/ras_event.h | 1 +
mm/memory-failure.c | 148 +++++++++++++++++++++++++++++----
4 files changed, 154 insertions(+), 18 deletions(-)
create mode 100644 include/linux/memory-failure.h

diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h
new file mode 100644
index 000000000000..9a579960972a
--- /dev/null
+++ b/include/linux/memory-failure.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MEMORY_FAILURE_H
+#define _LINUX_MEMORY_FAILURE_H
+
+#include <linux/interval_tree.h>
+
+struct pfn_address_space;
+
+struct pfn_address_space_ops {
+ void (*failure)(struct pfn_address_space *pfn_space, unsigned long pfn);
+};
+
+struct pfn_address_space {
+ struct interval_tree_node node;
+ const struct pfn_address_space_ops *ops;
+ struct address_space *mapping;
+};
+
+int register_pfn_address_space(struct pfn_address_space *pfn_space);
+void unregister_pfn_address_space(struct pfn_address_space *pfn_space);
+
+#endif /* _LINUX_MEMORY_FAILURE_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1f79667824eb..e3ef52d3d45a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3530,6 +3530,7 @@ enum mf_action_page_type {
MF_MSG_BUDDY,
MF_MSG_DAX,
MF_MSG_UNSPLIT_THP,
+ MF_MSG_PFN,
MF_MSG_UNKNOWN,
};

diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index cbd3ddd7c33d..5c62a4d17172 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -373,6 +373,7 @@ TRACE_EVENT(aer_event,
EM ( MF_MSG_BUDDY, "free buddy page" ) \
EM ( MF_MSG_DAX, "dax page" ) \
EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \
+ EM ( MF_MSG_PFN, "non struct page pfn" ) \
EMe ( MF_MSG_UNKNOWN, "unknown page" )

/*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fae9baf3be16..2c1a2ec42f7b 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -38,6 +38,7 @@

#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/memory-failure.h>
#include <linux/page-flags.h>
#include <linux/kernel-page-flags.h>
#include <linux/sched/signal.h>
@@ -62,6 +63,7 @@
#include <linux/page-isolation.h>
#include <linux/pagewalk.h>
#include <linux/shmem_fs.h>
+#include <linux/pfn_t.h>
#include "swap.h"
#include "internal.h"
#include "ras/ras_event.h"
@@ -122,6 +124,10 @@ const struct attribute_group memory_failure_attr_group = {
.attrs = memory_failure_attr,
};

+static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED;
+
+static DEFINE_MUTEX(pfn_space_lock);
+
/*
* Return values:
* 1: the page is dissolved (if needed) and taken off from buddy,
@@ -399,15 +405,14 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
* Schedule a process for later kill.
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
*
- * Note: @fsdax_pgoff is used only when @p is a fsdax page and a
- * filesystem with a memory failure handler has claimed the
- * memory_failure event. In all other cases, page->index and
- * page->mapping are sufficient for mapping the page back to its
+ * Notice: @pgoff is used either when @p is a fsdax page or a PFN is not
+ * backed by struct page and a filesystem with a memory failure handler
+ * has claimed the memory_failure event. In all other cases, page->index
+ * and page->mapping are sufficient for mapping the page back to its
* corresponding user virtual address.
*/
-static void add_to_kill(struct task_struct *tsk, struct page *p,
- pgoff_t fsdax_pgoff, struct vm_area_struct *vma,
- struct list_head *to_kill)
+static void add_to_kill(struct task_struct *tsk, struct page *p, pgoff_t pgoff,
+ struct vm_area_struct *vma, struct list_head *to_kill)
{
struct to_kill *tk;

@@ -417,13 +422,20 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
return;
}

- tk->addr = page_address_in_vma(p, vma);
- if (is_zone_device_page(p)) {
- if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
- tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
+ if (vma->vm_flags | PFN_MAP) {
+ tk->addr =
+ vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ tk->size_shift = PAGE_SHIFT;
+ } else if (is_zone_device_page(p)) {
+ if (pgoff != FSDAX_INVALID_PGOFF)
+ tk->addr = vma_pgoff_address(pgoff, 1, vma);
+ else
+ tk->addr = page_address_in_vma(p, vma);
tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
- } else
+ } else {
+ tk->addr = page_address_in_vma(p, vma);
tk->size_shift = page_shift(compound_head(p));
+ }

/*
* Send SIGKILL if "tk->addr == -EFAULT". Also, as
@@ -617,13 +629,12 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
i_mmap_unlock_read(mapping);
}

-#ifdef CONFIG_FS_DAX
/*
* Collect processes when the error hit a fsdax page.
*/
-static void collect_procs_fsdax(struct page *page,
- struct address_space *mapping, pgoff_t pgoff,
- struct list_head *to_kill)
+static void collect_procs_pgoff(struct page *page,
+ struct address_space *mapping, pgoff_t pgoff,
+ struct list_head *to_kill)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
@@ -643,7 +654,6 @@ static void collect_procs_fsdax(struct page *page,
read_unlock(&tasklist_lock);
i_mmap_unlock_read(mapping);
}
-#endif /* CONFIG_FS_DAX */

/*
* Collect the processes who have the corrupted page mapped to kill.
@@ -835,6 +845,7 @@ static const char * const action_page_types[] = {
[MF_MSG_BUDDY] = "free buddy page",
[MF_MSG_DAX] = "dax page",
[MF_MSG_UNSPLIT_THP] = "unsplit thp",
+ [MF_MSG_PFN] = "non struct page pfn",
[MF_MSG_UNKNOWN] = "unknown page",
};

@@ -1745,7 +1756,7 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,

SetPageHWPoison(page);

- collect_procs_fsdax(page, mapping, index, &to_kill);
+ collect_procs_pgoff(page, mapping, index, &to_kill);
unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
index, mf_flags);
unlock:
@@ -2052,6 +2063,99 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
return rc;
}

+/**
+ * register_pfn_address_space - Register PA region for poison notification.
+ * @pfn_space: structure containing region range and callback function on
+ * poison detection.
+ *
+ * This function is called by a kernel module to register a PA region and
+ * a callback function with the kernel. On detection of poison, the
+ * kernel code will go through all registered regions and call the
+ * appropriate callback function associated with the range. The kernel
+ * module is responsible for tracking the poisoned pages.
+ *
+ * Return: 0 if successfully registered,
+ * -EBUSY if the region is already registered
+ */
+int register_pfn_address_space(struct pfn_address_space *pfn_space)
+{
+ if (!request_mem_region(pfn_space->node.start << PAGE_SHIFT,
+ (pfn_space->node.last - pfn_space->node.start + 1) << PAGE_SHIFT, ""))
+ return -EBUSY;
+
+ mutex_lock(&pfn_space_lock);
+ interval_tree_insert(&pfn_space->node, &pfn_space_itree);
+ mutex_unlock(&pfn_space_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(register_pfn_address_space);
+
+/**
+ * unregister_pfn_address_space - Unregister a PA region from poison
+ * notification.
+ * @pfn_space: structure containing region range to be unregistered.
+ *
+ * This function is called by a kernel module to unregister the PA region
+ * from the kernel from poison tracking.
+ */
+void unregister_pfn_address_space(struct pfn_address_space *pfn_space)
+{
+ mutex_lock(&pfn_space_lock);
+ interval_tree_remove(&pfn_space->node, &pfn_space_itree);
+ mutex_unlock(&pfn_space_lock);
+ release_mem_region(pfn_space->node.start << PAGE_SHIFT,
+ (pfn_space->node.last - pfn_space->node.start + 1) << PAGE_SHIFT);
+}
+EXPORT_SYMBOL_GPL(unregister_pfn_address_space);
+
+static int memory_failure_pfn(unsigned long pfn, int flags)
+{
+ struct interval_tree_node *node;
+ int rc = -EBUSY;
+ LIST_HEAD(tokill);
+
+ mutex_lock(&pfn_space_lock);
+ /*
+ * Modules registers with MM the address space mapping to the device memory they
+ * manage. Iterate to identify exactly which address space has mapped to this
+ * failing PFN.
+ */
+ for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node;
+ node = interval_tree_iter_next(node, pfn, pfn)) {
+ struct pfn_address_space *pfn_space =
+ container_of(node, struct pfn_address_space, node);
+ rc = 0;
+
+ /*
+ * Modules managing the device memory needs to be conveyed about the
+ * memory failure so that the poisoned PFN can be tracked.
+ */
+ pfn_space->ops->failure(pfn_space, pfn);
+
+ collect_procs_pgoff(NULL, pfn_space->mapping, pfn, &tokill);
+
+ unmap_mapping_range(pfn_space->mapping, pfn << PAGE_SHIFT,
+ PAGE_SIZE, 0);
+ }
+ mutex_unlock(&pfn_space_lock);
+
+ /*
+ * Unlike System-RAM there is no possibility to swap in a different
+ * physical page at a given virtual address, so all userspace
+ * consumption of direct PFN memory necessitates SIGBUS (i.e.
+ * MF_MUST_KILL)
+ */
+ flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+ kill_procs(&tokill, true, false, pfn, flags);
+
+ pr_err("%#lx: recovery action for %s: %s\n",
+ pfn, action_page_types[MF_MSG_PFN],
+ action_name[rc ? MF_FAILED : MF_RECOVERED]);
+
+ return rc;
+}
+
static DEFINE_MUTEX(mf_mutex);

/**
@@ -2093,6 +2197,11 @@ int memory_failure(unsigned long pfn, int flags)
if (!(flags & MF_SW_SIMULATED))
hw_memory_failure = true;

+ if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) {
+ res = memory_failure_pfn(pfn, flags);
+ goto unlock_mutex;
+ }
+
p = pfn_to_online_page(pfn);
if (!p) {
res = arch_memory_failure(pfn, flags);
@@ -2106,6 +2215,9 @@ int memory_failure(unsigned long pfn, int flags)
pgmap);
goto unlock_mutex;
}
+
+ res = memory_failure_pfn(pfn, flags);
+ goto unlock_mutex;
}
pr_err("%#lx: memory outside kernel control\n", pfn);
res = -ENXIO;
--
2.17.1