[RFC PATCH v2 5/6] mm, fsdax: refactor dax handler in memory-failure

From: Shiyang Ruan
Date: Sun Nov 22 2020 - 19:42:10 EST


With the ->memory_failure() implemented in pmem device and
->storage_lost() in XFS, we are able to track files or metadata
and process them further.

We don't track files by page->mapping, page->index any more, so
some of functions who obtain ->mapping, ->index from struct page
parameter need to be changed by directly passing mapping and index.

Signed-off-by: Shiyang Ruan <ruansy.fnst@xxxxxxxxxxxxxx>
---
fs/dax.c | 18 +++----
include/linux/dax.h | 5 +-
include/linux/mm.h | 8 +++
mm/memory-failure.c | 127 +++++++++++++++++++++++++++-----------------
4 files changed, 94 insertions(+), 64 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 5b47834f2e1b..34471acde683 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -379,14 +379,14 @@ static struct page *dax_busy_page(void *entry)
}

/*
- * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
+ * dax_lock - Lock the DAX entry corresponding to a page
* @page: The page whose entry we want to lock
*
* Context: Process context.
* Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
* not be locked.
*/
-dax_entry_t dax_lock_page(struct page *page)
+dax_entry_t dax_lock(struct address_space *mapping, pgoff_t index)
{
XA_STATE(xas, NULL, 0);
void *entry;
@@ -394,8 +394,6 @@ dax_entry_t dax_lock_page(struct page *page)
/* Ensure page->mapping isn't freed while we look at it */
rcu_read_lock();
for (;;) {
- struct address_space *mapping = READ_ONCE(page->mapping);
-
entry = NULL;
if (!mapping || !dax_mapping(mapping))
break;
@@ -413,11 +411,7 @@ dax_entry_t dax_lock_page(struct page *page)

xas.xa = &mapping->i_pages;
xas_lock_irq(&xas);
- if (mapping != page->mapping) {
- xas_unlock_irq(&xas);
- continue;
- }
- xas_set(&xas, page->index);
+ xas_set(&xas, index);
entry = xas_load(&xas);
if (dax_is_locked(entry)) {
rcu_read_unlock();
@@ -433,10 +427,10 @@ dax_entry_t dax_lock_page(struct page *page)
return (dax_entry_t)entry;
}

-void dax_unlock_page(struct page *page, dax_entry_t cookie)
+void dax_unlock(struct address_space *mapping, pgoff_t index,
+ dax_entry_t cookie)
{
- struct address_space *mapping = page->mapping;
- XA_STATE(xas, &mapping->i_pages, page->index);
+ XA_STATE(xas, &mapping->i_pages, index);

if (S_ISCHR(mapping->host->i_mode))
return;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index b52f084aa643..a8d697eb886c 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -150,8 +150,9 @@ int dax_writeback_mapping_range(struct address_space *mapping,

struct page *dax_layout_busy_page(struct address_space *mapping);
struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
-dax_entry_t dax_lock_page(struct page *page);
-void dax_unlock_page(struct page *page, dax_entry_t cookie);
+dax_entry_t dax_lock(struct address_space *mapping, pgoff_t index);
+void dax_unlock(struct address_space *mapping, pgoff_t index,
+ dax_entry_t cookie);
#else
static inline bool bdev_dax_supported(struct block_device *bdev,
int blocksize)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 872b51ebe57b..729448ed10b7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1141,6 +1141,14 @@ static inline bool is_device_private_page(const struct page *page)
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
}

+static inline bool is_device_fsdax_page(const struct page *page)
+{
+ return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+ IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
+ is_zone_device_page(page) &&
+ page->pgmap->type == MEMORY_DEVICE_FS_DAX;
+}
+
static inline bool is_pci_p2pdma_page(const struct page *page)
{
return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c0bb186bba62..f8f80458746e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -120,6 +120,9 @@ static int hwpoison_filter_dev(struct page *p)
if (PageSlab(p))
return -EINVAL;

+ if (is_device_fsdax_page(p))
+ return 0;
+
mapping = page_mapping(p);
if (mapping == NULL || mapping->host == NULL)
return -EINVAL;
@@ -290,9 +293,8 @@ void shake_page(struct page *p, int access)
EXPORT_SYMBOL_GPL(shake_page);

static unsigned long dev_pagemap_mapping_shift(struct page *page,
- struct vm_area_struct *vma)
+ struct vm_area_struct *vma, unsigned long address)
{
- unsigned long address = vma_address(page, vma);
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
@@ -333,8 +335,8 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
*/
static void add_to_kill(struct task_struct *tsk, struct page *p,
- struct vm_area_struct *vma,
- struct list_head *to_kill)
+ struct address_space *mapping, pgoff_t pgoff,
+ struct vm_area_struct *vma, struct list_head *to_kill)
{
struct to_kill *tk;

@@ -344,12 +346,18 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
return;
}

- tk->addr = page_address_in_vma(p, vma);
- if (is_zone_device_page(p))
- tk->size_shift = dev_pagemap_mapping_shift(p, vma);
- else
- tk->size_shift = page_shift(compound_head(p));
-
+ if (is_device_fsdax_page(p)) {
+ tk->addr = vma->vm_start +
+ ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ tk->size_shift = dev_pagemap_mapping_shift(p, vma, tk->addr);
+ } else {
+ tk->addr = page_address_in_vma(p, vma);
+ if (is_zone_device_page(p)) {
+ tk->size_shift = dev_pagemap_mapping_shift(p, vma,
+ vma_address(p, vma));
+ } else
+ tk->size_shift = page_shift(compound_head(p));
+ }
/*
* Send SIGKILL if "tk->addr == -EFAULT". Also, as
* "tk->size_shift" is always non-zero for !is_zone_device_page(),
@@ -495,7 +503,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
if (!page_mapped_in_vma(page, vma))
continue;
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill);
+ add_to_kill(t, page, NULL, 0, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -505,24 +513,20 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
/*
* Collect processes when the error hit a file mapped page.
*/
-static void collect_procs_file(struct page *page, struct list_head *to_kill,
- int force_early)
+static void collect_procs_file(struct page *page, struct address_space *mapping,
+ pgoff_t pgoff, struct list_head *to_kill, int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
- struct address_space *mapping = page->mapping;
- pgoff_t pgoff;

i_mmap_lock_read(mapping);
read_lock(&tasklist_lock);
pgoff = page_to_pgoff(page);
for_each_process(tsk) {
struct task_struct *t = task_early_kill(tsk, force_early);
-
if (!t)
continue;
- vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
- pgoff) {
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
/*
* Send early kill signal to tasks where a vma covers
* the page but the corrupted page is not necessarily
@@ -530,8 +534,10 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* Assume applications who requested early kill want
* to be informed of all such data corruptions.
*/
- if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill);
+ if (vma->vm_mm == t->mm) {
+ add_to_kill(t, page, mapping, pgoff, vma,
+ to_kill);
+ }
}
}
read_unlock(&tasklist_lock);
@@ -550,7 +556,8 @@ static void collect_procs(struct page *page, struct list_head *tokill,
if (PageAnon(page))
collect_procs_anon(page, tokill, force_early);
else
- collect_procs_file(page, tokill, force_early);
+ collect_procs_file(page, page->mapping, page_to_pgoff(page),
+ tokill, force_early);
}

static const char *action_name[] = {
@@ -1221,14 +1228,14 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
return res;
}

-static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
- struct dev_pagemap *pgmap)
+static int memory_failure_dev_pagemap_kill_procs(unsigned long pfn, int flags,
+ struct address_space *mapping, pgoff_t index)
{
struct page *page = pfn_to_page(pfn);
const bool unmap_success = true;
unsigned long size = 0;
struct to_kill *tk;
- LIST_HEAD(tokill);
+ LIST_HEAD(to_kill);
int rc = -EBUSY;
loff_t start;
dax_entry_t cookie;
@@ -1240,28 +1247,9 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
* also prevents changes to the mapping of this pfn until
* poison signaling is complete.
*/
- cookie = dax_lock_page(page);
+ cookie = dax_lock(mapping, index);
if (!cookie)
- goto out;
-
- if (hwpoison_filter(page)) {
- rc = 0;
- goto unlock;
- }
-
- if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
- /*
- * TODO: Handle HMM pages which may need coordination
- * with device-side memory.
- */
goto unlock;
- }
-
- /*
- * Use this flag as an indication that the dax page has been
- * remapped UC to prevent speculative consumption of poison.
- */
- SetPageHWPoison(page);

/*
* Unlike System-RAM there is no possibility to swap in a
@@ -1270,9 +1258,10 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
* SIGBUS (i.e. MF_MUST_KILL)
*/
flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
- collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
+ collect_procs_file(page, mapping, index, &to_kill,
+ flags & MF_ACTION_REQUIRED);

- list_for_each_entry(tk, &tokill, nd)
+ list_for_each_entry(tk, &to_kill, nd)
if (tk->size_shift)
size = max(size, 1UL << tk->size_shift);
if (size) {
@@ -1282,13 +1271,51 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
* actual size of the mapping being torn down is
* communicated in siginfo, see kill_proc()
*/
- start = (page->index << PAGE_SHIFT) & ~(size - 1);
- unmap_mapping_range(page->mapping, start, start + size, 0);
+ start = (index << PAGE_SHIFT) & ~(size - 1);
+ unmap_mapping_range(mapping, start, start + size, 0);
}
- kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
+
+ kill_procs(&to_kill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
rc = 0;
unlock:
- dax_unlock_page(page, cookie);
+ dax_unlock(mapping, index, cookie);
+ return rc;
+}
+
+static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
+ struct dev_pagemap *pgmap)
+{
+ struct page *page = pfn_to_page(pfn);
+ struct mf_recover_controller mfrc = {
+ .recover_fn = memory_failure_dev_pagemap_kill_procs,
+ .pfn = pfn,
+ .flags = flags,
+ };
+ int rc;
+
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ /*
+ * TODO: Handle HMM pages which may need coordination
+ * with device-side memory.
+ */
+ goto out;
+ }
+
+ if (hwpoison_filter(page)) {
+ rc = 0;
+ goto out;
+ }
+
+ /*
+ * Use this flag as an indication that the dax page has been
+ * remapped UC to prevent speculative consumption of poison.
+ */
+ SetPageHWPoison(page);
+
+ /* call driver to handle the memory failure */
+ if (pgmap->ops->memory_failure)
+ rc = pgmap->ops->memory_failure(pgmap, &mfrc);
+
out:
/* drop pgmap ref acquired in caller */
put_dev_pagemap(pgmap);
--
2.29.2