Re: [PATCH v3 1/3] mm: memfd/hugetlb: introduce memfd-based userspace MFR policy

From: William Roche

Date: Wed Feb 04 2026 - 12:32:06 EST

On 2/3/26 20:23, Jiaqi Yan wrote:

[...]
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3b4c152c5c73a..8b0f5aa49711f 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -551,6 +551,18 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
}
folio_unlock(folio);
+
+ /*
+ * There may be pending HWPoison-ed folios when a memfd is being
+ * removed or part of it is being truncated.
+ *
+ * HugeTLBFS' error_remove_folio keeps the HWPoison-ed folios in
+ * page cache until mm wants to drop the folio at the end of the
+ * of the filemap. At this point, if memory failure was delayed

"of the" is repeated

+ * by MFD_MF_KEEP_UE_MAPPED in the past, we can now deal with it.
+ */
+ filemap_offline_hwpoison_folio(mapping, folio);
+
return ret;
}
@@ -582,13 +594,13 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
const pgoff_t end = lend >> PAGE_SHIFT;
struct folio_batch fbatch;
pgoff_t next, index;
- int i, freed = 0;
+ int i, j, freed = 0;
bool truncate_op = (lend == LLONG_MAX);
folio_batch_init(&fbatch);
next = lstart >> PAGE_SHIFT;
while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
- for (i = 0; i < folio_batch_count(&fbatch); ++i) {
+ for (i = 0, j = 0; i < folio_batch_count(&fbatch); ++i) {
struct folio *folio = fbatch.folios[i];
u32 hash = 0;
@@ -603,8 +615,17 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
index, truncate_op))
freed++;
+ /*
+ * Skip HWPoison-ed hugepages, which should no
+ * longer be hugetlb if successfully dissolved.
+ */
+ if (folio_test_hugetlb(folio))
+ fbatch.folios[j++] = folio;
+
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
+ fbatch.nr = j;
+
folio_batch_release(&fbatch);
cond_resched();
}
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e51b8ef0cebd9..7fadf1772335d 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -879,10 +879,17 @@ int dissolve_free_hugetlb_folios(unsigned long start_pfn,
#ifdef CONFIG_MEMORY_FAILURE
extern void folio_clear_hugetlb_hwpoison(struct folio *folio);
+extern bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio,
+ struct address_space *mapping);
#else
static inline void folio_clear_hugetlb_hwpoison(struct folio *folio)
{
}
+static inline bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio

comma is missing

+ struct address_space *mapping)
+{
+ return false;
+}
#endif
#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index ec442af3f8861..53772c29451eb 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -211,6 +211,7 @@ enum mapping_flags {
AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't
account usage to user cgroups */
AS_NO_DATA_INTEGRITY = 11, /* no data integrity guarantees */
+ AS_MF_KEEP_UE_MAPPED = 12, /* For MFD_MF_KEEP_UE_MAPPED. */
/* Bits 16-25 are used for FOLIO_ORDER */
AS_FOLIO_ORDER_BITS = 5,
AS_FOLIO_ORDER_MIN = 16,
@@ -356,6 +357,16 @@ static inline bool mapping_no_data_integrity(const struct address_space *mapping
return test_bit(AS_NO_DATA_INTEGRITY, &mapping->flags);
}
+static inline bool mapping_mf_keep_ue_mapped(const struct address_space *mapping)
+{
+ return test_bit(AS_MF_KEEP_UE_MAPPED, &mapping->flags);
+}
+
+static inline void mapping_set_mf_keep_ue_mapped(struct address_space *mapping)
+{
+ set_bit(AS_MF_KEEP_UE_MAPPED, &mapping->flags);
+}
+
static inline gfp_t mapping_gfp_mask(const struct address_space *mapping)
{
return mapping->gfp_mask;
@@ -1303,6 +1314,18 @@ void replace_page_cache_folio(struct folio *old, struct folio *new);
void delete_from_page_cache_batch(struct address_space *mapping,
struct folio_batch *fbatch);
bool filemap_release_folio(struct folio *folio, gfp_t gfp);
+#ifdef CONFIG_MEMORY_FAILURE
+/*
+ * Provided by memory failure to offline HWPoison-ed folio managed by memfd.
+ */
+void filemap_offline_hwpoison_folio(struct address_space *mapping,
+ struct folio *folio);
+#else
+static inline void filemap_offline_hwpoison_folio(struct address_space *mapping,
+ struct folio *folio)
+{
+}
+#endif
loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end,
int whence);
diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
index 273a4e15dfcff..d9875da551b7f 100644
--- a/include/uapi/linux/memfd.h
+++ b/include/uapi/linux/memfd.h
@@ -12,6 +12,12 @@
#define MFD_NOEXEC_SEAL 0x0008U
/* executable */
#define MFD_EXEC 0x0010U
+/*
+ * Keep owned folios mapped when uncorrectable memory errors (UE) causes
+ * memory failure (MF) within the folio. Only at the end of the mapping
+ * will its HWPoison-ed folios be dealt with.
+ */
+#define MFD_MF_KEEP_UE_MAPPED 0x0020U
/*
* Huge page size encoding when MFD_HUGETLB is specified, and a huge page
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a1832da0f6236..2a161c281da2a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5836,9 +5836,11 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
* So we need to block hugepage fault by PG_hwpoison bit check.
*/
if (unlikely(folio_test_hwpoison(folio))) {
- ret = VM_FAULT_HWPOISON_LARGE |
- VM_FAULT_SET_HINDEX(hstate_index(h));
- goto backout_unlocked;
+ if (!mapping_mf_keep_ue_mapped(mapping)) {
+ ret = VM_FAULT_HWPOISON_LARGE |
+ VM_FAULT_SET_HINDEX(hstate_index(h));
+ goto backout_unlocked;
+ }
}
/* Check for page in userfault range. */
diff --git a/mm/memfd.c b/mm/memfd.c
index ab5312aff14b9..f9fdf014b67ba 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -340,7 +340,8 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | \
+ MFD_NOEXEC_SEAL | MFD_EXEC | MFD_MF_KEEP_UE_MAPPED)
static int check_sysctl_memfd_noexec(unsigned int *flags)
{
@@ -414,6 +415,8 @@ static int sanitize_flags(unsigned int *flags_ptr)
if (!(flags & MFD_HUGETLB)) {
if (flags & ~MFD_ALL_FLAGS)
return -EINVAL;
+ if (flags & MFD_MF_KEEP_UE_MAPPED)
+ return -EINVAL;
} else {
/* Allow huge page size encoding in flags. */
if (flags & ~(MFD_ALL_FLAGS |
@@ -486,6 +489,16 @@ static struct file *alloc_file(const char *name, unsigned int flags)
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
file->f_flags |= O_LARGEFILE;
+ /*
+ * MFD_MF_KEEP_UE_MAPPED can only be specified in memfd_create;
+ * no API to update it once memfd is created. MFD_MF_KEEP_UE_MAPPED
+ * is not seal-able.
+ *
+ * For now MFD_MF_KEEP_UE_MAPPED is only supported by HugeTLBFS.
+ */
+ if (flags & MFD_MF_KEEP_UE_MAPPED)
+ mapping_set_mf_keep_ue_mapped(file->f_mapping);
+
if (flags & MFD_NOEXEC_SEAL) {
inode->i_mode &= ~0111;
file_seals = memfd_file_seals_ptr(file);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 58b34f5d2c05d..b9cecbbe08dae 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -410,6 +410,8 @@ static void __add_to_kill(struct task_struct *tsk, const struct page *p,
unsigned long addr)
{
struct to_kill *tk;
+ const struct folio *folio;
+ struct address_space *mapping;
tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
if (!tk) {
@@ -420,8 +422,19 @@ static void __add_to_kill(struct task_struct *tsk, const struct page *p,
tk->addr = addr;
if (is_zone_device_page(p))
tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
- else
- tk->size_shift = folio_shift(page_folio(p));
+ else {
+ folio = page_folio(p);
+ mapping = folio_mapping(folio);
+ if (mapping && mapping_mf_keep_ue_mapped(mapping))
+ /*
+ * Let userspace know the radius of HWPoison is
+ * the size of raw page; accessing other pages
+ * inside the folio is still ok.
+ */
+ tk->size_shift = PAGE_SHIFT;
+ else
+ tk->size_shift = folio_shift(folio);
+ }
/*
* Send SIGKILL if "tk->addr == -EFAULT". Also, as
@@ -844,6 +857,8 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
int flags)
{
int ret;
+ struct folio *folio;
+ struct address_space *mapping;
struct hwpoison_walk priv = {
.pfn = pfn,
};
@@ -861,8 +876,14 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
* ret = 0 when poison page is a clean page and it's dropped, no
* SIGBUS is needed.
*/
- if (ret == 1 && priv.tk.addr)
+ if (ret == 1 && priv.tk.addr) {
+ folio = pfn_folio(pfn);
+ mapping = folio_mapping(folio);
+ if (mapping && mapping_mf_keep_ue_mapped(mapping))
+ priv.tk.size_shift = PAGE_SHIFT;
+
kill_proc(&priv.tk, pfn, flags);
+ }
mmap_read_unlock(p->mm);
return ret > 0 ? -EHWPOISON : 0;
@@ -1206,6 +1227,13 @@ static int me_huge_page(struct page_state *ps, struct page *p)
}
}
+ /*
+ * MF still needs to holds a refcount for the deferred actions in

to hold (without the s)

+ * filemap_offline_hwpoison_folio.
+ */
+ if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
+ return res;
+
if (has_extra_refcount(ps, p, extra_pins))
res = MF_FAILED;
@@ -1602,6 +1630,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
{
LIST_HEAD(tokill);
bool unmap_success;
+ bool keep_mapped;
int forcekill;
bool mlocked = folio_test_mlocked(folio);
@@ -1629,8 +1658,12 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
*/
collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
- unmap_success = !unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL);
- if (!unmap_success)
+ keep_mapped = hugetlb_should_keep_hwpoison_mapped(folio, folio->mapping);

We shoud use folio_mapping(folio) instead of folio->mapping.

But more importantly this function can be called on non hugepages folios, and hugetlb_should_keep_hwpoison_mapped() is warning (ONCE) in this case. So shouldn't the caller make sure that we are dealing with hugepages first ?

+ if (!keep_mapped)
+ unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL);
+
+ unmap_success = !folio_mapped(folio);
+ if (!keep_mapped && !unmap_success)
pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n",
pfn, folio_mapcount(folio));
@@ -1655,7 +1688,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
!unmap_success;
kill_procs(&tokill, forcekill, pfn, flags);
- return unmap_success;
+ return unmap_success || keep_mapped;
}
static int identify_page_state(unsigned long pfn, struct page *p,
@@ -1896,6 +1929,13 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
unsigned long count = 0;
head = llist_del_all(raw_hwp_list_head(folio));
+ /*
+ * If filemap_offline_hwpoison_folio_hugetlb is handling this folio,
+ * it has already taken off the head of the llist.
+ */
+ if (head == NULL)
+ return 0;
+
llist_for_each_entry_safe(p, next, head, node) {
if (move_flag)
SetPageHWPoison(p->page);
@@ -1912,7 +1952,8 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
struct llist_head *head;
struct raw_hwp_page *raw_hwp;
struct raw_hwp_page *p;
- int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0;
+ struct address_space *mapping = folio->mapping;

Same here - We shoud use folio_mapping(folio) instead of folio->mapping.

+ bool has_hwpoison = folio_test_set_hwpoison(folio);
/*
* Once the hwpoison hugepage has lost reliable raw error info,
@@ -1931,8 +1972,15 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
if (raw_hwp) {
raw_hwp->page = page;
llist_add(&raw_hwp->node, head);
+ if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
+ /*
+ * A new raw HWPoison page. Don't return HWPOISON.
+ * Error event will be counted in action_result().
+ */
+ return 0;
+
/* the first error event will be counted in action_result(). */
- if (ret)
+ if (has_hwpoison)
num_poisoned_pages_inc(page_to_pfn(page));
} else {
/*
@@ -1947,7 +1995,8 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
*/
__folio_free_raw_hwp(folio, false);
}
- return ret;
+
+ return has_hwpoison ? -EHWPOISON : 0;
}
static unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag)
@@ -1980,6 +2029,18 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio)
folio_free_raw_hwp(folio, true);
}
+bool hugetlb_should_keep_hwpoison_mapped(struct folio *folio,
+ struct address_space *mapping)
+{
+ if (WARN_ON_ONCE(!folio_test_hugetlb(folio)))
+ return false;
+
+ if (!mapping)
+ return false;
+
+ return mapping_mf_keep_ue_mapped(mapping);
+}

The definition of this above function should be encapsulated with
#ifdef CONFIG_MEMORY_FAILURE
#endif

+
/*
* Called from hugetlb code with hugetlb_lock held.
*
@@ -2037,6 +2098,51 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
return ret;
}
+static void filemap_offline_hwpoison_folio_hugetlb(struct folio *folio)
+{
+ int ret;
+ struct llist_node *head;
+ struct raw_hwp_page *curr, *next;
+
+ /*
+ * Since folio is still in the folio_batch, drop the refcount
+ * elevated by filemap_get_folios.
+ */
+ folio_put_refs(folio, 1);
+ head = llist_del_all(raw_hwp_list_head(folio));
+
+ /*
+ * Release refcounts held by try_memory_failure_hugetlb, one per
+ * HWPoison-ed page in the raw hwp list.
+ *
+ * Set HWPoison flag on each page so that free_has_hwpoisoned()
+ * can exclude them during dissolve_free_hugetlb_folio().
+ */
+ llist_for_each_entry_safe(curr, next, head, node) {
+ folio_put(folio);
+ SetPageHWPoison(curr->page);
+ kfree(curr);
+ }
+
+ /* Refcount now should be zero and ready to dissolve folio. */
+ ret = dissolve_free_hugetlb_folio(folio);
+ if (ret)
+ pr_err("failed to dissolve hugetlb folio: %d\n", ret);
+}
+
+void filemap_offline_hwpoison_folio(struct address_space *mapping,
+ struct folio *folio)
+{
+ WARN_ON_ONCE(!mapping);
+
+ if (!folio_test_hwpoison(folio))
+ return;
+
+ /* Pending MFR currently only exist for hugetlb. */
+ if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
+ filemap_offline_hwpoison_folio_hugetlb(folio);

Shouldn't we also test here that we are dealing with hugepages first before testing hugetlb_should_keep_hwpoison_mapped(folio, mapping) ?

+}
+
/*
* Taking refcount of hugetlb pages needs extra care about race conditions
* with basic operations like hugepage allocation/free/demotion.

Don't we also need to take into account the repeated errors in try_memory_failure_hugetlb() ?

Something like that:

@@ -2036,9 +2099,10 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
{
int res, rv;
struct page *p = pfn_to_page(pfn);
- struct folio *folio;
+ struct folio *folio = page_folio(p);
unsigned long page_flags;
bool migratable_cleared = false;
+ struct address_space *mapping = folio_mapping(folio);

*hugetlb = 1;
retry:
@@ -2060,15 +2124,17 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
rv = kill_accessing_process(current, pfn, flags);
if (res == MF_HUGETLB_PAGE_PRE_POISONED)
action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
- else
+ else {
+ if (hugetlb_should_keep_hwpoison_mapped(folio, mapping))
+ return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_DELAYED);
action_result(pfn, MF_MSG_HUGE, MF_FAILED);
+ }
return rv;
default:
WARN_ON((res != MF_HUGETLB_FREED) && (res != MF_HUGETLB_IN_USED));
break;
}

- folio = page_folio(p);
folio_lock(folio);

if (hwpoison_filter(p)) {

So that we don't call action_result(pfn, MF_MSG_HUGE, MF_FAILED); for a repeated error ?

--
2.47.3