Re: [PATCH v2 1/1] mm/ksm: recover from memory failure on KSM page by migrating to healthy duplicate

From: David Hildenbrand

Date: Tue Oct 28 2025 - 05:44:43 EST

On 16.10.25 12:18, Longlong Xia wrote:

From: Longlong Xia <xialonglong@xxxxxxxxxx>

When a hardware memory error occurs on a KSM page, the current
behavior is to kill all processes mapping that page. This can
be overly aggressive when KSM has multiple duplicate pages in
a chain where other duplicates are still healthy.

This patch introduces a recovery mechanism that attempts to
migrate mappings from the failing KSM page to a newly
allocated KSM page or another healthy duplicate already
present in the same chain, before falling back to the
process-killing procedure.

The recovery process works as follows:
1. Identify if the failing KSM page belongs to a stable node chain.
2. Locate a healthy duplicate KSM page within the same chain.
3. For each process mapping the failing page:
a. Attempt to allocate a new KSM page copy from healthy duplicate
KSM page. If successful, migrate the mapping to this new KSM page.
b. If allocation fails, migrate the mapping to the existing healthy
duplicate KSM page.
4. If all migrations succeed, remove the failing KSM page from the chain.
5. Only if recovery fails (e.g., no healthy duplicate found or migration
error) does the kernel fall back to killing the affected processes.

Signed-off-by: Longlong Xia <xialonglong@xxxxxxxxxx>
---
mm/ksm.c | 246 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 246 insertions(+)

diff --git a/mm/ksm.c b/mm/ksm.c
index 160787bb121c..9099bad1ab35 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -3084,6 +3084,246 @@ void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc)
}
#ifdef CONFIG_MEMORY_FAILURE
+static struct ksm_stable_node *find_chain_head(struct ksm_stable_node *dup_node)
+{
+ struct ksm_stable_node *stable_node, *dup;
+ struct rb_node *node;
+ int nid;
+
+ if (!is_stable_node_dup(dup_node))
+ return NULL;
+
+ for (nid = 0; nid < ksm_nr_node_ids; nid++) {
+ node = rb_first(root_stable_tree + nid);
+ for (; node; node = rb_next(node)) {
+ stable_node = rb_entry(node,
+ struct ksm_stable_node,
+ node);

Put that into a single line for readability, please.

You can also consider factoring out this inner loop in a helper function.

+
+ if (!is_stable_node_chain(stable_node))
+ continue;
+
+ hlist_for_each_entry(dup, &stable_node->hlist,
+ hlist_dup) {

Single line, or properly indent.

+ if (dup == dup_node)
+ return stable_node;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+static struct folio *find_healthy_folio(struct ksm_stable_node *chain_head,
+ struct ksm_stable_node *failing_node,
+ struct ksm_stable_node **healthy_dupdup)
+{
+ struct ksm_stable_node *dup;
+ struct hlist_node *hlist_safe;
+ struct folio *healthy_folio;
+
+ if (!is_stable_node_chain(chain_head) || !is_stable_node_dup(failing_node))
+ return NULL;
+
+ hlist_for_each_entry_safe(dup, hlist_safe, &chain_head->hlist, hlist_dup) {
+ if (dup == failing_node)
+ continue;
+
+ healthy_folio = ksm_get_folio(dup, KSM_GET_FOLIO_TRYLOCK);
+ if (healthy_folio) {
+ *healthy_dupdup = dup;
+ return healthy_folio;
+ }
+ }
+
+ return NULL;
+}
+
+static struct page *create_new_stable_node_dup(struct ksm_stable_node *chain_head,
+ struct folio *healthy_folio,
+ struct ksm_stable_node **new_stable_node)
+{
+ int nid;
+ unsigned long kpfn;
+ struct page *new_page = NULL;
+
+ if (!is_stable_node_chain(chain_head))
+ return NULL;
+
+ new_page = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_ZERO);
+ if (!new_page)
+ return NULL;
+
+ copy_highpage(new_page, folio_page(healthy_folio, 0));
+
+ *new_stable_node = alloc_stable_node();
+ if (!*new_stable_node) {
+ __free_page(new_page);
+ return NULL;
+ }
+
+ INIT_HLIST_HEAD(&(*new_stable_node)->hlist);
+ kpfn = page_to_pfn(new_page);
+ (*new_stable_node)->kpfn = kpfn;
+ nid = get_kpfn_nid(kpfn);
+ DO_NUMA((*new_stable_node)->nid = nid);
+ (*new_stable_node)->rmap_hlist_len = 0;
+
+ (*new_stable_node)->head = STABLE_NODE_DUP_HEAD;
+ hlist_add_head(&(*new_stable_node)->hlist_dup, &chain_head->hlist);
+ ksm_stable_node_dups++;
+ folio_set_stable_node(page_folio(new_page), *new_stable_node);
+ folio_add_lru(page_folio(new_page));

There seems to be a lot of copy-paste. For example, why no reuse stable_node_chain_add_dup()?

Or why not try to reuse stable_tree_insert() in the first place?

Try to reuse or factor out instead of copy-pasting, please.

+
+ return new_page;
+}
+
+static int replace_failing_page(struct vm_area_struct *vma, struct page *page,
+ struct page *kpage, unsigned long addr)
+{
+ struct folio *kfolio = page_folio(kpage);
+ struct mm_struct *mm = vma->vm_mm;
+ struct folio *folio = page_folio(page);
+ pmd_t *pmd;
+ pte_t *ptep;
+ pte_t newpte;
+ spinlock_t *ptl;
+ int err = -EFAULT;
+ struct mmu_notifier_range range;
+
+ pmd = mm_find_pmd(mm, addr);
+ if (!pmd)
+ goto out;
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
+ addr + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+
+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!ptep)
+ goto out_mn;
+
+ folio_get(kfolio);
+ folio_add_anon_rmap_pte(kfolio, kpage, vma, addr, RMAP_NONE);
+ newpte = mk_pte(kpage, vma->vm_page_prot);
+
+ flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep)));
+ ptep_clear_flush(vma, addr, ptep);
+ set_pte_at(mm, addr, ptep, newpte);
+
+ folio_remove_rmap_pte(folio, page, vma);
+ if (!folio_mapped(folio))
+ folio_free_swap(folio);
+ folio_put(folio);
+
+ pte_unmap_unlock(ptep, ptl);
+ err = 0;
+out_mn:
+ mmu_notifier_invalidate_range_end(&range);
+out:
+ return err;
+}

This is a lot of copy-paste from replace_page(). Isn't there a way to avoid this duplication by unifying both functions in some way?

+
+static void migrate_to_target_dup(struct ksm_stable_node *failing_node,
+ struct folio *failing_folio,
+ struct folio *target_folio,
+ struct ksm_stable_node *target_dup)
+{
+ struct ksm_rmap_item *rmap_item;
+ struct hlist_node *hlist_safe;
+ int err;
+
+ hlist_for_each_entry_safe(rmap_item, hlist_safe, &failing_node->hlist, hlist) {
+ struct mm_struct *mm = rmap_item->mm;
+ unsigned long addr = rmap_item->address & PAGE_MASK;

Can be const.

+ struct vm_area_struct *vma;
+
+ if (!mmap_read_trylock(mm))
+ continue;
+
+ if (ksm_test_exit(mm)) {
+ mmap_read_unlock(mm);
+ continue;
+ }
+
+ vma = vma_lookup(mm, addr);
+ if (!vma) {
+ mmap_read_unlock(mm);
+ continue;
+ }
+
+ if (!folio_trylock(target_folio)) {

Can't we leave the target folio locked the whole time? The caller already locked it, why not keep it locked until we're done?

+ mmap_read_unlock(mm);
+ continue;
+ }
+
+ err = replace_failing_page(vma, &failing_folio->page,
+ folio_page(target_folio, 0), addr);
+ if (!err) {
+ hlist_del(&rmap_item->hlist);
+ rmap_item->head = target_dup;
+ hlist_add_head(&rmap_item->hlist, &target_dup->hlist);
+ target_dup->rmap_hlist_len++;
+ failing_node->rmap_hlist_len--;
+ }
+
+ folio_unlock(target_folio);
+ mmap_read_unlock(mm);
+ }
+
+}
+
+static bool ksm_recover_within_chain(struct ksm_stable_node *failing_node)
+{
+ struct folio *failing_folio = NULL;
+ struct ksm_stable_node *healthy_dupdup = NULL;
+ struct folio *healthy_folio = NULL;
+ struct ksm_stable_node *chain_head = NULL;
+ struct page *new_page = NULL;
+ struct ksm_stable_node *new_stable_node = NULL;

Only initialize what needs initialization (nothing in here?) and combine where possible.

Like

struct folio *failing_folio, *healthy_folio;

+
+ if (!is_stable_node_dup(failing_node))
+ return false;
+
+ guard(mutex)(&ksm_thread_mutex);
+ failing_folio = ksm_get_folio(failing_node, KSM_GET_FOLIO_NOLOCK);
+ if (!failing_folio)
+ return false;
+
+ chain_head = find_chain_head(failing_node);
+ if (!chain_head)
+ return NULL;
+
+ healthy_folio = find_healthy_folio(chain_head, failing_node, &healthy_dupdup);
+ if (!healthy_folio) {
+ folio_put(failing_folio);
+ return false;
+ }
+
+ new_page = create_new_stable_node_dup(chain_head, healthy_folio, &new_stable_node);

Why are you returning a page here and not a folio?

--
Cheers

David / dhildenb