Re: [PATCH 2/3] ksm: perform a range-walk in break_ksm
From: David Hildenbrand
Date: Wed Oct 29 2025 - 10:45:21 EST
On 28.10.25 14:19, Pedro Demarchi Gomes wrote:
Make break_ksm() receive an address range and change
break_ksm_pmd_entry() to perform a range-walk and return the address of
the first ksm page found.
This change allows break_ksm() to skip unmapped regions instead of
iterating every page address. When unmerging large sparse VMAs, this
significantly reduces runtime, as confirmed by benchmark test (see
cover letter).
Instead of referencing the cover letter, directly include the data here.
Suggested-by: David Hildenbrand <david@xxxxxxxxxx>
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@xxxxxxxxx>
---
mm/ksm.c | 88 +++++++++++++++++++++++++++++++-------------------------
1 file changed, 49 insertions(+), 39 deletions(-)
diff --git a/mm/ksm.c b/mm/ksm.c
index 2a9a7fd4c777..1d1ef0554c7c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -607,34 +607,54 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
return atomic_read(&mm->mm_users) == 0;
}
-static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
+struct break_ksm_arg {
+ unsigned long addr;
+};
You can avoid that by simply passing a pointer to addr.
+
+static int break_ksm_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- struct page *page = NULL;
+ struct page *page;
spinlock_t *ptl;
- pte_t *pte;
- pte_t ptent;
- int ret;
+ pte_t *start_ptep = NULL, *ptep, pte;
Is there a need to initialize start_ptep?
+ int ret = 0;
+ struct mm_struct *mm = walk->mm;
+ struct break_ksm_arg *private = (struct break_ksm_arg *) walk->private;
Prefer reverse xmas tree:
struct break_ksm_arg *private = (struct break_ksm_arg *) walk->private;
struct mm_struct *mm = walk->mm;
...
With the break_ksm_arg simplification you'd had
unsigned long *found_addr = (unsigned long *)walk->private;
- pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- if (!pte)
+ if (ksm_test_exit(walk->mm))
return 0;
- ptent = ptep_get(pte);
- if (pte_present(ptent)) {
- page = vm_normal_page(walk->vma, addr, ptent);
- } else if (!pte_none(ptent)) {
- swp_entry_t entry = pte_to_swp_entry(ptent);
- /*
- * As KSM pages remain KSM pages until freed, no need to wait
- * here for migration to end.
- */
- if (is_migration_entry(entry))
- page = pfn_swap_entry_to_page(entry);
+ if (signal_pending(current))
+ return -ERESTARTSYS;
I assume that's not a problem for the other callsites that wouldn't check this before.
+
+ start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (!start_ptep)
+ return 0;
+
+ for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
Best to declare pte and folio (see last mail) here in the loop:
pte_t pte = ptep_get(ptep);
struct folio *folio = NULL;
+ pte = ptep_get(ptep);
+ page = NULL;
+ if (pte_present(pte)) {
+ page = vm_normal_page(walk->vma, addr, pte);
+ } else if (!pte_none(pte)) {
+ swp_entry_t entry = pte_to_swp_entry(pte);
+
+ /*
+ * As KSM pages remain KSM pages until freed, no need to wait
+ * here for migration to end.
+ */
+ if (is_migration_entry(entry))
+ page = pfn_swap_entry_to_page(entry);
+ }
+ /* return 1 if the page is an normal ksm page or KSM-placed zero page */
+ ret = (page && folio_test_ksm(page_folio(page))) || is_ksm_zero_pte(pte);
+ if (ret) {
+ private->addr = addr;
+ goto out_unlock;
+ }
I suggest you call "ret" "found" instead.
}
- /* return 1 if the page is an normal ksm page or KSM-placed zero page */
- ret = (page && folio_test_ksm(page_folio(page))) || is_ksm_zero_pte(ptent);
- pte_unmap_unlock(pte, ptl);
+out_unlock:
+ pte_unmap_unlock(ptep, ptl);
return ret;
}
@@ -661,9 +681,11 @@ static const struct mm_walk_ops break_ksm_lock_vma_ops = {
* of the process that owns 'vma'. We also do not want to enforce
* protection keys here anyway.
*/
-static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma)
+static int break_ksm(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long end, bool lock_vma)
{
vm_fault_t ret = 0;
+ struct break_ksm_arg break_ksm_arg;
const struct mm_walk_ops *ops = lock_vma ?
&break_ksm_lock_vma_ops : &break_ksm_ops;
@@ -671,11 +693,10 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_v
int ksm_page;
cond_resched();
- ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL);
- if (WARN_ON_ONCE(ksm_page < 0))
+ ksm_page = walk_page_range_vma(vma, addr, end, ops, &break_ksm_arg);
+ if (ksm_page <= 0)
return ksm_page;
- if (!ksm_page)
- return 0;
+ addr = break_ksm_arg.addr;
ret = handle_mm_fault(vma, addr,
FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
NULL);
@@ -761,7 +782,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item)
mmap_read_lock(mm);
vma = find_mergeable_vma(mm, addr);
if (vma)
- break_ksm(vma, addr, false);
+ break_ksm(vma, addr, addr + 1, false);
Better to use addr + PAGE_SIZE
--
Cheers
David / dhildenb