Re: [PATCH v8 14/21] mm/mmap: Avoid zeroing vma tree in mmap_region()

From: Bert Karwatzki
Date: Fri Oct 04 2024 - 04:51:45 EST


As a sidenode I've tried to figure out the minimal patch it takes to trigger the
crash. The first commit that triggers the bug is f8d112a4e657 ("mm/mmap: avoid
zeroing vma tree in mmap_region()") so I split this commit into two parts, the
first part gives a working kernel while the second part gives the bug.

First part (to be applied on top of commit 94f59ea591f
("mm: clean up unmap_region() argument list") in linux-next):
From efdd914b012b35d34d1cae469f199023e82f7a15 Mon Sep 17 00:00:00 2001
From: Bert Karwatzki <spasswolf@xxxxxx>
Date: Thu, 3 Oct 2024 23:59:30 +0200
Subject: [PATCH 1/2] mm: mmap: Prepare to avoid zeroing vma tree in
mmap_region()

In an attempt to create a minimal patch to trigger the mapletree
bug apply parts of patch [v8 14/21] which still give a working kernel.

Signed-off-by: Bert Karwatzki <spasswolf@xxxxxx>
---
mm/mmap.c | 39 ++++++++++++++++++---------------------
mm/vma.c | 54 ++++++++++++++++++++++++++++++++++++++++++------------
mm/vma.h | 22 ++++++++++++++++------
3 files changed, 76 insertions(+), 39 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 304dc085533a..da9bfb899eec 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1373,23 +1373,19 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long merge_start = addr, merge_end = end;
bool writable_file_mapping = false;
pgoff_t vm_pgoff;
- int error;
+ int error = -ENOMEM;
VMA_ITERATOR(vmi, mm, addr);
+ unsigned long nr_pages, nr_accounted;

- /* Check against address space limit. */
- if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
- unsigned long nr_pages;
+ nr_pages = count_vma_pages_range(mm, addr, end, &nr_accounted);

- /*
- * MAP_FIXED may remove pages of mappings that intersects with
- * requested mapping. Account for the pages it would unmap.
- */
- nr_pages = count_vma_pages_range(mm, addr, end);
-
- if (!may_expand_vm(mm, vm_flags,
- (len >> PAGE_SHIFT) - nr_pages))
- return -ENOMEM;
- }
+ /*
+ * Check against address space limit.
+ * MAP_FIXED may remove pages of mappings that intersects with requested
+ * mapping. Account for the pages it would unmap.
+ */
+ if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages))
+ return -ENOMEM;

/* Find the first overlapping VMA */
vma = vma_find(&vmi, end);
@@ -1410,6 +1406,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,

/* Unmap any existing mapping in the area */
vms_complete_munmap_vmas(&vms, &mas_detach);
+ vms.vma_count = 0;
next = vms.next;
prev = vms.prev;
vma = NULL;
@@ -1425,8 +1422,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
*/
if (accountable_mapping(file, vm_flags)) {
charged = len >> PAGE_SHIFT;
+ charged -= nr_accounted;
if (security_vm_enough_memory_mm(mm, charged))
- return -ENOMEM;
+ goto clear_tree_failed;
+ vms.nr_accounted = 0;
vm_flags |= VM_ACCOUNT;
}

@@ -1475,10 +1474,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
* not unmapped, but the maps are removed from the list.
*/
vma = vm_area_alloc(mm);
- if (!vma) {
- error = -ENOMEM;
+ if (!vma)
goto unacct_error;
- }

vma_iter_config(&vmi, addr, end);
vma_set_range(vma, addr, end, pgoff);
@@ -1605,7 +1602,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
return addr;

close_and_free_vma:
- if (file && vma->vm_ops && vma->vm_ops->close)
+ if (file && !vms.closed_vm_ops && vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);

if (file || vma->vm_file) {
@@ -1627,7 +1624,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,

clear_tree_failed:
if (vms.vma_count)
- abort_munmap_vmas(&mas_detach);
+ abort_munmap_vmas(&mas_detach, false);
gather_failed:
validate_mm(mm);
return error;
@@ -1960,7 +1957,7 @@ void exit_mmap(struct mm_struct *mm)
do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
- remove_vma(vma, true);
+ remove_vma(vma, /* unreachable = */ true, /* closed = */ false);
count++;
cond_resched();
vma = vma_next(&vmi);
diff --git a/mm/vma.c b/mm/vma.c
index 83c5c46c67b9..648c58da8ad4 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -136,10 +136,10 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
/*
* Close a vm structure and free it.
*/
-void remove_vma(struct vm_area_struct *vma, bool unreachable)
+void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed)
{
might_sleep();
- if (vma->vm_ops && vma->vm_ops->close)
+ if (!closed && vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
if (vma->vm_file)
fput(vma->vm_file);
@@ -521,7 +521,6 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
vma_iter_store(vmi, vma);

vma_complete(&vp, vmi, vma->vm_mm);
- validate_mm(vma->vm_mm);
return 0;

nomem:
@@ -645,11 +644,14 @@ void vma_complete(struct vma_prepare *vp,
uprobe_mmap(vp->insert);
}

-static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
- struct ma_state *mas_detach, bool mm_wr_locked)
+static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
+ struct ma_state *mas_detach, bool mm_wr_locked)
{
struct mmu_gather tlb;

+ if (!vms->clear_ptes) /* Nothing to do */
+ return;
+
/*
* We can free page tables without write-locking mmap_lock because VMAs
* were isolated before we downgraded mmap_lock.
@@ -658,11 +660,31 @@ static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
lru_add_drain();
tlb_gather_mmu(&tlb, vms->mm);
update_hiwater_rss(vms->mm);
- unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, vms->vma_count, mm_wr_locked);
+ unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
+ vms->vma_count, mm_wr_locked);
+
mas_set(mas_detach, 1);
/* start and end may be different if there is no prev or next vma. */
- free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, vms->unmap_end, mm_wr_locked);
+ free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
+ vms->unmap_end, mm_wr_locked);
tlb_finish_mmu(&tlb);
+ vms->clear_ptes = false;
+}
+
+void vms_clean_up_area(struct vma_munmap_struct *vms,
+ struct ma_state *mas_detach, bool mm_wr_locked)
+{
+ struct vm_area_struct *vma;
+
+ if (!vms->nr_pages)
+ return;
+
+ vms_clear_ptes(vms, mas_detach, mm_wr_locked);
+ mas_set(mas_detach, 0);
+ mas_for_each(mas_detach, vma, ULONG_MAX)
+ if (vma->vm_ops && vma->vm_ops->close)
+ vma->vm_ops->close(vma);
+ vms->closed_vm_ops = true;
}

/*
@@ -686,7 +708,10 @@ void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
if (vms->unlock)
mmap_write_downgrade(mm);

- vms_complete_pte_clear(vms, mas_detach, !vms->unlock);
+ if (!vms->nr_pages)
+ return;
+
+ vms_clear_ptes(vms, mas_detach, !vms->unlock);
/* Update high watermark before we lower total_vm */
update_hiwater_vm(mm);
/* Stat accounting */
@@ -702,7 +727,7 @@ void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
/* Remove and clean up vmas */
mas_set(mas_detach, 0);
mas_for_each(mas_detach, vma, ULONG_MAX)
- remove_vma(vma, false);
+ remove_vma(vma, /* = */ false, vms->closed_vm_ops);

vm_unacct_memory(vms->nr_accounted);
validate_mm(mm);
@@ -846,13 +871,14 @@ int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
while (vma_iter_addr(vms->vmi) > vms->start)
vma_iter_prev_range(vms->vmi);

+ vms->clear_ptes = true;
return 0;

userfaultfd_error:
munmap_gather_failed:
end_split_failed:
modify_vma_failed:
- abort_munmap_vmas(mas_detach);
+ abort_munmap_vmas(mas_detach, /* closed = */ false);
start_split_failed:
map_count_exceeded:
return error;
@@ -897,7 +923,7 @@ int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
return 0;

clear_tree_failed:
- abort_munmap_vmas(&mas_detach);
+ abort_munmap_vmas(&mas_detach, /* closed = */ false);
gather_failed:
validate_mm(mm);
return error;
@@ -1615,17 +1641,21 @@ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
}

unsigned long count_vma_pages_range(struct mm_struct *mm,
- unsigned long addr, unsigned long end)
+ unsigned long addr, unsigned long end,
+ unsigned long *nr_accounted)
{
VMA_ITERATOR(vmi, mm, addr);
struct vm_area_struct *vma;
unsigned long nr_pages = 0;

+ *nr_accounted = 0;
for_each_vma_range(vmi, vma, end) {
unsigned long vm_start = max(addr, vma->vm_start);
unsigned long vm_end = min(end, vma->vm_end);

nr_pages += PHYS_PFN(vm_end - vm_start);
+ if (vma->vm_flags & VM_ACCOUNT)
+ *nr_accounted += PHYS_PFN(vm_end - vm_start);
}

return nr_pages;
diff --git a/mm/vma.h b/mm/vma.h
index 82ba48174341..64b44f5a0a11 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -48,6 +48,8 @@ struct vma_munmap_struct {
unsigned long stack_vm;
unsigned long data_vm;
bool unlock; /* Unlock after the munmap */
+ bool clear_ptes; /* If there are outstanding PTE to be cleared */
+ bool closed_vm_ops; /* call_mmap() was encountered, so vmas may be closed */
};

#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
@@ -96,14 +98,13 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms,
unsigned long start, unsigned long end, struct list_head *uf,
bool unlock)
{
+ vms->mm = current->mm;
vms->vmi = vmi;
vms->vma = vma;
if (vma) {
- vms->mm = vma->vm_mm;
vms->start = start;
vms->end = end;
} else {
- vms->mm = NULL;
vms->start = vms->end = 0;
}
vms->unlock = unlock;
@@ -113,6 +114,8 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms,
vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
vms->unmap_start = FIRST_USER_ADDRESS;
vms->unmap_end = USER_PGTABLES_CEILING;
+ vms->clear_ptes = false;
+ vms->closed_vm_ops = false;
}
#endif

@@ -122,18 +125,24 @@ int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
struct ma_state *mas_detach);

+void vms_clean_up_area(struct vma_munmap_struct *vms,
+ struct ma_state *mas_detach, bool mm_wr_locked);
+
/*
* abort_munmap_vmas - Undo any munmap work and free resources
*
* Reattach any detached vmas and free up the maple tree used to track the vmas.
*/
-static inline void abort_munmap_vmas(struct ma_state *mas_detach)
+static inline void abort_munmap_vmas(struct ma_state *mas_detach, bool closed)
{
struct vm_area_struct *vma;

mas_set(mas_detach, 0);
- mas_for_each(mas_detach, vma, ULONG_MAX)
+ mas_for_each(mas_detach, vma, ULONG_MAX) {
vma_mark_detached(vma, false);
+ if (closed && vma->vm_ops && vma->vm_ops->open)
+ vma->vm_ops->open(vma);
+ }

__mt_destroy(mas_detach->tree);
}
@@ -147,7 +156,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
unsigned long start, size_t len, struct list_head *uf,
bool unlock);

-void remove_vma(struct vm_area_struct *vma, bool unreachable);
+void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed);

void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct vm_area_struct *next);
@@ -261,7 +270,8 @@ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
int mm_take_all_locks(struct mm_struct *mm);
void mm_drop_all_locks(struct mm_struct *mm);
unsigned long count_vma_pages_range(struct mm_struct *mm,
- unsigned long addr, unsigned long end);
+ unsigned long addr, unsigned long end,
+ unsigned long *nr_accounted);

static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
{
--
2.45.2

And here the second part which causes the bug (goes on top of the patch above):
From 312feab01ec0a2562a1e44e231de12cc3dc3cce5 Mon Sep 17 00:00:00 2001
From: Bert Karwatzki <spasswolf@xxxxxx>
Date: Fri, 4 Oct 2024 10:28:12 +0200
Subject: [PATCH 2/2] mm: mmap: Avoid zeroing vma tree in mmap_region()

Now apply the parts of patch [v8 14/21] which give the crash.

Signed-off-by: Bert Karwatzki <spasswolf@xxxxxx>
---
mm/mmap.c | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index da9bfb899eec..405e0432c78e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1399,14 +1399,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
if (error)
goto gather_failed;

- /* Remove any existing mappings from the vma tree */
- error = vma_iter_clear_gfp(&vmi, addr, end, GFP_KERNEL);
- if (error)
- goto clear_tree_failed;
-
- /* Unmap any existing mapping in the area */
- vms_complete_munmap_vmas(&vms, &mas_detach);
- vms.vma_count = 0;
next = vms.next;
prev = vms.prev;
vma = NULL;
@@ -1424,7 +1416,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
charged = len >> PAGE_SHIFT;
charged -= nr_accounted;
if (security_vm_enough_memory_mm(mm, charged))
- goto clear_tree_failed;
+ goto abort_munmap;
vms.nr_accounted = 0;
vm_flags |= VM_ACCOUNT;
}
@@ -1484,6 +1476,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr,

if (file) {
vma->vm_file = get_file(file);
+ /*
+ * call_mmap() may map PTE, so ensure there are no existing PTEs
+ * call the vm_ops close function if one exists.
+ */
+ vms_clean_up_area(&vms, &mas_detach, true);
error = call_mmap(file, vma);
if (error)
goto unmap_and_free_vma;
@@ -1574,6 +1571,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
expanded:
perf_event_mmap(vma);

+ /* Unmap any existing mapping in the area */
+ vms_complete_munmap_vmas(&vms, &mas_detach);
+
vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
@@ -1622,9 +1622,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
if (charged)
vm_unacct_memory(charged);

-clear_tree_failed:
- if (vms.vma_count)
- abort_munmap_vmas(&mas_detach, false);
+abort_munmap:
+ if (vms.nr_pages)
+ abort_munmap_vmas(&mas_detach, vms.closed_vm_ops);
gather_failed:
validate_mm(mm);
return error;
--
2.45.2

So the moving vms_complete_munmap_vmas() to the end of mmap_region seems
to be problematic.

Bert Karwatzki