Re: [PATCH v8 14/21] mm/mmap: Avoid zeroing vma tree in mmap_region()

From: Lorenzo Stoakes
Date: Fri Oct 04 2024 - 05:00:22 EST


On Fri, Oct 04, 2024 at 10:51:05AM +0200, Bert Karwatzki wrote:
> As a sidenode I've tried to figure out the minimal patch it takes to trigger the
> crash. The first commit that triggers the bug is f8d112a4e657 ("mm/mmap: avoid
> zeroing vma tree in mmap_region()") so I split this commit into two parts, the
> first part gives a working kernel while the second part gives the bug.
>
> First part (to be applied on top of commit 94f59ea591f
> ("mm: clean up unmap_region() argument list") in linux-next):
> From efdd914b012b35d34d1cae469f199023e82f7a15 Mon Sep 17 00:00:00 2001
> From: Bert Karwatzki <spasswolf@xxxxxx>
> Date: Thu, 3 Oct 2024 23:59:30 +0200
> Subject: [PATCH 1/2] mm: mmap: Prepare to avoid zeroing vma tree in
> mmap_region()
>
> In an attempt to create a minimal patch to trigger the mapletree
> bug apply parts of patch [v8 14/21] which still give a working kernel.
>

Thanks, this aligns with expectations as 'zeroing' the range will collapse
the overwritten VMA entries and render the subsequent merge trivial.

It seems to be the state of the maple tree that's the issue, it appears to
be fine, then we attempt the merge (and the store), and this triggers the
issue.

The v5 version of the printk patch which eliminates kernel pointer hashing
in the dumps should give us hopefully enough information to repro the tree,
as otherwise all attempts to do so have failed (this is a very, very
stubborn problem).

Kernel pointer hashing is an issue here because the dumping code uses %p to
output entries, including the last entry that stores the number of entries
(so we have no way of validating what this is) and also some pointers like
the parent pointer are tagged, so the tagging causes the hash to generate a
meaningless value.

So at this point we definitely need this (and I will write a patch to make
the debug code not hash here too I think :)

Thanks again for all your efforts, all much appreciated!


> Signed-off-by: Bert Karwatzki <spasswolf@xxxxxx>
> ---
> mm/mmap.c | 39 ++++++++++++++++++---------------------
> mm/vma.c | 54 ++++++++++++++++++++++++++++++++++++++++++------------
> mm/vma.h | 22 ++++++++++++++++------
> 3 files changed, 76 insertions(+), 39 deletions(-)
>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 304dc085533a..da9bfb899eec 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1373,23 +1373,19 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> unsigned long merge_start = addr, merge_end = end;
> bool writable_file_mapping = false;
> pgoff_t vm_pgoff;
> - int error;
> + int error = -ENOMEM;
> VMA_ITERATOR(vmi, mm, addr);
> + unsigned long nr_pages, nr_accounted;
>
> - /* Check against address space limit. */
> - if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
> - unsigned long nr_pages;
> + nr_pages = count_vma_pages_range(mm, addr, end, &nr_accounted);
>
> - /*
> - * MAP_FIXED may remove pages of mappings that intersects with
> - * requested mapping. Account for the pages it would unmap.
> - */
> - nr_pages = count_vma_pages_range(mm, addr, end);
> -
> - if (!may_expand_vm(mm, vm_flags,
> - (len >> PAGE_SHIFT) - nr_pages))
> - return -ENOMEM;
> - }
> + /*
> + * Check against address space limit.
> + * MAP_FIXED may remove pages of mappings that intersects with requested
> + * mapping. Account for the pages it would unmap.
> + */
> + if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages))
> + return -ENOMEM;
>
> /* Find the first overlapping VMA */
> vma = vma_find(&vmi, end);
> @@ -1410,6 +1406,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>
> /* Unmap any existing mapping in the area */
> vms_complete_munmap_vmas(&vms, &mas_detach);
> + vms.vma_count = 0;
> next = vms.next;
> prev = vms.prev;
> vma = NULL;
> @@ -1425,8 +1422,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> */
> if (accountable_mapping(file, vm_flags)) {
> charged = len >> PAGE_SHIFT;
> + charged -= nr_accounted;
> if (security_vm_enough_memory_mm(mm, charged))
> - return -ENOMEM;
> + goto clear_tree_failed;
> + vms.nr_accounted = 0;
> vm_flags |= VM_ACCOUNT;
> }
>
> @@ -1475,10 +1474,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> * not unmapped, but the maps are removed from the list.
> */
> vma = vm_area_alloc(mm);
> - if (!vma) {
> - error = -ENOMEM;
> + if (!vma)
> goto unacct_error;
> - }
>
> vma_iter_config(&vmi, addr, end);
> vma_set_range(vma, addr, end, pgoff);
> @@ -1605,7 +1602,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> return addr;
>
> close_and_free_vma:
> - if (file && vma->vm_ops && vma->vm_ops->close)
> + if (file && !vms.closed_vm_ops && vma->vm_ops && vma->vm_ops->close)
> vma->vm_ops->close(vma);
>
> if (file || vma->vm_file) {
> @@ -1627,7 +1624,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>
> clear_tree_failed:
> if (vms.vma_count)
> - abort_munmap_vmas(&mas_detach);
> + abort_munmap_vmas(&mas_detach, false);
> gather_failed:
> validate_mm(mm);
> return error;
> @@ -1960,7 +1957,7 @@ void exit_mmap(struct mm_struct *mm)
> do {
> if (vma->vm_flags & VM_ACCOUNT)
> nr_accounted += vma_pages(vma);
> - remove_vma(vma, true);
> + remove_vma(vma, /* unreachable = */ true, /* closed = */ false);
> count++;
> cond_resched();
> vma = vma_next(&vmi);
> diff --git a/mm/vma.c b/mm/vma.c
> index 83c5c46c67b9..648c58da8ad4 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -136,10 +136,10 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
> /*
> * Close a vm structure and free it.
> */
> -void remove_vma(struct vm_area_struct *vma, bool unreachable)
> +void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed)
> {
> might_sleep();
> - if (vma->vm_ops && vma->vm_ops->close)
> + if (!closed && vma->vm_ops && vma->vm_ops->close)
> vma->vm_ops->close(vma);
> if (vma->vm_file)
> fput(vma->vm_file);
> @@ -521,7 +521,6 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
> vma_iter_store(vmi, vma);
>
> vma_complete(&vp, vmi, vma->vm_mm);
> - validate_mm(vma->vm_mm);
> return 0;
>
> nomem:
> @@ -645,11 +644,14 @@ void vma_complete(struct vma_prepare *vp,
> uprobe_mmap(vp->insert);
> }
>
> -static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
> - struct ma_state *mas_detach, bool mm_wr_locked)
> +static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
> + struct ma_state *mas_detach, bool mm_wr_locked)
> {
> struct mmu_gather tlb;
>
> + if (!vms->clear_ptes) /* Nothing to do */
> + return;
> +
> /*
> * We can free page tables without write-locking mmap_lock because VMAs
> * were isolated before we downgraded mmap_lock.
> @@ -658,11 +660,31 @@ static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
> lru_add_drain();
> tlb_gather_mmu(&tlb, vms->mm);
> update_hiwater_rss(vms->mm);
> - unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, vms->vma_count, mm_wr_locked);
> + unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
> + vms->vma_count, mm_wr_locked);
> +
> mas_set(mas_detach, 1);
> /* start and end may be different if there is no prev or next vma. */
> - free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, vms->unmap_end, mm_wr_locked);
> + free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
> + vms->unmap_end, mm_wr_locked);
> tlb_finish_mmu(&tlb);
> + vms->clear_ptes = false;
> +}
> +
> +void vms_clean_up_area(struct vma_munmap_struct *vms,
> + struct ma_state *mas_detach, bool mm_wr_locked)
> +{
> + struct vm_area_struct *vma;
> +
> + if (!vms->nr_pages)
> + return;
> +
> + vms_clear_ptes(vms, mas_detach, mm_wr_locked);
> + mas_set(mas_detach, 0);
> + mas_for_each(mas_detach, vma, ULONG_MAX)
> + if (vma->vm_ops && vma->vm_ops->close)
> + vma->vm_ops->close(vma);
> + vms->closed_vm_ops = true;
> }
>
> /*
> @@ -686,7 +708,10 @@ void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
> if (vms->unlock)
> mmap_write_downgrade(mm);
>
> - vms_complete_pte_clear(vms, mas_detach, !vms->unlock);
> + if (!vms->nr_pages)
> + return;
> +
> + vms_clear_ptes(vms, mas_detach, !vms->unlock);
> /* Update high watermark before we lower total_vm */
> update_hiwater_vm(mm);
> /* Stat accounting */
> @@ -702,7 +727,7 @@ void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
> /* Remove and clean up vmas */
> mas_set(mas_detach, 0);
> mas_for_each(mas_detach, vma, ULONG_MAX)
> - remove_vma(vma, false);
> + remove_vma(vma, /* = */ false, vms->closed_vm_ops);
>
> vm_unacct_memory(vms->nr_accounted);
> validate_mm(mm);
> @@ -846,13 +871,14 @@ int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
> while (vma_iter_addr(vms->vmi) > vms->start)
> vma_iter_prev_range(vms->vmi);
>
> + vms->clear_ptes = true;
> return 0;
>
> userfaultfd_error:
> munmap_gather_failed:
> end_split_failed:
> modify_vma_failed:
> - abort_munmap_vmas(mas_detach);
> + abort_munmap_vmas(mas_detach, /* closed = */ false);
> start_split_failed:
> map_count_exceeded:
> return error;
> @@ -897,7 +923,7 @@ int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> return 0;
>
> clear_tree_failed:
> - abort_munmap_vmas(&mas_detach);
> + abort_munmap_vmas(&mas_detach, /* closed = */ false);
> gather_failed:
> validate_mm(mm);
> return error;
> @@ -1615,17 +1641,21 @@ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
> }
>
> unsigned long count_vma_pages_range(struct mm_struct *mm,
> - unsigned long addr, unsigned long end)
> + unsigned long addr, unsigned long end,
> + unsigned long *nr_accounted)
> {
> VMA_ITERATOR(vmi, mm, addr);
> struct vm_area_struct *vma;
> unsigned long nr_pages = 0;
>
> + *nr_accounted = 0;
> for_each_vma_range(vmi, vma, end) {
> unsigned long vm_start = max(addr, vma->vm_start);
> unsigned long vm_end = min(end, vma->vm_end);
>
> nr_pages += PHYS_PFN(vm_end - vm_start);
> + if (vma->vm_flags & VM_ACCOUNT)
> + *nr_accounted += PHYS_PFN(vm_end - vm_start);
> }
>
> return nr_pages;
> diff --git a/mm/vma.h b/mm/vma.h
> index 82ba48174341..64b44f5a0a11 100644
> --- a/mm/vma.h
> +++ b/mm/vma.h
> @@ -48,6 +48,8 @@ struct vma_munmap_struct {
> unsigned long stack_vm;
> unsigned long data_vm;
> bool unlock; /* Unlock after the munmap */
> + bool clear_ptes; /* If there are outstanding PTE to be cleared */
> + bool closed_vm_ops; /* call_mmap() was encountered, so vmas may be closed */
> };
>
> #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
> @@ -96,14 +98,13 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms,
> unsigned long start, unsigned long end, struct list_head *uf,
> bool unlock)
> {
> + vms->mm = current->mm;
> vms->vmi = vmi;
> vms->vma = vma;
> if (vma) {
> - vms->mm = vma->vm_mm;
> vms->start = start;
> vms->end = end;
> } else {
> - vms->mm = NULL;
> vms->start = vms->end = 0;
> }
> vms->unlock = unlock;
> @@ -113,6 +114,8 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms,
> vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
> vms->unmap_start = FIRST_USER_ADDRESS;
> vms->unmap_end = USER_PGTABLES_CEILING;
> + vms->clear_ptes = false;
> + vms->closed_vm_ops = false;
> }
> #endif
>
> @@ -122,18 +125,24 @@ int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
> void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
> struct ma_state *mas_detach);
>
> +void vms_clean_up_area(struct vma_munmap_struct *vms,
> + struct ma_state *mas_detach, bool mm_wr_locked);
> +
> /*
> * abort_munmap_vmas - Undo any munmap work and free resources
> *
> * Reattach any detached vmas and free up the maple tree used to track the vmas.
> */
> -static inline void abort_munmap_vmas(struct ma_state *mas_detach)
> +static inline void abort_munmap_vmas(struct ma_state *mas_detach, bool closed)
> {
> struct vm_area_struct *vma;
>
> mas_set(mas_detach, 0);
> - mas_for_each(mas_detach, vma, ULONG_MAX)
> + mas_for_each(mas_detach, vma, ULONG_MAX) {
> vma_mark_detached(vma, false);
> + if (closed && vma->vm_ops && vma->vm_ops->open)
> + vma->vm_ops->open(vma);
> + }
>
> __mt_destroy(mas_detach->tree);
> }
> @@ -147,7 +156,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
> unsigned long start, size_t len, struct list_head *uf,
> bool unlock);
>
> -void remove_vma(struct vm_area_struct *vma, bool unreachable);
> +void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed);
>
> void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
> struct vm_area_struct *prev, struct vm_area_struct *next);
> @@ -261,7 +270,8 @@ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
> int mm_take_all_locks(struct mm_struct *mm);
> void mm_drop_all_locks(struct mm_struct *mm);
> unsigned long count_vma_pages_range(struct mm_struct *mm,
> - unsigned long addr, unsigned long end);
> + unsigned long addr, unsigned long end,
> + unsigned long *nr_accounted);
>
> static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
> {
> --
> 2.45.2
>
> And here the second part which causes the bug (goes on top of the patch above):
> From 312feab01ec0a2562a1e44e231de12cc3dc3cce5 Mon Sep 17 00:00:00 2001
> From: Bert Karwatzki <spasswolf@xxxxxx>
> Date: Fri, 4 Oct 2024 10:28:12 +0200
> Subject: [PATCH 2/2] mm: mmap: Avoid zeroing vma tree in mmap_region()
>
> Now apply the parts of patch [v8 14/21] which give the crash.
>
> Signed-off-by: Bert Karwatzki <spasswolf@xxxxxx>
> ---
> mm/mmap.c | 24 ++++++++++++------------
> 1 file changed, 12 insertions(+), 12 deletions(-)
>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index da9bfb899eec..405e0432c78e 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1399,14 +1399,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> if (error)
> goto gather_failed;
>
> - /* Remove any existing mappings from the vma tree */
> - error = vma_iter_clear_gfp(&vmi, addr, end, GFP_KERNEL);
> - if (error)
> - goto clear_tree_failed;
> -
> - /* Unmap any existing mapping in the area */
> - vms_complete_munmap_vmas(&vms, &mas_detach);
> - vms.vma_count = 0;
> next = vms.next;
> prev = vms.prev;
> vma = NULL;
> @@ -1424,7 +1416,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> charged = len >> PAGE_SHIFT;
> charged -= nr_accounted;
> if (security_vm_enough_memory_mm(mm, charged))
> - goto clear_tree_failed;
> + goto abort_munmap;
> vms.nr_accounted = 0;
> vm_flags |= VM_ACCOUNT;
> }
> @@ -1484,6 +1476,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>
> if (file) {
> vma->vm_file = get_file(file);
> + /*
> + * call_mmap() may map PTE, so ensure there are no existing PTEs
> + * call the vm_ops close function if one exists.
> + */
> + vms_clean_up_area(&vms, &mas_detach, true);
> error = call_mmap(file, vma);
> if (error)
> goto unmap_and_free_vma;
> @@ -1574,6 +1571,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> expanded:
> perf_event_mmap(vma);
>
> + /* Unmap any existing mapping in the area */
> + vms_complete_munmap_vmas(&vms, &mas_detach);
> +
> vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
> if (vm_flags & VM_LOCKED) {
> if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
> @@ -1622,9 +1622,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> if (charged)
> vm_unacct_memory(charged);
>
> -clear_tree_failed:
> - if (vms.vma_count)
> - abort_munmap_vmas(&mas_detach, false);
> +abort_munmap:
> + if (vms.nr_pages)
> + abort_munmap_vmas(&mas_detach, vms.closed_vm_ops);
> gather_failed:
> validate_mm(mm);
> return error;
> --
> 2.45.2
>
> So the moving vms_complete_munmap_vmas() to the end of mmap_region seems
> to be problematic.
>
> Bert Karwatzki