[PATCH v3 16/66] mm/mmap: Change do_brk_flags() to expand existing VMA and add do_brk_munmap()

From: Liam Howlett
Date: Mon Oct 04 2021 - 21:31:54 EST


From: "Liam R. Howlett" <Liam.Howlett@xxxxxxxxxx>

Avoid allocating a new VMA when it a vma modification can occur. When a
brk() can expand or contract a VMA, then the single store operation will
only modify one index of the maple tree instead of causing a node to
split or coalesce. This avoids unnecessary allocations/frees of maple
tree nodes and VMAs.

Use the advanced API for the maple tree to avoid unnecessary walks of
the tree.

Signed-off-by: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx>
---
mm/mmap.c | 278 +++++++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 224 insertions(+), 54 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 9f047204fa93..b1f7c080b9da 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -188,17 +188,22 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
return next;
}

-static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
- struct list_head *uf);
+static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long newbrk, unsigned long oldbrk,
+ struct list_head *uf);
+static int do_brk_flags(struct ma_state *mas, struct vm_area_struct **brkvma,
+ unsigned long addr, unsigned long request,
+ unsigned long flags);
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long newbrk, oldbrk, origbrk;
struct mm_struct *mm = current->mm;
- struct vm_area_struct *next;
+ struct vm_area_struct *brkvma, *next = NULL;
unsigned long min_brk;
bool populate;
bool downgraded = false;
LIST_HEAD(uf);
+ MA_STATE(mas, &mm->mm_mt, 0, 0);

if (mmap_write_lock_killable(mm))
return -EINTR;
@@ -238,37 +243,60 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
goto success;
}

- /*
- * Always allow shrinking brk.
- * __do_munmap() may downgrade mmap_lock to read.
- */
- if (brk <= mm->brk) {
+ mas_set(&mas, newbrk);
+ mas_lock(&mas);
+ brkvma = mas_walk(&mas);
+ mas_unlock(&mas);
+ if (brkvma) { // munmap necessary, there is something at newbrk.
+ /*
+ * Always allow shrinking brk.
+ * do_brk_munmap() may downgrade mmap_lock to read.
+ */
int ret;

+ if (brkvma->vm_start >= oldbrk)
+ goto out; // mapping intersects with an existing non-brk vma.
/*
- * mm->brk must to be protected by write mmap_lock so update it
- * before downgrading mmap_lock. When __do_munmap() fails,
- * mm->brk will be restored from origbrk.
+ * mm->brk must to be protected by write mmap_lock.
+ * do_brk_munmap() may downgrade the lock, so update it
+ * before calling do_brk_munmap().
*/
mm->brk = brk;
- ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
- if (ret < 0) {
- mm->brk = origbrk;
- goto out;
- } else if (ret == 1) {
+ mas.last = oldbrk - 1;
+ ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf);
+ if (ret == 1) {
downgraded = true;
- }
- goto success;
- }
+ goto success;
+ } else if (!ret)
+ goto success;

+ mm->brk = origbrk;
+ goto out;
+ }
+ /* Only check if the next VMA is within the stack_guard_gap of the
+ * expansion area */
+ next = mas_next(&mas, newbrk + PAGE_SIZE + stack_guard_gap);
/* Check against existing mmap mappings. */
- next = find_vma(mm, oldbrk);
if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
goto out;

+ mas_lock(&mas);
+ brkvma = mas_prev(&mas, mm->start_brk);
+ mas_unlock(&mas);
+ if (brkvma) {
+ if(brkvma->vm_start >= oldbrk)
+ goto out; // Trying to map over another vma.
+
+ if (brkvma->vm_end <= min_brk) {
+ brkvma = NULL;
+ mas_reset(&mas);
+ }
+ }
+
/* Ok, looks good - let it rip. */
- if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
+ if (do_brk_flags(&mas, &brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
goto out;
+
mm->brk = brk;

success:
@@ -370,16 +398,16 @@ static void validate_mm(struct mm_struct *mm)
validate_mm_mt(mm);

while (vma) {
+#ifdef CONFIG_DEBUG_VM_RB
struct anon_vma *anon_vma = vma->anon_vma;
struct anon_vma_chain *avc;
-
if (anon_vma) {
anon_vma_lock_read(anon_vma);
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
anon_vma_interval_tree_verify(avc);
anon_vma_unlock_read(anon_vma);
}
-
+#endif
highest_address = vm_end_gap(vma);
vma = vma->vm_next;
i++;
@@ -2009,6 +2037,7 @@ EXPORT_SYMBOL(get_unmapped_area);
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma;
+ MA_STATE(mas, &mm->mm_mt, addr, addr);

mmap_assert_locked(mm);
/* Check the cache first. */
@@ -2016,7 +2045,9 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
if (likely(vma))
return vma;

- vma = mt_find(&mm->mm_mt, &addr, ULONG_MAX);
+ rcu_read_lock();
+ vma = mas_find(&mas, -1);
+ rcu_read_unlock();
if (vma)
vmacache_update(addr, vma);

@@ -2501,7 +2532,6 @@ unlock_range(struct vm_area_struct *start, unsigned long limit)
mm->locked_vm -= vma_pages(tmp);
munlock_vma_pages_all(tmp);
}
-
tmp = tmp->vm_next;
}
}
@@ -2734,16 +2764,105 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
}

/*
- * this is really a simplified "do_mmap". it only handles
- * anonymous maps. eventually we may be able to do some
- * brk-specific accounting here.
+ * bkr_munmap() - Unmap a parital vma.
+ * @mas: The maple tree state.
+ * @vma: The vma to be modified
+ * @newbrk: the start of the address to unmap
+ * @oldbrk: The end of the address to unmap
+ * @uf: The userfaultfd list_head
+ *
+ * Returns: 0 on success.
+ * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if
+ * possible.
+ */
+static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long newbrk, unsigned long oldbrk,
+ struct list_head *uf)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct unmap;
+ unsigned long unmap_pages;
+ int ret = 1;
+
+ arch_unmap(mm, newbrk, oldbrk);
+
+ if (likely(vma->vm_start >= newbrk)) { // remove entire mapping(s)
+ mas_set(mas, newbrk);
+ if (vma->vm_start != newbrk)
+ mas_reset(mas); // cause a re-walk for the first overlap.
+ ret = __do_munmap(mm, newbrk, oldbrk - newbrk, uf, true);
+ goto munmap_full_vma;
+ }
+
+ vma_init(&unmap, mm);
+ unmap.vm_start = newbrk;
+ unmap.vm_end = oldbrk;
+ ret = userfaultfd_unmap_prep(&unmap, newbrk, oldbrk, uf);
+ if (ret)
+ return ret;
+ ret = 1;
+
+ // Change the oldbrk of vma to the newbrk of the munmap area
+ vma_adjust_trans_huge(vma, vma->vm_start, newbrk, 0);
+ if (vma->anon_vma) {
+ anon_vma_lock_write(vma->anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ }
+
+ vma->vm_end = newbrk;
+ if (vma_mas_remove(&unmap, mas))
+ goto mas_store_fail;
+
+ vmacache_invalidate(vma->vm_mm);
+ if (vma->anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
+ }
+
+ unmap_pages = vma_pages(&unmap);
+ if (unmap.vm_flags & VM_LOCKED) {
+ mm->locked_vm -= unmap_pages;
+ munlock_vma_pages_range(&unmap, newbrk, oldbrk);
+ }
+
+ mmap_write_downgrade(mm);
+ unmap_region(mm, &unmap, vma, newbrk, oldbrk);
+ /* Statistics */
+ vm_stat_account(mm, unmap.vm_flags, -unmap_pages);
+ if (unmap.vm_flags & VM_ACCOUNT)
+ vm_unacct_memory(unmap_pages);
+
+munmap_full_vma:
+ validate_mm_mt(mm);
+ return ret;
+
+mas_store_fail:
+ vma->vm_end = oldbrk;
+ if (vma->anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
+ }
+ return -ENOMEM;
+}
+
+/*
+ * do_brk_flags() - Increase the brk vma if the flags match.
+ * @mas: The maple tree state.
+ * @addr: The start address
+ * @len: The length of the increase
+ * @vma: The vma,
+ * @flags: The VMA Flags
+ *
+ * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
+ * do not match then create a new anonymous VMA. Eventually we may be able to
+ * do some brk-specific accounting here.
*/
-static int do_brk_flags(unsigned long addr, unsigned long len,
- unsigned long flags, struct list_head *uf)
+static int do_brk_flags(struct ma_state *mas, struct vm_area_struct **brkvma,
+ unsigned long addr, unsigned long len,
+ unsigned long flags)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev;
- pgoff_t pgoff = addr >> PAGE_SHIFT;
+ struct vm_area_struct *prev = NULL, *vma;
int error;
unsigned long mapped_addr;
validate_mm_mt(mm);
@@ -2761,11 +2880,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
if (error)
return error;

- /* Clear old maps, set up prev and uf */
- if (munmap_vma_range(mm, addr, len, &prev, uf))
- return -ENOMEM;
-
- /* Check against address space limits *after* clearing old maps... */
+ /* Check against address space limits by the changed size */
if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
return -ENOMEM;

@@ -2775,28 +2890,65 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
return -ENOMEM;

- /* Can we just expand an old private anonymous mapping? */
- vma = vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
- if (vma)
- goto out;
+ mas->last = addr + len - 1;
+ if (*brkvma) {
+ vma = *brkvma;
+ /* Expand the existing vma if possible; almost never a singular
+ * list, so this will almost always fail. */

- /*
- * create a vma struct for an anonymous mapping
- */
- vma = vm_area_alloc(mm);
- if (!vma) {
- vm_unacct_memory(len >> PAGE_SHIFT);
- return -ENOMEM;
+ if ((!vma->anon_vma ||
+ list_is_singular(&vma->anon_vma_chain)) &&
+ ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)){
+ mas->index = vma->vm_start;
+
+ vma_adjust_trans_huge(vma, addr, addr + len, 0);
+ if (vma->anon_vma) {
+ anon_vma_lock_write(vma->anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ }
+ vma->vm_end = addr + len;
+ vma->vm_flags |= VM_SOFTDIRTY;
+ mas_lock(mas);
+ if (mas_store_gfp(mas, vma, GFP_KERNEL))
+ goto mas_mod_fail;
+
+ mas_unlock(mas);
+ if (vma->anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
+ }
+ khugepaged_enter_vma_merge(vma, flags);
+ goto out;
+ }
+ prev = vma;
}
+ mas_lock(mas);
+ mas->index = addr;
+ mas_walk(mas);
+ mas_unlock(mas);
+
+ /* create a vma struct for an anonymous mapping */
+ vma = vm_area_alloc(mm);
+ if (!vma)
+ goto vma_alloc_fail;

vma_set_anonymous(vma);
vma->vm_start = addr;
vma->vm_end = addr + len;
- vma->vm_pgoff = pgoff;
+ vma->vm_pgoff = addr >> PAGE_SHIFT;
vma->vm_flags = flags;
vma->vm_page_prot = vm_get_page_prot(flags);
- vma_link(mm, vma, prev);
+ mas_lock(mas);
+ if (vma_mas_store(vma, mas))
+ goto mas_store_fail;
+ mas_unlock(mas);
+
+ if (!prev)
+ prev = mas_prev(mas, 0);
+
+ __vma_link_list(mm, vma, prev);
+ mm->map_count++;
+ *brkvma = vma;
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
@@ -2806,15 +2958,32 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
vma->vm_flags |= VM_SOFTDIRTY;
validate_mm_mt(mm);
return 0;
+
+mas_store_fail:
+ vm_area_free(vma);
+vma_alloc_fail:
+ vm_unacct_memory(len >> PAGE_SHIFT);
+ return -ENOMEM;
+
+mas_mod_fail:
+ mas_unlock(mas);
+ vma->vm_end = addr;
+ if (vma->anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
+ }
+ return -ENOMEM;
+
}

int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
{
struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = NULL;
unsigned long len;
int ret;
bool populate;
- LIST_HEAD(uf);
+ MA_STATE(mas, &mm->mm_mt, addr, addr);

len = PAGE_ALIGN(request);
if (len < request)
@@ -2825,10 +2994,11 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
if (mmap_write_lock_killable(mm))
return -EINTR;

- ret = do_brk_flags(addr, len, flags, &uf);
+ // This vma left intentionally blank.
+ mas_walk(&mas);
+ ret = do_brk_flags(&mas, &vma, addr, len, flags);
populate = ((mm->def_flags & VM_LOCKED) != 0);
mmap_write_unlock(mm);
- userfaultfd_unmap_complete(mm, &uf);
if (populate && !ret)
mm_populate(addr, len);
return ret;
--
2.30.2