Re: [PATCH] mapletree-vs-khugepaged

From: Liam Howlett
Date: Wed Jun 01 2022 - 16:48:01 EST


* Liam R. Howlett <Liam.Howlett@xxxxxxxxxx> [220531 14:56]:
> * Liam R. Howlett <Liam.Howlett@xxxxxxxxxx> [220530 13:38]:
> > * Guenter Roeck <linux@xxxxxxxxxxxx> [220519 17:42]:
> > > On 5/19/22 07:35, Liam Howlett wrote:
> > > > * Guenter Roeck <linux@xxxxxxxxxxxx> [220517 10:32]:
> > > >

...

> > I have qemu 7.0 which seems to change the default memory size from 32MB
> > to 128MB. This can be seen on your log here:
> >
> > Memory: 27928K/32768K available (2827K kernel code, 160K rwdata, 432K rodata, 1016K init, 66K bss, 4840K reserved, 0K cma-reserved)
> >
> > With 128MB the kernel boots. With 64MB it also boots. 32MB fails with
> > an OOM. Looking into it more, I see that the OOM is caused by a
> > contiguous page allocation of 1MB (order 7 at 8K pages).

...

> > Does anyone have any idea why nommu would be getting this fragmented?
>
> Answer: Why, yes. Matthew does. Using alloc_pages_exact() means we
> allocate the huge chunk of memory then free the leftovers immediately.
> Those freed leftover pages are handed out on the next request - which
> happens to be the maple tree.
>
> It seems nommu is so close to OOMing already that this makes a
> difference. Attached is a patch which _almost_ solves the issue by
> making it less likely to use those pages, but it's still a matter of
> timing on if this will OOM anyways. It reduces the potential by a large
> margin, maybe 1/10 fail instead of 4/5 failing. This patch is probably
> worth taking on its own as it reduces memory fragmentation on
> short-lived allocations that use alloc_pages_exact().
>
> I changed the nommu code a bit to reduce memory usage as well. During a
> split even, I no longer delete then re-add the VMA and I only
> preallocate a single time for the two writes associated with a split. I
> also moved my pre-allocation ahead of the call path that does
> alloc_pages_exact(). This all but ensures we won't fragment the larger
> chunks of memory as we get enough nodes out of a single page to run at
> least through boot. However, the failure rate remained at 1/10 with
> this change.
>
> I had accepted the scenario that this all just worked before, but my
> setup is different than that of Guenter. I am using buildroot-2022.02.1
> and qemu 7.0 for my testing. My configuration OOMs 12/13 times without
> maple tree, so I think we actually lowered the memory pressure on boot
> with these changes. Obviously there is a element of timing that causes
> variation in the testing so exact numbers are not possible.

Andrew,

Please add the previous patch to the mm branch, it is not dependent on
the maple tree.

Please also include the attached patch as a fix for the maple tree nommu
OOM issue on top of "nommu: remove uses of VMA linked list". It
triggers much less for me than a straight up buildroot-2022.02.1 build
with qemu 7.0. I believe this will fix Guenter's issues with the maple
tree.

Thanks,
Liam
From c92dfca0283877f0c0d9b4e619e261c0da78eb74 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@xxxxxxxxxx>
Date: Fri, 20 May 2022 21:51:57 -0400
Subject: [PATCH] mm/nommu: Move preallocations and limit other allocations

Move the preallocations in do_mmap() ahead of allocations that may use
alloc_pages_exact() so that reclaimed areas of larger pages don't get
reused for the maple tree nodes. This will allow the larger blocks of
memory to reassemble after use.

Also avoid other unnecessary allocations for maple tree nodes by
overwriting the VMA on split instead of deleting and re-adding.

Signed-off-by: Liam R. Howlett <Liam.Howlett@xxxxxxxxxx>
---
mm/nommu.c | 114 ++++++++++++++++++++++++++++++++++++++++-------------
1 file changed, 86 insertions(+), 28 deletions(-)

diff --git a/mm/nommu.c b/mm/nommu.c
index b5bb12772cbf..c80797961067 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -557,25 +557,14 @@ void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas)
mas_store_prealloc(mas, NULL);
}

-/*
- * add a VMA into a process's mm_struct in the appropriate place in the list
- * and tree and add to the address space's page tree also if not an anonymous
- * page
- * - should be called with mm->mmap_lock held writelocked
- */
-static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
+static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm)
{
- struct address_space *mapping;
- MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end);
-
- BUG_ON(!vma->vm_region);
-
mm->map_count++;
vma->vm_mm = mm;

/* add the VMA to the mapping */
if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
+ struct address_space *mapping = vma->vm_file->f_mapping;

i_mmap_lock_write(mapping);
flush_dcache_mmap_lock(mapping);
@@ -583,18 +572,46 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
flush_dcache_mmap_unlock(mapping);
i_mmap_unlock_write(mapping);
}
+}

+/*
+ * mas_add_vma_to_mm() - Maple state variant of add_mas_to_mm().
+ * @mas: The maple state with preallocations.
+ * @mm: The mm_struct
+ * @vma: The vma to add
+ *
+ */
+static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm,
+ struct vm_area_struct *vma)
+{
+ BUG_ON(!vma->vm_region);
+
+ setup_vma_to_mm(vma, mm);
/* add the VMA to the tree */
- vma_mas_store(vma, &mas);
+ vma_mas_store(vma, mas);
}

/*
- * delete a VMA from its owning mm_struct and address space
+ * add a VMA into a process's mm_struct in the appropriate place in the list
+ * and tree and add to the address space's page tree also if not an anonymous
+ * page
+ * - should be called with mm->mmap_lock held writelocked
*/
-static void delete_vma_from_mm(struct vm_area_struct *vma)
+static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
{
- MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0);
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end);
+
+ if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ pr_warn("Allocation of vma tree for process %d failed\n",
+ current->pid);
+ return -ENOMEM;
+ }
+ mas_add_vma_to_mm(&mas, mm, vma);
+ return 0;
+}

+static void cleanup_vma_from_mm(struct vm_area_struct *vma)
+{
vma->vm_mm->map_count--;
/* remove the VMA from the mapping */
if (vma->vm_file) {
@@ -607,9 +624,24 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
flush_dcache_mmap_unlock(mapping);
i_mmap_unlock_write(mapping);
}
+}
+/*
+ * delete a VMA from its owning mm_struct and address space
+ */
+static int delete_vma_from_mm(struct vm_area_struct *vma)
+{
+ MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0);
+
+ if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ pr_warn("Allocation of vma tree for process %d failed\n",
+ current->pid);
+ return -ENOMEM;
+ }
+ cleanup_vma_from_mm(vma);

/* remove from the MM's tree and list */
vma_mas_remove(vma, &mas);
+ return 0;
}

/*
@@ -1019,6 +1051,7 @@ unsigned long do_mmap(struct file *file,
vm_flags_t vm_flags;
unsigned long capabilities, result;
int ret;
+ MA_STATE(mas, &current->mm->mm_mt, 0, 0);

*populate = 0;

@@ -1037,6 +1070,10 @@ unsigned long do_mmap(struct file *file,
* now know into VMA flags */
vm_flags = determine_vm_flags(file, prot, flags, capabilities);

+
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ goto error_maple_preallocate;
+
/* we're going to need to record the mapping */
region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
if (!region)
@@ -1186,7 +1223,7 @@ unsigned long do_mmap(struct file *file,
current->mm->total_vm += len >> PAGE_SHIFT;

share:
- add_vma_to_mm(current->mm, vma);
+ mas_add_vma_to_mm(&mas, current->mm, vma);

/* we flush the region from the icache only when the first executable
* mapping of it is made */
@@ -1212,11 +1249,13 @@ unsigned long do_mmap(struct file *file,

sharing_violation:
up_write(&nommu_region_sem);
+ mas_destroy(&mas);
pr_warn("Attempt to share mismatched mappings\n");
ret = -EINVAL;
goto error;

error_getting_vma:
+ mas_destroy(&mas);
kmem_cache_free(vm_region_jar, region);
pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
len, current->pid);
@@ -1224,10 +1263,17 @@ unsigned long do_mmap(struct file *file,
return -ENOMEM;

error_getting_region:
+ mas_destroy(&mas);
pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
len, current->pid);
show_free_areas(0, NULL);
return -ENOMEM;
+
+error_maple_preallocate:
+ pr_warn("Allocation of vma tree for process %d failed\n", current->pid);
+ show_free_areas(0, NULL);
+ return -ENOMEM;
+
}

unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
@@ -1293,6 +1339,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *new;
struct vm_region *region;
unsigned long npages;
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end);

/* we're only permitted to split anonymous regions (these should have
* only a single usage on the region) */
@@ -1328,7 +1375,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);

- delete_vma_from_mm(vma);
down_write(&nommu_region_sem);
delete_nommu_region(vma->vm_region);
if (new_below) {
@@ -1341,8 +1387,17 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
add_nommu_region(vma->vm_region);
add_nommu_region(new->vm_region);
up_write(&nommu_region_sem);
- add_vma_to_mm(mm, vma);
- add_vma_to_mm(mm, new);
+ if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ pr_warn("Allocation of vma tree for process %d failed\n",
+ current->pid);
+ return -ENOMEM;
+ }
+
+ setup_vma_to_mm(vma, mm);
+ setup_vma_to_mm(new, mm);
+ mas_set_range(&mas, vma->vm_start, vma->vm_end - 1);
+ mas_store(&mas, vma);
+ vma_mas_store(new, &mas);
return 0;
}

@@ -1358,12 +1413,14 @@ static int shrink_vma(struct mm_struct *mm,

/* adjust the VMA's pointers, which may reposition it in the MM's tree
* and list */
- delete_vma_from_mm(vma);
+ if (delete_vma_from_mm(vma))
+ return -ENOMEM;
if (from > vma->vm_start)
vma->vm_end = from;
else
vma->vm_start = to;
- add_vma_to_mm(mm, vma);
+ if (add_vma_to_mm(mm, vma))
+ return -ENOMEM;

/* cut the backing region down to size */
region = vma->vm_region;
@@ -1394,7 +1451,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
MA_STATE(mas, &mm->mm_mt, start, start);
struct vm_area_struct *vma;
unsigned long end;
- int ret;
+ int ret = 0;

len = PAGE_ALIGN(len);
if (len == 0)
@@ -1444,9 +1501,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
}

erase_whole_vma:
- delete_vma_from_mm(vma);
+ if (delete_vma_from_mm(vma))
+ ret = -ENOMEM;
delete_vma(mm, vma);
- return 0;
+ return ret;
}

int vm_munmap(unsigned long addr, size_t len)
@@ -1485,12 +1543,12 @@ void exit_mmap(struct mm_struct *mm)
*/
mmap_write_lock(mm);
for_each_vma(vmi, vma) {
- delete_vma_from_mm(vma);
+ cleanup_vma_from_mm(vma);
delete_vma(mm, vma);
cond_resched();
}
- mmap_write_unlock(mm);
__mt_destroy(&mm->mm_mt);
+ mmap_write_unlock(mm);
}

int vm_brk(unsigned long addr, unsigned long len)
--
2.35.1