RE: my test code and result///[PATCH] mm/mmap: fix the adjusted length error

From: chenjianhong (A)
Date: Fri May 17 2019 - 02:51:38 EST


[test code] The following is my test code.
/*
* first, we allocat large virtual memory;
* second, we allocate hugepage memory by shmat, and release one
* of the hugepage memory block;
* third, we allocate hugepage memory by shmat again, this will fail.
*/
#include <stdio.h>
#include <stdlib.h>
#include <sys/shm.h>
#include <errno.h>

int size = 0x4000000;
int shmid[5];
void *shm[5];
int key[5] = {234, 235, 236, 237, 238};
unsigned long seg_size[5] = {0x200000, 0x4000000, 0x1000000,
0x400000, 0xa00000};

int init_memory(void)
{
int i,j;
for (i = 0; i < 5; i++) {
shmid[i] = shmget((key_t)key[i], seg_size[i], 0666
| IPC_CREAT | SHM_HUGETLB);
if (shmid[i] == -1) {
fprintf(stderr, "shmget[%d] error(%d)\n",
i, errno);
goto failed;
}
}
return 0;
failed:
for (j = 0; j < i; j++) {
shmctl(shmid[j], IPC_RMID, 0);
}
return -1;
}

int del_segmem(void)
{
int i = 0;
for (i = 0; i < 5; i++) {
shmdt(shm[i]);
shmctl(shmid[i], IPC_RMID, 0);
}
return 0;
}

int fun_C(void)
{
int i = 0;
printf("-----------------------fun_C shmat-----------------------\n");
for (i = 0; i < 5; i+=1) {
shm[i] = shmat(shmid[i], 0, 0);
if (shm[i] == (void *)-1) {
fprintf(stderr, "shmat[%d] failed %d\n", i, errno);
return -1;
}
}
sleep(2);
system("pid=`ps -e | grep memory | awk '{print $1}'`;cat /proc/$pid/maps");
shmdt(shm[1]);
printf("-----------------------after fun_C shmdt-----------------------\n");
system("pid=`ps -e | grep memory | awk '{print $1}'`;cat /proc/$pid/maps");
printf("-----------------------fun_C ok-----------------------\n");
return 0;
}

int fun_A(void)
{
int i = 1;
shm[i] = shmat(shmid[1], 0, 0);
printf("-----------------------fun_A shmat-----------------------\n");
if (shm[i] == (void *)-1) {
fprintf(stderr, "funa shmat[%d] size(0x%08x)failed %d\n",
i, seg_size[i], errno);
return -1;
}
system("pid=`ps -e | grep memory | awk '{print $1}'`;cat /proc/$pid/maps");
sleep(2);
shmdt(shm[1]);
printf("-----------------------fun_A shmdt-----------------------\n");
system("pid=`ps -e | grep memory | awk '{print $1}'`;cat /proc/$pid/maps");
printf("-----------------------fun_A ok-----------------------\n");
return 0;
}

/*
* first, we allocat large virtual memory;
* second, we allocate hugepage memory by shmat, and release one
* of the hugepage memory block;
* third, we allocate hugepage memory by shmat again, this will fail.
*/

int main(int argc,char * argv[])
{
int i;
int ret = 0;
for (i == 0; i < 52; i++) {
malloc(size);//first
}
if (init_memory() != 0) {
ret = -1;
goto failed_memory;
}
fun_C();//second
sleep(5);
ret = fun_A();//third
if (ret != 0) {
goto failed_memory;
}
sleep(3);
failed_memory:
del_segmem();
return ret;
}

[Test result]
-----------------------fun_C shmat-----------------
00008000-00009000 r-xp 00000000 00:12 290 /tmp/memory_mmap
00011000-00012000 rw-p 00001000 00:12 290 /tmp/memory_mmap
27589000-f75bd000 rw-p 00000000 00:00 0
f75bd000-f76e4000 r-xp 00000000 01:00 560 /lib/libc-2.11.1.so
f76e4000-f76ec000 ---p 00127000 01:00 560 /lib/libc-2.11.1.so
f76ec000-f76ee000 r--p 00127000 01:00 560 /lib/libc-2.11.1.so
f76ee000-f76ef000 rw-p 00129000 01:00 560 /lib/libc-2.11.1.so
f76ef000-f76f2000 rw-p 00000000 00:00 0
f76f2000-f7713000 r-xp 00000000 01:00 583 /lib/libgcc_s.so.1
f7713000-f771a000 ---p 00021000 01:00 583 /lib/libgcc_s.so.1
f771a000-f771b000 rw-p 00020000 01:00 583 /lib/libgcc_s.so.1
f771b000-f7738000 r-xp 00000000 01:00 543 /lib/ld-2.11.1.so
f773c000-f773d000 rw-p 00000000 00:00 0
f773d000-f773f000 rw-p 00000000 00:00 0
f773f000-f7740000 r--p 0001c000 01:00 543 /lib/ld-2.11.1.so
f7740000-f7741000 rw-p 0001d000 01:00 543 /lib/ld-2.11.1.so
f7800000-f7a00000 rw-s 00000000 00:0e 327680 /SYSV000000ea (deleted)
f7a00000-fba00000 rw-s 00000000 00:0e 360449 /SYSV000000eb (deleted)
fba00000-fca00000 rw-s 00000000 00:0e 393218 /SYSV000000ec (deleted)
fca00000-fce00000 rw-s 00000000 00:0e 425987 /SYSV000000ed (deleted)
fce00000-fd800000 rw-s 00000000 00:0e 458756 /SYSV000000ee (deleted)
ff9df000-ffa00000 rw-p 00000000 00:00 0 [stack]
ffff0000-ffff1000 r-xp 00000000 00:00 0 [vectors]
-----------------------after fun_C shmdt-----------
00008000-00009000 r-xp 00000000 00:12 290 /tmp/memory_mmap
00011000-00012000 rw-p 00001000 00:12 290 /tmp/memory_mmap
27589000-f75bd000 rw-p 00000000 00:00 0
f75bd000-f76e4000 r-xp 00000000 01:00 560 /lib/libc-2.11.1.so
f76e4000-f76ec000 ---p 00127000 01:00 560 /lib/libc-2.11.1.so
f76ec000-f76ee000 r--p 00127000 01:00 560 /lib/libc-2.11.1.so
f76ee000-f76ef000 rw-p 00129000 01:00 560 /lib/libc-2.11.1.so
f76ef000-f76f2000 rw-p 00000000 00:00 0
f76f2000-f7713000 r-xp 00000000 01:00 583 /lib/libgcc_s.so.1
f7713000-f771a000 ---p 00021000 01:00 583 /lib/libgcc_s.so.1
f771a000-f771b000 rw-p 00020000 01:00 583 /lib/libgcc_s.so.1
f771b000-f7738000 r-xp 00000000 01:00 543 /lib/ld-2.11.1.so
f773c000-f773d000 rw-p 00000000 00:00 0
f773d000-f773f000 rw-p 00000000 00:00 0
f773f000-f7740000 r--p 0001c000 01:00 543 /lib/ld-2.11.1.so
f7740000-f7741000 rw-p 0001d000 01:00 543 /lib/ld-2.11.1.so
f7800000-f7a00000 rw-s 00000000 00:0e 327680 /SYSV000000ea (deleted)
fba00000-fca00000 rw-s 00000000 00:0e 393218 /SYSV000000ec (deleted)
fca00000-fce00000 rw-s 00000000 00:0e 425987 /SYSV000000ed (deleted)
fce00000-fd800000 rw-s 00000000 00:0e 458756 /SYSV000000ee (deleted)
ff9df000-ffa00000 rw-p 00000000 00:00 0 [stack]
ffff0000-ffff1000 r-xp 00000000 00:00 0 [vectors]
-----------------------fun_C ok--------------------
-----------------------fun_A shmat-----------------
funa shmat[1] size(0x04000000)failed 12

-----Original Message-----
From: chenjianhong (A)
Sent: Friday, May 17, 2019 2:07 PM
To: gregkh@xxxxxxxxxxxxxxxxxxx; akpm@xxxxxxxxxxxxxxxxxxxx; mhocko@xxxxxxxx; vbabka@xxxxxxx;
kirill.shutemov@xxxxxxxxxxxxxxx; yang.shi@xxxxxxxxxxxxxxxxx; jannh@xxxxxxxxxx; steve.capper@xxxxxxx;
tiny.windzz@xxxxxxxxx; walken@xxxxxxxxxx
Cc: chenjianhong (A) <chenjianhong2@xxxxxxxxxx>; linux-kernel@xxxxxxxxxxxxxxx; linux-mm@xxxxxxxxx;
stable@xxxxxxxxxxxxxxx
Subject: [PATCH] mm/mmap: fix the adjusted length error

In linux version 4.4, a 32-bit process may fail to allocate 64M hugepage memory by function shmat even
though there is a 64M memory gap in the process.

It is the adjusted length that causes the problem, introduced from
commit db4fbfb9523c935 ("mm: vm_unmapped_area() lookup function"). Accounting for the worst case
alignment overhead, function unmapped_area and unmapped_area_topdown adjust the search length
before searching for available vma gap. This is an estimated length, sum of the desired length and the
longest alignment offset, which can cause misjudgement if the system has very few virtual memory left.
For example, if the longest memory gap available is 64M, we canât get it from the system by allocating
64M hugepage memory via shmat function. The reason is that it requires a longger length, the sum of the
desired length(64M) and the longest alignment offset.

To fix this error ,we can calculate the alignment offset of gap_start or gap_end to get a desired gap_start
or gap_end value, before searching for the available gap. In this way, we don't need to adjust the search
length.

Problem reproduces procedure:
1. allocate a lot of virtual memory segments via shmat and malloc
2. release one of the biggest memory segment via shmdt
3. attach the biggest memory segment via shmat

e.g.
process maps:
00008000-00009000 r-xp 00000000 00:12 3385 /tmp/memory_mmap
00011000-00012000 rw-p 00001000 00:12 3385 /tmp/memory_mmap
27536000-f756a000 rw-p 00000000 00:00 0
f756a000-f7691000 r-xp 00000000 01:00 560 /lib/libc-2.11.1.so
f7691000-f7699000 ---p 00127000 01:00 560 /lib/libc-2.11.1.so
f7699000-f769b000 r--p 00127000 01:00 560 /lib/libc-2.11.1.so
f769b000-f769c000 rw-p 00129000 01:00 560 /lib/libc-2.11.1.so
f769c000-f769f000 rw-p 00000000 00:00 0
f769f000-f76c0000 r-xp 00000000 01:00 583 /lib/libgcc_s.so.1
f76c0000-f76c7000 ---p 00021000 01:00 583 /lib/libgcc_s.so.1
f76c7000-f76c8000 rw-p 00020000 01:00 583 /lib/libgcc_s.so.1
f76c8000-f76e5000 r-xp 00000000 01:00 543 /lib/ld-2.11.1.so
f76e9000-f76ea000 rw-p 00000000 00:00 0
f76ea000-f76ec000 rw-p 00000000 00:00 0
f76ec000-f76ed000 r--p 0001c000 01:00 543 /lib/ld-2.11.1.so
f76ed000-f76ee000 rw-p 0001d000 01:00 543 /lib/ld-2.11.1.so
f7800000-f7a00000 rw-s 00000000 00:0e 0 /SYSV000000ea (deleted)
fba00000-fca00000 rw-s 00000000 00:0e 65538 /SYSV000000ec (deleted)
fca00000-fce00000 rw-s 00000000 00:0e 98307 /SYSV000000ed (deleted)
fce00000-fd800000 rw-s 00000000 00:0e 131076 /SYSV000000ee (deleted)
ff913000-ff934000 rw-p 00000000 00:00 0 [stack]
ffff0000-ffff1000 r-xp 00000000 00:00 0 [vectors]

from 0xf7a00000 to fba00000, it has 64M memory gap, but we can't get it from kernel.

Signed-off-by: jianhong chen <chenjianhong2@xxxxxxxxxx>
Cc: stable@xxxxxxxxxxxxxxx
---
mm/mmap.c | 43 +++++++++++++++++++++++++++++--------------
1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index bd7b9f2..c5a5782 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1865,6 +1865,22 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
return error;
}

+static inline unsigned long gap_start_offset(struct vm_unmapped_area_info *info,
+ unsigned long addr)
+{
+ /* get gap_start offset to adjust gap address to the
+ * desired alignment
+ */
+ return (info->align_offset - addr) & info->align_mask; }
+
+static inline unsigned long gap_end_offset(struct vm_unmapped_area_info *info,
+ unsigned long addr)
+{
+ /* get gap_end offset to adjust gap address to the desired alignment */
+ return (addr - info->align_offset) & info->align_mask; }
+
unsigned long unmapped_area(struct vm_unmapped_area_info *info) {
/*
@@ -1879,10 +1895,7 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;

- /* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask;
- if (length < info->length)
- return -ENOMEM;
+ length = info->length;

/* Adjust search limits by the desired length */
if (info->high_limit < length)
@@ -1914,6 +1927,7 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
}

gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
+ gap_start += gap_start_offset(info, gap_start);
check_current:
/* Check if current node has a suitable gap */
if (gap_start > high_limit)
@@ -1942,6 +1956,7 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
struct vm_area_struct, vm_rb);
if (prev == vma->vm_rb.rb_left) {
gap_start = vm_end_gap(vma->vm_prev);
+ gap_start += gap_start_offset(info, gap_start);
gap_end = vm_start_gap(vma);
goto check_current;
}
@@ -1951,17 +1966,17 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
check_highest:
/* Check highest gap, which does not precede any rbtree node */
gap_start = mm->highest_vm_end;
+ gap_start += gap_start_offset(info, gap_start);
gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
if (gap_start > high_limit)
return -ENOMEM;

found:
/* We found a suitable gap. Clip it with the original low_limit. */
- if (gap_start < info->low_limit)
+ if (gap_start < info->low_limit) {
gap_start = info->low_limit;
-
- /* Adjust gap address to the desired alignment */
- gap_start += (info->align_offset - gap_start) & info->align_mask;
+ gap_start += gap_start_offset(info, gap_start);
+ }

VM_BUG_ON(gap_start + info->length > info->high_limit);
VM_BUG_ON(gap_start + info->length > gap_end);

@@ -1974,16 +1989,14 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;

- /* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask;
- if (length < info->length)
- return -ENOMEM;
+ length = info->length;

/*
* Adjust search limits by the desired length.
* See implementation comment at top of unmapped_area().
*/
gap_end = info->high_limit;
+ gap_end -= gap_end_offset(info, gap_end);
if (gap_end < length)
return -ENOMEM;
high_limit = gap_end - length;
@@ -2020,6 +2033,7 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
check_current:
/* Check if current node has a suitable gap */
gap_end = vm_start_gap(vma);
+ gap_end -= gap_end_offset(info, gap_end);
if (gap_end < low_limit)
return -ENOMEM;
if (gap_start <= high_limit &&
@@ -2054,13 +2068,14 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)

found:
/* We found a suitable gap. Clip it with the original high_limit. */
- if (gap_end > info->high_limit)
+ if (gap_end > info->high_limit) {
gap_end = info->high_limit;
+ gap_end -= gap_end_offset(info, gap_end);
+ }

found_highest:
/* Compute highest gap address at the desired alignment */
gap_end -= info->length;
- gap_end -= (gap_end - info->align_offset) & info->align_mask;

VM_BUG_ON(gap_end < info->low_limit);
VM_BUG_ON(gap_end < gap_start);
--
1.8.5.6