Re: mlockall(MCL_CURRENT) blocking infinitely

From: Robert Stupp
Date: Wed Nov 06 2019 - 07:25:30 EST


Oh! That (s/__get_user_pages()/__get_user_pages_locked()/) seems to fix
it.

Attached a patch (gup-mlockall.txt, against v5.3.8) - although, I've
got no clue how patches work here.
gup-debug-mlockall.txt is the debug printk() I've added plus the code
change.
dmesg-out.txt.gz is the dmesg output of the latter for test.c


On Wed, 2019-11-06 at 12:26 +0100, Robert Stupp wrote:
> Maybe a native and wrong idea, but would it work to call
> __get_user_pages_locked() instead of __get_user_pages() in
> populate_vma_page_range() ?
>
> On Wed, 2019-11-06 at 11:25 +0100, Robert Stupp wrote:
> > Here's one more dmesg output with more information captured in
> > __get_user_pages() as well. It basically confirms that
> > handle_mm_fault() returns VM_FAULT_RETRY.
> >
> > I'm not sure where and what to change ("fix with a FOLL_TRIED
> > somewhere") to make it work. My (uneducated) impression is, that
> > only
> > __get_user_pages() needs to be changed - but I might be wrong.
> >

commit d2a1ce44cd81b8ccf426a6c40a3a0d9b2b51e076
Author: Robert Stupp <snazy@xxxxxxxx>
Date: Wed Nov 6 12:58:12 2019 +0100

make mlockall(MCL_CURRENT) aware of new VM_FAULT_RETRY conditions

mlockall(MCL_CURRENT) can run into an busy/endless loop in
__mm_populate(). This can happen since
6b4c9f4469819a0c1a38a0a4541337e0f9bf6c11 (introduced in Linux 5.1),
"filemap: drop the mmap_sem for all blocking operations" / new
VM_FAULT_RETRY conditions.

The fix here is to replace the call to __get_user_pages() with a
call to __get_user_pages_locked() in populate_vma_page_range().

This change seems to fix the behavior for mlockall(MCL_CURRENT) but
does not change the behavior for other usages of
populate_vma_page_range() in find_extend_vma() and
mprotect_fixup().

(Text by Johannes Weiner hannes-at-cmpxchg.org):
The only way this can occur is if populate_vma_page_range() returns
0 and we don't advance the iteration position (if it returned an
error, we wouldn't reset nend and move on to the next vma as
ignore_errors is 1 for mlockall.)

populate_vma_page_range() returns 0 when the first page is not
found and faultin_page() returns -EBUSY (if it were processing
pages, or if the error from faultin_page() would be a different
one, we would return the number of pages processed or -error).

faultin_page() returns -EBUSY when VM_FAULT_RETRY is set, i.e. we
dropped the mmap_sem in order to initiate IO and require a retry.
That is consistent with the bisect result (new VM_FAULT_RETRY
conditions).

At this point, regular page fault would retry with FAULT_FLAG_TRIED
to indicate that the mmap_sem cannot be dropped a second time.

diff --git a/mm/gup.c b/mm/gup.c
index 98f13ab37bac..e319ffb625b1 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1227,8 +1227,8 @@ long populate_vma_page_range(struct vm_area_struct *vma,
* We made sure addr is within a VMA, so the following will
* not result in a stack expansion that recurses back here.
*/
- return __get_user_pages(current, mm, start, nr_pages, gup_flags,
- NULL, NULL, nonblocking);
+ return __get_user_pages_locked(current, mm, start, nr_pages,
+ NULL, NULL, nonblocking, gup_flags);
}

/*
commit 8c2e4c28bde0b10ba488d32d43d644612b2df737
Author: Robert Stupp <snazy@xxxxxxxx>
Date: Wed Nov 6 12:29:53 2019 +0100

Replace __get_user_pages() with __get_user_pages_locked() in populate_vma_page_range()

diff --git a/mm/gup.c b/mm/gup.c
index 5c9825745bb2..c0b61b3130a8 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1065,14 +1065,15 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
}
EXPORT_SYMBOL_GPL(fixup_user_fault);

-static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
+static __always_inline long __get_user_pages_locked_x(struct task_struct *tsk,
struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
struct vm_area_struct **vmas,
int *locked,
- unsigned int flags)
+ unsigned int flags,
+ int ign)
{
long ret, pages_done;
bool lock_dropped;
@@ -1090,8 +1091,8 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
pages_done = 0;
lock_dropped = false;
for (;;) {
- ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
- vmas, locked);
+ ret = __get_user_pages_x(tsk, mm, start, nr_pages, flags, pages,
+ vmas, locked, ign);
if (!locked)
/* VM_FAULT_RETRY couldn't trigger, bypass */
return ret;
@@ -1133,8 +1134,8 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
*locked = 1;
lock_dropped = true;
down_read(&mm->mmap_sem);
- ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
- pages, NULL, NULL);
+ ret = __get_user_pages_x(tsk, mm, start, 1, flags | FOLL_TRIED,
+ pages, NULL, NULL, ign);
if (ret != 1) {
BUG_ON(ret > 1);
if (!pages_done)
@@ -1159,6 +1160,17 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
}
return pages_done;
}
+static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ int *locked,
+ unsigned int flags)
+{
+ return __get_user_pages_locked_x(tsk, mm, start, nr_pages, pages, vmas, locked, flags, 1);
+}

/*
* get_user_pages_remote() - pin user pages in memory
@@ -1291,8 +1303,10 @@ long populate_vma_page_range_x(struct vm_area_struct *vma,
* We made sure addr is within a VMA, so the following will
* not result in a stack expansion that recurses back here.
*/
- return __get_user_pages_x(current, mm, start, nr_pages, gup_flags,
- NULL, NULL, nonblocking, ign);
+ return __get_user_pages_locked_x(current, mm, start, nr_pages,
+ NULL, NULL, nonblocking, gup_flags, ign);
+// return __get_user_pages_x(current, mm, start, nr_pages, gup_flags,
+// NULL, NULL, nonblocking, ign);
}
long populate_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end, int *nonblocking)

commit 8b0bed7848a32c74512e8278c94de16eea390274
Author: Robert Stupp <snazy@xxxxxxxx>
Date: Wed Nov 6 09:50:32 2019 +0100

More printk

diff --git a/mm/gup.c b/mm/gup.c
index 3bc25fd44433..5c9825745bb2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -627,8 +627,8 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
* *@flags does not include FOLL_NOWAIT, the mmap_sem may be released.
* If it is, *@nonblocking will be set to 0 and -EBUSY returned.
*/
-static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
- unsigned long address, unsigned int *flags, int *nonblocking)
+static int faultin_page_x(struct task_struct *tsk, struct vm_area_struct *vma,
+ unsigned long address, unsigned int *flags, int *nonblocking, int ign)
{
unsigned int fault_flags = 0;
vm_fault_t ret;
@@ -650,8 +650,14 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
}

ret = handle_mm_fault(vma, address, fault_flags);
+ if (!ign) {
+ printk(KERN_WARNING "faultin_page handle_mm_fault --> ret = %u\n", ret);
+ }
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, *flags);
+ if (!ign) {
+ printk(KERN_WARNING "faultin_page handle_mm_fault --> err = %d\n", err);
+ }

if (err)
return err;
@@ -666,6 +672,9 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
}

if (ret & VM_FAULT_RETRY) {
+ if (!ign) {
+ printk(KERN_WARNING "faultin_page-->EBUSY VM_FAULT_RETRY non-blocking?%d FAULT_FLAG_RETRY_NOWAIT?%d\n", nonblocking?1:0, (fault_flags & FAULT_FLAG_RETRY_NOWAIT)?1:0);
+ }
if (nonblocking && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
*nonblocking = 0;
return -EBUSY;
@@ -682,8 +691,16 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
*/
if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
*flags |= FOLL_COW;
+ if (!ign) {
+ printk(KERN_WARNING "faultin_page-->0\n");
+ }
return 0;
}
+static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
+ unsigned long address, unsigned int *flags, int *nonblocking)
+{
+ return faultin_page_x(tsk, vma, address, flags, nonblocking, 1);
+}

static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
{
@@ -788,15 +805,18 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* instead of __get_user_pages. __get_user_pages should be used only if
* you need some special @gup_flags.
*/
-static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+static long __get_user_pages_x(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *nonblocking)
+ struct vm_area_struct **vmas, int *nonblocking, int ign)
{
long ret = 0, i = 0;
struct vm_area_struct *vma = NULL;
struct follow_page_context ctx = { NULL };

+ if (!ign)
+ printk(KERN_WARNING "__get_user_pages start=%lx nr_pages=%lu gup_flags=%x ctx.page_mask=%u\n", start, nr_pages, gup_flags, ctx.page_mask);
+
if (!nr_pages)
return 0;

@@ -817,11 +837,25 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,

/* first iteration or cross vma bound */
if (!vma || start >= vma->vm_end) {
+ if (!ign) {
+ if (!vma)
+ printk(KERN_WARNING "__get_user_pages @1 vma==NULL\n");
+ else
+ printk(KERN_WARNING "__get_user_pages @1 start=%lx vma->vm_start=%lx vma->vm_end=%lx vma->vm_flags=%lx\n", start, vma->vm_start, vma->vm_end, vma->vm_flags);
+ }
vma = find_extend_vma(mm, start);
+ if (!ign) {
+ if (!vma)
+ printk(KERN_WARNING "__get_user_pages @2 vma==NULL\n");
+ else
+ printk(KERN_WARNING "__get_user_pages @2 start=%lx vma->vm_start=%lx vma->vm_end=%lx vma->vm_flags=%lx\n", start, vma->vm_start, vma->vm_end, vma->vm_flags);
+ }
if (!vma && in_gate_area(mm, start)) {
ret = get_gate_page(mm, start & PAGE_MASK,
gup_flags, &vma,
pages ? &pages[i] : NULL);
+ if (!ign)
+ printk(KERN_WARNING "__get_user_pages @3 get_gate_page --> %ld\n", ret);
if (ret)
goto out;
ctx.page_mask = 0;
@@ -829,6 +863,8 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
}

if (!vma || check_vma_flags(vma, gup_flags)) {
+ if (!ign)
+ printk(KERN_WARNING "__get_user_pages @4 EFAULT\n");
ret = -EFAULT;
goto out;
}
@@ -836,6 +872,8 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i,
gup_flags, nonblocking);
+ if (!ign)
+ printk(KERN_WARNING "__get_user_pages @5 follow_hugetlb_page --> %ld\n", i);
continue;
}
}
@@ -845,15 +883,21 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
* potentially allocating memory.
*/
if (fatal_signal_pending(current)) {
+ if (!ign)
+ printk(KERN_WARNING "__get_user_pages @6 ERESTARTSYS\n");
ret = -ERESTARTSYS;
goto out;
}
cond_resched();

page = follow_page_mask(vma, start, foll_flags, &ctx);
+ if (!ign)
+ printk(KERN_WARNING "__get_user_pages @7 follow_page_mask --> %d ctx.page_mask=%u\n", page ? 1 : 0, ctx.page_mask);
if (!page) {
- ret = faultin_page(tsk, vma, start, &foll_flags,
- nonblocking);
+ ret = faultin_page_x(tsk, vma, start, &foll_flags,
+ nonblocking, ign);
+ if (!ign)
+ printk(KERN_WARNING "__get_user_pages @8 faultin_page --> %ld\n", ret);
switch (ret) {
case 0:
goto retry;
@@ -869,6 +913,8 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
}
BUG();
} else if (PTR_ERR(page) == -EEXIST) {
+ if (!ign)
+ printk(KERN_WARNING "__get_user_pages @8 EEXIST\n");
/*
* Proper page table entry exists, but no corresponding
* struct page.
@@ -876,6 +922,8 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
goto next_page;
} else if (IS_ERR(page)) {
ret = PTR_ERR(page);
+ if (!ign)
+ printk(KERN_WARNING "__get_user_pages @8 IS_ERR -> ret=%ld\n", ret);
goto out;
}
if (pages) {
@@ -890,17 +938,31 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
ctx.page_mask = 0;
}
page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
+ if (!ign)
+ printk(KERN_WARNING "__get_user_pages @9 page_increm=%u ctx.page_mask=%u\n", page_increm, ctx.page_mask);
if (page_increm > nr_pages)
page_increm = nr_pages;
i += page_increm;
start += page_increm * PAGE_SIZE;
nr_pages -= page_increm;
+ if (!ign)
+ printk(KERN_WARNING "__get_user_pages @10 i=%ld start=%lx nr_pages=%ld\n", i, start, nr_pages);
} while (nr_pages);
out:
if (ctx.pgmap)
put_dev_pagemap(ctx.pgmap);
+ if (!ign) {
+ printk(KERN_WARNING "__get_user_pages LEAVE i=%ld ret=%ld\n", i, ret);
+ }
return i ? i : ret;
}
+static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *nonblocking)
+{
+ return __get_user_pages_x(tsk, mm, start, nr_pages, gup_flags, pages, vmas, nonblocking, 1);
+}

static bool vma_permits_fault(struct vm_area_struct *vma,
unsigned int fault_flags)
@@ -1193,8 +1255,9 @@ EXPORT_SYMBOL(get_user_pages_remote);
* If @nonblocking is non-NULL, it must held for read only and may be
* released. If it's released, *@nonblocking will be set to 0.
*/
-long populate_vma_page_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end, int *nonblocking)
+long populate_vma_page_range_x(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, int *nonblocking,
+ int ign)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long nr_pages = (end - start) / PAGE_SIZE;
@@ -1228,8 +1291,13 @@ long populate_vma_page_range(struct vm_area_struct *vma,
* We made sure addr is within a VMA, so the following will
* not result in a stack expansion that recurses back here.
*/
- return __get_user_pages(current, mm, start, nr_pages, gup_flags,
- NULL, NULL, nonblocking);
+ return __get_user_pages_x(current, mm, start, nr_pages, gup_flags,
+ NULL, NULL, nonblocking, ign);
+}
+long populate_vma_page_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, int *nonblocking)
+{
+ return populate_vma_page_range_x(vma, start, end, nonblocking, 1);
}

/*
@@ -1292,7 +1360,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
* double checks the vma flags, so that it won't mlock pages
* if the vma was already munlocked.
*/
- ret = populate_vma_page_range(vma, nstart, nend, &locked);
+ ret = populate_vma_page_range_x(vma, nstart, nend, &locked, ign);
if (ret < 0) {
if (!ign)
printk(KERN_WARNING "_mm_populate %lx %lx %lx %d LOOP-2 %ld\n", start, len, end, ignore_errors, ret);

commit 74da9b1c9bbaf66320d973ff7c8dc63962b6dc7d
Author: Robert Stupp <snazy@xxxxxxxx>
Date: Tue Nov 5 17:18:49 2019 +0100

Add debug things

diff --git a/mm/gup.c b/mm/gup.c
index 98f13ab37bac..3bc25fd44433 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -3,6 +3,7 @@
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/spinlock.h>
+#include <linux/printk.h>

#include <linux/mm.h>
#include <linux/memremap.h>
@@ -1241,14 +1242,23 @@ long populate_vma_page_range(struct vm_area_struct *vma,
int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
{
struct mm_struct *mm = current->mm;
- unsigned long end, nstart, nend;
+ unsigned long end, nstart, nend = 0L;
struct vm_area_struct *vma = NULL;
int locked = 0;
long ret = 0;
+ unsigned long nstart_prev = 0L - 1L, nend_prev = 0L - 1L;
+ int ign;

end = start + len;

+ printk(KERN_WARNING "_mm_populate %lx %lx %lx %d ENTER\n", start, len, end, ignore_errors);
+
for (nstart = start; nstart < end; nstart = nend) {
+ ign = nstart == nstart_prev && nend == nend_prev;
+ nstart_prev = nstart;
+ nend_prev = nend;
+ if (!ign)
+ printk(KERN_WARNING "_mm_populate %lx %lx %lx %d LOOP %lx %d %ld\n", start, len, end, ignore_errors, nstart, locked, ret);
/*
* We want to fault in pages for [nstart; end) address range.
* Find first corresponding VMA.
@@ -1259,6 +1269,8 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
vma = find_vma(mm, nstart);
} else if (nstart >= vma->vm_end)
vma = vma->vm_next;
+ if (!ign && vma)
+ printk(KERN_WARNING "_mm_populate %lx %lx %lx %d vma->vm_start=%lx vma->vm_end=%lx vma->vm_flags=%lx\n", start, len, end, ignore_errors, vma->vm_start, vma->vm_end, vma->vm_flags);
if (!vma || vma->vm_start >= end)
break;
/*
@@ -1266,8 +1278,13 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
* range with the first VMA. Also, skip undesirable VMA types.
*/
nend = min(end, vma->vm_end);
- if (vma->vm_flags & (VM_IO | VM_PFNMAP))
- continue;
+ if (!ign)
+ printk(KERN_WARNING "_mm_populate %lx %lx %lx %d nend=%lx %lx %lx\n", start, len, end, ignore_errors, nend, end, vma->vm_end);
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
+ if (!ign)
+ printk(KERN_WARNING "_mm_populate %lx %lx %lx %d LOOP-1 %lx\n", start, len, end, ignore_errors, vma->vm_flags);
+ continue;
+ }
if (nstart < vma->vm_start)
nstart = vma->vm_start;
/*
@@ -1277,6 +1294,8 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
*/
ret = populate_vma_page_range(vma, nstart, nend, &locked);
if (ret < 0) {
+ if (!ign)
+ printk(KERN_WARNING "_mm_populate %lx %lx %lx %d LOOP-2 %ld\n", start, len, end, ignore_errors, ret);
if (ignore_errors) {
ret = 0;
continue; /* continue at next VMA */
@@ -1284,8 +1303,11 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
break;
}
nend = nstart + ret * PAGE_SIZE;
+ if (!ign)
+ printk(KERN_WARNING "_mm_populate %lx %lx %lx %d LOOP-3 ret=%ld nend=%lx\n", start, len, end, ignore_errors, ret, nend);
ret = 0;
}
+ printk(KERN_WARNING "_mm_populate END %lu %lu %d\n", start, len, locked);
if (locked)
up_read(&mm->mmap_sem);
return ret; /* 0 or negative error code */

Attachment: dmesg-out.txt.gz
Description: application/gzip

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>

int main(int argc, char** argv) {
if (argc > 1) {
printf("Sleeping... %s\n", argv[1]);
sleep(atoi(argv[1]));
}
printf("Before mlockall(MCL_CURRENT)\n");
mlockall(MCL_CURRENT);
printf("After mlockall(MCL_CURRENT)\n");
}