Re: [PATCH] mm: Fix mmap MAP_POPULATE for DAX pmd mapping
From: Toshi Kani
Date: Mon Nov 23 2015 - 17:19:30 EST
On Mon, 2015-11-23 at 12:53 -0800, Dan Williams wrote:
> On Mon, Nov 23, 2015 at 12:04 PM, Toshi Kani <toshi.kani@xxxxxxx> wrote:
> > The following oops was observed when mmap() with MAP_POPULATE
> > pre-faulted pmd mappings of a DAX file. follow_trans_huge_pmd()
> > expects that a target address has a struct page.
> >
> > BUG: unable to handle kernel paging request at ffffea0012220000
> > follow_trans_huge_pmd+0xba/0x390
> > follow_page_mask+0x33d/0x420
> > __get_user_pages+0xdc/0x800
> > populate_vma_page_range+0xb5/0xe0
> > __mm_populate+0xc5/0x150
> > vm_mmap_pgoff+0xd5/0xe0
> > SyS_mmap_pgoff+0x1c1/0x290
> > SyS_mmap+0x1b/0x30
> >
> > Fix it by making the PMD pre-fault handling consistent with PTE.
> > After pre-faulted in faultin_page(), follow_page_mask() calls
> > follow_trans_huge_pmd(), which is changed to call follow_pfn_pmd()
> > for VM_PFNMAP or VM_MIXEDMAP. follow_pfn_pmd() handles FOLL_TOUCH
> > and returns with -EEXIST.
>
> As of 4.4.-rc2 DAX pmd mappings are disabled. So we have time to do
> something more comprehensive in 4.5.
Yes, I noticed during my testing that I could not use pmd...
> > Reported-by: Mauricio Porto <mauricio.porto@xxxxxxx>
> > Signed-off-by: Toshi Kani <toshi.kani@xxxxxxx>
> > Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
> > Cc: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
> > Cc: Matthew Wilcox <willy@xxxxxxxxxxxxxxx>
> > Cc: Dan Williams <dan.j.williams@xxxxxxxxx>
> > Cc: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx>
> > ---
> > mm/huge_memory.c | 34 ++++++++++++++++++++++++++++++++++
> > 1 file changed, 34 insertions(+)
> >
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index d5b8920..f56e034 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> [..]
> > @@ -1288,6 +1315,13 @@ struct page *follow_trans_huge_pmd(struct
> > vm_area_struct *vma,
> > if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
> > goto out;
> >
> > + /* pfn map does not have a struct page */
> > + if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) {
> > + ret = follow_pfn_pmd(vma, addr, pmd, flags);
> > + page = ERR_PTR(ret);
> > + goto out;
> > + }
> > +
> > page = pmd_page(*pmd);
> > VM_BUG_ON_PAGE(!PageHead(page), page);
> > if (flags & FOLL_TOUCH) {
>
> I think it is already problematic that dax pmd mappings are getting
> confused with transparent huge pages.
We had the same issue with dax pte mapping [1], and this change extends the pfn
map handling to pmd. So, this problem is not specific to pmd.
[1] https://lkml.org/lkml/2015/6/23/181
> They're more closely related to
> a hugetlbfs pmd mappings in that they are mapping an explicit
> allocation. I have some pending patches to address this dax-pmd vs
> hugetlb-pmd vs thp-pmd classification that I will post shortly.
Not sure which way is better, but I am certainly interested in your changes.
> By the way, I'm collecting DAX pmd regression tests [1], is this just
> a simple crash upon using MAP_POPULATE?
>
> [1]: https://github.com/pmem/ndctl/blob/master/lib/test-dax-pmd.c
Yes, this issue is easy to reproduce with MAP_POPULATE. In case it helps,
attached are the test I used for testing the patches. Sorry, the code is messy
since it was only intended for my internal use...
- The test was originally written for the pte change [1] and comments in
test.sh (ex. mlock fail, ok) reflect the results without the pte change.
- For the pmd test, I modified test-mmap.c to call posix_memalign() before
mmap(). By calling free(), the 2MB-aligned address from posix_memalign() can be
used for mmap(). This keeps the mmap'd address aligned on 2MB.
- I created test file(s) with dd (i.e. all blocks written) in my test.
- The other infinite loop issue (fixed by my other patch) was found by the test
case with option "-LMSr".
Thanks,
-Toshi
Attachment:
test.sh
Description: application/shellscript
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/time.h>
#include <string.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#define MB(a) ((a) * 1024UL * 1024UL)
static struct timeval start_tv, stop_tv;
// Calculate the difference between two time values.
void tvsub(struct timeval *tdiff, struct timeval *t1, struct timeval *t0)
{
tdiff->tv_sec = t1->tv_sec - t0->tv_sec;
tdiff->tv_usec = t1->tv_usec - t0->tv_usec;
if (tdiff->tv_usec < 0)
tdiff->tv_sec--, tdiff->tv_usec += 1000000;
}
// Start timing now.
void start()
{
(void) gettimeofday(&start_tv, (struct timezone *) 0);
}
// Stop timing and return real time in microseconds.
unsigned long long stop()
{
struct timeval tdiff;
(void) gettimeofday(&stop_tv, (struct timezone *) 0);
tvsub(&tdiff, &stop_tv, &start_tv);
return (tdiff.tv_sec * 1000000 + tdiff.tv_usec);
}
void test_write(unsigned long *p, size_t size)
{
int i;
unsigned long *wp, tmp;
unsigned long long timeval;
start();
for (i=0, wp=p; i<(size/sizeof(wp)); i++)
*wp++ = 1;
timeval = stop();
printf("Write: %10llu usec\n", timeval);
}
void test_read(unsigned long *p, size_t size)
{
int i;
unsigned long *wp, tmp;
unsigned long long timeval;
start();
for (i=0, wp=p; i<(size/sizeof(wp)); i++)
tmp = *wp++;
timeval = stop();
printf("Read : %10llu usec\n", timeval);
}
int main(int argc, char **argv)
{
int fd, i, opt, ret;
int oflags, mprot, mflags = 0;
int is_read_only = 0, is_mlock = 0, is_mlockall = 0;
int mlock_skip = 0, read_test = 0, write_test = 0;
void *mptr = NULL;
unsigned long *p;
struct stat stat;
size_t size, cpy_size;
const char *file_name = NULL;
while ((opt = getopt(argc, argv, "LRMSApsrw")) != -1) {
switch (opt) {
case 'L':
file_name = "/mnt/pmem0/4Gfile";
break;
case 'R':
printf("> mmap: read-only\n");
is_read_only = 1;
break;
case 'M':
printf("> mlock\n");
is_mlock = 1;
break;
case 'S':
printf("> mlock - skip first ite\n");
mlock_skip = 1;
break;
case 'A':
printf("> mlockall\n");
is_mlockall = 1;
break;
case 'p':
printf("> MAP_POPULATE\n");
mflags |= MAP_POPULATE;
break;
case 's':
printf("> MAP_SHARED\n");
mflags |= MAP_SHARED;
break;
case 'r':
printf("> read-test\n");
read_test = 1;
break;
case 'w':
printf("> write-test\n");
write_test = 1;
break;
}
}
if (!file_name) {
file_name = "/mnt/pmem1/32Kfile";
}
if (!(mflags & MAP_SHARED)) {
printf("> MAP_PRIVATE\n");
mflags |= MAP_PRIVATE;
}
if (is_read_only) {
oflags = O_RDONLY;
mprot = PROT_READ;
} else {
oflags = O_RDWR;
mprot = PROT_READ|PROT_WRITE;
}
fd = open(file_name, oflags);
if (fd == -1) {
perror("open failed");
exit(1);
}
ret = fstat(fd, &stat);
if (ret < 0) {
perror("fstat failed");
exit(1);
}
size = stat.st_size;
printf("> open %s size 0x%x flags 0x%x\n", file_name, size, oflags);
ret = posix_memalign(&mptr, MB(2), size);
if (ret ==0)
free(mptr);
printf("> mmap mprot 0x%x flags 0x%x\n", mprot, mflags);
p = mmap(mptr, size, mprot, mflags, fd, 0x0);
if (!p) {
perror("mmap failed");
exit(1);
}
if ((long unsigned)p & (MB(2)-1))
printf("> mmap: NOT 2MB aligned: 0x%p\n", p);
else
printf("> mmap: 2MB aligned: 0x%p\n", p);
#if 0 /* SIZE LIMIT */
if (size >= MB(2))
cpy_size = MB(32);
else
#endif
cpy_size = size;
for (i=0; i<3; i++) {
if (is_mlock && !mlock_skip) {
printf("> mlock 0x%p\n", p);
ret = mlock(p, size);
if (ret < 0) {
perror("mlock failed");
exit(1);
}
} else if (is_mlockall) {
printf("> mlockall\n");
ret = mlockall(MCL_CURRENT|MCL_FUTURE);
if (ret < 0) {
perror("mlockall failed");
exit(1);
}
}
printf("===== %d =====\n", i+1);
if (write_test)
test_write(p, cpy_size);
if (read_test)
test_read(p, cpy_size);
if (is_mlock && !mlock_skip) {
printf("> munlock 0x%p\n", p);
ret = munlock(p, size);
if (ret < 0) {
perror("munlock failed");
exit(1);
}
} else if (is_mlockall) {
printf("> munlockall\n");
ret = munlockall();
if (ret < 0) {
perror("munlockall failed");
exit(1);
}
}
/* skip, if requested, only the first iteration */
mlock_skip = 0;
}
printf("> munmap 0x%p\n", p);
munmap(p, size);
}