[RFC PATCH 1/2] mm, mincore2(): retrieve dax and tlb-size attributes of an address range
From: Dan Williams
Date: Sun Sep 11 2016 - 13:34:50 EST
As evidenced by this bug report [1], userspace libraries are interested
in whether a mapping is DAX mapped, i.e. no intervening page cache.
Rather than using the ambiguous VM_MIXEDMAP flag in smaps, provide an
explicit "is dax" indication as a new flag in the page vector populated
by mincore.
There are also cases, particularly for testing and validating a
configuration to know the hardware mapping geometry of the pages in a
given process address range. Consider filesystem-dax where a
configuration needs to take care to align partitions and block
allocations before huge page mappings might be used, or
anonymous-transparent-huge-pages where a process is opportunistically
assigned large pages. mincore2() allows these configurations to be
surveyed and validated.
The implementation takes advantage of the unused bits in the per-page
byte returned for each PAGE_SIZE extent of a given address range. The
new format of each vector byte is:
(TLB_SHIFT - PAGE_SHIFT) << 2 | vma_is_dax() << 1 | page_present
[1]: https://lkml.org/lkml/2016/9/7/61
Cc: Arnd Bergmann <arnd@xxxxxxxx>
Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
Cc: Xiao Guangrong <guangrong.xiao@xxxxxxxxxxxxxxx>
Cc: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
---
include/linux/syscalls.h | 2 +
include/uapi/asm-generic/mman-common.h | 3 +
kernel/sys_ni.c | 1
mm/mincore.c | 126 +++++++++++++++++++++++++-------
4 files changed, 104 insertions(+), 28 deletions(-)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d02239022bd0..4aa2ee7e359a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -467,6 +467,8 @@ asmlinkage long sys_munlockall(void);
asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
asmlinkage long sys_mincore(unsigned long start, size_t len,
unsigned char __user * vec);
+asmlinkage long sys_mincore2(unsigned long start, size_t len,
+ unsigned char __user * vec, int flags);
asmlinkage long sys_pivot_root(const char __user *new_root,
const char __user *put_old);
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 58274382a616..05037343f0da 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -72,4 +72,7 @@
#define MAP_HUGE_SHIFT 26
#define MAP_HUGE_MASK 0x3f
+#define MINCORE_DAX 1 /* indicate pages that are dax-mapped */
+#define MINCORE_ORDER 2 /* retrieve hardware mapping-size-order */
+
#endif /* __ASM_GENERIC_MMAN_COMMON_H */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2c5e3a8e00d7..e14b87834054 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -197,6 +197,7 @@ cond_syscall(sys_mlockall);
cond_syscall(sys_munlockall);
cond_syscall(sys_mlock2);
cond_syscall(sys_mincore);
+cond_syscall(sys_mincore2);
cond_syscall(sys_madvise);
cond_syscall(sys_mremap);
cond_syscall(sys_remap_file_pages);
diff --git a/mm/mincore.c b/mm/mincore.c
index c0b5ba965200..15f9eb5de65b 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -15,25 +15,62 @@
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/hugetlb.h>
+#include <linux/dax.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
+#define MINCORE_DAX_MASK 2
+#define MINCORE_DAX_SHIFT 1
+
+#define MINCORE_ORDER_MASK 0x7c
+#define MINCORE_ORDER_SHIFT 2
+
+struct mincore_params {
+ unsigned char *vec;
+ int flags;
+};
+
+static void mincore_set(unsigned char *vec, struct vm_area_struct *vma, int nr,
+ int flags)
+{
+ unsigned char mincore = 1;
+
+ if (!nr) {
+ *vec = 0;
+ return;
+ }
+
+ if ((flags & MINCORE_DAX) && vma_is_dax(vma))
+ mincore |= 1 << MINCORE_DAX_SHIFT;
+ if (flags & MINCORE_ORDER) {
+ unsigned char order = ilog2(nr);
+
+ WARN_ON((order << MINCORE_ORDER_SHIFT) & ~MINCORE_ORDER_MASK);
+ mincore |= order << MINCORE_ORDER_SHIFT;
+ }
+ memset(vec, mincore, nr);
+}
+
static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
#ifdef CONFIG_HUGETLB_PAGE
+ struct mincore_params *p = walk->private;
+ int nr = (end - addr) >> PAGE_SHIFT;
+ unsigned char *vec = p->vec;
unsigned char present;
- unsigned char *vec = walk->private;
/*
* Hugepages under user process are always in RAM and never
* swapped out, but theoretically it needs to be checked.
*/
present = pte && !huge_pte_none(huge_ptep_get(pte));
- for (; addr != end; vec++, addr += PAGE_SIZE)
- *vec = present;
- walk->private = vec;
+ if (!present)
+ memset(vec, 0, nr);
+ else
+ mincore_set(vec, walk->vma, nr, p->flags);
+ p->vec = vec + nr;
#else
BUG();
#endif
@@ -82,20 +119,24 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
}
static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
- struct vm_area_struct *vma, unsigned char *vec)
+ struct vm_area_struct *vma, unsigned char *vec,
+ int flags)
{
unsigned long nr = (end - addr) >> PAGE_SHIFT;
+ unsigned char present;
int i;
if (vma->vm_file) {
pgoff_t pgoff;
pgoff = linear_page_index(vma, addr);
- for (i = 0; i < nr; i++, pgoff++)
- vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
+ for (i = 0; i < nr; i++, pgoff++) {
+ present = mincore_page(vma->vm_file->f_mapping, pgoff);
+ mincore_set(vec + i, vma, present, flags);
+ }
} else {
for (i = 0; i < nr; i++)
- vec[i] = 0;
+ mincore_set(vec + i, vma, 0, flags);
}
return nr;
}
@@ -103,8 +144,11 @@ static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
static int mincore_unmapped_range(unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- walk->private += __mincore_unmapped_range(addr, end,
- walk->vma, walk->private);
+ struct mincore_params *p = walk->private;
+ int nr = __mincore_unmapped_range(addr, end, walk->vma, p->vec,
+ p->flags);
+
+ p->vec += nr;
return 0;
}
@@ -114,18 +158,20 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
spinlock_t *ptl;
struct vm_area_struct *vma = walk->vma;
pte_t *ptep;
- unsigned char *vec = walk->private;
+ struct mincore_params *p = walk->private;
+ unsigned char *vec = p->vec;
int nr = (end - addr) >> PAGE_SHIFT;
+ int flags = p->flags;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
- memset(vec, 1, nr);
+ mincore_set(vec, vma, nr, flags);
spin_unlock(ptl);
goto out;
}
if (pmd_trans_unstable(pmd)) {
- __mincore_unmapped_range(addr, end, vma, vec);
+ __mincore_unmapped_range(addr, end, vma, vec, flags);
goto out;
}
@@ -135,9 +181,9 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (pte_none(pte))
__mincore_unmapped_range(addr, addr + PAGE_SIZE,
- vma, vec);
+ vma, vec, flags);
else if (pte_present(pte))
- *vec = 1;
+ mincore_set(vec, vma, 1, flags);
else { /* pte is a swap entry */
swp_entry_t entry = pte_to_swp_entry(pte);
@@ -146,14 +192,17 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
* migration or hwpoison entries are always
* uptodate
*/
- *vec = 1;
+ mincore_set(vec, vma, 1, flags);
} else {
#ifdef CONFIG_SWAP
- *vec = mincore_page(swap_address_space(entry),
- entry.val);
+ unsigned char present;
+
+ present = mincore_page(swap_address_space(entry),
+ entry.val);
+ mincore_set(vec, vma, present, flags);
#else
WARN_ON(1);
- *vec = 1;
+ mincore_set(vec, vma, 1, flags);
#endif
}
}
@@ -161,7 +210,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
}
pte_unmap_unlock(ptep - 1, ptl);
out:
- walk->private += nr;
+ p->vec = vec + nr;
cond_resched();
return 0;
}
@@ -171,16 +220,21 @@ out:
* all the arguments, we hold the mmap semaphore: we should
* just return the amount of info we're asked for.
*/
-static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec)
+static long do_mincore(unsigned long addr, unsigned long pages,
+ unsigned char *vec, int flags)
{
struct vm_area_struct *vma;
unsigned long end;
int err;
+ struct mincore_params p = {
+ .vec = vec,
+ .flags = flags,
+ };
struct mm_walk mincore_walk = {
.pmd_entry = mincore_pte_range,
.pte_hole = mincore_unmapped_range,
.hugetlb_entry = mincore_hugetlb,
- .private = vec,
+ .private = &p,
};
vma = find_vma(current->mm, addr);
@@ -195,13 +249,19 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
}
/*
- * The mincore(2) system call.
+ * The mincore2(2) system call.
*
- * mincore() returns the memory residency status of the pages in the
+ * mincore2() returns the memory residency status of the pages in the
* current process's address space specified by [addr, addr + len).
* The status is returned in a vector of bytes. The least significant
* bit of each byte is 1 if the referenced page is in memory, otherwise
- * it is zero.
+ * it is zero. When 'flags' is non-zero each byte additionally contains
+ * an indication of whether the referenced page in memory is a DAX
+ * mapping (bit 2 of each vector byte), and/or the order of the mapping
+ * (bits 3 through 7 of each vector byte). Where the order relates to
+ * the hardware mapping size backing the given logical-page. For
+ * example, a 2MB-dax-mapped-huge-page would correspond to 512 vector
+ * entries with the value 0x27.
*
* Because the status of a page can change after mincore() checks it
* but before it returns to the application, the returned vector may
@@ -218,8 +278,8 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
* mapped
* -EAGAIN - A kernel resource was temporarily unavailable.
*/
-SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
- unsigned char __user *, vec)
+SYSCALL_DEFINE4(mincore2, unsigned long, start, size_t, len,
+ unsigned char __user *, vec, int, flags)
{
long retval;
unsigned long pages;
@@ -229,6 +289,10 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
if (start & ~PAGE_MASK)
return -EINVAL;
+ /* Check that undefined flags are zero */
+ if (flags & ~(MINCORE_DAX | MINCORE_ORDER))
+ return -EINVAL;
+
/* ..and we need to be passed a valid user-space range */
if (!access_ok(VERIFY_READ, (void __user *) start, len))
return -ENOMEM;
@@ -251,7 +315,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
* the temporary buffer size.
*/
down_read(¤t->mm->mmap_sem);
- retval = do_mincore(start, min(pages, PAGE_SIZE), tmp);
+ retval = do_mincore(start, min(pages, PAGE_SIZE), tmp, flags);
up_read(¤t->mm->mmap_sem);
if (retval <= 0)
@@ -268,3 +332,9 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
free_page((unsigned long) tmp);
return retval;
}
+
+SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
+ unsigned char __user *, vec)
+{
+ return sys_mincore2(start, len, vec, 0);
+}