[PATCH v2 2/4] mm: introduce fincore()

From: Naoya Horiguchi
Date: Thu Jul 03 2014 - 17:54:34 EST


This patch provides a new system call fincore(2), which provides mincore()-
like information, i.e. page residency of a given file. But unlike mincore(),
fincore() has a mode flag which allows us to extract detailed information
about page cache like pfn and page flag. This kind of information is very
helpful, for example when applications want to know the file cache status
to control the IO on their own way.

The details about the data format being passed to userspace are explained
in inline comment, but generally in long entry format, we can choose which
information is extraced flexibly, so you don't have to waste memory by
extracting unnecessary information. And with FINCORE_PGOFF flag, we can skip
hole pages (not on memory,) which makes us avoid a flood of meaningless
zero entries when calling on extremely large (but only few pages of it
are loaded on memory) file.

Basic testset is added in the next patch on tools/testing/selftests/fincore/.

ChangeLog v2:
- move definition of FINCORE_* to include/uapi/linux/pagecache.h
- add another parameter fincore_extra to sys_fincore()
- rename FINCORE_SKIP_HOLE to FINCORE_PGOFF and change bit order.
- add valid argument check (start should be inside file address range,
nr_pages should be positive)
- add end-of-file check (scan to the end of file even if the last page
is a hole)
- add access_ok(VERIFY_WIRTE) (copied from mincore())
- update inline comments

Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx>
---
arch/x86/syscalls/syscall_64.tbl | 1 +
include/linux/syscalls.h | 4 +
include/uapi/linux/pagecache.h | 94 ++++++++++++
mm/Makefile | 2 +-
mm/fincore.c | 322 +++++++++++++++++++++++++++++++++++++++
5 files changed, 422 insertions(+), 1 deletion(-)
create mode 100644 mm/fincore.c

diff --git v3.16-rc3.orig/arch/x86/syscalls/syscall_64.tbl v3.16-rc3/arch/x86/syscalls/syscall_64.tbl
index ec255a1646d2..9d291b7081ca 100644
--- v3.16-rc3.orig/arch/x86/syscalls/syscall_64.tbl
+++ v3.16-rc3/arch/x86/syscalls/syscall_64.tbl
@@ -323,6 +323,7 @@
314 common sched_setattr sys_sched_setattr
315 common sched_getattr sys_sched_getattr
316 common renameat2 sys_renameat2
+317 common fincore sys_fincore

#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git v3.16-rc3.orig/include/linux/syscalls.h v3.16-rc3/include/linux/syscalls.h
index b0881a0ed322..60795ee8f9ee 100644
--- v3.16-rc3.orig/include/linux/syscalls.h
+++ v3.16-rc3/include/linux/syscalls.h
@@ -65,6 +65,7 @@ struct old_linux_dirent;
struct perf_event_attr;
struct file_handle;
struct sigaltstack;
+struct fincore_extra;

#include <linux/types.h>
#include <linux/aio_abi.h>
@@ -866,4 +867,7 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
unsigned long idx1, unsigned long idx2);
asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
+asmlinkage long sys_fincore(int fd, loff_t start, long nr_pages,
+ int mode, unsigned char __user *vec,
+ struct fincore_extra __user *extra);
#endif
diff --git v3.16-rc3.orig/include/uapi/linux/pagecache.h v3.16-rc3/include/uapi/linux/pagecache.h
index 15e879f7395f..cd53c69d6f56 100644
--- v3.16-rc3.orig/include/uapi/linux/pagecache.h
+++ v3.16-rc3/include/uapi/linux/pagecache.h
@@ -14,4 +14,98 @@ enum {

#define PAGECACHE_TAG_MASK ((1UL << __NR_PAGECACHE_TAGS) - 1)

+/*
+ * You can control how the buffer in userspace is filled with this mode
+ * parameters:
+ *
+ * - FINCORE_BMAP:
+ * the page status is returned in a vector of bytes.
+ * The least significant bit of each byte is 1 if the referenced page
+ * is in memory, otherwise it is zero.
+ *
+ * - FINCORE_PGOFF:
+ * if this flag is set, fincore() doesn't store any information about
+ * holes. Instead each records per page has the entry of page offset,
+ * using 8 bytes. This mode is useful if we handle a large file and
+ * only few pages are on memory.
+ *
+ * - FINCORE_PFN:
+ * stores pfn, using 8 bytes.
+ *
+ * - FINCORE_PAGEFLAGS:
+ * stores page flags, using 8 bytes. See definition of KPF_* for
+ * details of each bit.
+ *
+ * - FINCORE_PAGECACHE_TAGS:
+ * stores pagecache tags, using 8 bytes. See definition of PAGECACHE_TAG_*
+ * for details of each bit.
+ *
+ * FINCORE_BMAP shouldn't be used combined with any other flags, and returnd
+ * data in this mode is like this:
+ *
+ * page offset 0 1 2 3 4
+ * +---+---+---+---+---+
+ * | 1 | 0 | 0 | 1 | 1 | ...
+ * +---+---+---+---+---+
+ * <->
+ * 1 byte
+ *
+ * For FINCORE_PFN, page data is formatted like this:
+ *
+ * page offset 0 1 2 3 4
+ * +-------+-------+-------+-------+-------+
+ * | pfn | pfn | pfn | pfn | pfn | ...
+ * +-------+-------+-------+-------+-------+
+ * <----->
+ * 8 byte
+ *
+ * We can use multiple flags among the flags in FINCORE_LONGENTRY_MASK.
+ * For example, when the mode is FINCORE_PFN|FINCORE_PAGEFLAGS, the per-page
+ * information is stored like this:
+ *
+ * page offset 0 page offset 1 page offset 2 page offset 3
+ * (hole)
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * | pfn | flags | pfn | flags | 0 | 0 | pfn | flags | ...
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * <-------------> <-------------> <-------------> <------------->
+ * 16 bytes 16 bytes 16 bytes 16 bytes
+ *
+ * When FINCORE_PGOFF is set, we store page offset entry and ignore holes
+ * For example, the data format of mode FINCORE_PGOFF|FINCORE_PFN|
+ * FINCORE_PAGEFLAGS|FINCORE_PAGECACHE_TAGS is like follows:
+ *
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * | pgoff | pfn | flags | tags | pgoff | pfn | flags | tags | ...
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * <-----------------------------> <----------------------------->
+ * 32 bytes 32 bytes
+ */
+#define FINCORE_BMAP 0x01 /* bytemap mode */
+#define FINCORE_PGOFF 0x02
+#define FINCORE_PFN 0x04
+#define FINCORE_PAGE_FLAGS 0x08
+#define FINCORE_PAGECACHE_TAGS 0x10
+
+#define FINCORE_MODE_MASK 0x1f
+#define FINCORE_LONGENTRY_MASK (FINCORE_PGOFF | FINCORE_PFN | \
+ FINCORE_PAGE_FLAGS | FINCORE_PAGECACHE_TAGS)
+
+struct fincore_extra {
+ /*
+ * (output) the number of entries with valid data, this is useful
+ * if you set FINCORE_PGOFF and want to know the end of filled data.
+ */
+ unsigned long nr_entries;
+
+ /*
+ * (input) A mask of pagecache tags which selects what fields the
+ * user wants.
+ * (output) A mask of pagecache tags returned from the kernel
+ * which tells userspace which data it actually filled.
+ * This variable is used only when FINCORE_PAGECACHE_TAGS is set.
+ */
+ unsigned long tags;
+};
+
#endif /* _UAPI_LINUX_PAGECACHE_H */
diff --git v3.16-rc3.orig/mm/Makefile v3.16-rc3/mm/Makefile
index 4064f3ec145e..cc9420221afd 100644
--- v3.16-rc3.orig/mm/Makefile
+++ v3.16-rc3/mm/Makefile
@@ -18,7 +18,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
compaction.o balloon_compaction.o vmacache.o \
interval_tree.o list_lru.o workingset.o \
- iov_iter.o $(mmu-y)
+ iov_iter.o fincore.o $(mmu-y)

obj-y += init-mm.o

diff --git v3.16-rc3.orig/mm/fincore.c v3.16-rc3/mm/fincore.c
new file mode 100644
index 000000000000..df7226658c0d
--- /dev/null
+++ v3.16-rc3/mm/fincore.c
@@ -0,0 +1,322 @@
+/*
+ * fincore(2) system call
+ *
+ * Copyright (C) 2014 NEC Corporation, Naoya Horiguchi
+ */
+
+#include <linux/syscalls.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <uapi/linux/pagecache.h>
+
+
+struct fincore_control {
+ int mode;
+ int width; /* width of each entry (in bytes) */
+ unsigned char *buffer;
+ long buffer_size;
+ void *cursor; /* current position on the buffer */
+ pgoff_t pgstart; /* start point of page cache scan in each run
+ * of the while loop */
+ long nr_pages; /* number of pages to be copied to userspace
+ * (decreasing while scan proceeds) */
+ long scanned_offset; /* page offset of the lastest scanned page */
+ unsigned long tags; /* pagecache tag mask */
+ struct address_space *mapping;
+};
+
+/*
+ * TODO: doing radix_tree_tag_get() for each tag is not optimal, but no easy
+ * way without degrading finely tuned radix tree routines.
+ */
+static unsigned long get_pagecache_tags(struct fincore_control *fc,
+ unsigned long index)
+{
+ int i;
+ unsigned long tags = 0;
+ struct radix_tree_root *root = &fc->mapping->page_tree;
+
+ for (i = 0; i < __NR_PAGECACHE_TAGS; i++) {
+ if (fc->tags & (1UL << i))
+ if (radix_tree_tag_get(root, index, i))
+ tags |= 1 << i;
+ }
+ return tags;
+}
+
+#define store_entry(fc, type, data) ({ \
+ *(type *)fc->cursor = (type)data; \
+ fc->cursor += sizeof(type); \
+})
+
+/*
+ * Store page cache data to temporal buffer in the specified format depending
+ * on fincore mode.
+ */
+static void __do_fincore(struct fincore_control *fc, struct page *page,
+ unsigned long index)
+{
+ VM_BUG_ON(!page);
+ VM_BUG_ON((unsigned long)fc->cursor - (unsigned long)fc->buffer
+ >= fc->buffer_size);
+ if (fc->mode & FINCORE_BMAP)
+ store_entry(fc, unsigned char, PageUptodate(page));
+ else if (fc->mode & (FINCORE_LONGENTRY_MASK)) {
+ if (fc->mode & FINCORE_PGOFF)
+ store_entry(fc, unsigned long, index);
+ if (fc->mode & FINCORE_PFN)
+ store_entry(fc, unsigned long, page_to_pfn(page));
+ if (fc->mode & FINCORE_PAGE_FLAGS)
+ store_entry(fc, unsigned long, stable_page_flags(page));
+ if (fc->mode & FINCORE_PAGECACHE_TAGS)
+ store_entry(fc, unsigned long,
+ get_pagecache_tags(fc, index));
+ }
+}
+
+/*
+ * Traverse page cache tree. It's assumed that temporal buffer are zeroed
+ * in advance. Due to this, we don't have to store zero entry explicitly
+ * one-by-one and we just set fc->cursor to the position of the next
+ * on-memory page.
+ *
+ * Return value is the number of pages whose data is stored in fc->buffer.
+ */
+static long do_fincore(struct fincore_control *fc, int nr_pages)
+{
+ pgoff_t pgend = fc->pgstart + nr_pages;
+ struct radix_tree_iter iter;
+ void **slot;
+ long nr = 0;
+
+ fc->cursor = fc->buffer;
+
+ rcu_read_lock();
+restart:
+ radix_tree_for_each_slot(slot, &fc->mapping->page_tree, &iter,
+ fc->pgstart) {
+ long jump;
+ struct page *page;
+
+ fc->scanned_offset = iter.index;
+ /* Handle holes */
+ jump = iter.index - fc->pgstart - nr;
+ if (jump) {
+ if (!(fc->mode & FINCORE_PGOFF)) {
+ if (iter.index < pgend) {
+ fc->cursor += jump * fc->width;
+ nr = iter.index - fc->pgstart;
+ } else {
+ /*
+ * Fill remaining buffer as hole. Next
+ * call should start at offset pgend.
+ */
+ nr = nr_pages;
+ fc->scanned_offset = pgend - 1;
+ break;
+ }
+ }
+ }
+repeat:
+ page = radix_tree_deref_slot(slot);
+ if (unlikely(!page))
+ /*
+ * No need to increment nr and fc->cursor, because next
+ * iteration should detect hole and update them there.
+ */
+ continue;
+ else if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ WARN_ON(iter.index);
+ goto restart;
+ }
+ __do_fincore(fc, page, iter.index);
+ } else {
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *slot)) {
+ page_cache_release(page);
+ goto repeat;
+ }
+
+ __do_fincore(fc, page, iter.index);
+ page_cache_release(page);
+ }
+
+ if (++nr == nr_pages)
+ break;
+ }
+
+ if (!(fc->mode & FINCORE_PGOFF)) {
+ nr = nr_pages;
+ fc->scanned_offset = pgend - 1;
+ }
+
+ rcu_read_unlock();
+
+ return nr;
+}
+
+static inline bool fincore_validate_mode(int mode)
+{
+ if (mode & ~FINCORE_MODE_MASK)
+ return false;
+ if (!(!!(mode & FINCORE_BMAP) ^ !!(mode & FINCORE_LONGENTRY_MASK)))
+ return false;
+ return true;
+}
+
+#define FINCORE_LOOP_STEP 256L
+
+/*
+ * The fincore(2) system call
+ *
+ * @fd: file descriptor of the target file
+ * @start: starting address offset of the target file (in byte).
+ * This should be aligned to page cache size.
+ * @nr_pages: the number of pages whose data is passed to userspace.
+ * @mode fincore mode flags to determine the entry's format
+ * @vec pointer of the userspace buffer. The size must be equal to or
+ * larger than (@nr_pages * width), where width is the size of
+ * each entry.
+ * @extra used to input/output additional information from/to userspace
+ *
+ * fincore() returns the memory residency status and additional info (like
+ * pfn and page flags) of the given file's pages.
+ *
+ * Depending on the fincore mode, caller can receive the different formatted
+ * information. See the comment on definition of FINCORE_*.
+ *
+ * Because the status of a page can change after fincore() checks it once,
+ * the returned vector may contain stale information.
+ *
+ * return values:
+ * -EBADF: @fd isn't a valid open file descriptor
+ * -EFAULT: @vec points to an illegal address
+ * -EINVAL: @start is unaligned to page cache size or is out of file range.
+ * Or @nr_pages is non-positive. Or @mode is invalid.
+ * Or fincore_extra is not given in FINCORE_PAGECACHE_TAG mode.
+ * 0: fincore() is successfully done
+ */
+SYSCALL_DEFINE6(fincore, int, fd, loff_t, start, long, nr_pages,
+ int, mode, unsigned char __user *, vec,
+ struct fincore_extra __user *, extra)
+{
+ long ret = 0;
+ long step;
+ long nr = 0;
+ long pages_to_eof;
+ int pc_shift = PAGE_CACHE_SHIFT;
+ struct fd f;
+
+ struct fincore_control fc = {
+ .mode = mode,
+ .width = sizeof(unsigned char),
+ };
+
+ if (start < 0 || nr_pages <= 0)
+ return -EINVAL;
+
+ if (!fincore_validate_mode(mode))
+ return -EINVAL;
+
+ f = fdget(fd);
+
+ if (is_file_hugepages(f.file))
+ pc_shift = huge_page_shift(hstate_file(f.file));
+
+ if (!IS_ALIGNED(start, 1 << pc_shift)) {
+ ret = -EINVAL;
+ goto fput;
+ }
+
+ /*
+ * TODO: support /dev/mem, /proc/pid/mem for system/process wide
+ * page survey, which would obsolete /proc/kpageflags, and
+ * /proc/pid/pagemap.
+ */
+ if (!S_ISREG(file_inode(f.file)->i_mode)) {
+ ret = -EBADF;
+ goto fput;
+ }
+
+ fc.pgstart = start >> pc_shift;
+ pages_to_eof = DIV_ROUND_UP(i_size_read(file_inode(f.file)),
+ 1UL << pc_shift) - fc.pgstart;
+ /* start is too large */
+ if (pages_to_eof <= 0) {
+ ret = -EINVAL;
+ goto fput;
+ }
+ /* Never go beyond the end of file */
+ fc.nr_pages = min(pages_to_eof, nr_pages);
+ fc.mapping = f.file->f_mapping;
+ if (mode & FINCORE_LONGENTRY_MASK)
+ fc.width = ((mode & FINCORE_PGOFF ? 1 : 0) +
+ (mode & FINCORE_PFN ? 1 : 0) +
+ (mode & FINCORE_PAGE_FLAGS ? 1 : 0) +
+ (mode & FINCORE_PAGECACHE_TAGS ? 1 : 0)
+ ) * sizeof(unsigned long);
+
+ if (mode & FINCORE_PAGECACHE_TAGS) {
+ if (!extra) {
+ ret = -EINVAL;
+ goto fput;
+ } else {
+ fc.tags = extra->tags & PAGECACHE_TAG_MASK;
+ __put_user(fc.tags, &extra->tags);
+ }
+ }
+
+ if (!access_ok(VERIFY_WRITE, vec, nr_pages * fc.width)) {
+ ret = -EFAULT;
+ goto fput;
+ }
+
+ step = min(fc.nr_pages, FINCORE_LOOP_STEP);
+
+ fc.buffer_size = step * fc.width;
+ fc.buffer = kmalloc(fc.buffer_size, GFP_TEMPORARY);
+ if (!fc.buffer) {
+ ret = -ENOMEM;
+ goto fput;
+ }
+
+ while (fc.nr_pages > 0) {
+ memset(fc.buffer, 0, fc.buffer_size);
+ ret = do_fincore(&fc, min(step, fc.nr_pages));
+ /* Reached the end of the file */
+ if (ret == 0)
+ break;
+ if (ret < 0)
+ break;
+ if (copy_to_user(vec + nr * fc.width,
+ fc.buffer, ret * fc.width)) {
+ ret = -EFAULT;
+ break;
+ }
+ fc.nr_pages -= ret;
+ fc.pgstart = fc.scanned_offset + 1;
+ nr += ret;
+ }
+
+ kfree(fc.buffer);
+
+ if (extra)
+ __put_user(nr, &extra->nr_entries);
+
+fput:
+ fdput(f);
+ return ret;
+}
--
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/