[RFC, PATCH] mm: map few pages around fault address if they are inpage cache

From: Kirill A. Shutemov
Date: Fri Feb 07 2014 - 10:45:37 EST


On Thu, Feb 06, 2014 at 05:31:55PM -0800, Linus Torvalds wrote:
> On Thu, Feb 6, 2014 at 2:24 PM, Kirill A. Shutemov <kirill@xxxxxxxxxxxxx> wrote:
> >
> > If we want to reduce number of page fault with less overhead we probably
> > should concentrate on minor page fault -- populate pte around fault
> > address which already in page cache. It should cover scripting use-case
> > pretty well.
>
> That's what my patch largely does. Except I screwed up and didn't use
> FAULT_FLAG_ALLOW_RETRY in fault_around().

I fail to see how you avoid touching backing storage.

> Anyway, my patch kind of works, but I'm starting to hate it. I think I
> want to try to extend the "->fault()" interface to allow
> filemap_fault() to just fill in multiple pages.
>
> We alread have that "vmf->page" thing, we could make it a small array
> easily. That would allow proper gang lookup, and much more efficient
> "fill in multiple entries in one go" in mm/memory.c.

See patch below.

I extended ->fault() interface: added pointer to array of pages and
nr_pages. If ->fault() nows about new interface and use it, it fills array
and set VM_FAULT_AROUND bit in return code. Only filemap_fault()
converted at the moment.

I haven't tested it much, but my kvm boots. There're few places where code
should be fixed. __do_fault() and filemap_fault() are too ugly and need to
be cleaned.

I don't have any performance data yet.

Any thoughts?

---
include/linux/mm.h | 11 +++++
mm/filemap.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
mm/memory.c | 58 ++++++++++++++++++++--
3 files changed, 204 insertions(+), 7 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 35527173cf50..deb65c0850a9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -181,6 +181,9 @@ extern pgprot_t protection_map[16];
#define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */
#define FAULT_FLAG_TRIED 0x40 /* second try */
#define FAULT_FLAG_USER 0x80 /* The fault originated in userspace */
+#define FAULT_FLAG_AROUND 0x100 /* Get ->nr_pages pages around fault
+ * address
+ */

/*
* vm_fault is filled by the the pagefault handler and passed to the vma's
@@ -200,6 +203,13 @@ struct vm_fault {
* is set (which is also implied by
* VM_FAULT_ERROR).
*/
+
+ int nr_pages; /* Number of pages to faultin, naturally
+ * aligned around virtual_address
+ */
+ struct page **pages; /* Pointer to array to store nr_pages
+ * pages.
+ */
};

/*
@@ -962,6 +972,7 @@ static inline int page_mapped(struct page *page)
#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */
#define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */
+#define VM_FAULT_AROUND 0x1000

#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */

diff --git a/mm/filemap.c b/mm/filemap.c
index b7749a92021c..1cabb15a5847 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -884,6 +884,64 @@ repeat:
return ret;
}

+void find_get_pages_or_null(struct address_space *mapping, pgoff_t start,
+ unsigned int nr_pages, struct page **pages)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+
+ if (unlikely(!nr_pages))
+ return;
+
+ memset(pages, 0, sizeof(struct page *) * nr_pages);
+
+ rcu_read_lock();
+restart:
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot(slot);
+ if (unlikely(!page))
+ continue;
+
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ WARN_ON(iter.index);
+ goto restart;
+ }
+ /*
+ * Otherwise, shmem/tmpfs must be storing a swap entry
+ * here as an exceptional entry: so skip over it -
+ * we only reach this from invalidate_mapping_pages().
+ */
+ continue;
+ }
+
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ if (page->index - start >= nr_pages) {
+ page_cache_release(page);
+ break;
+ }
+
+ /* Has the page moved? */
+ if (unlikely(page != *slot)) {
+ page_cache_release(page);
+ goto repeat;
+ }
+
+ pages[page->index - start] = page;
+ }
+
+ rcu_read_unlock();
+}
+
/**
* find_get_pages_contig - gang contiguous pagecache lookup
* @mapping: The address_space to search
@@ -1595,6 +1653,67 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
page, offset, ra->ra_pages);
}

+static void lock_secondary_pages(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ unsigned long address = (unsigned long)vmf->virtual_address;
+ int primary_idx = (address >> PAGE_SHIFT) & (vmf->nr_pages - 1);
+ pgoff_t size;
+ int i;
+
+ for (i = 0; i < vmf->nr_pages; i++) {
+ unsigned long addr = address + PAGE_SIZE * (i - primary_idx);
+
+ if (i == primary_idx || !vmf->pages[i])
+ continue;
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ goto put;
+ if (!trylock_page(vmf->pages[i]))
+ goto put;
+ /* Truncated? */
+ if (unlikely(vmf->pages[i]->mapping != mapping))
+ goto unlock;
+
+ if (unlikely(!PageUptodate(vmf->pages[i])))
+ goto unlock;
+
+ /*
+ * We have a locked page in the page cache, now we need to
+ * check that it's up-to-date. If not, it is going to be due to
+ * an error.
+ */
+ size = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1)
+ >> PAGE_CACHE_SHIFT;
+ if (unlikely(vmf->pages[i]->index >= size))
+ goto unlock;
+
+ continue;
+unlock:
+ unlock_page(vmf->pages[i]);
+put:
+ put_page(vmf->pages[i]);
+ vmf->pages[i] = NULL;
+ }
+}
+
+static void unlock_and_put_secondary_pages(struct vm_fault *vmf)
+{
+ unsigned long address = (unsigned long)vmf->virtual_address;
+ int primary_idx = (address >> PAGE_SHIFT) & (vmf->nr_pages - 1);
+ int i;
+
+ for (i = 0; i < vmf->nr_pages; i++) {
+ if (i == primary_idx || !vmf->pages[i])
+ continue;
+ unlock_page(vmf->pages[i]);
+ page_cache_release(vmf->pages[i]);
+ vmf->pages[i] = NULL;
+ }
+}
+
/**
* filemap_fault - read in file data for page fault handling
* @vma: vma in which the fault was taken
@@ -1617,6 +1736,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
pgoff_t offset = vmf->pgoff;
struct page *page;
pgoff_t size;
+ unsigned long address = (unsigned long)vmf->virtual_address;
+ int primary_idx = (address >> PAGE_SHIFT) & (vmf->nr_pages - 1);
int ret = 0;

size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -1626,7 +1747,15 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
/*
* Do we have something in the page cache already?
*/
- page = find_get_page(mapping, offset);
+ if (vmf->flags & FAULT_FLAG_AROUND) {
+ find_get_pages_or_null(mapping, offset - primary_idx,
+ vmf->nr_pages, vmf->pages);
+ page = vmf->pages[primary_idx];
+ lock_secondary_pages(vma, vmf);
+ ret |= VM_FAULT_AROUND;
+ } else
+ page = find_get_page(mapping, offset);
+
if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
/*
* We found the page, so try async readahead before
@@ -1638,14 +1767,18 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
do_sync_mmap_readahead(vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
- ret = VM_FAULT_MAJOR;
+ ret |= VM_FAULT_MAJOR;
retry_find:
page = find_get_page(mapping, offset);
if (!page)
goto no_cached_page;
+ if (vmf->flags & FAULT_FLAG_AROUND)
+ vmf->pages[primary_idx] = page;
}

if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+ unlock_and_put_secondary_pages(vmf);
+ ret &= ~VM_FAULT_AROUND;
page_cache_release(page);
return ret | VM_FAULT_RETRY;
}
@@ -1671,6 +1804,7 @@ retry_find:
*/
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (unlikely(offset >= size)) {
+ unlock_and_put_secondary_pages(vmf);
unlock_page(page);
page_cache_release(page);
return VM_FAULT_SIGBUS;
@@ -1694,6 +1828,8 @@ no_cached_page:
if (error >= 0)
goto retry_find;

+ unlock_and_put_secondary_pages(vmf);
+
/*
* An error return from page_cache_read can result if the
* system is low on memory, or a problem occurs while trying
@@ -1722,6 +1858,8 @@ page_not_uptodate:
if (!error || error == AOP_TRUNCATED_PAGE)
goto retry_find;

+ unlock_and_put_secondary_pages(vmf);
+
/* Things didn't work out. Return zero to tell the mm layer so. */
shrink_readahead_size_eio(file, ra);
return VM_FAULT_SIGBUS;
diff --git a/mm/memory.c b/mm/memory.c
index 6768ce9e57d2..e94d86ac7d5a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3283,6 +3283,9 @@ oom:
return VM_FAULT_OOM;
}

+#define FAULT_AROUND_PAGES 32
+#define FAULT_AROUND_MASK (FAULT_AROUND_PAGES - 1)
+
/*
* __do_fault() tries to create a new page mapping. It aggressively
* tries to share with existing pages, but makes a separate copy if
@@ -3303,12 +3306,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *page_table;
spinlock_t *ptl;
struct page *page;
+ struct page *pages[FAULT_AROUND_PAGES];
struct page *cow_page;
pte_t entry;
int anon = 0;
struct page *dirty_page = NULL;
struct vm_fault vmf;
- int ret;
+ int i, ret;
int page_mkwrite = 0;

/*
@@ -3336,14 +3340,35 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vmf.flags = flags;
vmf.page = NULL;

+ /* Do fault around only for read faults on linear mapping */
+ if (flags & (FAULT_FLAG_WRITE | FAULT_FLAG_NONLINEAR)) {
+ vmf.nr_pages = 0;
+ vmf.pages = NULL;
+ } else {
+ vmf.nr_pages = FAULT_AROUND_PAGES;
+ vmf.pages = pages;
+ vmf.flags |= FAULT_FLAG_AROUND;
+ }
ret = vma->vm_ops->fault(vma, &vmf);
+
+ /* ->fault don't know about FAULT_FLAG_AROUND */
+ if ((vmf.flags & FAULT_FLAG_AROUND) && !(ret & VM_FAULT_AROUND)) {
+ vmf.flags &= ~FAULT_FLAG_AROUND;
+ }
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY)))
goto uncharge_out;

if (unlikely(PageHWPoison(vmf.page))) {
- if (ret & VM_FAULT_LOCKED)
- unlock_page(vmf.page);
+ if (ret & VM_FAULT_LOCKED) {
+ if (vmf.flags & FAULT_FLAG_AROUND) {
+ for (i = 0; i < FAULT_AROUND_PAGES; i++) {
+ if (pages[i])
+ unlock_page(pages[i]);
+ }
+ } else
+ unlock_page(vmf.page);
+ }
ret = VM_FAULT_HWPOISON;
goto uncharge_out;
}
@@ -3352,9 +3377,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* For consistency in subsequent calls, make the faulted page always
* locked.
*/
- if (unlikely(!(ret & VM_FAULT_LOCKED)))
+ if (unlikely(!(ret & VM_FAULT_LOCKED))) {
+ BUG_ON(ret & VM_FAULT_AROUND); // FIXME
lock_page(vmf.page);
- else
+ } else
VM_BUG_ON(!PageLocked(vmf.page));

/*
@@ -3401,6 +3427,28 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,

page_table = pte_offset_map_lock(mm, pmd, address, &ptl);

+ for (i = 0; (vmf.flags & FAULT_FLAG_AROUND) && i < FAULT_AROUND_PAGES;
+ i++) {
+ int primary_idx = (address >> PAGE_SHIFT) & FAULT_AROUND_MASK;
+ pte_t *pte = page_table - primary_idx + i;
+ unsigned long addr = address + PAGE_SIZE * (i - primary_idx);
+
+ if (!pages[i])
+ continue;
+ if (i == primary_idx || !pte_none(*pte))
+ goto skip;
+ if (PageHWPoison(vmf.page))
+ goto skip;
+ flush_icache_page(vma, pages[i]);
+ entry = mk_pte(pages[i], vma->vm_page_prot);
+ inc_mm_counter_fast(mm, MM_FILEPAGES);
+ page_add_file_rmap(pages[i]);
+ set_pte_at(mm, addr, pte, entry);
+ update_mmu_cache(vma, addr, pte);
+skip:
+ unlock_page(pages[i]);
+ }
+
/*
* This silly early PAGE_DIRTY setting removes a race
* due to the bad i386 page protection. But it's valid
--
Kirill A. Shutemov
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/