[PATCH 04/17] mm: gup: make get_user_pages_fast and __get_user_pages_fast latency conscious

From: Andrea Arcangeli
Date: Fri Oct 03 2014 - 13:11:30 EST


This teaches gup_fast and __gup_fast to re-enable irqs and
cond_resched() if possible every BATCH_PAGES.

This must be implemented by other archs as well and it's a requirement
before converting more get_user_pages() to get_user_pages_fast() as an
optimization (instead of using get_user_pages_unlocked which would be
slower).

Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx>
---
arch/x86/mm/gup.c | 234 ++++++++++++++++++++++++++++++++++--------------------
1 file changed, 149 insertions(+), 85 deletions(-)

diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 2ab183b..917d8c1 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -12,6 +12,12 @@

#include <asm/pgtable.h>

+/*
+ * Keep irq disabled for no more than BATCH_PAGES pages.
+ * Matches PTRS_PER_PTE (or half in non-PAE kernels).
+ */
+#define BATCH_PAGES 512
+
static inline pte_t gup_get_pte(pte_t *ptep)
{
#ifndef CONFIG_X86_PAE
@@ -250,6 +256,40 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
return 1;
}

+static inline int __get_user_pages_fast_batch(unsigned long start,
+ unsigned long end,
+ int write, struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long next;
+ unsigned long flags;
+ pgd_t *pgdp;
+ int nr = 0;
+
+ /*
+ * This doesn't prevent pagetable teardown, but does prevent
+ * the pagetables and pages from being freed on x86.
+ *
+ * So long as we atomically load page table pointers versus teardown
+ * (which we do on x86, with the above PAE exception), we can follow the
+ * address down to the the page and take a ref on it.
+ */
+ local_irq_save(flags);
+ pgdp = pgd_offset(mm, start);
+ do {
+ pgd_t pgd = *pgdp;
+
+ next = pgd_addr_end(start, end);
+ if (pgd_none(pgd))
+ break;
+ if (!gup_pud_range(pgd, start, next, write, pages, &nr))
+ break;
+ } while (pgdp++, start = next, start != end);
+ local_irq_restore(flags);
+
+ return nr;
+}
+
/*
* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
* back to the regular GUP.
@@ -257,31 +297,55 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
- struct mm_struct *mm = current->mm;
- unsigned long addr, len, end;
- unsigned long next;
- unsigned long flags;
- pgd_t *pgdp;
- int nr = 0;
+ unsigned long len, end, batch_pages;
+ int nr, ret;

start &= PAGE_MASK;
- addr = start;
len = (unsigned long) nr_pages << PAGE_SHIFT;
end = start + len;
+ /*
+ * get_user_pages() handles nr_pages == 0 gracefully, but
+ * gup_fast starts walking the first pagetable in a do {}
+ * while() fashion so it's not robust to handle nr_pages ==
+ * 0. There's no point in being permissive about end < start
+ * either. So this check verifies both nr_pages being non
+ * zero, and that "end" didn't overflow.
+ */
+ VM_BUG_ON(end <= start);
if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
(void __user *)start, len)))
return 0;

- /*
- * XXX: batch / limit 'nr', to avoid large irq off latency
- * needs some instrumenting to determine the common sizes used by
- * important workloads (eg. DB2), and whether limiting the batch size
- * will decrease performance.
- *
- * It seems like we're in the clear for the moment. Direct-IO is
- * the main guy that batches up lots of get_user_pages, and even
- * they are limited to 64-at-a-time which is not so many.
- */
+ ret = 0;
+ for (;;) {
+ batch_pages = nr_pages;
+ if (batch_pages > BATCH_PAGES && !irqs_disabled())
+ batch_pages = BATCH_PAGES;
+ len = (unsigned long) batch_pages << PAGE_SHIFT;
+ end = start + len;
+ nr = __get_user_pages_fast_batch(start, end, write, pages);
+ VM_BUG_ON(nr > batch_pages);
+ nr_pages -= nr;
+ ret += nr;
+ if (!nr_pages || nr != batch_pages)
+ break;
+ start += len;
+ pages += batch_pages;
+ }
+
+ return ret;
+}
+
+static inline int get_user_pages_fast_batch(unsigned long start,
+ unsigned long end,
+ int write, struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long next;
+ pgd_t *pgdp;
+ int nr = 0;
+ unsigned long orig_start = start;
+
/*
* This doesn't prevent pagetable teardown, but does prevent
* the pagetables and pages from being freed on x86.
@@ -290,18 +354,24 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* (which we do on x86, with the above PAE exception), we can follow the
* address down to the the page and take a ref on it.
*/
- local_irq_save(flags);
- pgdp = pgd_offset(mm, addr);
+ local_irq_disable();
+ pgdp = pgd_offset(mm, start);
do {
pgd_t pgd = *pgdp;

- next = pgd_addr_end(addr, end);
- if (pgd_none(pgd))
+ next = pgd_addr_end(start, end);
+ if (pgd_none(pgd)) {
+ VM_BUG_ON(nr >= (end-orig_start) >> PAGE_SHIFT);
break;
- if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ }
+ if (!gup_pud_range(pgd, start, next, write, pages, &nr)) {
+ VM_BUG_ON(nr >= (end-orig_start) >> PAGE_SHIFT);
break;
- } while (pgdp++, addr = next, addr != end);
- local_irq_restore(flags);
+ }
+ } while (pgdp++, start = next, start != end);
+ local_irq_enable();
+
+ cond_resched();

return nr;
}
@@ -326,80 +396,74 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
struct mm_struct *mm = current->mm;
- unsigned long addr, len, end;
- unsigned long next;
- pgd_t *pgdp;
- int nr = 0;
+ unsigned long len, end, batch_pages;
+ int nr, ret;
+ unsigned long orig_start;

start &= PAGE_MASK;
- addr = start;
+ orig_start = start;
len = (unsigned long) nr_pages << PAGE_SHIFT;

end = start + len;
- if (end < start)
- goto slow_irqon;
+ /*
+ * get_user_pages() handles nr_pages == 0 gracefully, but
+ * gup_fast starts walking the first pagetable in a do {}
+ * while() fashion so it's not robust to handle nr_pages ==
+ * 0. There's no point in being permissive about end < start
+ * either. So this check verifies both nr_pages being non
+ * zero, and that "end" didn't overflow.
+ */
+ VM_BUG_ON(end <= start);

+ nr = ret = 0;
#ifdef CONFIG_X86_64
if (end >> __VIRTUAL_MASK_SHIFT)
goto slow_irqon;
#endif
+ for (;;) {
+ batch_pages = min(nr_pages, BATCH_PAGES);
+ len = (unsigned long) batch_pages << PAGE_SHIFT;
+ end = start + len;
+ nr = get_user_pages_fast_batch(start, end, write, pages);
+ VM_BUG_ON(nr > batch_pages);
+ nr_pages -= nr;
+ ret += nr;
+ if (!nr_pages)
+ break;
+ if (nr < batch_pages)
+ goto slow_irqon;
+ start += len;
+ pages += batch_pages;
+ }

- /*
- * XXX: batch / limit 'nr', to avoid large irq off latency
- * needs some instrumenting to determine the common sizes used by
- * important workloads (eg. DB2), and whether limiting the batch size
- * will decrease performance.
- *
- * It seems like we're in the clear for the moment. Direct-IO is
- * the main guy that batches up lots of get_user_pages, and even
- * they are limited to 64-at-a-time which is not so many.
- */
- /*
- * This doesn't prevent pagetable teardown, but does prevent
- * the pagetables and pages from being freed on x86.
- *
- * So long as we atomically load page table pointers versus teardown
- * (which we do on x86, with the above PAE exception), we can follow the
- * address down to the the page and take a ref on it.
- */
- local_irq_disable();
- pgdp = pgd_offset(mm, addr);
- do {
- pgd_t pgd = *pgdp;
-
- next = pgd_addr_end(addr, end);
- if (pgd_none(pgd))
- goto slow;
- if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
- goto slow;
- } while (pgdp++, addr = next, addr != end);
- local_irq_enable();
-
- VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
- return nr;
-
- {
- int ret;
+ VM_BUG_ON(ret != (end - orig_start) >> PAGE_SHIFT);
+ return ret;

-slow:
- local_irq_enable();
slow_irqon:
- /* Try to get the remaining pages with get_user_pages */
- start += nr << PAGE_SHIFT;
- pages += nr;
-
- ret = get_user_pages_unlocked(current, mm, start,
- (end - start) >> PAGE_SHIFT,
- write, 0, pages);
-
- /* Have to be a bit careful with return values */
- if (nr > 0) {
- if (ret < 0)
- ret = nr;
- else
- ret += nr;
- }
+ /* Try to get the remaining pages with get_user_pages */
+ start += nr << PAGE_SHIFT;
+ pages += nr;

- return ret;
+ /*
+ * "nr" was the get_user_pages_fast_batch last retval, "ret"
+ * was the sum of all get_user_pages_fast_batch retvals, now
+ * "nr" becomes the sum of all get_user_pages_fast_batch
+ * retvals and "ret" will become the get_user_pages_unlocked
+ * retval.
+ */
+ nr = ret;
+
+ ret = get_user_pages_unlocked(current, mm, start,
+ (end - start) >> PAGE_SHIFT,
+ write, 0, pages);
+
+ /* Have to be a bit careful with return values */
+ if (nr > 0) {
+ if (ret < 0)
+ ret = nr;
+ else
+ ret += nr;
}
+
+ return ret;
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/