Re: Linux 4.9-rc6

From: Eric Dumazet
Date: Wed Dec 21 2016 - 10:30:31 EST


On Sun, 2016-12-04 at 09:17 -0800, Eric Dumazet wrote:
> On Sun, 2016-12-04 at 03:10 -0800, Linus Torvalds wrote:
> >
> >
> > On Dec 4, 2016 02:43, "Thorsten Leemhuis" <regressions@xxxxxxxxxxxxx>
> > wrote:
> >
> >
> > What the status of below patch? From the discussion it looks a
> > lot like
> > it was developed to fix a regression in 4.9, but the patch
> > afaics has
> > neither his mainline or linux-next yet.
> >
> >
> > It's not a regression as far as I can tell. It's a small optimization.
> > Maybe.
> >
> >
> > It's not going into 4.9, is not even clear it's worth it later either,
> > unless somebody had numbers (which I haven't seen)
> >
> Right, the patch was not in anyway ready for 4.9 ;)
>
> I'll try to complete this for next cycle.

I now have a hacky patch that also adds PMD alignment for large
allocations, and support hugepages (this last part depends on
CONFIG_HAVE_ARCH_HUGE_VMAP at this moment, x86/arm64 so far)

Toshi Kani added pmd_set_huge() in commit e61ce6ade404e ("mm: change
ioremap to set up huge I/O mappings"), I am not sure why vmalloc() was
not considered (or I might have missed it completely)

It seems to provide about 25 cycles gain per random access for large
tables on my x86 lab hosts.

(I did a test with a program having 10 Million fds)

For allocations above 2 MB (pages >= 512), like Dentry cache,
Inode-cache, TCP established hash table, or large alloc_fdmem() ones,
might benefit from this.

lpaa23:~# grep large /proc/vmallocinfo
0xffffc90000009000-0xffffc9000000c000 12288 alloc_large_system_hash+0x189/0x253 pages=2 vmalloc N0=1 N1=1
0xffffc9000000c000-0xffffc9000000f000 12288 alloc_large_system_hash+0x189/0x253 pages=2 vmalloc N0=1 N1=1
0xffffc9000001e000-0xffffc9000009f000 528384 alloc_large_system_hash+0x189/0x253 pages=128 vmalloc N0=64 N1=64
0xffffc9000009f000-0xffffc900000e0000 266240 alloc_large_system_hash+0x189/0x253 pages=64 vmalloc N0=32 N1=32
0xffffc900001d9000-0xffffc900001dc000 12288 alloc_large_system_hash+0x189/0x253 pages=2 vmalloc N0=1 N1=1
0xffffc90000200000-0xffffc90010201000 268439552 alloc_large_system_hash+0x189/0x253 pages=65536 vmalloc vpages N0=32768 N1=32768
0xffffc90010400000-0xffffc90018401000 134221824 alloc_large_system_hash+0x189/0x253 pages=32768 vmalloc vpages N0=16384 N1=16384
0xffffc90018600000-0xffffc90018a01000 4198400 alloc_large_system_hash+0x189/0x253 pages=1024 vmalloc vpages N0=512 N1=512
0xffffc90018c00000-0xffffc90019001000 4198400 alloc_large_system_hash+0x189/0x253 pages=1024 vmalloc vpages N0=512 N1=512
0xffffc9001b249000-0xffffc9001b34a000 1052672 alloc_large_system_hash+0x189/0x253 pages=256 vmalloc N0=128 N1=128
0xffffc9001b400000-0xffffc9001b801000 4198400 alloc_large_system_hash+0x189/0x253 pages=1024 vmalloc vpages N0=512 N1=512
0xffffc9001ba00000-0xffffc9001bc01000 2101248 alloc_large_system_hash+0x189/0x253 pages=512 vmalloc N0=256 N1=256
0xffffc9001bc01000-0xffffc9001bd02000 1052672 alloc_large_system_hash+0x189/0x253 pages=256 vmalloc N0=128 N1=128
0xffffc9001be00000-0xffffc9001c001000 2101248 alloc_large_system_hash+0x189/0x253 pages=512 vmalloc N0=256 N1=256


I wont be able to split this patch in 3 parts before January 6th, after
my vacations. I am showing the WIP if anyone is interested seeing this.

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a5584384eabc..055b027ee659 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -21,6 +21,7 @@
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
#include <linux/list.h>
+#include <linux/mempolicy.h>
#include <linux/notifier.h>
#include <linux/rbtree.h>
#include <linux/radix-tree.h>
@@ -154,6 +155,18 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr,
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+ if (next - addr == PMD_SIZE) {
+ struct page *page = pages[*nr];
+
+ if (compound_order(page) == PMD_SHIFT - PAGE_SHIFT) {
+ if (pmd_set_huge(pmd, page_to_phys(page), prot)) {
+ (*nr) += 1 << (PMD_SHIFT - PAGE_SHIFT);
+ continue;
+ }
+ }
+ }
+#endif
if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
@@ -1349,7 +1362,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
if (flags & VM_IOREMAP)
align = 1ul << clamp_t(int, get_count_order_long(size),
PAGE_SHIFT, IOREMAP_MAX_ORDER);
-
+ else if (size >= PMD_SIZE)
+ align = PMD_SIZE;
area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!area))
return NULL;
@@ -1482,11 +1496,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
if (deallocate_pages) {
int i;

- for (i = 0; i < area->nr_pages; i++) {
+ for (i = 0; i < area->nr_pages;) {
struct page *page = area->pages[i];
+ unsigned int order;

BUG_ON(!page);
- __free_pages(page, 0);
+ order = compound_order(page);
+ __free_pages(page, order);
+ i += 1 << order;
}

kvfree(area->pages);
@@ -1613,16 +1630,39 @@ EXPORT_SYMBOL(vmap);
static void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot,
int node, const void *caller);
+
+static int vmalloc_max_order(int node, int nr_pages)
+{
+ int max_node_order = min(PMD_SHIFT - PAGE_SHIFT, MAX_ORDER - 1);
+
+#if defined(CONFIG_NUMA)
+ if (nr_online_nodes > 1 && node == NUMA_NO_NODE) {
+ struct mempolicy *pol = current->mempolicy;
+ int pages_per_node, nr_nodes;
+
+ if (pol && pol->mode == MPOL_INTERLEAVE) {
+ nr_nodes = nodes_weight(pol->v.nodes);
+ pages_per_node = DIV_ROUND_UP(nr_pages, nr_nodes);
+ max_node_order = min(max_node_order,
+ ilog2(pages_per_node));
+ }
+ }
+#endif
+ return max_node_order;
+}
+
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
struct page **pages;
- unsigned int nr_pages, array_size, i;
+ unsigned int nr_pages, array_size, i, j;
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
+ int max_node_order;

nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
array_size = (nr_pages * sizeof(struct page *));
+ max_node_order = vmalloc_max_order(node, nr_pages);

area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
@@ -1639,20 +1679,31 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
return NULL;
}

- for (i = 0; i < area->nr_pages; i++) {
- struct page *page;

- if (node == NUMA_NO_NODE)
- page = alloc_page(alloc_mask);
- else
- page = alloc_pages_node(node, alloc_mask, 0);
+ for (i = 0; i < area->nr_pages;) {
+ int order = min(ilog2(area->nr_pages - i), max_node_order);
+ struct page *page;

- if (unlikely(!page)) {
- /* Successfully allocated i pages, free them in __vunmap() */
- area->nr_pages = i;
- goto fail;
+ for (;;) {
+ gfp_t gfp = alloc_mask;
+
+ if (order > 0)
+ gfp = (gfp & ~__GFP_DIRECT_RECLAIM) |
+ __GFP_NORETRY | __GFP_COMP;
+ if (node == NUMA_NO_NODE)
+ page = alloc_pages(gfp, order);
+ else
+ page = alloc_pages_node(node, gfp, order);
+ if (page)
+ break;
+ if (unlikely(--order < 0)) {
+ /* Successfully allocated i pages, free them in __vunmap() */
+ area->nr_pages = i;
+ goto fail;
+ }
}
- area->pages[i] = page;
+ for (j = 0; j < (1U << order); j++)
+ area->pages[i++] = page++;
if (gfpflags_allow_blocking(gfp_mask))
cond_resched();
}
@@ -2619,9 +2670,13 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)

memset(counters, 0, nr_node_ids * sizeof(unsigned int));

- for (nr = 0; nr < v->nr_pages; nr++)
- counters[page_to_nid(v->pages[nr])]++;
+ for (nr = 0; nr < v->nr_pages;) {
+ struct page *page = v->pages[nr];
+ int npages = 1 << compound_order(page);

+ counters[page_to_nid(page)] += npages;
+ nr += npages;
+ }
for_each_node_state(nr, N_HIGH_MEMORY)
if (counters[nr])
seq_printf(m, " N%u=%u", nr, counters[nr]);