Re: [RFC PATCH 3/3] mm: increase scalability of global memory commitment accounting

From: Andrey Ryabinin
Date: Thu Feb 11 2016 - 08:35:20 EST


On 02/10/2016 08:46 PM, Konstantin Khlebnikov wrote:
> On Wed, Feb 10, 2016 at 5:52 PM, Andrey Ryabinin
> <aryabinin@xxxxxxxxxxxxx> wrote:
>> Currently we use percpu_counter for accounting committed memory. Change
>> of committed memory on more than vm_committed_as_batch pages leads to
>> grab of counter's spinlock. The batch size is quite small - from 32 pages
>> up to 0.4% of the memory/cpu (usually several MBs even on large machines).
>>
>> So map/munmap of several MBs anonymous memory in multiple processes leads
>> to high contention on that spinlock.
>>
>> Instead of percpu_counter we could use ordinary per-cpu variables.
>> Dump test case (8-proccesses running map/munmap of 4MB,
>> vm_committed_as_batch = 2MB on test setup) showed 2.5x performance
>> improvement.
>>
>> The downside of this approach is slowdown of vm_memory_committed().
>> However, it doesn't matter much since it usually is not in a hot path.
>> The only exception is __vm_enough_memory() with overcommit set to
>> OVERCOMMIT_NEVER. In that case brk1 test from will-it-scale benchmark
>> shows 1.1x - 1.3x performance regression.
>>
>> So I think it's a good tradeoff. We've got significantly increased
>> scalability for the price of some overhead in vm_memory_committed().
>
> I think thats a no go. 30% regression for your not-so-big machine.
> For 4096 cores regression will be enourmous. Link: https://xkcd.com/619/
>

Bayan. Linux already supports 8192 cpus. So I set possible_cpus=8192 to see how bad it is.
brk1 test with disabled overcommit (OVERCOMMIT_NEVER) showed ~500x regression. I guess that's too much.

I've tried another approach - convert 'vm_committed_as' to atomic_t variable.
On 8-proccesses map/munmap of 4K this shows only 2%-3% regression (comparing to mainline).
And for 4MB map/munmap this gives 125% improvement.

So, for me, this sounds like a good way to go, although, it worth check regression of small
allocations on bigger machines.

---

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index df4661a..f30e387 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -41,7 +41,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#define K(x) ((x) << (PAGE_SHIFT - 10))
si_meminfo(&i);
si_swapinfo(&i);
- committed = percpu_counter_read_positive(&vm_committed_as);
+ committed = vm_memory_committed();

cached = global_page_state(NR_FILE_PAGES) -
total_swapcache_pages() - i.bufferram;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 979bc83..82dac6e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1881,7 +1881,11 @@ extern void memmap_init_zone(unsigned long, int, unsigned long,
extern void setup_per_zone_wmarks(void);
extern int __meminit init_per_zone_wmark_min(void);
extern void mem_init(void);
+#ifdef CONFIG_MMU
+static inline void mmap_init(void) {}
+#else
extern void __init mmap_init(void);
+#endif
extern void show_mem(unsigned int flags);
extern void si_meminfo(struct sysinfo * val);
extern void si_meminfo_node(struct sysinfo *val, int nid);
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 16373c8..21b68e8 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -2,7 +2,7 @@
#define _LINUX_MMAN_H

#include <linux/mm.h>
-#include <linux/percpu_counter.h>
+#include <linux/percpu.h>

#include <linux/atomic.h>
#include <uapi/linux/mman.h>
@@ -10,19 +10,12 @@
extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern unsigned long sysctl_overcommit_kbytes;
-extern struct percpu_counter vm_committed_as;
-
-#ifdef CONFIG_SMP
-extern s32 vm_committed_as_batch;
-#else
-#define vm_committed_as_batch 0
-#endif
-
unsigned long vm_memory_committed(void);
+extern atomic_t vm_committed_as;

static inline void vm_acct_memory(long pages)
{
- __percpu_counter_add(&vm_committed_as, pages, vm_committed_as_batch);
+ atomic_add(pages, &vm_committed_as);
}

static inline void vm_unacct_memory(long pages)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index fdadf91..d96c71f 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -142,51 +142,6 @@ early_param("mminit_loglevel", set_mminit_loglevel);
struct kobject *mm_kobj;
EXPORT_SYMBOL_GPL(mm_kobj);

-#ifdef CONFIG_SMP
-s32 vm_committed_as_batch = 32;
-
-static void __meminit mm_compute_batch(void)
-{
- u64 memsized_batch;
- s32 nr = num_present_cpus();
- s32 batch = max_t(s32, nr*2, 32);
-
- /* batch size set to 0.4% of (total memory/#cpus), or max int32 */
- memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff);
-
- vm_committed_as_batch = max_t(s32, memsized_batch, batch);
-}
-
-static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
- unsigned long action, void *arg)
-{
- switch (action) {
- case MEM_ONLINE:
- case MEM_OFFLINE:
- mm_compute_batch();
- default:
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block compute_batch_nb __meminitdata = {
- .notifier_call = mm_compute_batch_notifier,
- .priority = IPC_CALLBACK_PRI, /* use lowest priority */
-};
-
-static int __init mm_compute_batch_init(void)
-{
- mm_compute_batch();
- register_hotmemory_notifier(&compute_batch_nb);
-
- return 0;
-}
-
-__initcall(mm_compute_batch_init);
-
-#endif
-
static int __init mm_sysfs_init(void)
{
mm_kobj = kobject_create_and_add("mm", kernel_kobj);
diff --git a/mm/mmap.c b/mm/mmap.c
index f088c60..c796d73 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3184,17 +3184,6 @@ void mm_drop_all_locks(struct mm_struct *mm)
}

/*
- * initialise the VMA slab
- */
-void __init mmap_init(void)
-{
- int ret;
-
- ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
- VM_BUG_ON(ret);
-}
-
-/*
* Initialise sysctl_user_reserve_kbytes.
*
* This is intended to prevent a user from starting a single memory hogging
diff --git a/mm/nommu.c b/mm/nommu.c
index 6402f27..2d52dbc 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -533,10 +533,6 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
*/
void __init mmap_init(void)
{
- int ret;
-
- ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
- VM_BUG_ON(ret);
vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
}

diff --git a/mm/util.c b/mm/util.c
index 47a57e5..9130983 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -402,6 +402,7 @@ unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
+atomic_t vm_committed_as;

int overcommit_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
@@ -445,12 +446,6 @@ unsigned long vm_commit_limit(void)
}

/*
- * Make sure vm_committed_as in one cacheline and not cacheline shared with
- * other variables. It can be updated by several CPUs frequently.
- */
-struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
-
-/*
* The global memory commitment made in the system can be a metric
* that can be used to drive ballooning decisions when Linux is hosted
* as a guest. On Hyper-V, the host implements a policy engine for dynamically
@@ -460,7 +455,7 @@ struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
*/
unsigned long vm_memory_committed(void)
{
- return percpu_counter_read_positive(&vm_committed_as);
+ return atomic_read(&vm_committed_as);
}
EXPORT_SYMBOL_GPL(vm_memory_committed);

@@ -484,8 +479,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
long free, allowed, reserve;

- VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
- -(s64)vm_committed_as_batch * num_online_cpus(),
+ VM_WARN_ONCE(atomic_read(&vm_committed_as) < 0,
"memory commitment underflow");

vm_acct_memory(pages);
@@ -553,7 +547,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
allowed -= min_t(long, mm->total_vm / 32, reserve);
}

- if (percpu_counter_read_positive(&vm_committed_as) < allowed)
+ if (vm_memory_committed() < allowed)
return 0;
error:
vm_unacct_memory(pages);
--
2.4.10