[RFC PATCH 2/5] overcommit accounting and handling functions

From: Andrea Righi
Date: Mon Jun 09 2008 - 19:42:00 EST


Split the different __vm_enough_memory() policies in inline functions to
easily reuse them in the memory controller overcommit handling routines.

Accounting functions vm_acct_memory() and vm_unacct_memory() are rewritten as
well, including per-cgroup committed VM accounting concept.

Signed-off-by: Andrea Righi <righi.andrea@xxxxxxxxx>
---
include/linux/mman.h | 148 ++++++++++++++++++++++++++++++++++++++++++++++++--
mm/memcontrol.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++-
mm/mmap.c | 85 ++++-------------------------
mm/nommu.c | 84 ++++-------------------------
mm/swap.c | 3 +-
5 files changed, 306 insertions(+), 153 deletions(-)

diff --git a/include/linux/mman.h b/include/linux/mman.h
index dab8892..37f695f 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -12,25 +12,165 @@

#ifdef __KERNEL__
#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/mmzone.h>
+#include <linux/mm_types.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>

#include <asm/atomic.h>

extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern atomic_long_t vm_committed_space;
+extern unsigned long totalreserve_pages;
+extern unsigned long totalram_pages;
+
+struct vm_acct_values {
+ int overcommit_memory;
+ int overcommit_ratio;
+ atomic_long_t vm_committed_space;
+};
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+extern void vm_acct_get_config(const struct mm_struct *mm,
+ struct vm_acct_values *v);
+extern void mem_cgroup_vm_acct_memory(struct mm_struct *mm, long pages);
+#else
+static inline void vm_acct_get_config(const struct mm_struct *mm,
+ struct vm_acct_values *v)
+{
+ v->overcommit_memory = sysctl_overcommit_memory;
+ v->overcommit_ratio = sysctl_overcommit_ratio;
+}
+static inline void mem_cgroup_vm_acct_memory(struct mm_struct *mm, long pages)
+{
+}
+#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
+
+static inline int __vm_enough_memory_guess(struct mm_struct *mm,
+ long pages,
+ int cap_sys_admin)
+{
+ unsigned long n, free;
+
+ free = global_page_state(NR_FILE_PAGES);
+ free += nr_swap_pages;
+
+ /*
+ * Any slabs which are created with the
+ * SLAB_RECLAIM_ACCOUNT flag claim to have contents
+ * which are reclaimable, under pressure. The dentry
+ * cache and most inode caches should fall into this
+ */
+ free += global_page_state(NR_SLAB_RECLAIMABLE);
+
+ /*
+ * Leave the last 3% for root
+ */
+ if (!cap_sys_admin)
+ free -= free / 32;
+
+ if (free > pages)
+ return 0;
+
+ /*
+ * nr_free_pages() is very expensive on large systems,
+ * only call if we're about to fail.
+ */
+ n = nr_free_pages();
+
+ /*
+ * Leave reserved pages. The pages are not for anonymous pages.
+ */
+ if (n <= totalreserve_pages)
+ return -ENOMEM;
+ else
+ n -= totalreserve_pages;
+
+ /*
+ * Leave the last 3% for root
+ */
+ if (!cap_sys_admin)
+ n -= n / 32;
+ free += n;
+
+ if (free > pages)
+ return 0;
+
+ return -ENOMEM;
+}
+
+static inline int __vm_enough_memory_never(struct mm_struct *mm,
+ long pages,
+ int cap_sys_admin)
+{
+ unsigned long allowed;
+ struct vm_acct_values v;
+
+ vm_acct_get_config(mm, &v);
+
+ allowed = (totalram_pages - hugetlb_total_pages())
+ * v.overcommit_ratio / 100;
+ /*
+ * Leave the last 3% for root
+ */
+ if (!cap_sys_admin)
+ allowed -= allowed / 32;
+ allowed += total_swap_pages;
+
+ /* Don't let a single process grow too big:
+ leave 3% of the size of this process for other processes */
+ allowed -= mm->total_vm / 32;
+
+ /*
+ * cast `allowed' as a signed long because vm_committed_space
+ * sometimes has a negative value
+ */
+ if (atomic_long_read(&vm_committed_space) < (long)allowed)
+ return 0;
+
+ return -ENOMEM;
+}
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+extern int mem_cgroup_vm_enough_memory_guess(struct mm_struct *mm,
+ long pages,
+ int cap_sys_admin);
+
+extern int mem_cgroup_vm_enough_memory_never(struct mm_struct *mm,
+ long pages,
+ int cap_sys_admin);
+#else /* CONFIG_CGROUP_MEM_RES_CTLR */
+static inline int mem_cgroup_vm_enough_memory_guess(struct mm_struct *mm,
+ long pages,
+ int cap_sys_admin)
+{
+ return __vm_enough_memory_guess(mm, pages, cap_sys_admin);
+}
+
+static inline int mem_cgroup_vm_enough_memory_never(struct mm_struct *mm,
+ long pages,
+ int cap_sys_admin)
+{
+ return __vm_enough_memory_never(mm, pages, cap_sys_admin);
+}
+#endif /* CONFIG_CGROUP_MEM_RES_CTLR */
+

#ifdef CONFIG_SMP
-extern void vm_acct_memory(long pages);
+extern void vm_acct_memory(struct mm_struct *mm, long pages);
#else
-static inline void vm_acct_memory(long pages)
+static inline void vm_acct_memory(struct mm_struct *mm, long pages)
{
atomic_long_add(pages, &vm_committed_space);
+ mem_cgroup_vm_acct_memory(mm, pages);
}
#endif

-static inline void vm_unacct_memory(long pages)
+static inline void vm_unacct_memory(struct mm_struct *mm, long pages)
{
- vm_acct_memory(-pages);
+ vm_acct_memory(mm, -pages);
}

/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e46451e..4100e24 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -21,6 +21,7 @@
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
#include <linux/mm.h>
+#include <linux/mman.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
#include <linux/backing-dev.h>
@@ -141,6 +142,10 @@ struct mem_cgroup {
* statistics.
*/
struct mem_cgroup_stat stat;
+ /*
+ * VM overcommit settings
+ */
+ struct vm_acct_values vmacct;
};
static struct mem_cgroup init_mem_cgroup;

@@ -187,6 +192,130 @@ enum charge_type {
MEM_CGROUP_CHARGE_TYPE_MAPPED,
};

+void vm_acct_get_config(const struct mm_struct *mm, struct vm_acct_values *v)
+{
+ struct mem_cgroup *mem;
+ long tmp;
+
+ BUG_ON(!mm);
+
+ rcu_read_lock();
+ mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ v->overcommit_memory = mem->vmacct.overcommit_memory;
+ v->overcommit_ratio = mem->vmacct.overcommit_ratio;
+ tmp = atomic_long_read(&mem->vmacct.vm_committed_space);
+ atomic_long_set(&v->vm_committed_space, tmp);
+ rcu_read_unlock();
+}
+
+void mem_cgroup_vm_acct_memory(struct mm_struct *mm, long pages)
+{
+ struct mem_cgroup *mem;
+ struct task_struct *tsk;
+
+ if (!mm)
+ return;
+
+ rcu_read_lock();
+ tsk = rcu_dereference(mm->owner);
+ mem = mem_cgroup_from_task(tsk);
+ /* Update memory cgroup statistic */
+ atomic_long_add(pages, &mem->vmacct.vm_committed_space);
+ /* Update task statistic */
+ atomic_long_add(pages, &tsk->vm_committed_space);
+ rcu_read_unlock();
+}
+
+int mem_cgroup_vm_enough_memory_guess(struct mm_struct *mm,
+ long pages,
+ int cap_sys_admin)
+{
+ unsigned long n, free;
+ struct mem_cgroup *mem;
+ long total, rss, cache;
+
+ rcu_read_lock();
+ mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ total = (long) (mem->res.limit >> PAGE_SHIFT) + 1L;
+ if (total > (totalram_pages - hugetlb_total_pages())) {
+ rcu_read_unlock();
+ return __vm_enough_memory_guess(mm, pages, cap_sys_admin);
+ }
+ cache = (long)mem_cgroup_read_stat(&mem->stat,
+ MEM_CGROUP_STAT_CACHE);
+ rss = (long)mem_cgroup_read_stat(&mem->stat,
+ MEM_CGROUP_STAT_RSS);
+ rcu_read_unlock();
+
+ free = cache;
+ free += nr_swap_pages;
+
+ /*
+ * Leave the last 3% for root
+ */
+ if (!cap_sys_admin)
+ free -= free / 32;
+
+ if (free > pages)
+ return 0;
+
+ n = total - rss;
+
+ /*
+ * Leave the last 3% for root
+ */
+ if (!cap_sys_admin)
+ n -= n / 32;
+ free += n;
+
+ if (free > pages)
+ return 0;
+
+ return -ENOMEM;
+}
+
+int mem_cgroup_vm_enough_memory_never(struct mm_struct *mm,
+ long pages,
+ int cap_sys_admin)
+{
+ unsigned long allowed;
+ struct vm_acct_values v;
+ struct mem_cgroup *mem;
+ long total;
+
+ rcu_read_lock();
+ mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ total = (long)(mem->res.limit >> PAGE_SHIFT) + 1L;
+ if (total > (totalram_pages - hugetlb_total_pages())) {
+ rcu_read_unlock();
+ return __vm_enough_memory_never(mm, pages, cap_sys_admin);
+ }
+ rcu_read_unlock();
+
+ vm_acct_get_config(mm, &v);
+
+ allowed = total * v.overcommit_ratio / 100;
+ /*
+ * Leave the last 3% for root
+ */
+ if (!cap_sys_admin)
+ allowed -= allowed / 32;
+ allowed += total_swap_pages;
+
+ /* Don't let a single process grow too big:
+ leave 3% of the size of this process for other processes */
+ allowed -= mm->total_vm / 32;
+
+ /*
+ * cast `allowed' as a signed long because vm_committed_space
+ * sometimes has a negative value
+ */
+ if (atomic_long_read(&v.vm_committed_space) < (long)allowed)
+ return 0;
+
+ return -ENOMEM;
+}
+
/*
* Always modified under lru lock. Then, not necessary to preempt_disable()
*/
@@ -1022,17 +1151,25 @@ static struct cgroup_subsys_state *
mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
{
struct mem_cgroup *mem;
+ struct cgroup *p = cont->parent;
int node;

- if (unlikely((cont->parent) == NULL)) {
+ if (unlikely((p) == NULL)) {
mem = &init_mem_cgroup;
page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
+ mem->vmacct.overcommit_memory = sysctl_overcommit_memory;
+ mem->vmacct.overcommit_ratio = sysctl_overcommit_ratio;
} else {
mem = mem_cgroup_alloc();
if (!mem)
return ERR_PTR(-ENOMEM);
+ mem->vmacct.overcommit_memory =
+ mem_cgroup_from_cont(p)->vmacct.overcommit_memory;
+ mem->vmacct.overcommit_ratio =
+ mem_cgroup_from_cont(p)->vmacct.overcommit_ratio;
}

+ atomic_long_set(&mem->vmacct.vm_committed_space, 0);
res_counter_init(&mem->res);

for_each_node_state(node, N_POSSIBLE)
diff --git a/mm/mmap.c b/mm/mmap.c
index 3354fdd..256599e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -25,6 +25,7 @@
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/mempolicy.h>
+#include <linux/memcontrol.h>
#include <linux/rmap.h>

#include <asm/uaccess.h>
@@ -100,87 +101,23 @@ atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- unsigned long free, allowed;
+ struct vm_acct_values v;

- vm_acct_memory(pages);
+ vm_acct_get_config(mm, &v);
+ vm_acct_memory(mm, pages);

/*
* Sometimes we want to use more memory than we have
*/
- if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
+ if (v.overcommit_memory == OVERCOMMIT_ALWAYS)
return 0;
-
- if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- unsigned long n;
-
- free = global_page_state(NR_FILE_PAGES);
- free += nr_swap_pages;
-
- /*
- * Any slabs which are created with the
- * SLAB_RECLAIM_ACCOUNT flag claim to have contents
- * which are reclaimable, under pressure. The dentry
- * cache and most inode caches should fall into this
- */
- free += global_page_state(NR_SLAB_RECLAIMABLE);
-
- /*
- * Leave the last 3% for root
- */
- if (!cap_sys_admin)
- free -= free / 32;
-
- if (free > pages)
- return 0;
-
- /*
- * nr_free_pages() is very expensive on large systems,
- * only call if we're about to fail.
- */
- n = nr_free_pages();
-
- /*
- * Leave reserved pages. The pages are not for anonymous pages.
- */
- if (n <= totalreserve_pages)
- goto error;
- else
- n -= totalreserve_pages;
-
- /*
- * Leave the last 3% for root
- */
- if (!cap_sys_admin)
- n -= n / 32;
- free += n;
-
- if (free > pages)
- return 0;
-
- goto error;
- }
-
- allowed = (totalram_pages - hugetlb_total_pages())
- * sysctl_overcommit_ratio / 100;
- /*
- * Leave the last 3% for root
- */
- if (!cap_sys_admin)
- allowed -= allowed / 32;
- allowed += total_swap_pages;
-
- /* Don't let a single process grow too big:
- leave 3% of the size of this process for other processes */
- allowed -= mm->total_vm / 32;
-
- /*
- * cast `allowed' as a signed long because vm_committed_space
- * sometimes has a negative value
- */
- if (atomic_long_read(&vm_committed_space) < (long)allowed)
+ if ((v.overcommit_memory == OVERCOMMIT_GUESS) &&
+ (!mem_cgroup_vm_enough_memory_guess(mm, pages, cap_sys_admin)))
+ return 0;
+ else if (!mem_cgroup_vm_enough_memory_never(mm, pages, cap_sys_admin))
return 0;
-error:
- vm_unacct_memory(pages);
+
+ vm_unacct_memory(mm, pages);

return -ENOMEM;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 3abd084..b194a44 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -20,6 +20,7 @@
#include <linux/file.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
+#include <linux/memcontrol.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/ptrace.h>
@@ -1356,86 +1357,23 @@ EXPORT_SYMBOL(get_unmapped_area);
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- unsigned long free, allowed;
+ struct vm_acct_values v;

- vm_acct_memory(pages);
+ vm_acct_get_config(mm, &v);
+ vm_acct_memory(mm, pages);

/*
* Sometimes we want to use more memory than we have
*/
- if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
+ if (v.overcommit_memory == OVERCOMMIT_ALWAYS)
return 0;
-
- if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- unsigned long n;
-
- free = global_page_state(NR_FILE_PAGES);
- free += nr_swap_pages;
-
- /*
- * Any slabs which are created with the
- * SLAB_RECLAIM_ACCOUNT flag claim to have contents
- * which are reclaimable, under pressure. The dentry
- * cache and most inode caches should fall into this
- */
- free += global_page_state(NR_SLAB_RECLAIMABLE);
-
- /*
- * Leave the last 3% for root
- */
- if (!cap_sys_admin)
- free -= free / 32;
-
- if (free > pages)
- return 0;
-
- /*
- * nr_free_pages() is very expensive on large systems,
- * only call if we're about to fail.
- */
- n = nr_free_pages();
-
- /*
- * Leave reserved pages. The pages are not for anonymous pages.
- */
- if (n <= totalreserve_pages)
- goto error;
- else
- n -= totalreserve_pages;
-
- /*
- * Leave the last 3% for root
- */
- if (!cap_sys_admin)
- n -= n / 32;
- free += n;
-
- if (free > pages)
- return 0;
-
- goto error;
- }
-
- allowed = totalram_pages * sysctl_overcommit_ratio / 100;
- /*
- * Leave the last 3% for root
- */
- if (!cap_sys_admin)
- allowed -= allowed / 32;
- allowed += total_swap_pages;
-
- /* Don't let a single process grow too big:
- leave 3% of the size of this process for other processes */
- allowed -= current->mm->total_vm / 32;
-
- /*
- * cast `allowed' as a signed long because vm_committed_space
- * sometimes has a negative value
- */
- if (atomic_long_read(&vm_committed_space) < (long)allowed)
+ if ((v.overcommit_memory == OVERCOMMIT_GUESS) &&
+ (!mem_cgroup_vm_enough_memory_guess(mm, pages, cap_sys_admin)))
+ return 0;
+ else if (!mem_cgroup_vm_enough_memory_never(mm, pages, cap_sys_admin))
return 0;
-error:
- vm_unacct_memory(pages);
+
+ vm_unacct_memory(mm, pages);

return -ENOMEM;
}
diff --git a/mm/swap.c b/mm/swap.c
index 45c9f25..f7676db 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -495,7 +495,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag);

static DEFINE_PER_CPU(long, committed_space) = 0;

-void vm_acct_memory(long pages)
+void vm_acct_memory(struct mm_struct *mm, long pages)
{
long *local;

@@ -507,6 +507,7 @@ void vm_acct_memory(long pages)
*local = 0;
}
preempt_enable();
+ mem_cgroup_vm_acct_memory(mm, pages);
}

#ifdef CONFIG_HOTPLUG_CPU
--
1.5.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/