Re: [PATCH] meminfo: show /proc/meminfo base on container's memcg
From: Zhu Yanhai
Date: Thu Jun 07 2012 - 19:18:44 EST
2012/5/29 Gao feng <gaofeng@xxxxxxxxxxxxxx>:
> cgroup and namespaces are used for creating containers but some of
> information is not isolated/virtualized. This patch is for isolating /proc/meminfo
> information per container, which uses memory cgroup. By this, top,free
> and other tools under container can work as expected(show container's
> usage) without changes.
>
> This patch is a trial to show memcg's info in /proc/meminfo if 'current'
> is under a memcg other than root.
>
> we show /proc/meminfo base on container's memory cgroup.
> because there are lots of info can't be provide by memcg, and
> the cmds such as top, free just use some entries of /proc/meminfo,
> we replace those entries by memory cgroup.
>
> if container has no memcg, we will show host's /proc/meminfo
> as before.
>
> there is no idea how to deal with Buffers,I just set it zero,
> It's strange if Buffers bigger than MemTotal.
>
> Signed-off-by: Gao feng <gaofeng@xxxxxxxxxxxxxx>
> ---
> Âfs/proc/meminfo.c     Â|  11 +++++---
> Âinclude/linux/memcontrol.h | Â 15 +++++++++++
> Âmm/memcontrol.c      Â|  56 ++++++++++++++++++++++++++++++++++++++++++++
> Â3 files changed, 78 insertions(+), 4 deletions(-)
>
> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
> index 80e4645..29a1fcd 100644
> --- a/fs/proc/meminfo.c
> +++ b/fs/proc/meminfo.c
> @@ -13,6 +13,7 @@
> Â#include <linux/atomic.h>
> Â#include <asm/page.h>
> Â#include <asm/pgtable.h>
> +#include <linux/memcontrol.h>
> Â#include "internal.h"
>
> Âvoid __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
> @@ -27,7 +28,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
> Â Â Â Âstruct vmalloc_info vmi;
> Â Â Â Âlong cached;
> Â Â Â Âunsigned long pages[NR_LRU_LISTS];
> - Â Â Â int lru;
>
> Â/*
> Â* display in kilobytes.
> @@ -39,16 +39,19 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
> Â Â Â Âallowed = ((totalram_pages - hugetlb_total_pages())
> Â Â Â Â Â Â Â Â* sysctl_overcommit_ratio / 100) + total_swap_pages;
>
> + Â Â Â memcg_meminfo(&i);
> Â Â Â Âcached = global_page_state(NR_FILE_PAGES) -
> Â Â Â Â Â Â Â Â Â Â Â Âtotal_swapcache_pages - i.bufferram;
> + Â Â Â /*
> + Â Â Â Â* If 'current' is in root memory cgroup, returns global status.
> + Â Â Â Â* If not, returns the status of memcg under which current runs.
> + Â Â Â Â*/
> + Â Â Â sys_page_state(pages, &cached);
> Â Â Â Âif (cached < 0)
> Â Â Â Â Â Â Â Âcached = 0;
>
> Â Â Â Âget_vmalloc_info(&vmi);
>
> - Â Â Â for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
> - Â Â Â Â Â Â Â pages[lru] = global_page_state(NR_LRU_BASE + lru);
> -
> Â Â Â Â/*
> Â Â Â Â * Tagged format, for easy grepping and expansion.
> Â Â Â Â */
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 0316197..6220764 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -21,6 +21,7 @@
> Â#define _LINUX_MEMCONTROL_H
> Â#include <linux/cgroup.h>
> Â#include <linux/vm_event_item.h>
> +#include <linux/mm.h>
>
> Âstruct mem_cgroup;
> Âstruct page_cgroup;
> @@ -116,6 +117,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â struct mem_cgroup_reclaim_cookie *);
> Âvoid mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
>
> +extern void memcg_meminfo(struct sysinfo *si);
> +extern void sys_page_state(unsigned long *page, long *cached);
> +
> Â/*
> Â* For memory reclaim.
> Â*/
> @@ -323,6 +327,17 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
> Â{
> Â}
>
> +static inline void memcg_meminfo(struct sysinfo *si)
> +{
> +}
> +
> +static inline void sys_page_state(unsigned long *pages, long *cached)
> +{
> + Â Â Â int lru;
> + Â Â Â for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
> + Â Â Â Â Â Â Â pages[lru] = global_page_state(NR_LRU_BASE + lru);
> +}
> +
> Âstatic inline bool mem_cgroup_disabled(void)
> Â{
> Â Â Â Âreturn true;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index f142ea9..c25e160 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -52,6 +52,7 @@
> Â#include "internal.h"
> Â#include <net/sock.h>
> Â#include <net/tcp_memcontrol.h>
> +#include <linux/pid_namespace.h>
>
> Â#include <asm/uaccess.h>
>
> @@ -4345,6 +4346,61 @@ mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
> Â Â Â Â Â Â Â Âmem_cgroup_get_local_stat(iter, s);
> Â}
>
> +void memcg_meminfo(struct sysinfo *val)
> +{
> + Â Â Â struct mem_cgroup *memcg = mem_cgroup_from_task(current);
> + Â Â Â __kernel_ulong_t totalram, totalswap;
> + Â Â Â if (current->nsproxy->pid_ns == &init_pid_ns ||
> + Â Â Â Â Â memcg == NULL || mem_cgroup_is_root(memcg))
> + Â Â Â Â Â Â Â return;
This is somehow not exactly the same with your description "if 'current'
> is under a memcg other than root." since you are checking both namespace and cgroup constraint.
> +
> + Â Â Â totalram = res_counter_read_u64(&memcg->res,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â RES_LIMIT) >> PAGE_SHIFT;
> + Â Â Â if (totalram < val->totalram) {
> + Â Â Â Â Â Â Â __kernel_ulong_t usageram;
> + Â Â Â Â Â Â Â usageram = res_counter_read_u64(&memcg->res,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â RES_USAGE) >> PAGE_SHIFT;
> + Â Â Â Â Â Â Â val->totalram = totalram;
> + Â Â Â Â Â Â Â val->freeram = totalram - usageram;
> + Â Â Â Â Â Â Â val->bufferram = 0;
> + Â Â Â Â Â Â Â val->totalhigh = 0;
> + Â Â Â Â Â Â Â val->freehigh = 0;
> + Â Â Â } else
> + Â Â Â Â Â Â Â return;
You don't want to show the local swap usage and limit (below code) if
mem.hard_limit_in_bytes is larger than the physical memory capacity?
Why?
> +
> + Â Â Â totalswap = res_counter_read_u64(&memcg->memsw,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â ÂRES_LIMIT) >> PAGE_SHIFT;
> + Â Â Â if (totalswap < val->totalswap) {
here, do you need also to check memcg->memsw_is_minimum? Local
swapping is disabled if memsw_is_minium is true.
> + Â Â Â Â Â Â Â __kernel_ulong_t usageswap;
> + Â Â Â Â Â Â Â usageswap = res_counter_read_u64(&memcg->memsw,
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â ÂRES_USAGE) >> PAGE_SHIFT;
> + Â Â Â Â Â Â Â val->totalswap = totalswap - val->totalram;
> + Â Â Â Â Â Â Â val->freeswap = totalswap - usageswap - val->freeram;
This is seriously broken, memsw.limit_in_bytes means the max swap
space you can get, and under global memory pressure all physical
memory belongs to a memcg might get dump into disk, until it reaches
the limit set by memsw.limit_in_bytes. So you should not subtract the
physical memory usage.
> + Â Â Â }
And you need a 'else' statement here, although memsw.limit_in_bytes
might be larger than val->totalswap, it's still wrong to show the user
a global swap usage other than your local one, right?
> +}
> +
> +void sys_page_state(unsigned long *pages, long *cached)
> +{
> + Â Â Â struct mem_cgroup *memcg = mem_cgroup_from_task(current);
> +
> + Â Â Â if (current->nsproxy->pid_ns == &init_pid_ns ||
> + Â Â Â Â Â memcg == NULL || mem_cgroup_is_root(memcg)) {
> + Â Â Â Â Â Â Â int lru;
> + Â Â Â Â Â Â Â for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
> + Â Â Â Â Â Â Â Â Â Â Â pages[lru] = global_page_state(NR_LRU_BASE + lru);
> + Â Â Â } else {
> + Â Â Â Â Â Â Â struct mcs_total_stat s;
> + Â Â Â Â Â Â Â memset(&s, 0, sizeof(s));
> + Â Â Â Â Â Â Â mem_cgroup_get_total_stat(memcg, &s);
> + Â Â Â Â Â Â Â *cached = s.stat[MCS_CACHE] >> PAGE_SHIFT;
> + Â Â Â Â Â Â Â pages[LRU_ACTIVE_ANON] = s.stat[MCS_ACTIVE_ANON] >> PAGE_SHIFT;
> + Â Â Â Â Â Â Â pages[LRU_ACTIVE_FILE] = s.stat[MCS_ACTIVE_FILE] >> PAGE_SHIFT;
> + Â Â Â Â Â Â Â pages[LRU_INACTIVE_ANON] = s.stat[MCS_INACTIVE_ANON] >> PAGE_SHIFT;
> + Â Â Â Â Â Â Â pages[LRU_INACTIVE_FILE] = s.stat[MCS_INACTIVE_FILE] >> PAGE_SHIFT;
> + Â Â Â Â Â Â Â pages[LRU_UNEVICTABLE] = s.stat[MCS_UNEVICTABLE] >> PAGE_SHIFT;
> + Â Â Â }
> +}
> +
> Â#ifdef CONFIG_NUMA
> Âstatic int mem_control_numa_stat_show(struct seq_file *m, void *arg)
> Â{
> --
> 1.7.7.6
>
> --
> To unsubscribe from this list: send the line "unsubscribe cgroups" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at Âhttp://vger.kernel.org/majordomo-info.html
We have a highly similar patch with yours here on a large production
system, AFAICS it's absolutely unaccepted to require correcting the
userland tools to read the stat file exported by the various cgroups,
which have different formats and locations and names with the legacy
ones i.e. /proc/stat, /proc/meminfo and /proc/loadavg. There are tons
of the tools and it's impossible to do so (just think about,
top/sar/vmstat/mpstat/free/who, and various userland library read the
number directly from above files, setup their thread pool or memory
pool according to the available resource).
--
Thanks,
Zhu Yanhai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/