Re: [PATCH v1] proc: Implement /proc/self/meminfo
From: Michal Hocko
Date: Thu Jun 03 2021 - 07:33:36 EST
[Cc linux-api]
On Thu 03-06-21 12:43:07, legion@xxxxxxxxxx wrote:
> From: Alexey Gladkov <legion@xxxxxxxxxx>
>
> The /proc/meminfo contains information regardless of the cgroups
> restrictions. This file is still widely used [1]. This means that all
> these programs will not work correctly inside container [2][3][4]. Some
> programs try to respect the cgroups limits, but not all of them
> implement support for all cgroup versions [5].
>
> Correct information can be obtained from cgroups, but this requires the
> cgroups to be available inside container and the correct version of
> cgroups to be supported.
>
> There is lxcfs [6] that emulates /proc/meminfo using fuse to provide
> information regarding cgroups. This patch can help them.
>
> This patch adds /proc/self/meminfo that contains a subset of
> /proc/meminfo respecting cgroup restrictions.
>
> We cannot just create /proc/self/meminfo and make a symlink at the old
> location because this will break the existing apparmor rules [7].
> Therefore, the patch adds a separate file with the same format.
>
> [1] https://codesearch.debian.net/search?q=%2Fproc%2Fmeminfo
> [2] https://sources.debian.org/src/erlang/1:23.2.6+dfsg-1/lib/os_mon/c_src/memsup.c#L300
> [3] https://sources.debian.org/src/p7zip/16.02+dfsg-8/CPP/Windows/System.cpp/#L103
> [4] https://sources.debian.org/src/systemd/247.3-5/src/oom/oomd.c/#L138
> [5] https://sources.debian.org/src/nodejs/12.21.0%7Edfsg-4/deps/uv/src/unix/linux-core.c/#L1059
> [6] https://linuxcontainers.org/lxcfs/
> [7] https://gitlab.com/apparmor/apparmor/-/blob/master/profiles/apparmor.d/abstractions/base#L98
>
> Signed-off-by: Alexey Gladkov <legion@xxxxxxxxxx>
> ---
> fs/proc/base.c | 2 +
> fs/proc/internal.h | 6 ++
> fs/proc/meminfo.c | 160 +++++++++++++++++++++++--------------
> include/linux/memcontrol.h | 2 +
> include/linux/mm.h | 15 ++++
> mm/memcontrol.c | 80 +++++++++++++++++++
> mm/page_alloc.c | 28 ++++---
> 7 files changed, 222 insertions(+), 71 deletions(-)
>
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 58bbf334265b..e95837cf713f 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -3269,6 +3269,7 @@ static const struct pid_entry tgid_base_stuff[] = {
> #ifdef CONFIG_SECCOMP_CACHE_DEBUG
> ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
> #endif
> + ONE("meminfo", S_IRUGO, proc_meminfo_show),
> };
>
> static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
> @@ -3602,6 +3603,7 @@ static const struct pid_entry tid_base_stuff[] = {
> #ifdef CONFIG_SECCOMP_CACHE_DEBUG
> ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
> #endif
> + ONE("meminfo", S_IRUGO, proc_meminfo_show),
> };
>
> static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> index 03415f3fb3a8..a6e8540afbd3 100644
> --- a/fs/proc/internal.h
> +++ b/fs/proc/internal.h
> @@ -241,6 +241,12 @@ extern int proc_net_init(void);
> static inline int proc_net_init(void) { return 0; }
> #endif
>
> +/*
> + * meminfo.c
> + */
> +extern int proc_meminfo_show(struct seq_file *m, struct pid_namespace *ns,
> + struct pid *pid, struct task_struct *tsk);
> +
> /*
> * proc_self.c
> */
> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
> index 6fa761c9cc78..3587a79d4b96 100644
> --- a/fs/proc/meminfo.c
> +++ b/fs/proc/meminfo.c
> @@ -16,6 +16,9 @@
> #ifdef CONFIG_CMA
> #include <linux/cma.h>
> #endif
> +#ifdef CONFIG_MEMCG
> +#include <linux/memcontrol.h>
> +#endif
> #include <asm/page.h>
> #include "internal.h"
>
> @@ -23,91 +26,112 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
> {
> }
>
> +static void proc_fill_meminfo(struct meminfo *mi)
> +{
> + int lru;
> + long cached;
> +
> + si_meminfo(&mi->si);
> + si_swapinfo(&mi->si);
> +
> + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
> + mi->pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
> +
> + cached = global_node_page_state(NR_FILE_PAGES) - total_swapcache_pages() - mi->si.bufferram;
> + if (cached < 0)
> + cached = 0;
> +
> + mi->cached = cached;
> + mi->swapcached = total_swapcache_pages();
> + mi->slab_reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B);
> + mi->slab_unreclaimable = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B);
> + mi->anon_pages = global_node_page_state(NR_ANON_MAPPED);
> + mi->mapped = global_node_page_state(NR_FILE_MAPPED);
> + mi->nr_pagetable = global_node_page_state(NR_PAGETABLE);
> + mi->dirty_pages = global_node_page_state(NR_FILE_DIRTY);
> + mi->writeback_pages = global_node_page_state(NR_WRITEBACK);
> +}
> +
> +#ifdef CONFIG_MEMCG
> +static inline void fill_meminfo(struct meminfo *mi, struct task_struct *task)
> +{
> + mem_fill_meminfo(mi, task);
> +}
> +#else
> +static inline void fill_meminfo(struct meminfo *mi, struct task_struct *task)
> +{
> + proc_fill_meminfo(mi);
> +}
> +#endif
> +
> static void show_val_kb(struct seq_file *m, const char *s, unsigned long num)
> {
> seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8);
> seq_write(m, " kB\n", 4);
> }
>
> +static int meminfo_proc_show_mi(struct seq_file *m, struct meminfo *mi)
> +{
> + show_val_kb(m, "MemTotal: ", mi->si.totalram);
> + show_val_kb(m, "MemFree: ", mi->si.freeram);
> + show_val_kb(m, "MemAvailable: ", si_mem_available_mi(mi));
> + show_val_kb(m, "Buffers: ", mi->si.bufferram);
> + show_val_kb(m, "Cached: ", mi->cached);
> + show_val_kb(m, "SwapCached: ", mi->swapcached);
> + show_val_kb(m, "Active: ", mi->pages[LRU_ACTIVE_ANON] + mi->pages[LRU_ACTIVE_FILE]);
> + show_val_kb(m, "Inactive: ", mi->pages[LRU_INACTIVE_ANON] + mi->pages[LRU_INACTIVE_FILE]);
> + show_val_kb(m, "Active(anon): ", mi->pages[LRU_ACTIVE_ANON]);
> + show_val_kb(m, "Inactive(anon): ", mi->pages[LRU_INACTIVE_ANON]);
> + show_val_kb(m, "Active(file): ", mi->pages[LRU_ACTIVE_FILE]);
> + show_val_kb(m, "Inactive(file): ", mi->pages[LRU_INACTIVE_FILE]);
> + show_val_kb(m, "Unevictable: ", mi->pages[LRU_UNEVICTABLE]);
> +
> +#ifdef CONFIG_HIGHMEM
> + show_val_kb(m, "HighTotal: ", mi->si.totalhigh);
> + show_val_kb(m, "HighFree: ", mi->si.freehigh);
> + show_val_kb(m, "LowTotal: ", mi->si.totalram - mi->si.totalhigh);
> + show_val_kb(m, "LowFree: ", mi->si.freeram - mi->si.freehigh);
> +#endif
> +
> + show_val_kb(m, "SwapTotal: ", mi->si.totalswap);
> + show_val_kb(m, "SwapFree: ", mi->si.freeswap);
> + show_val_kb(m, "Dirty: ", mi->dirty_pages);
> + show_val_kb(m, "Writeback: ", mi->writeback_pages);
> +
> + show_val_kb(m, "AnonPages: ", mi->anon_pages);
> + show_val_kb(m, "Mapped: ", mi->mapped);
> + show_val_kb(m, "Shmem: ", mi->si.sharedram);
> + show_val_kb(m, "Slab: ", mi->slab_reclaimable + mi->slab_unreclaimable);
> + show_val_kb(m, "SReclaimable: ", mi->slab_reclaimable);
> + show_val_kb(m, "SUnreclaim: ", mi->slab_unreclaimable);
> + show_val_kb(m, "PageTables: ", mi->nr_pagetable);
> +
> + return 0;
> +}
> +
> static int meminfo_proc_show(struct seq_file *m, void *v)
> {
> - struct sysinfo i;
> - unsigned long committed;
> - long cached;
> - long available;
> - unsigned long pages[NR_LRU_LISTS];
> - unsigned long sreclaimable, sunreclaim;
> - int lru;
>
> - si_meminfo(&i);
> - si_swapinfo(&i);
> - committed = vm_memory_committed();
> + struct meminfo mi;
>
> - cached = global_node_page_state(NR_FILE_PAGES) -
> - total_swapcache_pages() - i.bufferram;
> - if (cached < 0)
> - cached = 0;
> + proc_fill_meminfo(&mi);
> + meminfo_proc_show_mi(m, &mi);
>
> - for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
> - pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
> -
> - available = si_mem_available();
> - sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B);
> - sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B);
> -
> - show_val_kb(m, "MemTotal: ", i.totalram);
> - show_val_kb(m, "MemFree: ", i.freeram);
> - show_val_kb(m, "MemAvailable: ", available);
> - show_val_kb(m, "Buffers: ", i.bufferram);
> - show_val_kb(m, "Cached: ", cached);
> - show_val_kb(m, "SwapCached: ", total_swapcache_pages());
> - show_val_kb(m, "Active: ", pages[LRU_ACTIVE_ANON] +
> - pages[LRU_ACTIVE_FILE]);
> - show_val_kb(m, "Inactive: ", pages[LRU_INACTIVE_ANON] +
> - pages[LRU_INACTIVE_FILE]);
> - show_val_kb(m, "Active(anon): ", pages[LRU_ACTIVE_ANON]);
> - show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]);
> - show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]);
> - show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]);
> - show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]);
> show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK));
>
> -#ifdef CONFIG_HIGHMEM
> - show_val_kb(m, "HighTotal: ", i.totalhigh);
> - show_val_kb(m, "HighFree: ", i.freehigh);
> - show_val_kb(m, "LowTotal: ", i.totalram - i.totalhigh);
> - show_val_kb(m, "LowFree: ", i.freeram - i.freehigh);
> -#endif
> -
> #ifndef CONFIG_MMU
> show_val_kb(m, "MmapCopy: ",
> (unsigned long)atomic_long_read(&mmap_pages_allocated));
> #endif
>
> - show_val_kb(m, "SwapTotal: ", i.totalswap);
> - show_val_kb(m, "SwapFree: ", i.freeswap);
> - show_val_kb(m, "Dirty: ",
> - global_node_page_state(NR_FILE_DIRTY));
> - show_val_kb(m, "Writeback: ",
> - global_node_page_state(NR_WRITEBACK));
> - show_val_kb(m, "AnonPages: ",
> - global_node_page_state(NR_ANON_MAPPED));
> - show_val_kb(m, "Mapped: ",
> - global_node_page_state(NR_FILE_MAPPED));
> - show_val_kb(m, "Shmem: ", i.sharedram);
> - show_val_kb(m, "KReclaimable: ", sreclaimable +
> + show_val_kb(m, "KReclaimable: ", mi.slab_reclaimable +
> global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE));
> - show_val_kb(m, "Slab: ", sreclaimable + sunreclaim);
> - show_val_kb(m, "SReclaimable: ", sreclaimable);
> - show_val_kb(m, "SUnreclaim: ", sunreclaim);
> seq_printf(m, "KernelStack: %8lu kB\n",
> global_node_page_state(NR_KERNEL_STACK_KB));
> #ifdef CONFIG_SHADOW_CALL_STACK
> seq_printf(m, "ShadowCallStack:%8lu kB\n",
> global_node_page_state(NR_KERNEL_SCS_KB));
> #endif
> - show_val_kb(m, "PageTables: ",
> - global_node_page_state(NR_PAGETABLE));
>
> show_val_kb(m, "NFS_Unstable: ", 0);
> show_val_kb(m, "Bounce: ",
> @@ -115,7 +139,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
> show_val_kb(m, "WritebackTmp: ",
> global_node_page_state(NR_WRITEBACK_TEMP));
> show_val_kb(m, "CommitLimit: ", vm_commit_limit());
> - show_val_kb(m, "Committed_AS: ", committed);
> + show_val_kb(m, "Committed_AS: ", vm_memory_committed());
> seq_printf(m, "VmallocTotal: %8lu kB\n",
> (unsigned long)VMALLOC_TOTAL >> 10);
> show_val_kb(m, "VmallocUsed: ", vmalloc_nr_pages());
> @@ -153,6 +177,20 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
> return 0;
> }
>
> +int proc_meminfo_show(struct seq_file *m, struct pid_namespace *ns,
> + struct pid *pid, struct task_struct *task)
> +{
> + struct meminfo mi;
> +
> + fill_meminfo(&mi, task);
> +
> + meminfo_proc_show_mi(m, &mi);
> + hugetlb_report_meminfo(m);
> + arch_report_meminfo(m);
> +
> + return 0;
> +}
> +
> static int __init proc_meminfo_init(void)
> {
> proc_create_single("meminfo", 0, NULL, meminfo_proc_show);
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index c193be760709..4a7e2894954f 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -1119,6 +1119,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
> gfp_t gfp_mask,
> unsigned long *total_scanned);
>
> +void mem_fill_meminfo(struct meminfo *mi, struct task_struct *task);
> +
> #else /* CONFIG_MEMCG */
>
> #define MEM_CGROUP_ID_SHIFT 0
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index c274f75efcf9..7faeaddd5b88 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2467,6 +2467,20 @@ static inline int early_pfn_to_nid(unsigned long pfn)
> extern int __meminit early_pfn_to_nid(unsigned long pfn);
> #endif
>
> +struct meminfo {
> + struct sysinfo si;
> + unsigned long pages[NR_LRU_LISTS];
> + unsigned long cached;
> + unsigned long swapcached;
> + unsigned long anon_pages;
> + unsigned long mapped;
> + unsigned long nr_pagetable;
> + unsigned long dirty_pages;
> + unsigned long writeback_pages;
> + unsigned long slab_reclaimable;
> + unsigned long slab_unreclaimable;
> +};
> +
> extern void set_dma_reserve(unsigned long new_dma_reserve);
> extern void memmap_init_range(unsigned long, int, unsigned long,
> unsigned long, unsigned long, enum meminit_context,
> @@ -2477,6 +2491,7 @@ extern int __meminit init_per_zone_wmark_min(void);
> extern void mem_init(void);
> extern void __init mmap_init(void);
> extern void show_mem(unsigned int flags, nodemask_t *nodemask);
> +extern long si_mem_available_mi(struct meminfo *mi);
> extern long si_mem_available(void);
> extern void si_meminfo(struct sysinfo * val);
> extern void si_meminfo_node(struct sysinfo *val, int nid);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 64ada9e650a5..344b546f9e25 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3750,6 +3750,86 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
> return nr;
> }
>
> +static void mem_cgroup_nr_pages(struct mem_cgroup *memcg, int nid, unsigned long *pages)
> +{
> + struct mem_cgroup *iter;
> + int i;
> +
> + for_each_mem_cgroup_tree(iter, memcg) {
> + for (i = 0; i < NR_LRU_LISTS; i++)
> + pages[i] += mem_cgroup_node_nr_lru_pages(iter, nid, BIT(i), false);
> + }
> +}
> +
> +static void mem_cgroup_si_meminfo(struct sysinfo *si, struct task_struct *task)
> +{
> + unsigned long memtotal, memused, swapsize;
> + struct mem_cgroup *memcg;
> + struct cgroup_subsys_state *css;
> +
> + css = task_css(task, memory_cgrp_id);
> + memcg = mem_cgroup_from_css(css);
> +
> + memtotal = READ_ONCE(memcg->memory.max);
> +
> + if (memtotal != PAGE_COUNTER_MAX) {
> + memused = page_counter_read(&memcg->memory);
> +
> + si->totalram = memtotal;
> + si->freeram = (memtotal > memused ? memtotal - memused : 0);
> + si->sharedram = memcg_page_state(memcg, NR_SHMEM);
> +
> + si->bufferram = nr_blockdev_pages();
> + si->totalhigh = totalhigh_pages();
> + si->freehigh = nr_free_highpages();
> + si->mem_unit = PAGE_SIZE;
> + } else {
> + si_meminfo(si);
> + memused = si->totalram - si->freeram;
> + }
> +
> + swapsize = READ_ONCE(memcg->memsw.max);
> +
> + if (swapsize != PAGE_COUNTER_MAX) {
> + unsigned long swaptotal, swapused;
> +
> + swaptotal = swapsize - memtotal;
> + swapused = page_counter_read(&memcg->memsw) - memused;
> + si->totalswap = swaptotal;
> + /* Due to global reclaim, memory.memsw.usage can be greater than
> + * (memory.memsw.max - memory.max). */
> + si->freeswap = (swaptotal > swapused ? swaptotal - swapused : 0);
> + } else {
> + si_swapinfo(si);
> + }
> +
> + css_put(css);
> +}
> +
> +void mem_fill_meminfo(struct meminfo *mi, struct task_struct *task)
> +{
> + struct cgroup_subsys_state *memcg_css = task_css(task, memory_cgrp_id);
> + struct mem_cgroup *memcg = mem_cgroup_from_css(memcg_css);
> + int nid;
> +
> + memset(&mi->pages, 0, sizeof(mi->pages));
> +
> + mem_cgroup_si_meminfo(&mi->si, task);
> +
> + for_each_online_node(nid)
> + mem_cgroup_nr_pages(memcg, nid, mi->pages);
> +
> + mi->slab_reclaimable = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B);
> + mi->slab_unreclaimable = memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
> + mi->cached = memcg_page_state(memcg, NR_FILE_PAGES);
> + mi->swapcached = memcg_page_state(memcg, NR_SWAPCACHE);
> + mi->anon_pages = memcg_page_state(memcg, NR_ANON_MAPPED);
> + mi->mapped = memcg_page_state(memcg, NR_FILE_MAPPED);
> + mi->nr_pagetable = memcg_page_state(memcg, NR_PAGETABLE);
> + mi->dirty_pages = memcg_page_state(memcg, NR_FILE_DIRTY);
> + mi->writeback_pages = memcg_page_state(memcg, NR_WRITEBACK);
> +}
> +
> static int memcg_numa_stat_show(struct seq_file *m, void *v)
> {
> struct numa_stat {
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index aaa1655cf682..0a3c9dcd2c13 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5551,18 +5551,13 @@ static inline void show_node(struct zone *zone)
> printk("Node %d ", zone_to_nid(zone));
> }
>
> -long si_mem_available(void)
> +long si_mem_available_mi(struct meminfo *mi)
> {
> long available;
> unsigned long pagecache;
> unsigned long wmark_low = 0;
> - unsigned long pages[NR_LRU_LISTS];
> unsigned long reclaimable;
> struct zone *zone;
> - int lru;
> -
> - for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
> - pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
>
> for_each_zone(zone)
> wmark_low += low_wmark_pages(zone);
> @@ -5571,14 +5566,14 @@ long si_mem_available(void)
> * Estimate the amount of memory available for userspace allocations,
> * without causing swapping.
> */
> - available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
> + available = mi->si.freeram - totalreserve_pages;
>
> /*
> * Not all the page cache can be freed, otherwise the system will
> * start swapping. Assume at least half of the page cache, or the
> * low watermark worth of cache, needs to stay.
> */
> - pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
> + pagecache = mi->pages[LRU_ACTIVE_FILE] + mi->pages[LRU_INACTIVE_FILE];
> pagecache -= min(pagecache / 2, wmark_low);
> available += pagecache;
>
> @@ -5587,14 +5582,27 @@ long si_mem_available(void)
> * items that are in use, and cannot be freed. Cap this estimate at the
> * low watermark.
> */
> - reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
> - global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
> + reclaimable = mi->slab_reclaimable + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
> available += reclaimable - min(reclaimable / 2, wmark_low);
>
> if (available < 0)
> available = 0;
> return available;
> }
> +
> +long si_mem_available(void)
> +{
> + struct meminfo mi;
> + int lru;
> +
> + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
> + mi.pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
> +
> + mi.si.freeram = global_zone_page_state(NR_FREE_PAGES);
> + mi.slab_reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B);
> +
> + return si_mem_available_mi(&mi);
> +}
> EXPORT_SYMBOL_GPL(si_mem_available);
>
> void si_meminfo(struct sysinfo *val)
> --
> 2.29.3
--
Michal Hocko
SUSE Labs