Re: [RFC PATCH] mm, kvm: account kvm_vcpu_mmap to kmemcg

From: Michal Hocko
Date: Fri Mar 29 2019 - 03:52:13 EST


On Thu 28-03-19 18:28:36, Shakeel Butt wrote:
> A VCPU of a VM can allocate upto three pages which can be mmap'ed by the
> user space application. At the moment this memory is not charged. On a
> large machine running large number of VMs (or small number of VMs having
> large number of VCPUs), this unaccounted memory can be very significant.

Is this really the case. How many machines are we talking about? Say I
have a virtual machines with 1K cpus, this will result in 12MB. Is this
significant to the overal size of the virtual machine to even care?

> So, this memory should be charged to a kmemcg. However that is not
> possible as these pages are mmapped to the userspace and PageKmemcg()
> was designed with the assumption that such pages will never be mmapped
> to the userspace.
>
> One way to solve this problem is by introducing an additional memcg
> charging API similar to mem_cgroup_[un]charge_skmem(). However skmem
> charging API usage is contained and shared and no new users are
> expected but the pages which can be mmapped and should be charged to
> kmemcg can and will increase. So, requiring the usage for such API will
> increase the maintenance burden. The simplest solution is to remove the
> assumption of no mmapping PageKmemcg() pages to user space.

IIRC the only purpose of PageKmemcg is to keep accounting in the legacy
memcg right. Spending a page flag for that is just no-go. If PageKmemcg
cannot reuse mapping then we have to find a better place for it (e.g.
bottom bit in the page->memcg pointer or rethink the whole PageKmemcg.

> Signed-off-by: Shakeel Butt <shakeelb@xxxxxxxxxx>
> ---
> arch/s390/kvm/kvm-s390.c | 2 +-
> arch/x86/kvm/x86.c | 2 +-
> include/linux/page-flags.h | 26 ++++++++++++++++++--------
> include/trace/events/mmflags.h | 9 ++++++++-
> virt/kvm/coalesced_mmio.c | 2 +-
> virt/kvm/kvm_main.c | 2 +-
> 6 files changed, 30 insertions(+), 13 deletions(-)
>
> diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
> index 4638303ba6a8..1a9e337ed5da 100644
> --- a/arch/s390/kvm/kvm-s390.c
> +++ b/arch/s390/kvm/kvm-s390.c
> @@ -2953,7 +2953,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
> goto out;
>
> BUILD_BUG_ON(sizeof(struct sie_page) != 4096);
> - sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL);
> + sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL_ACCOUNT);
> if (!sie_page)
> goto out_free_cpu;
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 65e4559eef2f..05c0c7eaa5c6 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -9019,7 +9019,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
> else
> vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
>
> - page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> if (!page) {
> r = -ENOMEM;
> goto fail;
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index 9f8712a4b1a5..b47a6a327d6a 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -78,6 +78,10 @@
> * PG_hwpoison indicates that a page got corrupted in hardware and contains
> * data with incorrect ECC bits that triggered a machine check. Accessing is
> * not safe since it may cause another machine check. Don't touch!
> + *
> + * PG_kmemcg indicates that a kmem page is charged to a memcg. If kmemcg is
> + * enabled, the page allocator will set PageKmemcg() on pages allocated with
> + * __GFP_ACCOUNT. It gets cleared on page free.
> */
>
> /*
> @@ -130,6 +134,9 @@ enum pageflags {
> #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
> PG_young,
> PG_idle,
> +#endif
> +#ifdef CONFIG_MEMCG_KMEM
> + PG_kmemcg,
> #endif
> __NR_PAGEFLAGS,
>
> @@ -289,6 +296,9 @@ static inline int Page##uname(const struct page *page) { return 0; }
> #define SETPAGEFLAG_NOOP(uname) \
> static inline void SetPage##uname(struct page *page) { }
>
> +#define __SETPAGEFLAG_NOOP(uname) \
> +static inline void __SetPage##uname(struct page *page) { }
> +
> #define CLEARPAGEFLAG_NOOP(uname) \
> static inline void ClearPage##uname(struct page *page) { }
>
> @@ -427,6 +437,13 @@ TESTCLEARFLAG(Young, young, PF_ANY)
> PAGEFLAG(Idle, idle, PF_ANY)
> #endif
>
> +#ifdef CONFIG_MEMCG_KMEM
> +__PAGEFLAG(Kmemcg, kmemcg, PF_NO_TAIL)
> +#else
> +TESTPAGEFLAG_FALSE(kmemcg)
> +__SETPAGEFLAG_NOOP(kmemcg)
> +__CLEARPAGEFLAG_NOOP(kmemcg)
> +#endif
> /*
> * On an anonymous page mapped into a user virtual memory area,
> * page->mapping points to its anon_vma, not to a struct address_space;
> @@ -701,8 +718,7 @@ PAGEFLAG_FALSE(DoubleMap)
> #define PAGE_MAPCOUNT_RESERVE -128
> #define PG_buddy 0x00000080
> #define PG_offline 0x00000100
> -#define PG_kmemcg 0x00000200
> -#define PG_table 0x00000400
> +#define PG_table 0x00000200
>
> #define PageType(page, flag) \
> ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
> @@ -743,12 +759,6 @@ PAGE_TYPE_OPS(Buddy, buddy)
> */
> PAGE_TYPE_OPS(Offline, offline)
>
> -/*
> - * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on
> - * pages allocated with __GFP_ACCOUNT. It gets cleared on page free.
> - */
> -PAGE_TYPE_OPS(Kmemcg, kmemcg)
> -
> /*
> * Marks pages in use as page tables.
> */
> diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
> index a1675d43777e..d93b78eac5b9 100644
> --- a/include/trace/events/mmflags.h
> +++ b/include/trace/events/mmflags.h
> @@ -79,6 +79,12 @@
> #define IF_HAVE_PG_IDLE(flag,string)
> #endif
>
> +#ifdef CONFIG_MEMCG_KMEM
> +#define IF_HAVE_PG_KMEMCG(flag,string) ,{1UL << flag, string}
> +#else
> +#define IF_HAVE_PG_KMEMCG(flag,string)
> +#endif
> +
> #define __def_pageflag_names \
> {1UL << PG_locked, "locked" }, \
> {1UL << PG_waiters, "waiters" }, \
> @@ -105,7 +111,8 @@ IF_HAVE_PG_MLOCK(PG_mlocked, "mlocked" ) \
> IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \
> IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \
> IF_HAVE_PG_IDLE(PG_young, "young" ) \
> -IF_HAVE_PG_IDLE(PG_idle, "idle" )
> +IF_HAVE_PG_IDLE(PG_idle, "idle" ) \
> +IF_HAVE_PG_KMEMCG(PG_kmemcg, "kmemcg" )
>
> #define show_page_flags(flags) \
> (flags) ? __print_flags(flags, "|", \
> diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
> index 5294abb3f178..ebf1601de2a5 100644
> --- a/virt/kvm/coalesced_mmio.c
> +++ b/virt/kvm/coalesced_mmio.c
> @@ -110,7 +110,7 @@ int kvm_coalesced_mmio_init(struct kvm *kvm)
> int ret;
>
> ret = -ENOMEM;
> - page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> if (!page)
> goto out_err;
>
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index f25aa98a94df..de6328dff251 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -306,7 +306,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
> vcpu->pre_pcpu = -1;
> INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
>
> - page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> if (!page) {
> r = -ENOMEM;
> goto fail;
> --
> 2.21.0.392.gf8f6787159e-goog
>

--
Michal Hocko
SUSE Labs