Re: [PATCH] memcg, hugetlb: pages allocated for hugetlb's overcommit will be charged to memcg

From: Mike Kravetz
Date: Wed May 02 2018 - 00:42:23 EST


On 05/01/2018 06:19 PM, TSUKADA Koutaro wrote:
> If nr_overcommit_hugepages is assumed to be infinite, allocating pages for
> hugetlb's overcommit from buddy pool is all unlimited even if being limited
> by memcg. The purpose of this patch is that if we allocate the hugetlb page
> from the boddy pool, that means we should charge it to memcg.
>
> A straightforward way for user applications to use hugetlb pages is to
> create the pool(nr_hugepages), but root privileges is required. For example,
> assuming the field of HPC, it can be said that giving root privs to general
> users is difficult. Also, the way to the creating pool is that we need to
> estimate exactly how much use hugetlb, and it feels a bit troublesome.
>
> In such a case, using hugetlb's overcommit feature, considered to let the
> user use hugetlb page only with overcommit without creating the any pool.
> However, as mentioned earlier, the page can be allocated limitelessly in
> overcommit in the current implementation. Therefore, by introducing memcg
> charging, I wanted to be able to manage the memory resources used by the
> user application only with memcg's limitation.
>
> This patch targets RHELSA(kernel-alt-4.11.0-45.6.1.el7a.src.rpm).

It would be very helpful to rebase this patch on a recent mainline kernel.
The code to allocate surplus huge pages has been significantly changed in
recent kernels.

I have no doubt that this is a real issue and we are not correctly charging
surplus to a memcg. But your patch will be hard to evaluate when based on
an older distro kernel.

> The patch
> does the following things.
>
> When allocating the page from buddy at __hugetlb_alloc_buddy_huge_page,
> set the flag of HUGETLB_OVERCOMMIT on that page[1].private. When actually
> use the page which HUGETLB_OVERCOMMIT is set(at hugepage_add_new_anon_rmap
> or huge_add_to_page_cache), it tries to charge to memcg. If the charge is
> successful, add the flag of HUGETLB_OVERCOMMIT_CHARGED on that page[1].

What is the reason for not charging pages at allocation/reserve time? I am
not an expert in memcg accounting, but I would think the pages should be
charged at allocation time. Otherwise, a task could allocate a large number
of (reserved) pages that are not charged to a memcg. memcg charges in other
code paths seem to happen at huge page allocation time.

--
Mike Kravetz

>
> The page charged to memcg will finally be uncharged at free_huge_page.
>
> Modification of memcontrol.c is for updating of statistical information
> when the task moves cgroup. The hpage_nr_pages works correctly for thp,
> but it is not suitable for such as hugetlb which uses the contiguous bit
> of aarch64, so need to modify it.
>
> Signed-off-by: TSUKADA Koutaro <tsukada@xxxxxxxxxxxx>
> ---
> include/linux/hugetlb.h | 45 ++++++++++++++++++++++
> mm/hugetlb.c | 80 +++++++++++++++++++++++++++++++++++++++
> mm/memcontrol.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++--
> 3 files changed, 219 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index d67675e..67991cb 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -511,6 +511,51 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr
> set_huge_pte_at(mm, addr, ptep, pte);
> }
> #endif
> +
> +#define HUGETLB_OVERCOMMIT 1UL
> +#define HUGETLB_OVERCOMMIT_CHARGED 2UL
> +
> +static inline void add_hugetlb_compound_private(struct page *page,
> + unsigned long val)
> +{
> + page[1].private |= val;
> +}
> +
> +static inline unsigned long get_hugetlb_compound_private(struct page *page)
> +{
> + return page_private(&page[1]);
> +}
> +
> +static inline void add_hugetlb_overcommit(struct page *page)
> +{
> + add_hugetlb_compound_private(page, HUGETLB_OVERCOMMIT);
> +}
> +
> +static inline void del_hugetlb_overcommit(struct page *page)
> +{
> + add_hugetlb_compound_private(page, ~(HUGETLB_OVERCOMMIT));
> +}
> +
> +static inline int is_hugetlb_overcommit(struct page *page)
> +{
> + return (get_hugetlb_compound_private(page) & HUGETLB_OVERCOMMIT);
> +}
> +
> +static inline void add_hugetlb_overcommit_charged(struct page *page)
> +{
> + add_hugetlb_compound_private(page, HUGETLB_OVERCOMMIT_CHARGED);
> +}
> +
> +static inline void del_hugetlb_overcommit_charged(struct page *page)
> +{
> + add_hugetlb_compound_private(page, ~(HUGETLB_OVERCOMMIT_CHARGED));
> +}
> +
> +static inline int is_hugetlb_overcommit_charged(struct page *page)
> +{
> + return (get_hugetlb_compound_private(page) &
> + HUGETLB_OVERCOMMIT_CHARGED);
> +}
> #else /* CONFIG_HUGETLB_PAGE */
> struct hstate {};
> #define alloc_huge_page(v, a, r) NULL
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 6191fb9..2cd91d9 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -24,6 +24,7 @@
> #include <linux/swapops.h>
> #include <linux/page-isolation.h>
> #include <linux/jhash.h>
> +#include <linux/memcontrol.h>
>
> #include <asm/page.h>
> #include <asm/pgtable.h>
> @@ -1227,6 +1228,17 @@ static void clear_page_huge_active(struct page *page)
> ClearPagePrivate(&page[1]);
> }
>
> +static void hugetlb_overcommit_finalize(struct page *page)
> +{
> + if (is_hugetlb_overcommit_charged(page)) {
> + del_hugetlb_overcommit_charged(page);
> + mem_cgroup_uncharge(page);
> + }
> + if (is_hugetlb_overcommit(page)) {
> + del_hugetlb_overcommit(page);
> + }
> +}
> +
> void free_huge_page(struct page *page)
> {
> /*
> @@ -1239,6 +1251,8 @@ void free_huge_page(struct page *page)
> (struct hugepage_subpool *)page_private(page);
> bool restore_reserve;
>
> + hugetlb_overcommit_finalize(page);
> +
> set_page_private(page, 0);
> page->mapping = NULL;
> VM_BUG_ON_PAGE(page_count(page), page);
> @@ -1620,6 +1634,13 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h,
> spin_unlock(&hugetlb_lock);
>
> page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid);
> + if (page) {
> + /*
> + * At this point it is impossible to judge whether it is
> + * mapped or just reserved, so only mark it.
> + */
> + add_hugetlb_overcommit(page);
> + }
>
> spin_lock(&hugetlb_lock);
> if (page) {
> @@ -3486,6 +3507,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> int ret = 0, outside_reserve = 0;
> unsigned long mmun_start; /* For mmu_notifiers */
> unsigned long mmun_end; /* For mmu_notifiers */
> + struct mem_cgroup *memcg;
> + int memcg_charged = 0;
>
> pte = huge_ptep_get(ptep);
> old_page = pte_page(pte);
> @@ -3552,6 +3575,15 @@ retry_avoidcopy:
> goto out_release_old;
> }
>
> + if (unlikely(is_hugetlb_overcommit(new_page))) {
> + if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL,
> + &memcg, true)) {
> + ret = VM_FAULT_OOM;
> + goto out_release_all;
> + }
> + memcg_charged = 1;
> + }
> +
> /*
> * When the original hugepage is shared one, it does not have
> * anon_vma prepared.
> @@ -3587,12 +3619,18 @@ retry_avoidcopy:
> make_huge_pte(vma, new_page, 1));
> page_remove_rmap(old_page, true);
> hugepage_add_new_anon_rmap(new_page, vma, address);
> + if (memcg_charged) {
> + mem_cgroup_commit_charge(new_page, memcg, false, true);
> + add_hugetlb_overcommit_charged(new_page);
> + }
> /* Make the old page be freed below */
> new_page = old_page;
> }
> spin_unlock(ptl);
> mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
> out_release_all:
> + if (memcg_charged)
> + mem_cgroup_cancel_charge(new_page, memcg, true);
> restore_reserve_on_error(h, vma, address, new_page);
> put_page(new_page);
> out_release_old:
> @@ -3641,9 +3679,18 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
> struct inode *inode = mapping->host;
> struct hstate *h = hstate_inode(inode);
> int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
> + struct mem_cgroup *memcg;
>
> if (err)
> return err;
> + if (page && is_hugetlb_overcommit(page)) {
> + err = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL,
> + &memcg, true);
> + if (err)
> + return err;
> + mem_cgroup_commit_charge(page, memcg, false, true);
> + add_hugetlb_overcommit_charged(page);
> + }
> ClearPagePrivate(page);
>
> spin_lock(&inode->i_lock);
> @@ -3663,6 +3710,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
> struct page *page;
> pte_t new_pte;
> spinlock_t *ptl;
> + struct mem_cgroup *memcg;
> + int memcg_charged = 0;
>
> /*
> * Currently, we are forced to kill the process in the event the
> @@ -3740,6 +3789,14 @@ retry:
> }
> } else {
> lock_page(page);
> + if (unlikely(is_hugetlb_overcommit(page))) {
> + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL,
> + &memcg, true)) {
> + ret = VM_FAULT_OOM;
> + goto backout_unlocked;
> + }
> + memcg_charged = 1;
> + }
> if (unlikely(anon_vma_prepare(vma))) {
> ret = VM_FAULT_OOM;
> goto backout_unlocked;
> @@ -3786,6 +3843,10 @@ retry:
> if (anon_rmap) {
> ClearPagePrivate(page);
> hugepage_add_new_anon_rmap(page, vma, address);
> + if (memcg_charged) {
> + mem_cgroup_commit_charge(page, memcg, false, true);
> + add_hugetlb_overcommit_charged(page);
> + }
> } else
> page_dup_rmap(page, true);
> new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
> @@ -3806,6 +3867,8 @@ out:
> backout:
> spin_unlock(ptl);
> backout_unlocked:
> + if (memcg_charged)
> + mem_cgroup_cancel_charge(page, memcg, true);
> unlock_page(page);
> restore_reserve_on_error(h, vma, address, page);
> put_page(page);
> @@ -4002,6 +4065,9 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
> spinlock_t *ptl;
> int ret;
> struct page *page;
> + struct mem_cgroup *memcg;
> + int memcg_charged = 0;
> +
>
> if (!*pagep) {
> ret = -ENOMEM;
> @@ -4045,6 +4111,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
> goto out_release_nounlock;
> }
>
> + if (!vm_shared && is_hugetlb_overcommit(page)) {
> + ret = -ENOMEM;
> + if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL,
> + &memcg, true))
> + goto out_release_nounlock;
> + memcg_charged = 1;
> + }
> +
> ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
> spin_lock(ptl);
>
> @@ -4057,6 +4131,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
> } else {
> ClearPagePrivate(page);
> hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
> + if (memcg_charged) {
> + mem_cgroup_commit_charge(page, memcg, false, true);
> + add_hugetlb_overcommit_charged(page);
> + }
> }
>
> _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
> @@ -4082,6 +4160,8 @@ out:
> out_release_unlock:
> spin_unlock(ptl);
> out_release_nounlock:
> + if (memcg_charged)
> + mem_cgroup_cancel_charge(page, memcg, true);
> if (vm_shared)
> unlock_page(page);
> put_page(page);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 02cfcd9..1842693 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4531,7 +4531,7 @@ static int mem_cgroup_move_account(struct page *page,
> struct mem_cgroup *to)
> {
> unsigned long flags;
> - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
> + unsigned int nr_pages = compound ? 1 << compound_order(page) : 1;
> int ret;
> bool anon;
>
> @@ -4744,12 +4744,64 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
> return 0;
> }
>
> +#ifdef CONFIG_HUGETLB_PAGE
> +static enum mc_target_type get_mctgt_type_hugetlb(struct vm_area_struct *vma,
> + unsigned long addr, pte_t *pte, union mc_target *target)
> +{
> + struct page *page = NULL;
> + pte_t entry;
> + enum mc_target_type ret = MC_TARGET_NONE;
> +
> + if (!(mc.flags & MOVE_ANON))
> + return ret;
> +
> + entry = huge_ptep_get(pte);
> + if (!pte_present(entry))
> + return ret;
> + page = pte_page(entry);
> + VM_BUG_ON_PAGE(!page || !PageHead(page), page);
> + if (!is_hugetlb_overcommit_charged(page))
> + return ret;
> + if (page->mem_cgroup == mc.from) {
> + ret = MC_TARGET_PAGE;
> + if (target) {
> + get_page(page);
> + target->page = page;
> + }
> + }
> +
> + return ret;
> +}
> +
> +static int hugetlb_count_precharge_pte_range(pte_t *pte, unsigned long hmask,
> + unsigned long addr, unsigned long end,
> + struct mm_walk *walk)
> +{
> + struct vm_area_struct *vma = walk->vma;
> + struct mm_struct *mm = walk->mm;
> + spinlock_t *ptl;
> + union mc_target target;
> +
> + ptl = huge_pte_lock(hstate_vma(vma), mm, pte);
> + if (get_mctgt_type_hugetlb(vma, addr, pte, &target) == MC_TARGET_PAGE) {
> + mc.precharge += (1 << compound_order(target.page));
> + put_page(target.page);
> + }
> + spin_unlock(ptl);
> +
> + return 0;
> +}
> +#endif
> +
> static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
> {
> unsigned long precharge;
>
> struct mm_walk mem_cgroup_count_precharge_walk = {
> .pmd_entry = mem_cgroup_count_precharge_pte_range,
> +#ifdef CONFIG_HUGETLB_PAGE
> + .hugetlb_entry = hugetlb_count_precharge_pte_range,
> +#endif
> .mm = mm,
> };
> down_read(&mm->mmap_sem);
> @@ -5023,10 +5075,48 @@ put: /* get_mctgt_type() gets the page */
> return ret;
> }
>
> +#ifdef CONFIG_HUGETLB_PAGE
> +static int hugetlb_move_charge_pte_range(pte_t *pte, unsigned long hmask,
> + unsigned long addr, unsigned long end,
> + struct mm_walk *walk)
> +{
> + struct vm_area_struct *vma = walk->vma;
> + struct mm_struct *mm = walk->mm;
> + spinlock_t *ptl;
> + enum mc_target_type target_type;
> + union mc_target target;
> + struct page *page;
> + unsigned long nr_pages;
> +
> + ptl = huge_pte_lock(hstate_vma(vma), mm, pte);
> + target_type = get_mctgt_type_hugetlb(vma, addr, pte, &target);
> + if (target_type == MC_TARGET_PAGE) {
> + page = target.page;
> + nr_pages = (1 << compound_order(page));
> + if (mc.precharge < nr_pages) {
> + put_page(page);
> + goto unlock;
> + }
> + if (!mem_cgroup_move_account(page, true, mc.from, mc.to)) {
> + mc.precharge -= nr_pages;
> + mc.moved_charge += nr_pages;
> + }
> + put_page(page);
> + }
> +unlock:
> + spin_unlock(ptl);
> +
> + return 0;
> +}
> +#endif
> +
> static void mem_cgroup_move_charge(void)
> {
> struct mm_walk mem_cgroup_move_charge_walk = {
> .pmd_entry = mem_cgroup_move_charge_pte_range,
> +#ifdef CONFIG_HUGETLB_PAGE
> + .hugetlb_entry = hugetlb_move_charge_pte_range,
> +#endif
> .mm = mc.mm,
> };
>
> @@ -5427,7 +5517,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
> bool compound)
> {
> struct mem_cgroup *memcg = NULL;
> - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
> + unsigned int nr_pages = compound ? (1 << compound_order(page)) : 1;
> int ret = 0;
>
> if (mem_cgroup_disabled())
> @@ -5488,7 +5578,7 @@ out:
> void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
> bool lrucare, bool compound)
> {
> - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
> + unsigned int nr_pages = compound ? (1 << compound_order(page)) : 1;
>
> VM_BUG_ON_PAGE(!page->mapping, page);
> VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
> @@ -5532,7 +5622,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
> void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
> bool compound)
> {
> - unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
> + unsigned int nr_pages = compound ? (1 << compound_order(page)) : 1;
>
> if (mem_cgroup_disabled())
> return;
>