Re: [PATCH 4/7] mm/hugetlb: Clean up map/global resv accounting when allocate

From: Ackerley Tng
Date: Fri Dec 27 2024 - 19:06:43 EST


Peter Xu <peterx@xxxxxxxxxx> writes:

> <snip>
>
> Signed-off-by: Peter Xu <peterx@xxxxxxxxxx>
> ---
> mm/hugetlb.c | 116 +++++++++++++++++++++++++++++++++++----------------
> 1 file changed, 80 insertions(+), 36 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index dfd479a857b6..14cfe0bb01e4 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -2956,6 +2956,25 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
> return ret;
> }
>
> +typedef enum {
> + /*
> + * For either 0/1: we checked the per-vma resv map, and one resv
> + * count either can be reused (0), or an extra needed (1).
> + */
> + MAP_CHG_REUSE = 0,
> + MAP_CHG_NEEDED = 1,
> + /*
> + * Cannot use per-vma resv count can be used, hence a new resv
> + * count is enforced.
> + *
> + * NOTE: This is mostly identical to MAP_CHG_NEEDED, except
> + * that currently vma_needs_reservation() has an unwanted side
> + * effect to either use end() or commit() to complete the
> + * transaction. Hence it needs to differenciate from NEEDED.
> + */
> + MAP_CHG_ENFORCED = 2,
> +} map_chg_state;
> +
> /*
> * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW
> * faults of hugetlb private mappings on top of a non-page-cache folio (in
> @@ -2969,12 +2988,11 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
> struct hugepage_subpool *spool = subpool_vma(vma);
> struct hstate *h = hstate_vma(vma);
> struct folio *folio;
> - long map_chg, map_commit, nr_pages = pages_per_huge_page(h);
> - long gbl_chg;
> + long retval, gbl_chg, nr_pages = pages_per_huge_page(h);
> + map_chg_state map_chg;
> int memcg_charge_ret, ret, idx;
> struct hugetlb_cgroup *h_cg = NULL;
> struct mem_cgroup *memcg;
> - bool deferred_reserve;
> gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
>
> memcg = get_mem_cgroup_from_current();
> @@ -2985,36 +3003,56 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
> }
>
> idx = hstate_index(h);
> - /*
> - * Examine the region/reserve map to determine if the process
> - * has a reservation for the page to be allocated. A return
> - * code of zero indicates a reservation exists (no change).
> - */
> - map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
> - if (map_chg < 0) {
> - if (!memcg_charge_ret)
> - mem_cgroup_cancel_charge(memcg, nr_pages);
> - mem_cgroup_put(memcg);
> - return ERR_PTR(-ENOMEM);
> +
> + /* Whether we need a separate per-vma reservation? */
> + if (cow_from_owner) {
> + /*
> + * Special case! Since it's a CoW on top of a reserved
> + * page, the private resv map doesn't count. So it cannot
> + * consume the per-vma resv map even if it's reserved.
> + */
> + map_chg = MAP_CHG_ENFORCED;
> + } else {
> + /*
> + * Examine the region/reserve map to determine if the process
> + * has a reservation for the page to be allocated. A return
> + * code of zero indicates a reservation exists (no change).
> + */
> + retval = vma_needs_reservation(h, vma, addr);
> + if (retval < 0) {
> + if (!memcg_charge_ret)
> + mem_cgroup_cancel_charge(memcg, nr_pages);
> + mem_cgroup_put(memcg);
> + return ERR_PTR(-ENOMEM);
> + }
> + map_chg = retval ? MAP_CHG_NEEDED : MAP_CHG_REUSE;
> }
>
> /*
> + * Whether we need a separate global reservation?
> + *
> * Processes that did not create the mapping will have no
> * reserves as indicated by the region/reserve map. Check
> * that the allocation will not exceed the subpool limit.
> - * Allocations for MAP_NORESERVE mappings also need to be
> - * checked against any subpool limit.
> + * Or if it can get one from the pool reservation directly.
> */
> - if (map_chg || cow_from_owner) {
> + if (map_chg) {
> gbl_chg = hugepage_subpool_get_pages(spool, 1);
> if (gbl_chg < 0)
> goto out_end_reservation;
> + } else {
> + /*
> + * If we have the vma reservation ready, no need for extra
> + * global reservation.
> + */
> + gbl_chg = 0;
> }
>
> - /* If this allocation is not consuming a reservation, charge it now.
> + /*
> + * If this allocation is not consuming a per-vma reservation,
> + * charge the hugetlb cgroup now.
> */
> - deferred_reserve = map_chg || cow_from_owner;
> - if (deferred_reserve) {
> + if (map_chg) {
> ret = hugetlb_cgroup_charge_cgroup_rsvd(
> idx, pages_per_huge_page(h), &h_cg);

Should hugetlb_cgroup_charge_cgroup_rsvd() be called when map_chg == MAP_CHG_ENFORCED?

> if (ret)
> @@ -3038,7 +3076,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
> if (!folio)
> goto out_uncharge_cgroup;
> spin_lock_irq(&hugetlb_lock);
> - if (!cow_from_owner && vma_has_reserves(vma, gbl_chg)) {
> + if (vma_has_reserves(vma, gbl_chg)) {
> folio_set_hugetlb_restore_reserve(folio);
> h->resv_huge_pages--;
> }
> @@ -3051,7 +3089,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
> /* If allocation is not consuming a reservation, also store the
> * hugetlb_cgroup pointer on the page.
> */
> - if (deferred_reserve) {
> + if (map_chg) {
> hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
> h_cg, folio);
> }

same for this,

> @@ -3060,26 +3098,31 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
>
> hugetlb_set_folio_subpool(folio, spool);
>
> - map_commit = vma_commit_reservation(h, vma, addr);
> - if (unlikely(map_chg > map_commit)) {
> + if (map_chg != MAP_CHG_ENFORCED) {
> + /* commit() is only needed if the map_chg is not enforced */
> + retval = vma_commit_reservation(h, vma, addr);
> /*
> + * Check for possible race conditions. When it happens..
> * The page was added to the reservation map between
> * vma_needs_reservation and vma_commit_reservation.
> * This indicates a race with hugetlb_reserve_pages.
> * Adjust for the subpool count incremented above AND
> - * in hugetlb_reserve_pages for the same page. Also,
> + * in hugetlb_reserve_pages for the same page. Also,
> * the reservation count added in hugetlb_reserve_pages
> * no longer applies.
> */
> - long rsv_adjust;
> + if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) {
> + long rsv_adjust;
>
> - rsv_adjust = hugepage_subpool_put_pages(spool, 1);
> - hugetlb_acct_memory(h, -rsv_adjust);
> - if (deferred_reserve) {
> - spin_lock_irq(&hugetlb_lock);
> - hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
> - pages_per_huge_page(h), folio);
> - spin_unlock_irq(&hugetlb_lock);
> + rsv_adjust = hugepage_subpool_put_pages(spool, 1);
> + hugetlb_acct_memory(h, -rsv_adjust);
> + if (map_chg) {
> + spin_lock_irq(&hugetlb_lock);
> + hugetlb_cgroup_uncharge_folio_rsvd(
> + hstate_index(h), pages_per_huge_page(h),
> + folio);
> + spin_unlock_irq(&hugetlb_lock);
> + }
> }
> }
>
> @@ -3093,14 +3136,15 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
> out_uncharge_cgroup:
> hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
> out_uncharge_cgroup_reservation:
> - if (deferred_reserve)
> + if (map_chg)
> hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
> h_cg);

and same for this.

> out_subpool_put:
> - if (map_chg || cow_from_owner)
> + if (map_chg)
> hugepage_subpool_put_pages(spool, 1);
> out_end_reservation:
> - vma_end_reservation(h, vma, addr);
> + if (map_chg != MAP_CHG_ENFORCED)
> + vma_end_reservation(h, vma, addr);
> if (!memcg_charge_ret)
> mem_cgroup_cancel_charge(memcg, nr_pages);
> mem_cgroup_put(memcg);