Re: [External] RE: [PATCH v4 09/21] mm/hugetlb: Free the vmemmap pages associated with each hugetlb page

From: Muchun Song
Date: Tue Nov 17 2020 - 05:27:39 EST


On Tue, Nov 17, 2020 at 5:55 PM Song Bao Hua (Barry Song)
<song.bao.hua@xxxxxxxxxxxxx> wrote:
>
>
>
> > -----Original Message-----
> > From: owner-linux-mm@xxxxxxxxx [mailto:owner-linux-mm@xxxxxxxxx] On
> > Behalf Of Muchun Song
> > Sent: Saturday, November 14, 2020 12:00 AM
> > To: corbet@xxxxxxx; mike.kravetz@xxxxxxxxxx; tglx@xxxxxxxxxxxxx;
> > mingo@xxxxxxxxxx; bp@xxxxxxxxx; x86@xxxxxxxxxx; hpa@xxxxxxxxx;
> > dave.hansen@xxxxxxxxxxxxxxx; luto@xxxxxxxxxx; peterz@xxxxxxxxxxxxx;
> > viro@xxxxxxxxxxxxxxxxxx; akpm@xxxxxxxxxxxxxxxxxxxx; paulmck@xxxxxxxxxx;
> > mchehab+huawei@xxxxxxxxxx; pawan.kumar.gupta@xxxxxxxxxxxxxxx;
> > rdunlap@xxxxxxxxxxxxx; oneukum@xxxxxxxx; anshuman.khandual@xxxxxxx;
> > jroedel@xxxxxxx; almasrymina@xxxxxxxxxx; rientjes@xxxxxxxxxx;
> > willy@xxxxxxxxxxxxx; osalvador@xxxxxxx; mhocko@xxxxxxxx
> > Cc: duanxiongchun@xxxxxxxxxxxxx; linux-doc@xxxxxxxxxxxxxxx;
> > linux-kernel@xxxxxxxxxxxxxxx; linux-mm@xxxxxxxxx;
> > linux-fsdevel@xxxxxxxxxxxxxxx; Muchun Song <songmuchun@xxxxxxxxxxxxx>
> > Subject: [PATCH v4 09/21] mm/hugetlb: Free the vmemmap pages associated
> > with each hugetlb page
> >
> > When we allocate a hugetlb page from the buddy, we should free the
> > unused vmemmap pages associated with it. We can do that in the
> > prep_new_huge_page().
> >
> > Signed-off-by: Muchun Song <songmuchun@xxxxxxxxxxxxx>
> > ---
> > arch/x86/include/asm/hugetlb.h | 9 ++
> > arch/x86/include/asm/pgtable_64_types.h | 8 ++
> > mm/hugetlb.c | 16 +++
> > mm/hugetlb_vmemmap.c | 188
> > ++++++++++++++++++++++++++++++++
> > mm/hugetlb_vmemmap.h | 5 +
> > 5 files changed, 226 insertions(+)
> >
> > diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
> > index 1721b1aadeb1..c601fe042832 100644
> > --- a/arch/x86/include/asm/hugetlb.h
> > +++ b/arch/x86/include/asm/hugetlb.h
> > @@ -4,6 +4,15 @@
> >
> > #include <asm/page.h>
> > #include <asm-generic/hugetlb.h>
> > +#include <asm/pgtable.h>
> > +
> > +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
> > +#define vmemmap_pmd_huge vmemmap_pmd_huge
> > +static inline bool vmemmap_pmd_huge(pmd_t *pmd)
> > +{
> > + return pmd_large(*pmd);
> > +}
> > +#endif
> >
> > #define hugepages_supported() boot_cpu_has(X86_FEATURE_PSE)
> >
> > diff --git a/arch/x86/include/asm/pgtable_64_types.h
> > b/arch/x86/include/asm/pgtable_64_types.h
> > index 52e5f5f2240d..bedbd2e7d06c 100644
> > --- a/arch/x86/include/asm/pgtable_64_types.h
> > +++ b/arch/x86/include/asm/pgtable_64_types.h
> > @@ -139,6 +139,14 @@ extern unsigned int ptrs_per_p4d;
> > # define VMEMMAP_START __VMEMMAP_BASE_L4
> > #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
> >
> > +/*
> > + * VMEMMAP_SIZE - allows the whole linear region to be covered by
> > + * a struct page array.
> > + */
> > +#define VMEMMAP_SIZE (1UL << (__VIRTUAL_MASK_SHIFT -
> > PAGE_SHIFT - \
> > + 1 + ilog2(sizeof(struct page))))
> > +#define VMEMMAP_END (VMEMMAP_START + VMEMMAP_SIZE)
> > +
> > #define VMALLOC_END (VMALLOC_START + (VMALLOC_SIZE_TB <<
> > 40) - 1)
> >
> > #define MODULES_VADDR (__START_KERNEL_map +
> > KERNEL_IMAGE_SIZE)
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index f88032c24667..a0ce6f33a717 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1499,6 +1499,14 @@ void free_huge_page(struct page *page)
> >
> > static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
> > {
> > + free_huge_page_vmemmap(h, page);
> > + /*
> > + * Because we store preallocated pages on @page->lru,
> > + * vmemmap_pgtable_free() must be called before the
> > + * initialization of @page->lru in INIT_LIST_HEAD().
> > + */
> > + vmemmap_pgtable_free(page);
> > +
> > INIT_LIST_HEAD(&page->lru);
> > set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
> > set_hugetlb_cgroup(page, NULL);
> > @@ -1751,6 +1759,14 @@ static struct page *alloc_fresh_huge_page(struct
> > hstate *h,
> > if (!page)
> > return NULL;
> >
> > + if (vmemmap_pgtable_prealloc(h, page)) {
> > + if (hstate_is_gigantic(h))
> > + free_gigantic_page(page, huge_page_order(h));
> > + else
> > + put_page(page);
> > + return NULL;
> > + }
> > +
> > if (hstate_is_gigantic(h))
> > prep_compound_gigantic_page(page, huge_page_order(h));
> > prep_new_huge_page(h, page, page_to_nid(page));
> > diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> > index 332c131c01a8..937562a15f1e 100644
> > --- a/mm/hugetlb_vmemmap.c
> > +++ b/mm/hugetlb_vmemmap.c
> > @@ -74,6 +74,7 @@
> > #include <linux/pagewalk.h>
> > #include <linux/mmzone.h>
> > #include <linux/list.h>
> > +#include <linux/bootmem_info.h>
> > #include <asm/pgalloc.h>
> > #include "hugetlb_vmemmap.h"
> >
> > @@ -86,6 +87,8 @@
> > * reserve at least 2 pages as vmemmap areas.
> > */
> > #define RESERVE_VMEMMAP_NR 2U
> > +#define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR <<
> > PAGE_SHIFT)
> > +#define TAIL_PAGE_REUSE -1
> >
> > #ifndef VMEMMAP_HPAGE_SHIFT
> > #define VMEMMAP_HPAGE_SHIFT HPAGE_SHIFT
> > @@ -97,6 +100,21 @@
> >
> > #define page_huge_pte(page) ((page)->pmd_huge_pte)
> >
> > +#define vmemmap_hpage_addr_end(addr, end) \
> > +({ \
> > + unsigned long __boundary; \
> > + __boundary = ((addr) + VMEMMAP_HPAGE_SIZE) &
> > VMEMMAP_HPAGE_MASK; \
> > + (__boundary - 1 < (end) - 1) ? __boundary : (end); \
> > +})
> > +
> > +#ifndef vmemmap_pmd_huge
> > +#define vmemmap_pmd_huge vmemmap_pmd_huge
> > +static inline bool vmemmap_pmd_huge(pmd_t *pmd)
> > +{
> > + return pmd_huge(*pmd);
> > +}
> > +#endif
> > +
> > static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
> > {
> > return h->nr_free_vmemmap_pages;
> > @@ -158,6 +176,176 @@ int vmemmap_pgtable_prealloc(struct hstate *h,
> > struct page *page)
> > return -ENOMEM;
> > }
> >
> > +/*
> > + * Walk a vmemmap address to the pmd it maps.
> > + */
> > +static pmd_t *vmemmap_to_pmd(unsigned long page)
> > +{
> > + pgd_t *pgd;
> > + p4d_t *p4d;
> > + pud_t *pud;
> > + pmd_t *pmd;
> > +
> > + if (page < VMEMMAP_START || page >= VMEMMAP_END)
> > + return NULL;
> > +
> > + pgd = pgd_offset_k(page);
> > + if (pgd_none(*pgd))
> > + return NULL;
> > + p4d = p4d_offset(pgd, page);
> > + if (p4d_none(*p4d))
> > + return NULL;
> > + pud = pud_offset(p4d, page);
> > +
> > + if (pud_none(*pud) || pud_bad(*pud))
> > + return NULL;
> > + pmd = pmd_offset(pud, page);
> > +
> > + return pmd;
> > +}
> > +
> > +static inline spinlock_t *vmemmap_pmd_lock(pmd_t *pmd)
> > +{
> > + return pmd_lock(&init_mm, pmd);
> > +}
> > +
> > +static inline int freed_vmemmap_hpage(struct page *page)
> > +{
> > + return atomic_read(&page->_mapcount) + 1;
> > +}
> > +
> > +static inline int freed_vmemmap_hpage_inc(struct page *page)
> > +{
> > + return atomic_inc_return_relaxed(&page->_mapcount) + 1;
> > +}
> > +
> > +static inline int freed_vmemmap_hpage_dec(struct page *page)
> > +{
> > + return atomic_dec_return_relaxed(&page->_mapcount) + 1;
> > +}
> > +
> > +static inline void free_vmemmap_page_list(struct list_head *list)
> > +{
> > + struct page *page, *next;
> > +
> > + list_for_each_entry_safe(page, next, list, lru) {
> > + list_del(&page->lru);
> > + free_vmemmap_page(page);
> > + }
> > +}
> > +
> > +static void __free_huge_page_pte_vmemmap(struct page *reuse, pte_t *ptep,
> > + unsigned long start,
> > + unsigned long end,
> > + struct list_head *free_pages)
> > +{
> > + /* Make the tail pages are mapped read-only. */
> > + pgprot_t pgprot = PAGE_KERNEL_RO;
> > + pte_t entry = mk_pte(reuse, pgprot);
> > + unsigned long addr;
> > +
> > + for (addr = start; addr < end; addr += PAGE_SIZE, ptep++) {
> > + struct page *page;
> > + pte_t old = *ptep;
> > +
> > + VM_WARN_ON(!pte_present(old));
> > + page = pte_page(old);
> > + list_add(&page->lru, free_pages);
> > +
> > + set_pte_at(&init_mm, addr, ptep, entry);
> > + }
> > +}
> > +
> > +static void __free_huge_page_pmd_vmemmap(struct hstate *h, pmd_t *pmd,
> > + unsigned long addr,
> > + struct list_head *free_pages)
> > +{
> > + unsigned long next;
> > + unsigned long start = addr + RESERVE_VMEMMAP_SIZE;
> > + unsigned long end = addr + vmemmap_pages_size_per_hpage(h);
> > + struct page *reuse = NULL;
> > +
> > + addr = start;
> > + do {
> > + pte_t *ptep;
> > +
> > + ptep = pte_offset_kernel(pmd, addr);
> > + if (!reuse)
> > + reuse = pte_page(ptep[TAIL_PAGE_REUSE]);
> > +
> > + next = vmemmap_hpage_addr_end(addr, end);
> > + __free_huge_page_pte_vmemmap(reuse, ptep, addr, next,
> > + free_pages);
> > + } while (pmd++, addr = next, addr != end);
> > +
> > + flush_tlb_kernel_range(start, end);
> > +}
> > +
> > +static void split_vmemmap_pmd(pmd_t *pmd, pte_t *pte_p, unsigned long
> > addr)
>
> Hi Muchun,
>
> Are you going to restore the pmd mapping after you free the hugetlb? I mean,
> When you free continuous 128MB hugetlb pages with 2MB size, will you
> redo the PMD vmemmap since 2MB PMD can just contain the page struct of
> 128MB memory?

Now we only restore the pmd mapping for the 1GB HugeTLB page. For the
2MB HugeTLB page, we do not(I haven't figured out how to handle it gracefully).

>
> If no, wouldn't it be simpler to only use base pages while populating vmemmap?
> I mean, once we enable the Kconfig option you add for VMEMMAP_FREE, we
> only use base pages to place "page struct" but not split PMD into base pages
> afterwards.
>
> One negative side effect might be that base pages are also used for those pages
> which won't be hugetlb later. but if most pages of host will be hugetlb for
> guest and SPDK, it shouldn't hurt too much.

Yeah, I agree with you. If the user uses a lot of HugeTLB pages(e.g.
SPDK/Guest),
it shouldn't hurt too much. And using base pages while populating vmemmap also
can decrease the overhead(of splitting PMD). In the end, if we don’t
come up with
a more suitable solution to deal with it(mentioned above for 2MB HugeTLB page).
Maybe this is also an idea.

Thanks.

>
> Or at least this can be done for hugetlb reserved by cmdline?
>
> > +{
> > + int i;
> > + pgprot_t pgprot = PAGE_KERNEL;
> > + struct mm_struct *mm = &init_mm;
> > + struct page *page;
> > + pmd_t old_pmd, _pmd;
> > +
> > + old_pmd = READ_ONCE(*pmd);
> > + page = pmd_page(old_pmd);
> > + pmd_populate_kernel(mm, &_pmd, pte_p);
> > +
> > + for (i = 0; i < VMEMMAP_HPAGE_NR; i++, addr += PAGE_SIZE) {
> > + pte_t entry, *pte;
> > +
> > + entry = mk_pte(page + i, pgprot);
> > + pte = pte_offset_kernel(&_pmd, addr);
> > + VM_BUG_ON(!pte_none(*pte));
> > + set_pte_at(mm, addr, pte, entry);
> > + }
> > +
> > + /* make pte visible before pmd */
> > + smp_wmb();
> > + pmd_populate_kernel(mm, pmd, pte_p);
> > +}
> > +
> > +static void split_vmemmap_huge_page(struct page *head, pmd_t *pmd)
> > +{
> > + struct page *pte_page, *t_page;
> > + unsigned long start = (unsigned long)head & VMEMMAP_HPAGE_MASK;
> > + unsigned long addr = start;
> > +
> > + list_for_each_entry_safe(pte_page, t_page, &head->lru, lru) {
> > + list_del(&pte_page->lru);
> > + VM_BUG_ON(freed_vmemmap_hpage(pte_page));
> > + split_vmemmap_pmd(pmd++, page_to_virt(pte_page), addr);
> > + addr += VMEMMAP_HPAGE_SIZE;
> > + }
> > +
> > + flush_tlb_kernel_range(start, addr);
> > +}
> > +
> > +void free_huge_page_vmemmap(struct hstate *h, struct page *head)
> > +{
> > + pmd_t *pmd;
> > + spinlock_t *ptl;
> > + LIST_HEAD(free_pages);
> > +
> > + if (!free_vmemmap_pages_per_hpage(h))
> > + return;
> > +
> > + pmd = vmemmap_to_pmd((unsigned long)head);
> > + BUG_ON(!pmd);
> > +
> > + ptl = vmemmap_pmd_lock(pmd);
> > + if (vmemmap_pmd_huge(pmd))
> > + split_vmemmap_huge_page(head, pmd);
> > +
> > + __free_huge_page_pmd_vmemmap(h, pmd, (unsigned long)head,
> > &free_pages);
> > + freed_vmemmap_hpage_inc(pmd_page(*pmd));
> > + spin_unlock(ptl);
> > +
> > + free_vmemmap_page_list(&free_pages);
> > +}
> > +
> > void __init hugetlb_vmemmap_init(struct hstate *h)
> > {
> > unsigned int order = huge_page_order(h);
> > diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
> > index 2a72d2f62411..fb8b77659ed5 100644
> > --- a/mm/hugetlb_vmemmap.h
> > +++ b/mm/hugetlb_vmemmap.h
> > @@ -15,6 +15,7 @@
> > void __init hugetlb_vmemmap_init(struct hstate *h);
> > int vmemmap_pgtable_prealloc(struct hstate *h, struct page *page);
> > void vmemmap_pgtable_free(struct page *page);
> > +void free_huge_page_vmemmap(struct hstate *h, struct page *head);
> > #else
> > static inline void hugetlb_vmemmap_init(struct hstate *h)
> > {
> > @@ -28,5 +29,9 @@ static inline int vmemmap_pgtable_prealloc(struct hstate
> > *h, struct page *page)
> > static inline void vmemmap_pgtable_free(struct page *page)
> > {
> > }
> > +
> > +static inline void free_huge_page_vmemmap(struct hstate *h, struct page
> > *head)
> > +{
> > +}
> > #endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
> > #endif /* _LINUX_HUGETLB_VMEMMAP_H */
> > --
> > 2.11.0
> >
>
> Thanks
> Barry
>


--
Yours,
Muchun