[PATCH v3 11/12] mm/gup: Handle hugepd for follow_page()
From: peterx
Date: Thu Mar 21 2024 - 18:10:46 EST
From: Peter Xu <peterx@xxxxxxxxxx>
Hugepd is only used in PowerPC so far on 4K page size kernels where hash
mmu is used. follow_page_mask() used to leverage hugetlb APIs to access
hugepd entries. Teach follow_page_mask() itself on hugepd.
With previous refactors on fast-gup gup_huge_pd(), most of the code can be
easily leveraged. There's something not needed for follow page, for
example, gup_hugepte() tries to detect pgtable entry change which will
never happen with slow gup (which has the pgtable lock held), but that's
not a problem to check.
Since follow_page() always only fetch one page, set the end to "address +
PAGE_SIZE" should suffice. We will still do the pgtable walk once for each
hugetlb page by setting ctx->page_mask properly.
One thing worth mentioning is that some level of pgtable's _bad() helper
will report is_hugepd() entries as TRUE on Power8 hash MMUs. I think it at
least applies to PUD on Power8 with 4K pgsize. It means feeding a hugepd
entry to pud_bad() will report a false positive. Let's leave that for now
because it can be arch-specific where I am a bit declined to touch. In
this patch it's not a problem as long as hugepd is detected before any bad
pgtable entries.
Signed-off-by: Peter Xu <peterx@xxxxxxxxxx>
---
mm/gup.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 66 insertions(+), 7 deletions(-)
diff --git a/mm/gup.c b/mm/gup.c
index 00cdf4cb0cd4..43a2e0a203cd 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -30,6 +30,11 @@ struct follow_page_context {
unsigned int page_mask;
};
+static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
+ unsigned long addr, unsigned int pdshift,
+ unsigned int flags,
+ struct follow_page_context *ctx);
+
static inline void sanity_check_pinned_pages(struct page **pages,
unsigned long npages)
{
@@ -871,6 +876,9 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
return no_page_table(vma, flags, address);
if (!pmd_present(pmdval))
return no_page_table(vma, flags, address);
+ if (unlikely(is_hugepd(__hugepd(pmd_val(pmdval)))))
+ return follow_hugepd(vma, __hugepd(pmd_val(pmdval)),
+ address, PMD_SHIFT, flags, ctx);
if (pmd_devmap(pmdval)) {
ptl = pmd_lock(mm, pmd);
page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
@@ -921,6 +929,9 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
pud = READ_ONCE(*pudp);
if (!pud_present(pud))
return no_page_table(vma, flags, address);
+ if (unlikely(is_hugepd(__hugepd(pud_val(pud)))))
+ return follow_hugepd(vma, __hugepd(pud_val(pud)),
+ address, PUD_SHIFT, flags, ctx);
if (pud_leaf(pud)) {
ptl = pud_lock(mm, pudp);
page = follow_huge_pud(vma, address, pudp, flags, ctx);
@@ -944,10 +955,13 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
p4dp = p4d_offset(pgdp, address);
p4d = READ_ONCE(*p4dp);
- if (!p4d_present(p4d))
- return no_page_table(vma, flags, address);
BUILD_BUG_ON(p4d_leaf(p4d));
- if (unlikely(p4d_bad(p4d)))
+
+ if (unlikely(is_hugepd(__hugepd(p4d_val(p4d)))))
+ return follow_hugepd(vma, __hugepd(p4d_val(p4d)),
+ address, P4D_SHIFT, flags, ctx);
+
+ if (!p4d_present(p4d) || p4d_bad(p4d))
return no_page_table(vma, flags, address);
return follow_pud_mask(vma, address, p4dp, flags, ctx);
@@ -981,7 +995,7 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
struct follow_page_context *ctx)
{
- pgd_t *pgd;
+ pgd_t *pgd, pgdval;
struct mm_struct *mm = vma->vm_mm;
ctx->page_mask = 0;
@@ -996,11 +1010,17 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
&ctx->page_mask);
pgd = pgd_offset(mm, address);
+ pgdval = *pgd;
- if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
- return no_page_table(vma, flags, address);
+ if (unlikely(is_hugepd(__hugepd(pgd_val(pgdval)))))
+ page = follow_hugepd(vma, __hugepd(pgd_val(pgdval)),
+ address, PGDIR_SHIFT, flags, ctx);
+ else if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+ page = no_page_table(vma, flags, address);
+ else
+ page = follow_p4d_mask(vma, address, pgd, flags, ctx);
- return follow_p4d_mask(vma, address, pgd, flags, ctx);
+ return page;
}
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
@@ -3037,6 +3057,37 @@ static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
return 1;
}
+
+static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
+ unsigned long addr, unsigned int pdshift,
+ unsigned int flags,
+ struct follow_page_context *ctx)
+{
+ struct page *page;
+ struct hstate *h;
+ spinlock_t *ptl;
+ int nr = 0, ret;
+ pte_t *ptep;
+
+ /* Only hugetlb supports hugepd */
+ if (WARN_ON_ONCE(!is_vm_hugetlb_page(vma)))
+ return ERR_PTR(-EFAULT);
+
+ h = hstate_vma(vma);
+ ptep = hugepte_offset(hugepd, addr, pdshift);
+ ptl = huge_pte_lock(h, vma->vm_mm, ptep);
+ ret = gup_huge_pd(hugepd, addr, pdshift, addr + PAGE_SIZE,
+ flags, &page, &nr);
+ spin_unlock(ptl);
+
+ if (ret) {
+ WARN_ON_ONCE(nr != 1);
+ ctx->page_mask = (1U << huge_page_order(h)) - 1;
+ return page;
+ }
+
+ return NULL;
+}
#else
static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
unsigned int pdshift, unsigned long end, unsigned int flags,
@@ -3044,6 +3095,14 @@ static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
{
return 0;
}
+
+static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
+ unsigned long addr, unsigned int pdshift,
+ unsigned int flags,
+ struct follow_page_context *ctx)
+{
+ return NULL;
+}
#endif /* CONFIG_ARCH_HAS_HUGEPD */
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
--
2.44.0