Re: [RFC PATCH 07/26] hugetlb: add hugetlb_pte to track HugeTLB page table entries

From: manish.mishra
Date: Mon Jun 27 2022 - 08:47:47 EST



On 24/06/22 11:06 pm, James Houghton wrote:
After high-granularity mapping, page table entries for HugeTLB pages can
be of any size/type. (For example, we can have a 1G page mapped with a
mix of PMDs and PTEs.) This struct is to help keep track of a HugeTLB
PTE after we have done a page table walk.

Without this, we'd have to pass around the "size" of the PTE everywhere.
We effectively did this before; it could be fetched from the hstate,
which we pass around pretty much everywhere.

This commit includes definitions for some basic helper functions that
are used later. These helper functions wrap existing PTE
inspection/modification functions, where the correct version is picked
depending on if the HugeTLB PTE is actually "huge" or not. (Previously,
all HugeTLB PTEs were "huge").

For example, hugetlb_ptep_get wraps huge_ptep_get and ptep_get, where
ptep_get is used when the HugeTLB PTE is PAGE_SIZE, and huge_ptep_get is
used in all other cases.

Signed-off-by: James Houghton <jthoughton@xxxxxxxxxx>
---
include/linux/hugetlb.h | 84 +++++++++++++++++++++++++++++++++++++++++
mm/hugetlb.c | 57 ++++++++++++++++++++++++++++
2 files changed, 141 insertions(+)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5fe1db46d8c9..1d4ec9dfdebf 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -46,6 +46,68 @@ enum {
__NR_USED_SUBPAGE,
};
+struct hugetlb_pte {
+ pte_t *ptep;
+ unsigned int shift;
+};
+
+static inline
+void hugetlb_pte_init(struct hugetlb_pte *hpte)
+{
+ hpte->ptep = NULL;
I agree it does not matter but still will hpte->shift = 0 too be better?
+}
+
+static inline
+void hugetlb_pte_populate(struct hugetlb_pte *hpte, pte_t *ptep,
+ unsigned int shift)
+{
+ BUG_ON(!ptep);
+ hpte->ptep = ptep;
+ hpte->shift = shift;
+}
+
+static inline
+unsigned long hugetlb_pte_size(const struct hugetlb_pte *hpte)
+{
+ BUG_ON(!hpte->ptep);
+ return 1UL << hpte->shift;
+}
+
+static inline
+unsigned long hugetlb_pte_mask(const struct hugetlb_pte *hpte)
+{
+ BUG_ON(!hpte->ptep);
+ return ~(hugetlb_pte_size(hpte) - 1);
+}
+
+static inline
+unsigned int hugetlb_pte_shift(const struct hugetlb_pte *hpte)
+{
+ BUG_ON(!hpte->ptep);
+ return hpte->shift;
+}
+
+static inline
+bool hugetlb_pte_huge(const struct hugetlb_pte *hpte)
+{
+ return !IS_ENABLED(CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING) ||
+ hugetlb_pte_shift(hpte) > PAGE_SHIFT;
+}
+
+static inline
+void hugetlb_pte_copy(struct hugetlb_pte *dest, const struct hugetlb_pte *src)
+{
+ dest->ptep = src->ptep;
+ dest->shift = src->shift;
+}
+
+bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte);
+bool hugetlb_pte_none(const struct hugetlb_pte *hpte);
+bool hugetlb_pte_none_mostly(const struct hugetlb_pte *hpte);
+pte_t hugetlb_ptep_get(const struct hugetlb_pte *hpte);
+void hugetlb_pte_clear(struct mm_struct *mm, const struct hugetlb_pte *hpte,
+ unsigned long address);
+
struct hugepage_subpool {
spinlock_t lock;
long count;
@@ -1130,6 +1192,28 @@ static inline spinlock_t *huge_pte_lock_shift(unsigned int shift,
return ptl;
}
+static inline
+spinlock_t *hugetlb_pte_lockptr(struct mm_struct *mm, struct hugetlb_pte *hpte)
+{
+
+ BUG_ON(!hpte->ptep);
+ // Only use huge_pte_lockptr if we are at leaf-level. Otherwise use
+ // the regular page table lock.
+ if (hugetlb_pte_none(hpte) || hugetlb_pte_present_leaf(hpte))
+ return huge_pte_lockptr(hugetlb_pte_shift(hpte),
+ mm, hpte->ptep);
+ return &mm->page_table_lock;
+}
+
+static inline
+spinlock_t *hugetlb_pte_lock(struct mm_struct *mm, struct hugetlb_pte *hpte)
+{
+ spinlock_t *ptl = hugetlb_pte_lockptr(mm, hpte);
+
+ spin_lock(ptl);
+ return ptl;
+}
+
#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
extern void __init hugetlb_cma_reserve(int order);
extern void __init hugetlb_cma_check(void);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d6d0d4c03def..1a1434e29740 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1120,6 +1120,63 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
return false;
}
+bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte)
+{
+ pgd_t pgd;
+ p4d_t p4d;
+ pud_t pud;
+ pmd_t pmd;
+
+ BUG_ON(!hpte->ptep);
+ if (hugetlb_pte_size(hpte) >= PGDIR_SIZE) {
+ pgd = *(pgd_t *)hpte->ptep;

sorry did not understand in these conditions why

hugetlb_pte_size(hpte) >= PGDIR_SIZE. I mean why >= check

and not just == check?

+ return pgd_present(pgd) && pgd_leaf(pgd);
+ } else if (hugetlb_pte_size(hpte) >= P4D_SIZE) {
+ p4d = *(p4d_t *)hpte->ptep;
+ return p4d_present(p4d) && p4d_leaf(p4d);
+ } else if (hugetlb_pte_size(hpte) >= PUD_SIZE) {
+ pud = *(pud_t *)hpte->ptep;
+ return pud_present(pud) && pud_leaf(pud);
+ } else if (hugetlb_pte_size(hpte) >= PMD_SIZE) {
+ pmd = *(pmd_t *)hpte->ptep;
+ return pmd_present(pmd) && pmd_leaf(pmd);
+ } else if (hugetlb_pte_size(hpte) >= PAGE_SIZE)
+ return pte_present(*hpte->ptep);
+ BUG();
+}
+
+bool hugetlb_pte_none(const struct hugetlb_pte *hpte)
+{
+ if (hugetlb_pte_huge(hpte))
+ return huge_pte_none(huge_ptep_get(hpte->ptep));
+ return pte_none(ptep_get(hpte->ptep));
+}
+
+bool hugetlb_pte_none_mostly(const struct hugetlb_pte *hpte)
+{
+ if (hugetlb_pte_huge(hpte))
+ return huge_pte_none_mostly(huge_ptep_get(hpte->ptep));
+ return pte_none_mostly(ptep_get(hpte->ptep));
+}
+
+pte_t hugetlb_ptep_get(const struct hugetlb_pte *hpte)
+{
+ if (hugetlb_pte_huge(hpte))
+ return huge_ptep_get(hpte->ptep);
+ return ptep_get(hpte->ptep);
+}
+
+void hugetlb_pte_clear(struct mm_struct *mm, const struct hugetlb_pte *hpte,
+ unsigned long address)
+{
+ BUG_ON(!hpte->ptep);
+ unsigned long sz = hugetlb_pte_size(hpte);
+
+ if (sz > PAGE_SIZE)
+ return huge_pte_clear(mm, address, hpte->ptep, sz);

just for cosistency something like above?

if (hugetlb_pte_huge(hpte))
+ return huge_pte_clear
;

+ return pte_clear(mm, address, hpte->ptep);
+}
+
static void enqueue_huge_page(struct hstate *h, struct page *page)
{
int nid = page_to_nid(page);