Re: [PATCH v4] arm64: Add support for PTE contiguous bit.

From: Steve Capper
Date: Wed Dec 16 2015 - 07:50:33 EST


On 11 December 2015 at 21:02, David Woods <dwoods@xxxxxxxxxx> wrote:
> The arm64 MMU supports a Contiguous bit which is a hint that the TTE
> is one of a set of contiguous entries which can be cached in a single
> TLB entry. Supporting this bit adds new intermediate huge page sizes.
>
> The set of huge page sizes available depends on the base page size.
> Without using contiguous pages the huge page sizes are as follows.
>
> 4KB: 2MB 1GB
> 64KB: 512MB
>
> With a 4KB granule, the contiguous bit groups together sets of 16 pages
> and with a 64KB granule it groups sets of 32 pages. This enables two new
> huge page sizes in each case, so that the full set of available sizes
> is as follows.
>
> 4KB: 64KB 2MB 32MB 1GB
> 64KB: 2MB 512MB 16GB
>
> If a 16KB granule is used then the contiguous bit groups 128 pages
> at the PTE level and 32 pages at the PMD level.
>
> If the base page size is set to 64KB then 2MB pages are enabled by
> default. It is possible in the future to make 2MB the default huge
> page size for both 4KB and 64KB granules.
>
> Signed-off-by: David Woods <dwoods@xxxxxxxxxx>
> Reviewed-by: Chris Metcalf <cmetcalf@xxxxxxxxxx>
> ---
>
> This version of the patch addresses all the comments I've received
> to date and is passing the libhugetlbfs tests. Catalin, assuming
> there are no further comments, can this be considered for the arm64
> next tree?

Hi David,
Thanks for this revised series.

I have a few comments below. Most arose when I enabled STRICT_MM_TYPECHECKS.

I have tested this on my arm64 system with PAGE_SIZE==64KB, and it ran well.

Cheers,
--
Steve

>
> arch/arm64/Kconfig | 3 -
> arch/arm64/include/asm/hugetlb.h | 44 ++----
> arch/arm64/include/asm/pgtable-hwdef.h | 18 ++-
> arch/arm64/include/asm/pgtable.h | 10 +-
> arch/arm64/mm/hugetlbpage.c | 267 ++++++++++++++++++++++++++++++++-
> include/linux/hugetlb.h | 2 -
> 6 files changed, 306 insertions(+), 38 deletions(-)
>
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 4876459..ffa3c54 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -530,9 +530,6 @@ config HW_PERF_EVENTS
> config SYS_SUPPORTS_HUGETLBFS
> def_bool y
>
> -config ARCH_WANT_GENERAL_HUGETLB
> - def_bool y
> -
> config ARCH_WANT_HUGE_PMD_SHARE
> def_bool y if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
>
> diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
> index bb4052e..bbc1e35 100644
> --- a/arch/arm64/include/asm/hugetlb.h
> +++ b/arch/arm64/include/asm/hugetlb.h
> @@ -26,36 +26,7 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
> return *ptep;
> }
>
> -static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
> - pte_t *ptep, pte_t pte)
> -{
> - set_pte_at(mm, addr, ptep, pte);
> -}
> -
> -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
> - unsigned long addr, pte_t *ptep)
> -{
> - ptep_clear_flush(vma, addr, ptep);
> -}
> -
> -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
> - unsigned long addr, pte_t *ptep)
> -{
> - ptep_set_wrprotect(mm, addr, ptep);
> -}
>
> -static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
> - unsigned long addr, pte_t *ptep)
> -{
> - return ptep_get_and_clear(mm, addr, ptep);
> -}
> -
> -static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> - unsigned long addr, pte_t *ptep,
> - pte_t pte, int dirty)
> -{
> - return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
> -}
>
> static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
> unsigned long addr, unsigned long end,
> @@ -97,4 +68,19 @@ static inline void arch_clear_hugepage_flags(struct page *page)
> clear_bit(PG_dcache_clean, &page->flags);
> }
>
> +extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
> + struct page *page, int writable);
> +#define arch_make_huge_pte arch_make_huge_pte
> +extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
> + pte_t *ptep, pte_t pte);
> +extern int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> + unsigned long addr, pte_t *ptep,
> + pte_t pte, int dirty);
> +extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
> + unsigned long addr, pte_t *ptep);
> +extern void huge_ptep_set_wrprotect(struct mm_struct *mm,
> + unsigned long addr, pte_t *ptep);
> +extern void huge_ptep_clear_flush(struct vm_area_struct *vma,
> + unsigned long addr, pte_t *ptep);
> +
> #endif /* __ASM_HUGETLB_H */
> diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
> index d6739e8..5c25b83 100644
> --- a/arch/arm64/include/asm/pgtable-hwdef.h
> +++ b/arch/arm64/include/asm/pgtable-hwdef.h
> @@ -90,7 +90,23 @@
> /*
> * Contiguous page definitions.
> */
> -#define CONT_PTES (_AC(1, UL) << CONT_SHIFT)
> +#ifdef CONFIG_ARM64_64K_PAGES
> +#define CONT_PTE_SHIFT 5
> +#define CONT_PMD_SHIFT 5
> +#elif defined(CONFIG_ARM64_16K_PAGES)
> +#define CONT_PTE_SHIFT 7
> +#define CONT_PMD_SHIFT 5
> +#else
> +#define CONT_PTE_SHIFT 4
> +#define CONT_PMD_SHIFT 4
> +#endif
> +
> +#define CONT_PTES (1 << CONT_PTE_SHIFT)
> +#define CONT_PTE_SIZE (CONT_PTES * PAGE_SIZE)
> +#define CONT_PTE_MASK (~(CONT_PTE_SIZE - 1))
> +#define CONT_PMDS (1 << CONT_PMD_SHIFT)
> +#define CONT_PMD_SIZE (CONT_PMDS * PMD_SIZE)
> +#define CONT_PMD_MASK (~(CONT_PMD_SIZE - 1))
> /* the the numerical offset of the PTE within a range of CONT_PTES */
> #define CONT_RANGE_OFFSET(addr) (((addr)>>PAGE_SHIFT)&(CONT_PTES-1))
>
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 450b355..35a318c 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -227,7 +227,8 @@ static inline pte_t pte_mkspecial(pte_t pte)
>
> static inline pte_t pte_mkcont(pte_t pte)
> {
> - return set_pte_bit(pte, __pgprot(PTE_CONT));
> + pte = set_pte_bit(pte, __pgprot(PTE_CONT));
> + return set_pte_bit(pte, __pgprot(PTE_TYPE_PAGE));
> }
>
> static inline pte_t pte_mknoncont(pte_t pte)
> @@ -235,6 +236,11 @@ static inline pte_t pte_mknoncont(pte_t pte)
> return clear_pte_bit(pte, __pgprot(PTE_CONT));
> }
>
> +static inline pmd_t pmd_mkcont(pmd_t pmd)
> +{
> + return __pmd(pmd_val(pmd) | PMD_SECT_CONT);
> +}
> +
> static inline void set_pte(pte_t *ptep, pte_t pte)
> {
> *ptep = pte;
> @@ -304,7 +310,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
> /*
> * Hugetlb definitions.
> */
> -#define HUGE_MAX_HSTATE 2
> +#define HUGE_MAX_HSTATE 4
> #define HPAGE_SHIFT PMD_SHIFT
> #define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT)
> #define HPAGE_MASK (~(HPAGE_SIZE - 1))
> diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
> index 383b03f..39a5e67 100644
> --- a/arch/arm64/mm/hugetlbpage.c
> +++ b/arch/arm64/mm/hugetlbpage.c
> @@ -41,17 +41,282 @@ int pud_huge(pud_t pud)
> #endif
> }
>
> +static int find_num_contig(struct mm_struct *mm, unsigned long addr,
> + pte_t *ptep, pte_t pte, size_t *pgsize)
> +{
> + pgd_t *pgd = pgd_offset(mm, addr);
> + pud_t *pud;
> + pmd_t *pmd;
> +

nit: We should probably set *pgsize = PAGE_SIZE here that way it's
defined on early return.
(Not a problem now, but may help if this code needs to be tweaked in future).

> + if (!pte_cont(pte))
> + return 1;
> + if (!pgd_present(*pgd)) {
> + VM_BUG_ON(!pgd_present(*pgd));
> + return 1;
> + }
> + pud = pud_offset(pgd, addr);
> + if (!pud_present(*pud)) {
> + VM_BUG_ON(!pud_present(*pud));
> + return 1;
> + }
> + pmd = pmd_offset(pud, addr);
> + if (!pmd_present(*pmd)) {
> + VM_BUG_ON(!pmd_present(*pmd));
> + return 1;
> + }
> + if ((pte_t *)pmd == ptep) {
> + *pgsize = PMD_SIZE;
> + return CONT_PMDS;
> + }
> + *pgsize = PAGE_SIZE;
> + return CONT_PTES;
> +}
> +
> +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
> + pte_t *ptep, pte_t pte)
> +{
> + size_t pgsize;
> + int i;
> + int ncontig = find_num_contig(mm, addr, ptep, pte, &pgsize);
> + unsigned long pfn;
> + pgprot_t hugeprot;
> +
> + if (ncontig == 1) {
> + set_pte_at(mm, addr, ptep, pte);
> + return;
> + }
> +
> + pfn = pte_pfn(pte);
> + hugeprot = __pgprot(pte_val(pfn_pte(pfn, 0)) ^ pte_val(pte));

For pfn_pte, we need the following to satisfy the strict mm checks:
pfn_pte(pfn, __pgprot(0))


> + for (i = 0; i < ncontig; i++) {
> + pr_debug("%s: set pte %p to 0x%llx\n", __func__, ptep,
> + pfn_pte(pfn, hugeprot));

We need to wrap the last argument with pte_val(.);

> + set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
> + ptep++;
> + pfn += pgsize >> PAGE_SHIFT;
> + addr += pgsize;
> + }
> +}
> +
> +pte_t *huge_pte_alloc(struct mm_struct *mm,
> + unsigned long addr, unsigned long sz)
> +{
> + pgd_t *pgd;
> + pud_t *pud;
> + pte_t *pte = NULL;
> +
> + pr_debug("%s: addr:0x%lx sz:0x%lx\n", __func__, addr, sz);
> + pgd = pgd_offset(mm, addr);
> + pud = pud_alloc(mm, pgd, addr);
> + if (!pud)
> + return NULL;
> +
> + if (sz == PUD_SIZE) {
> + pte = (pte_t *)pud;
> + } else if (sz == (PAGE_SIZE * CONT_PTES)) {
> + pmd_t *pmd = pmd_alloc(mm, pud, addr);
> +
> + WARN_ON(addr & (sz - 1));
> + pte = pte_alloc_map(mm, NULL, pmd, addr);

We get away with this on arm64 because we don't have high memory.

If one were to port this to 32-bit ARM, then this would break as it
would leave a mapping open.
(i.e. there's no corresponding pte_unmap(.) to line up with the
pte_offset_map from pte_alloc_map).

Maybe worth a quick comment for potential porters that they need to
either disable CONFIG_HIGHPTE or rework the page table page allocation
for contiguous ptes (tricky as one may already have non-contiguous
ptes present).

> + } else if (sz == PMD_SIZE) {
> + if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
> + pud_none(*pud))
> + pte = huge_pmd_share(mm, addr, pud);
> + else
> + pte = (pte_t *)pmd_alloc(mm, pud, addr);
> + } else if (sz == (PMD_SIZE * CONT_PMDS)) {
> + pmd_t *pmd;
> +
> + pmd = pmd_alloc(mm, pud, addr);
> + WARN_ON(addr & (sz - 1));
> + return (pte_t *)pmd;
> + }
> +
> + pr_debug("%s: addr:0x%lx sz:0x%lx ret pte=%p/0x%llx\n", __func__, addr,
> + sz, pte, pte_val(*pte));
> + return pte;
> +}
> +
> +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
> +{
> + pgd_t *pgd;
> + pud_t *pud;
> + pmd_t *pmd = NULL;
> + pte_t *pte = NULL;
> +
> + pgd = pgd_offset(mm, addr);
> + pr_debug("%s: addr:0x%lx pgd:%p\n", __func__, addr, pgd);
> + if (!pgd_present(*pgd))
> + return NULL;
> + pud = pud_offset(pgd, addr);
> + if (!pud_present(*pud))
> + return NULL;
> +
> + if (pud_huge(*pud))
> + return (pte_t *)pud;
> + pmd = pmd_offset(pud, addr);
> + if (!pmd_present(*pmd))
> + return NULL;
> +
> + if (pte_cont(pmd_pte(*pmd))) {
> + pmd = pmd_offset(
> + pud, (addr & CONT_PMD_MASK));
> + return (pte_t *)pmd;
> + }
> + if (pmd_huge(*pmd))
> + return (pte_t *)pmd;
> + pte = pte_offset_kernel(pmd, addr);
> + if (pte_present(*pte) && pte_cont(*pte)) {
> + pte = pte_offset_kernel(
> + pmd, (addr & CONT_PTE_MASK));

Probably best return pte here.


> + }
> + return pte;

and NULL here to signify an error (as we get to the point where we
know that addr isn't referring to a huge page).

> +}
> +
> +pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
> + struct page *page, int writable)
> +{
> + size_t pagesize = huge_page_size(hstate_vma(vma));
> +
> + if (pagesize == CONT_PTE_SIZE) {
> + entry = pte_mkcont(entry);
> + } else if (pagesize == CONT_PMD_SIZE) {
> + entry = pmd_pte(pmd_mkcont(pte_pmd(entry)));
> + } else if (pagesize != PUD_SIZE && pagesize != PMD_SIZE) {
> + pr_warn("%s: unrecognized huge page size 0x%lx\n",
> + __func__, pagesize);
> + }
> + return entry;
> +}
> +
> +pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
> + unsigned long addr, pte_t *ptep)
> +{
> + pte_t pte;
> +
> + if (pte_cont(*ptep)) {
> + int ncontig, i;
> + size_t pgsize;
> + pte_t *cpte;
> + bool is_dirty = false;
> +
> + cpte = huge_pte_offset(mm, addr);
> + ncontig = find_num_contig(mm, addr, cpte,
> + pte_val(*cpte), &pgsize);

The call to pte_val is spurious.

> + /* save the 1st pte to return */
> + pte = ptep_get_and_clear(mm, addr, cpte);
> + for (i = 1; i < ncontig; ++i) {
> + /*
> + * If HW_AFDBM is enabled, then the HW could
> + * turn on the dirty bit for any of the page
> + * in the set, so check them all.
> + */
> + ++cpte;
> + if (pte_dirty(ptep_get_and_clear(mm, addr, cpte)))
> + is_dirty = true;

Okay, ptep_get_and_clear uses the exclusive monitor to guard against
updates from AFDBM. The above looks good to me.

> + }
> + if (is_dirty)
> + return pte_mkdirty(pte);
> + else
> + return pte;
> + } else {
> + return ptep_get_and_clear(mm, addr, ptep);
> + }
> +}
> +
> +int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> + unsigned long addr, pte_t *ptep,
> + pte_t pte, int dirty)
> +{
> + pte_t *cpte;
> +
> + if (pte_cont(pte)) {
> + int ncontig, i, changed = 0;
> + size_t pgsize = 0;
> + unsigned long pfn = pte_pfn(pte);
> + /* Select all bits except the pfn */
> + pgprot_t hugeprot =
> + __pgprot(pte_val(pfn_pte(pfn, 0) ^ pte_val(pte)));

pfn_pte needs the second argument wrapping with __pgprot(.)

> +
> + cpte = huge_pte_offset(vma->vm_mm, addr);
> + pfn = pte_pfn(*cpte);
> + ncontig = find_num_contig(vma->vm_mm, addr, cpte,
> + pte_val(*cpte), &pgsize);

The call to pte_val(.) is spurious.

> + for (i = 0; i < ncontig; ++i, ++cpte) {
> + changed = ptep_set_access_flags(vma, addr, cpte,
> + pfn_pte(pfn,
> + hugeprot),
> + dirty);

ptep_set_access_flags calls through to set_pte_at which will warn if
we are in a scenario where we can lose dirty information. So this code
looks okay for AFDBM to me.

> + pfn += pgsize >> PAGE_SHIFT;
> + }
> + return changed;
> + } else {
> + return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
> + }
> +}
> +
> +void huge_ptep_set_wrprotect(struct mm_struct *mm,
> + unsigned long addr, pte_t *ptep)
> +{
> + if (pte_cont(*ptep)) {
> + int ncontig, i;
> + pte_t *cpte;
> + size_t pgsize = 0;
> +
> + cpte = huge_pte_offset(mm, addr);
> + ncontig = find_num_contig(mm, addr, cpte,
> + pte_val(*cpte), &pgsize);

The call to pte_val is spurious(.).

> + for (i = 0; i < ncontig; ++i, ++cpte)
> + ptep_set_wrprotect(mm, addr, cpte);
> + } else {
> + ptep_set_wrprotect(mm, addr, ptep);
> + }

ptep_set_wrprotect uses the exclusive monitor, thus is protected from
AFDBM. This looks good to me.

> +}
> +
> +void huge_ptep_clear_flush(struct vm_area_struct *vma,
> + unsigned long addr, pte_t *ptep)
> +{
> + if (pte_cont(*ptep)) {
> + int ncontig, i;
> + pte_t *cpte;
> + size_t pgsize = 0;
> +
> + cpte = huge_pte_offset(vma->vm_mm, addr);
> + ncontig = find_num_contig(vma->vm_mm, addr, cpte,
> + pte_val(*cpte), &pgsize);

Call to pte_val is spurious.

> + for (i = 0; i < ncontig; ++i, ++cpte)
> + ptep_clear_flush(vma, addr, cpte);
> + } else {
> + ptep_clear_flush(vma, addr, ptep);
> + }
> +}
> +
> static __init int setup_hugepagesz(char *opt)
> {
> unsigned long ps = memparse(opt, &opt);
> +
> if (ps == PMD_SIZE) {
> hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
> } else if (ps == PUD_SIZE) {
> hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
> + } else if (ps == (PAGE_SIZE * CONT_PTES)) {
> + hugetlb_add_hstate(CONT_PTE_SHIFT);
> + } else if (ps == (PMD_SIZE * CONT_PMDS)) {
> + hugetlb_add_hstate((PMD_SHIFT + CONT_PMD_SHIFT) - PAGE_SHIFT);
> } else {
> - pr_err("hugepagesz: Unsupported page size %lu M\n", ps >> 20);
> + pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10);
> return 0;
> }
> return 1;
> }
> __setup("hugepagesz=", setup_hugepagesz);
> +
> +#ifdef CONFIG_ARM64_64K_PAGES
> +static __init int add_default_hugepagesz(void)
> +{
> + if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL)
> + hugetlb_add_hstate(CONT_PMD_SHIFT);
> + return 0;
> +}
> +arch_initcall(add_default_hugepagesz);
> +#endif
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 685c262..b0eb064 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -96,9 +96,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
> struct address_space *mapping,
> pgoff_t idx, unsigned long address);
>
> -#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
> pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
> -#endif
>
> extern int hugepages_treat_as_movable;
> extern int sysctl_hugetlb_shm_group;
> --
> 2.1.2
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/