Re: [RFC PATCH v2 07/20] powerpc/8xx: Rework support for 8M pages using contiguous PTE entries

From: Oscar Salvador
Date: Fri May 24 2024 - 06:02:33 EST


On Fri, May 17, 2024 at 09:00:01PM +0200, Christophe Leroy wrote:
> In order to fit better with standard Linux page tables layout, add
> support for 8M pages using contiguous PTE entries in a standard
> page table. Page tables will then be populated with 1024 similar
> entries and two PMD entries will point to that page table.
>
> The PMD entries also get a flag to tell it is addressing an 8M page,
> this is required for the HW tablewalk assistance.
>
> Signed-off-by: Christophe Leroy <christophe.leroy@xxxxxxxxxx>

I guess that this will slightly change if you remove patch#1 and patch#2
as you said you will.
So I will not comment on the overall design because I do not know how it will
look afterwards, but just some things that caught my eye

> --- a/arch/powerpc/include/asm/hugetlb.h
> +++ b/arch/powerpc/include/asm/hugetlb.h
> @@ -41,7 +41,16 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
> static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
> unsigned long addr, pte_t *ptep)
> {
> - return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1));
> + pmd_t *pmdp = (pmd_t *)ptep;
> + pte_t pte;
> +
> + if (IS_ENABLED(CONFIG_PPC_8xx) && pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M))) {

There are quite some places where you do the "pmd_off" to check whether that
is a 8MB entry.
I think it would make somse sense to have some kind of macro/function to make
more clear what we are checking against.
e.g:

#define pmd_is_SZ_8M(mm, addr, pmdp) (pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M)))
(or whatever name you see fit)

then you would just need

if (IS_ENABLED(CONFIG_PPC_8xx && pmd_is_SZ_8M(mm, addr, pdmp))

Because I see that is also scaterred in 8xx code.


> + pte = __pte(pte_update(mm, addr, pte_offset_kernel(pmdp, 0), ~0UL, 0, 1));
> + pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), ~0UL, 0, 1);

I have this fresh one because I recently read about 8xx pagetables, but not sure
how my memory will survive this, so maybe throw a little comment in there that
we are pointing the two pmds to the area.

Also, the way we pass the parameters here to pte_update() is a bit awkward.
Ideally we should be using some meaningful names?

clr_all_bits = ~0UL
set_bits = 0
bool is_huge = true

pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), clr_all_bits, set_bits, is_huge)

or something along those lines

> -static inline int check_and_get_huge_psize(int shift)
> -{
> - return shift_to_mmu_psize(shift);
> + if (pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M)))

Here you could also use the pmd_is_SZ_8M()

> + ptep = pte_offset_kernel(pmdp, 0);
> + return ptep_get(ptep);
> }
>
> #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
> @@ -53,7 +33,14 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
> static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
> pte_t *ptep, unsigned long sz)
> {
> - pte_update(mm, addr, ptep, ~0UL, 0, 1);
> + pmd_t *pmdp = (pmd_t *)ptep;
> +
> + if (pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M))) {
> + pte_update(mm, addr, pte_offset_kernel(pmdp, 0), ~0UL, 0, 1);
> + pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), ~0UL, 0, 1);
> + } else {
> + pte_update(mm, addr, ptep, ~0UL, 0, 1);
> + }

Could we not leverage this in huge_ptep_get_and_clear()?
AFAICS,

huge_pet_get_and_clear(mm, addr, pte_t *p)
{
pte_t pte = pte_val(*p);

huge_pte_clear(mm, addr, p);
return pte;
}

Or maybe it is not that easy if different powerpc platforms provide their own.
It might be worth checking though.

> }
>
> #define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
> @@ -63,7 +50,14 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
> unsigned long clr = ~pte_val(pte_wrprotect(__pte(~0)));
> unsigned long set = pte_val(pte_wrprotect(__pte(0)));
>
> - pte_update(mm, addr, ptep, clr, set, 1);
> + pmd_t *pmdp = (pmd_t *)ptep;
> +
> + if (pmdp == pmd_off(mm, ALIGN_DOWN(addr, SZ_8M))) {
> + pte_update(mm, addr, pte_offset_kernel(pmdp, 0), clr, set, 1);
> + pte_update(mm, addr, pte_offset_kernel(pmdp + 1, 0), clr, set, 1);
> + } else {
> + pte_update(mm, addr, ptep, clr, set, 1);

I would replace the "1" with "is_huge" or "huge", as being done in
__ptep_set_access_flags , something that makes it more clear without the need
to check pte_update().


> #endif /* _ASM_POWERPC_PGALLOC_32_H */
> diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
> index 07df6b664861..b05cc4f87713 100644
> --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
> +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
..
> - * For other page sizes, we have a single entry in the table.
> + * For 8M pages, we have 1024 entries as if it was
> + * 4M pages, but they are flagged as 8M pages for the hardware.

Maybe drop a comment that a single PMD entry is worth 4MB, so

> + * For 4k pages, we have a single entry in the table.
> */
> -static pmd_t *pmd_off(struct mm_struct *mm, unsigned long addr);
> -static int hugepd_ok(hugepd_t hpd);
> -
> static inline int number_of_cells_per_pte(pmd_t *pmd, pte_basic_t val, int huge)
> {
> if (!huge)
> return PAGE_SIZE / SZ_4K;
> - else if (hugepd_ok(*((hugepd_t *)pmd)))
> - return 1;
> + else if ((pmd_val(*pmd) & _PMD_PAGE_MASK) == _PMD_PAGE_8M)
> + return SZ_4M / SZ_4K;

this becomes more intuitive.


> +static inline void pmd_populate_kernel_size(struct mm_struct *mm, pmd_t *pmdp,
> + pte_t *pte, unsigned long sz)
> +{
> + if (sz == SZ_8M)
> + *pmdp = __pmd(__pa(pte) | _PMD_PRESENT | _PMD_PAGE_8M);
> + else
> + *pmdp = __pmd(__pa(pte) | _PMD_PRESENT);
> +}
> +
> +static inline void pmd_populate_size(struct mm_struct *mm, pmd_t *pmdp,
> + pgtable_t pte_page, unsigned long sz)
> +{
> + if (sz == SZ_8M)
> + *pmdp = __pmd(__pa(pte_page) | _PMD_USER | _PMD_PRESENT | _PMD_PAGE_8M);
> + else
> + *pmdp = __pmd(__pa(pte_page) | _PMD_USER | _PMD_PRESENT);
> +}

In patch#1 you mentioned this will change with the removal of patch#1
and patch#2.

> --- a/arch/powerpc/mm/hugetlbpage.c
> +++ b/arch/powerpc/mm/hugetlbpage.c
> @@ -183,9 +183,6 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
> if (!hpdp)
> return NULL;
>
> - if (IS_ENABLED(CONFIG_PPC_8xx) && pshift < PMD_SHIFT)
> - return pte_alloc_huge(mm, (pmd_t *)hpdp, addr, sz);
> -
> BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
>
> if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr,
> @@ -198,10 +195,18 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
> pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
> unsigned long addr, unsigned long sz)
> {
> + pmd_t *pmd = pmd_off(mm, addr);
> +
> if (sz < PMD_SIZE)
> - return pte_alloc_huge(mm, pmd_off(mm, addr), addr, sz);
> + return pte_alloc_huge(mm, pmd, addr, sz);
>
> - return NULL;
> + if (sz != SZ_8M)
> + return NULL;
> + if (!pte_alloc_huge(mm, pmd, addr, sz))
> + return NULL;
> + if (!pte_alloc_huge(mm, pmd + 1, addr, sz))
> + return NULL;
> + return (pte_t *)pmd;

I think that having the check for invalid huge page sizes upfront would
make more sense, maybe just a matter of taste.

/* Unsupported size */
if (sz > PMD_SIZE && sz = SZ_8M)
return NULL;

if (sz < PMD_SIZE)
...
/* 8MB huge pages */
...

return (pte_t *) pmd;

Also, I am not a big fan of the two separate pte_alloc_huge() for pmd#0+pmd#1,
and I am thinking we might want to hide that within a function and drop a
comment in there explaining why we are updatng both pmds.



> diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
> index d93433e26ded..99f656b3f9f3 100644
> --- a/arch/powerpc/mm/nohash/8xx.c
> +++ b/arch/powerpc/mm/nohash/8xx.c
> @@ -48,20 +48,6 @@ unsigned long p_block_mapped(phys_addr_t pa)
> return 0;
> }
>
> -static pte_t __init *early_hugepd_alloc_kernel(hugepd_t *pmdp, unsigned long va)
> -{
> - if (hpd_val(*pmdp) == 0) {
> - pte_t *ptep = memblock_alloc(sizeof(pte_basic_t), SZ_4K);
> -
> - if (!ptep)
> - return NULL;
> -
> - hugepd_populate_kernel((hugepd_t *)pmdp, ptep, PAGE_SHIFT_8M);
> - hugepd_populate_kernel((hugepd_t *)pmdp + 1, ptep, PAGE_SHIFT_8M);
> - }
> - return hugepte_offset(*(hugepd_t *)pmdp, va, PGDIR_SHIFT);
> -}
> -
> static int __ref __early_map_kernel_hugepage(unsigned long va, phys_addr_t pa,
> pgprot_t prot, int psize, bool new)

Am I blind or do we never use the 'new' parameter?
I checked the tree and it seems we always pass it 'true'.

arch/powerpc/mm/nohash/8xx.c: err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new);
arch/powerpc/mm/nohash/8xx.c: err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_8M, new);
arch/powerpc/mm/nohash/8xx.c: err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new);
arch/powerpc/mm/nohash/8xx.c:
__early_map_kernel_hugepage(VIRT_IMMR_BASE, PHYS_IMMR_BASE, PAGE_KERNEL_NCG, MMU_PAGE_512K, true);

I think we can drop the 'new' and the block code that tries to handle
it?

> diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
> index acdf64c9b93e..59f0d7706d2f 100644
> --- a/arch/powerpc/mm/pgtable.c
> +++ b/arch/powerpc/mm/pgtable.c

> +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
> + pte_t pte, unsigned long sz)
> +{
> + pmd_t *pmdp = pmd_off(mm, addr);
> +
> + pte = set_pte_filter(pte, addr);
> +
> + if (sz == SZ_8M) {
> + __set_huge_pte_at(pmdp, pte_offset_kernel(pmdp, 0), pte_val(pte));
> + __set_huge_pte_at(pmdp, pte_offset_kernel(pmdp + 1, 0), pte_val(pte) + SZ_4M);

You also mentioned that this would slightly change after you drop
patch#0 and patch#1.
The only comment I have right know would be to add a little comment
explaining the layout (the replication of 1024 entries), or just
something like "see comment from number_of_cells_per_pte".



--
Oscar Salvador
SUSE Labs