Re: [RFC PATCH v2 18/20] powerpc/64s: Use contiguous PMD/PUD instead of HUGEPD

From: Christophe Leroy
Date: Mon May 20 2024 - 12:43:29 EST




Le 20/05/2024 à 14:54, Nicholas Piggin a écrit :
> On Sat May 18, 2024 at 5:00 AM AEST, Christophe Leroy wrote:
>> On book3s/64, the only user of hugepd is hash in 4k mode.
>>
>> All other setups (hash-64, radix-4, radix-64) use leaf PMD/PUD.
>>
>> Rework hash-4k to use contiguous PMD and PUD instead.
>>
>> In that setup there are only two huge page sizes: 16M and 16G.
>>
>> 16M sits at PMD level and 16G at PUD level.
>>
>> pte_update doesn't know page size, lets use the same trick as
>> hpte_need_flush() to get page size from segment properties. That's
>> not the most efficient way but let's do that until callers of
>> pte_update() provide page size instead of just a huge flag.
>>
>> Signed-off-by: Christophe Leroy <christophe.leroy@xxxxxxxxxx>
>> ---
>> arch/powerpc/include/asm/book3s/64/hash-4k.h | 15 --------
>> arch/powerpc/include/asm/book3s/64/hash.h | 38 +++++++++++++++----
>> arch/powerpc/include/asm/book3s/64/hugetlb.h | 38 -------------------
>> .../include/asm/book3s/64/pgtable-4k.h | 34 -----------------
>> .../include/asm/book3s/64/pgtable-64k.h | 20 ----------
>> arch/powerpc/include/asm/hugetlb.h | 4 ++
>> .../include/asm/nohash/32/hugetlb-8xx.h | 4 --
>> .../powerpc/include/asm/nohash/hugetlb-e500.h | 4 --
>> arch/powerpc/include/asm/page.h | 8 ----
>> arch/powerpc/mm/book3s64/hash_utils.c | 11 ++++--
>> arch/powerpc/mm/book3s64/pgtable.c | 12 ------
>> arch/powerpc/mm/hugetlbpage.c | 19 ----------
>> arch/powerpc/mm/pgtable.c | 2 +-
>> arch/powerpc/platforms/Kconfig.cputype | 1 -
>> 14 files changed, 43 insertions(+), 167 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
>> index 6472b08fa1b0..c654c376ef8b 100644
>> --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
>> +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
>> @@ -74,21 +74,6 @@
>> #define remap_4k_pfn(vma, addr, pfn, prot) \
>> remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot))
>>
>> -#ifdef CONFIG_HUGETLB_PAGE
>> -static inline int hash__hugepd_ok(hugepd_t hpd)
>> -{
>> - unsigned long hpdval = hpd_val(hpd);
>> - /*
>> - * if it is not a pte and have hugepd shift mask
>> - * set, then it is a hugepd directory pointer
>> - */
>> - if (!(hpdval & _PAGE_PTE) && (hpdval & _PAGE_PRESENT) &&
>> - ((hpdval & HUGEPD_SHIFT_MASK) != 0))
>> - return true;
>> - return false;
>> -}
>> -#endif
>> -
>> /*
>> * 4K PTE format is different from 64K PTE format. Saving the hash_slot is just
>> * a matter of returning the PTE bits that need to be modified. On 64K PTE,
>> diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
>> index faf3e3b4e4b2..509811ca7695 100644
>> --- a/arch/powerpc/include/asm/book3s/64/hash.h
>> +++ b/arch/powerpc/include/asm/book3s/64/hash.h
>> @@ -4,6 +4,7 @@
>> #ifdef __KERNEL__
>>
>> #include <asm/asm-const.h>
>> +#include <asm/book3s/64/slice.h>
>>
>> /*
>> * Common bits between 4K and 64K pages in a linux-style PTE.
>> @@ -161,14 +162,10 @@ extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
>> pte_t *ptep, unsigned long pte, int huge);
>> unsigned long htab_convert_pte_flags(unsigned long pteflags, unsigned long flags);
>> /* Atomic PTE updates */
>> -static inline unsigned long hash__pte_update(struct mm_struct *mm,
>> - unsigned long addr,
>> - pte_t *ptep, unsigned long clr,
>> - unsigned long set,
>> - int huge)
>> +static inline unsigned long hash__pte_update_one(pte_t *ptep, unsigned long clr,
>> + unsigned long set)
>> {
>> __be64 old_be, tmp_be;
>> - unsigned long old;
>>
>> __asm__ __volatile__(
>> "1: ldarx %0,0,%3 # pte_update\n\
>> @@ -182,11 +179,38 @@ static inline unsigned long hash__pte_update(struct mm_struct *mm,
>> : "r" (ptep), "r" (cpu_to_be64(clr)), "m" (*ptep),
>> "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
>> : "cc" );
>> +
>> + return be64_to_cpu(old_be);
>> +}
>> +
>> +static inline unsigned long hash__pte_update(struct mm_struct *mm,
>> + unsigned long addr,
>> + pte_t *ptep, unsigned long clr,
>> + unsigned long set,
>> + int huge)
>> +{
>> + unsigned long old;
>> +
>> + old = hash__pte_update_one(ptep, clr, set);
>> +
>> + if (huge && IS_ENABLED(CONFIG_PPC_4K_PAGES)) {
>> + unsigned int psize = get_slice_psize(mm, addr);
>> + int nb, i;
>> +
>> + if (psize == MMU_PAGE_16M)
>> + nb = SZ_16M / PMD_SIZE;
>> + else if (psize == MMU_PAGE_16G)
>> + nb = SZ_16G / PUD_SIZE;
>> + else
>> + nb = 1;
>> +
>> + for (i = 1; i < nb; i++)
>> + hash__pte_update_one(ptep + i, clr, set);
>> + }
>> /* huge pages use the old page table lock */
>> if (!huge)
>> assert_pte_locked(mm, addr);
>>
>> - old = be64_to_cpu(old_be);
>> if (old & H_PAGE_HASHPTE)
>> hpte_need_flush(mm, addr, ptep, old, huge);
>>
>
> Nice series, I don't know this hugepd code very well but I'll try.
> Why do you have to replicate the PTE entry here? The hash table refill
> should always be working on the first PTE of the page otherwise we have
> bigger problems.

I don't know how book3s/64 works exactly, but on nohash, when you get a
TLB miss exception the only thing you have is the address and you don't
know yes it is a hugepage so you get the PTE as if it was a 4k page and
it is only when you read that PTE that you know it is a hugepage.

Ok, on book3s/64 the page size seems to be encoded inside the segment so
maybe it is a bit different but anyway the TLB miss exception (or DSI ?)
can happen at any address.

>
> What paths look at the N > 0 PTEs of a contiguous page entry?
>

pte_offset_kernel() or pte_offset_map_lock() will land on any contiguous
PTE based on the address handed to pte_index(), as if it was a standard
(4k or 64k) page.

pte_index() doesn't know it is a hugepage, that's the reason why we need
to duplicate the entry.