Re: [PATCH v3 09/20] kvm: arm64: Make stage2 page table layout dynamic

From: Auger Eric
Date: Mon Jul 02 2018 - 08:14:42 EST


Hi Suzuki,

On 06/29/2018 01:15 PM, Suzuki K Poulose wrote:
> So far we had a static stage2 page table handling code, based on a
> fixed IPA of 40bits. As we prepare for a configurable IPA size per
> VM, make our stage2 page table code dynamic, to do the right thing
> for a given VM. We ensure the existing condition is always true even
> when we lift the limit on the IPA. i.e,
>
> page table levels in stage1 >= page table levels in stage2
>
> Support for the IPA size configuration needs other changes in the way
> we configure the EL2 registers (VTTBR and VTCR). So, the IPA is still
> fixed to 40bits. The patch also moves the kvm_page_empty() in asm/kvm_mmu.h
> to the top, before including the asm/stage2_pgtable.h to avoid a forward
> declaration.
>
> Cc: Marc Zyngier <marc.zyngier@xxxxxxx>
> Cc: Christoffer Dall <cdall@xxxxxxxxxx>
> Signed-off-by: Suzuki K Poulose <suzuki.poulose@xxxxxxx>
> ---
> Changes since V2
> - Restrict the stage2 page table to allow reusing the host page table
> helpers for now, until we get stage1 independent page table helpers.
I would move this up in the commit msg to motivate the fact we enforce
the able condition.
> ---
> arch/arm64/include/asm/kvm_mmu.h | 14 +-
> arch/arm64/include/asm/stage2_pgtable-nopmd.h | 42 ------
> arch/arm64/include/asm/stage2_pgtable-nopud.h | 39 -----
> arch/arm64/include/asm/stage2_pgtable.h | 207 +++++++++++++++++++-------
> 4 files changed, 159 insertions(+), 143 deletions(-)
> delete mode 100644 arch/arm64/include/asm/stage2_pgtable-nopmd.h
> delete mode 100644 arch/arm64/include/asm/stage2_pgtable-nopud.h

with my very limited knowledge of S2 page table walkers I fail to
understand why we now can get rid of stage2_pgtable-nopmd.h and
stage2_pgtable-nopud.h and associated FOLDED config. Please could you
explain it in the commit message?
>
> diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
> index dbaf513..a351722 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -21,6 +21,7 @@
> #include <asm/page.h>
> #include <asm/memory.h>
> #include <asm/cpufeature.h>
> +#include <asm/kvm_arm.h>
>
> /*
> * As ARMv8.0 only has the TTBR0_EL2 register, we cannot express
> @@ -147,6 +148,13 @@ static inline unsigned long __kern_hyp_va(unsigned long v)
> #define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL))
> #define kvm_vttbr_baddr_mask(kvm) VTTBR_BADDR_MASK
>
> +static inline bool kvm_page_empty(void *ptr)
> +{
> + struct page *ptr_page = virt_to_page(ptr);
> +
> + return page_count(ptr_page) == 1;
> +}
> +
> #include <asm/stage2_pgtable.h>
>
> int create_hyp_mappings(void *from, void *to, pgprot_t prot);
> @@ -237,12 +245,6 @@ static inline bool kvm_s2pmd_exec(pmd_t *pmdp)
> return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN);
> }
>
> -static inline bool kvm_page_empty(void *ptr)
> -{
> - struct page *ptr_page = virt_to_page(ptr);
> - return page_count(ptr_page) == 1;
> -}
> -
> #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
>
> #ifdef __PAGETABLE_PMD_FOLDED
> diff --git a/arch/arm64/include/asm/stage2_pgtable-nopmd.h b/arch/arm64/include/asm/stage2_pgtable-nopmd.h
> deleted file mode 100644
> index 0280ded..0000000
> --- a/arch/arm64/include/asm/stage2_pgtable-nopmd.h
> +++ /dev/null
> @@ -1,42 +0,0 @@
> -/*
> - * Copyright (C) 2016 - ARM Ltd
> - *
> - * This program is free software; you can redistribute it and/or modify
> - * it under the terms of the GNU General Public License version 2 as
> - * published by the Free Software Foundation.
> - *
> - * This program is distributed in the hope that it will be useful,
> - * but WITHOUT ANY WARRANTY; without even the implied warranty of
> - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> - * GNU General Public License for more details.
> - *
> - * You should have received a copy of the GNU General Public License
> - * along with this program. If not, see <http://www.gnu.org/licenses/>.
> - */
> -
> -#ifndef __ARM64_S2_PGTABLE_NOPMD_H_
> -#define __ARM64_S2_PGTABLE_NOPMD_H_
> -
> -#include <asm/stage2_pgtable-nopud.h>
> -
> -#define __S2_PGTABLE_PMD_FOLDED
> -
> -#define S2_PMD_SHIFT S2_PUD_SHIFT
> -#define S2_PTRS_PER_PMD 1
> -#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT)
> -#define S2_PMD_MASK (~(S2_PMD_SIZE-1))
> -
> -#define stage2_pud_none(kvm, pud) (0)
> -#define stage2_pud_present(kvm, pud) (1)
> -#define stage2_pud_clear(kvm, pud) do { } while (0)
> -#define stage2_pud_populate(kvm, pud, pmd) do { } while (0)
> -#define stage2_pmd_offset(kvm, pud, address) ((pmd_t *)(pud))
> -
> -#define stage2_pmd_free(kvm, pmd) do { } while (0)
> -
> -#define stage2_pmd_addr_end(kvm, addr, end) (end)
> -
> -#define stage2_pud_huge(kvm, pud) (0)
> -#define stage2_pmd_table_empty(kvm, pmdp) (0)
> -
> -#endif
> diff --git a/arch/arm64/include/asm/stage2_pgtable-nopud.h b/arch/arm64/include/asm/stage2_pgtable-nopud.h
> deleted file mode 100644
> index cd6304e..0000000
> --- a/arch/arm64/include/asm/stage2_pgtable-nopud.h
> +++ /dev/null
> @@ -1,39 +0,0 @@
> -/*
> - * Copyright (C) 2016 - ARM Ltd
> - *
> - * This program is free software; you can redistribute it and/or modify
> - * it under the terms of the GNU General Public License version 2 as
> - * published by the Free Software Foundation.
> - *
> - * This program is distributed in the hope that it will be useful,
> - * but WITHOUT ANY WARRANTY; without even the implied warranty of
> - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> - * GNU General Public License for more details.
> - *
> - * You should have received a copy of the GNU General Public License
> - * along with this program. If not, see <http://www.gnu.org/licenses/>.
> - */
> -
> -#ifndef __ARM64_S2_PGTABLE_NOPUD_H_
> -#define __ARM64_S2_PGTABLE_NOPUD_H_
> -
> -#define __S2_PGTABLE_PUD_FOLDED
> -
> -#define S2_PUD_SHIFT S2_PGDIR_SHIFT
> -#define S2_PTRS_PER_PUD 1
> -#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT)
> -#define S2_PUD_MASK (~(S2_PUD_SIZE-1))
> -
> -#define stage2_pgd_none(kvm, pgd) (0)
> -#define stage2_pgd_present(kvm, pgd) (1)
> -#define stage2_pgd_clear(kvm, pgd) do { } while (0)
> -#define stage2_pgd_populate(kvm, pgd, pud) do { } while (0)
> -
> -#define stage2_pud_offset(kvm, pgd, address) ((pud_t *)(pgd))
> -
> -#define stage2_pud_free(kvm, x) do { } while (0)
> -
> -#define stage2_pud_addr_end(kvm, addr, end) (end)
> -#define stage2_pud_table_empty(kvm, pmdp) (0)
> -
> -#endif
> diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h
> index 057a405..ffc37cc 100644
> --- a/arch/arm64/include/asm/stage2_pgtable.h
> +++ b/arch/arm64/include/asm/stage2_pgtable.h
> @@ -19,8 +19,12 @@
> #ifndef __ARM64_S2_PGTABLE_H_
> #define __ARM64_S2_PGTABLE_H_
>
> +#include <linux/hugetlb.h>
> #include <asm/pgtable.h>
>
> +/* The PGDIR shift for a given page table with "n" levels. */
> +#define pt_levels_pgdir_shift(n) ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (n))
> +
> /*
> * The hardware supports concatenation of up to 16 tables at stage2 entry level
> * and we use the feature whenever possible.
> @@ -29,118 +33,209 @@
> * On arm64, the smallest PAGE_SIZE supported is 4k, which means
> * (PAGE_SHIFT - 3) > 4 holds for all page sizes.
Trying to understand that comment. Why do we compare to 4?
> * This implies, the total number of page table levels at stage2 expected
> - * by the hardware is actually the number of levels required for (KVM_PHYS_SHIFT - 4)
> + * by the hardware is actually the number of levels required for (IPA_SHIFT - 4)
although understandable, is IPA_SHIFT defined somewhere?
> * in normal translations(e.g, stage1), since we cannot have another level in
> - * the range (KVM_PHYS_SHIFT, KVM_PHYS_SHIFT - 4).
> + * the range (IPA_SHIFT, IPA_SHIFT - 4).
I fail to understand the above comment. Could you give a pointer to the
spec?
> */
> -#define STAGE2_PGTABLE_LEVELS ARM64_HW_PGTABLE_LEVELS(KVM_PHYS_SHIFT - 4)
> +#define stage2_pt_levels(ipa_shift) ARM64_HW_PGTABLE_LEVELS((ipa_shift) - 4)
>
> /*
> - * With all the supported VA_BITs and 40bit guest IPA, the following condition
> - * is always true:
> + * With all the supported VA_BITs and guest IPA, the following condition
> + * must be always true:
> *
> - * STAGE2_PGTABLE_LEVELS <= CONFIG_PGTABLE_LEVELS
> + * stage2_pt_levels <= CONFIG_PGTABLE_LEVELS
> *
> * We base our stage-2 page table walker helpers on this assumption and
> * fall back to using the host version of the helper wherever possible.
> * i.e, if a particular level is not folded (e.g, PUD) at stage2, we fall back
> * to using the host version, since it is guaranteed it is not folded at host.
> *
> - * If the condition breaks in the future, we can rearrange the host level
> - * definitions and reuse them for stage2. Till then...
> + * If the condition breaks in the future, we need completely independent
> + * page table helpers. Till then...
> */
> -#if STAGE2_PGTABLE_LEVELS > CONFIG_PGTABLE_LEVELS
> +
> +#if stage2_pt_levels(KVM_PHYS_SHIFT) > CONFIG_PGTABLE_LEVELS
> #error "Unsupported combination of guest IPA and host VA_BITS."
> #endif
>
> -/* S2_PGDIR_SHIFT is the size mapped by top-level stage2 entry */
> -#define S2_PGDIR_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - STAGE2_PGTABLE_LEVELS)
> -#define S2_PGDIR_SIZE (_AC(1, UL) << S2_PGDIR_SHIFT)
> -#define S2_PGDIR_MASK (~(S2_PGDIR_SIZE - 1))
> -
> /*
> * The number of PTRS across all concatenated stage2 tables given by the
> * number of bits resolved at the initial level.
> */
> -#define PTRS_PER_S2_PGD (1 << (KVM_PHYS_SHIFT - S2_PGDIR_SHIFT))
> +#define __s2_pgd_ptrs(pa, lvls) (1 << ((pa) - pt_levels_pgdir_shift((lvls))))
> +#define __s2_pgd_size(pa, lvls) (__s2_pgd_ptrs((pa), (lvls)) * sizeof(pgd_t))
> +
> +#define kvm_stage2_levels(kvm) stage2_pt_levels(kvm_phys_shift(kvm))
> +#define stage2_pgdir_shift(kvm) \
> + pt_levels_pgdir_shift(kvm_stage2_levels(kvm))
> +#define stage2_pgdir_size(kvm) (_AC(1, UL) << stage2_pgdir_shift((kvm)))
> +#define stage2_pgdir_mask(kvm) (~(stage2_pgdir_size((kvm)) - 1))
> +#define stage2_pgd_ptrs(kvm) \
> + __s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
> +
> +#define stage2_pgd_size(kvm) __s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
>
> /*
> * kvm_mmmu_cache_min_pages is the number of stage2 page table translation
> * levels in addition to the PGD.
> */
> -#define kvm_mmu_cache_min_pages(kvm) (STAGE2_PGTABLE_LEVELS - 1)
> +#define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1)
>
>
> -#if STAGE2_PGTABLE_LEVELS > 3
> +/* PUD/PMD definitions if present */
> +#define __S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1)
> +#define __S2_PUD_SIZE (_AC(1, UL) << __S2_PUD_SHIFT)
> +#define __S2_PUD_MASK (~(__S2_PUD_SIZE - 1))
>
> -#define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1)
> -#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT)
> -#define S2_PUD_MASK (~(S2_PUD_SIZE - 1))
> +#define __S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2)
> +#define __S2_PMD_SIZE (_AC(1, UL) << __S2_PMD_SHIFT)
> +#define __S2_PMD_MASK (~(__S2_PMD_SIZE - 1))
Is this renaming mandatory?
>
> -#define stage2_pgd_none(kvm, pgd) pgd_none(pgd)
> -#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd)
> -#define stage2_pgd_present(kvm, pgd) pgd_present(pgd)
> -#define stage2_pgd_populate(kvm, pgd, pud) pgd_populate(NULL, pgd, pud)
> -#define stage2_pud_offset(kvm, pgd, address) pud_offset(pgd, address)
> -#define stage2_pud_free(kvm, pud) pud_free(NULL, pud)
> +#define __s2_pud_index(addr) \
> + (((addr) >> __S2_PUD_SHIFT) & (PTRS_PER_PTE - 1))
> +#define __s2_pmd_index(addr) \
> + (((addr) >> __S2_PMD_SHIFT) & (PTRS_PER_PTE - 1))
>
> -#define stage2_pud_table_empty(kvm, pudp) kvm_page_empty(pudp)
> +#define __kvm_has_stage2_levels(kvm, min_levels) \
> + ((CONFIG_PGTABLE_LEVELS >= min_levels) && (kvm_stage2_levels(kvm) >= min_levels))
kvm_stage2_levels <= CONFIG_PGTABLE_LEVELS so you should just need to
check kvm_stage2_levels?
> +
> +#define kvm_stage2_has_pgd(kvm) __kvm_has_stage2_levels(kvm, 4)
> +#define kvm_stage2_has_pud(kvm) __kvm_has_stage2_levels(kvm, 3)
> +
> +static inline int stage2_pgd_none(struct kvm *kvm, pgd_t pgd)
> +{
> + return kvm_stage2_has_pgd(kvm) ? pgd_none(pgd) : 0;
> +}
> +
> +static inline void stage2_pgd_clear(struct kvm *kvm, pgd_t *pgdp)
> +{
> + if (kvm_stage2_has_pgd(kvm))
> + pgd_clear(pgdp);
> +}
> +
> +static inline int stage2_pgd_present(struct kvm *kvm, pgd_t pgd)
> +{
> + return kvm_stage2_has_pgd(kvm) ? pgd_present(pgd) : 1;
> +}
> +
> +static inline void stage2_pgd_populate(struct kvm *kvm, pgd_t *pgdp, pud_t *pud)
> +{
> + if (kvm_stage2_has_pgd(kvm))
> + pgd_populate(NULL, pgdp, pud);
> + else
> + BUG();
> +}
> +
> +static inline pud_t *stage2_pud_offset(struct kvm *kvm,
> + pgd_t *pgd, unsigned long address)
> +{
> + if (kvm_stage2_has_pgd(kvm)) {
> + phys_addr_t pud_phys = pgd_page_paddr(*pgd);
> +
> + pud_phys += __s2_pud_index(address) * sizeof(pud_t);
> + return __va(pud_phys);
> + }
> + return (pud_t *)pgd;
> +}
> +
> +static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud)
> +{
> + if (kvm_stage2_has_pgd(kvm))
> + pud_free(NULL, pud);
> +}
> +
> +static inline int stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp)
> +{
> + return kvm_stage2_has_pgd(kvm) && kvm_page_empty(pudp);
> +}
>
> static inline phys_addr_t
> stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
> {
> - phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK;
> + if (kvm_stage2_has_pgd(kvm)) {
> + phys_addr_t boundary = (addr + __S2_PUD_SIZE) & __S2_PUD_MASK;
>
> - return (boundary - 1 < end - 1) ? boundary : end;
> + return (boundary - 1 < end - 1) ? boundary : end;
> + }
> + return end;
> }
>
> -#endif /* STAGE2_PGTABLE_LEVELS > 3 */
> +static inline int stage2_pud_none(struct kvm *kvm, pud_t pud)
> +{
> + return kvm_stage2_has_pud(kvm) ? pud_none(pud) : 0;
> +}
>
> +static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pudp)
> +{
> + if (kvm_stage2_has_pud(kvm))
> + pud_clear(pudp);
> +}
>
> -#if STAGE2_PGTABLE_LEVELS > 2
> +static inline int stage2_pud_present(struct kvm *kvm, pud_t pud)
> +{
> + return kvm_stage2_has_pud(kvm) ? pud_present(pud) : 1;
> +}
>
> -#define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2)
> -#define S2_PMD_SIZE (_AC(1, UL) << S2_PMD_SHIFT)
> -#define S2_PMD_MASK (~(S2_PMD_SIZE - 1))
> +static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pudp, pmd_t *pmd)
> +{
> + if (kvm_stage2_has_pud(kvm))
> + pud_populate(NULL, pudp, pmd);
> + else
> + BUG();
> +}
>
> -#define stage2_pud_none(kvm, pud) pud_none(pud)
> -#define stage2_pud_clear(kvm, pud) pud_clear(pud)
> -#define stage2_pud_present(kvm, pud) pud_present(pud)
> -#define stage2_pud_populate(kvm, pud, pmd) pud_populate(NULL, pud, pmd)
> -#define stage2_pmd_offset(kvm, pud, address) pmd_offset(pud, address)
> -#define stage2_pmd_free(kvm, pmd) pmd_free(NULL, pmd)
> +static inline pmd_t *stage2_pmd_offset(struct kvm *kvm,
> + pud_t *pud, unsigned long address)
> +{
> + if (kvm_stage2_has_pud(kvm)) {
> + phys_addr_t pmd_phys = pud_page_paddr(*pud);
>
> -#define stage2_pud_huge(kvm, pud) pud_huge(pud)
> -#define stage2_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp)
> + pmd_phys += __s2_pmd_index(address) * sizeof(pmd_t);
> + return __va(pmd_phys);
> + }
> + return (pmd_t *)pud;
> +}
> +
> +static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd)
> +{
> + if (kvm_stage2_has_pud(kvm))
> + pmd_free(NULL, pmd);
> +}
> +
> +static inline int stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp)
> +{
> + return kvm_stage2_has_pud(kvm) && kvm_page_empty(pmdp);
> +}
>
> static inline phys_addr_t
> stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
> {
> - phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK;
> + if (kvm_stage2_has_pud(kvm)) {
> + phys_addr_t boundary = (addr + __S2_PMD_SIZE) & __S2_PMD_MASK;
>
> - return (boundary - 1 < end - 1) ? boundary : end;
> + return (boundary - 1 < end - 1) ? boundary : end;
> + }
> + return end;
> }
>
> -#endif /* STAGE2_PGTABLE_LEVELS > 2 */
> +static inline int stage2_pud_huge(struct kvm *kvm, pud_t pud)
> +{
> + return kvm_stage2_has_pud(kvm) ? pud_huge(pud) : 0;
> +}
>
> #define stage2_pte_table_empty(kvm, ptep) kvm_page_empty(ptep)
>
> -#if STAGE2_PGTABLE_LEVELS == 2
> -#include <asm/stage2_pgtable-nopmd.h>
> -#elif STAGE2_PGTABLE_LEVELS == 3
> -#include <asm/stage2_pgtable-nopud.h>
> -#endif
> -
> -#define stage2_pgd_size(kvm) (PTRS_PER_S2_PGD * sizeof(pgd_t))
> -
> -#define stage2_pgd_index(kvm, addr) \
> - (((addr) >> S2_PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1))
> +static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr)
> +{
> + return (addr >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1);
> +}
>
> static inline phys_addr_t
> stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
> {
> - phys_addr_t boundary = (addr + S2_PGDIR_SIZE) & S2_PGDIR_MASK;
> + phys_addr_t boundary;
>
> + boundary = (addr + stage2_pgdir_size(kvm)) & stage2_pgdir_mask(kvm);
> return (boundary - 1 < end - 1) ? boundary : end;
> }
>
>

Globally this patch is pretty hard to review. I don't know if it is
possible to split into 2. 1) Addition of some helper macros. 2) removal
of nopud and nopmd and implementation of the corresponding macros?

Thanks

Eric