[PATCH v3 1/6] kvm: determine memory type from VMA

From: ankita
Date: Wed Apr 05 2023 - 14:01:58 EST


From: Ankit Agrawal <ankita@xxxxxxxxxx>

Each VM stores the requires pgprots for its mappings in the
vma->pgprot. Based on this we can determine the desired MT_DEVICE_*
for the VMA directly, and do not have to guess based on heuristics
based on pfn_is_map_memory().

There are the following kinds of pgprot available to userspace and their
corresponding type:
pgprot_noncached -> MT_DEVICE_nGnRnE
pgprot_writecombine -> MT_NORMAL_NC
pgprot_device -> MT_DEVICE_nGnRE
pgprot_tagged -> MT_NORMAL_TAGGED

Decode the relevant MT_* types in use and translate them into the
corresponding KVM_PGTABLEPROT_*:

- MT_DEVICE_nGnRE -> KVM_PGTABLE_PROT_DEVICE_nGnRE (device)
- MT_DEVICE_nGnRnE -> KVM_PGTABLE_PROT_DEVICE_nGnRnE (noncached)
- MT_NORMAL/_TAGGED/_NC -> 0

The selection of 0 for the S2 KVM_PGTABLE_PROT_DEVICE_nGnRnE is based
on [2].

Also worth noting is the result of the stage-1 and stage-2. Ref [3]
If FWB not set, then the combination is the one that is more restrictive.
The sequence from lowest restriction to the highest:
DEVICE_nGnRnE -> DEVICE_nGnRE -> NORMAL/_TAGGED/_NC
If FWB is set, then stage-2 mapping type overrides the stage-1 [1].

This solves a problem where KVM cannot preserve the MT_NORMAL memory
type for non-struct page backed memory into the S2 mapping. Instead
the VMA creator determines the MT type and the S2 will follow it.

[1] https://developer.arm.com/documentation/102376/0100/Combining-Stage-1-and-Stage-2-attributes
[2] ARMv8 reference manual: https://developer.arm.com/documentation/ddi0487/gb/ Section D5.5.3, Table D5-38
[3] ARMv8 reference manual: https://developer.arm.com/documentation/ddi0487/gb/ Table G5-20 on page G5-6330

Signed-off-by: Ankit Agrawal <ankita@xxxxxxxxxx>
---
arch/arm64/include/asm/kvm_pgtable.h | 8 +++++---
arch/arm64/include/asm/memory.h | 6 ++++--
arch/arm64/kvm/hyp/pgtable.c | 16 +++++++++++-----
arch/arm64/kvm/mmu.c | 27 ++++++++++++++++++++++-----
4 files changed, 42 insertions(+), 15 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 4cd6762bda80..d3166b6e6329 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -150,7 +150,8 @@ enum kvm_pgtable_stage2_flags {
* @KVM_PGTABLE_PROT_X: Execute permission.
* @KVM_PGTABLE_PROT_W: Write permission.
* @KVM_PGTABLE_PROT_R: Read permission.
- * @KVM_PGTABLE_PROT_DEVICE: Device attributes.
+ * @KVM_PGTABLE_PROT_DEVICE_nGnRE: Device nGnRE attributes.
+ * @KVM_PGTABLE_PROT_DEVICE_nGnRnE: Device nGnRnE attributes.
* @KVM_PGTABLE_PROT_SW0: Software bit 0.
* @KVM_PGTABLE_PROT_SW1: Software bit 1.
* @KVM_PGTABLE_PROT_SW2: Software bit 2.
@@ -161,7 +162,8 @@ enum kvm_pgtable_prot {
KVM_PGTABLE_PROT_W = BIT(1),
KVM_PGTABLE_PROT_R = BIT(2),

- KVM_PGTABLE_PROT_DEVICE = BIT(3),
+ KVM_PGTABLE_PROT_DEVICE_nGnRE = BIT(3),
+ KVM_PGTABLE_PROT_DEVICE_nGnRnE = BIT(4),

KVM_PGTABLE_PROT_SW0 = BIT(55),
KVM_PGTABLE_PROT_SW1 = BIT(56),
@@ -178,7 +180,7 @@ enum kvm_pgtable_prot {
#define PAGE_HYP KVM_PGTABLE_PROT_RW
#define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X)
#define PAGE_HYP_RO (KVM_PGTABLE_PROT_R)
-#define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE)
+#define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE_nGnRE)

typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
enum kvm_pgtable_prot prot);
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 78e5163836a0..4ebbc4b1ba4d 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -147,14 +147,16 @@
* Memory types for Stage-2 translation
*/
#define MT_S2_NORMAL 0xf
+#define MT_S2_DEVICE_nGnRnE 0x0
#define MT_S2_DEVICE_nGnRE 0x1

/*
* Memory types for Stage-2 translation when ID_AA64MMFR2_EL1.FWB is 0001
* Stage-2 enforces Normal-WB and Device-nGnRE
*/
-#define MT_S2_FWB_NORMAL 6
-#define MT_S2_FWB_DEVICE_nGnRE 1
+#define MT_S2_FWB_NORMAL 0x6
+#define MT_S2_FWB_DEVICE_nGnRnE 0x0
+#define MT_S2_FWB_DEVICE_nGnRE 0x1

#ifdef CONFIG_ARM64_4K_PAGES
#define IOREMAP_MAX_ORDER (PUD_SHIFT)
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 3d61bd3e591d..7a8238b41590 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -355,7 +355,7 @@ struct hyp_map_data {

static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
{
- bool device = prot & KVM_PGTABLE_PROT_DEVICE;
+ bool device = prot & KVM_PGTABLE_PROT_DEVICE_nGnRE;
u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype);
u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS;
@@ -636,14 +636,20 @@ static bool stage2_has_fwb(struct kvm_pgtable *pgt)
static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot,
kvm_pte_t *ptep)
{
- bool device = prot & KVM_PGTABLE_PROT_DEVICE;
- kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) :
- KVM_S2_MEMATTR(pgt, NORMAL);
u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
+ kvm_pte_t attr;
+
+ if (prot & KVM_PGTABLE_PROT_DEVICE_nGnRE)
+ attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRE);
+ else if (prot & KVM_PGTABLE_PROT_DEVICE_nGnRnE)
+ attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRnE);
+ else
+ attr = KVM_S2_MEMATTR(pgt, NORMAL);

if (!(prot & KVM_PGTABLE_PROT_X))
attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
- else if (device)
+ else if (prot & KVM_PGTABLE_PROT_DEVICE_nGnRE ||
+ prot & KVM_PGTABLE_PROT_DEVICE_nGnRnE)
return -EINVAL;

if (prot & KVM_PGTABLE_PROT_R)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 7113587222ff..8d63aa951c33 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -897,7 +897,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
int ret = 0;
struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO };
struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
- enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
+ enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE_nGnRE |
KVM_PGTABLE_PROT_R |
(writable ? KVM_PGTABLE_PROT_W : 0);

@@ -1186,6 +1186,15 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
return vma->vm_flags & VM_MTE_ALLOWED;
}

+/*
+ * Determine the memory region cacheability from VMA's pgprot. This
+ * is used to set the stage 2 PTEs.
+ */
+static unsigned long mapping_type(pgprot_t page_prot)
+{
+ return ((pgprot_val(page_prot) & PTE_ATTRINDX_MASK) >> 2);
+}
+
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_memory_slot *memslot, unsigned long hva,
unsigned long fault_status)
@@ -1368,10 +1377,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (exec_fault)
prot |= KVM_PGTABLE_PROT_X;

- if (device)
- prot |= KVM_PGTABLE_PROT_DEVICE;
- else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
- prot |= KVM_PGTABLE_PROT_X;
+ switch (mapping_type(vma->vm_page_prot)) {
+ case MT_DEVICE_nGnRE:
+ prot |= KVM_PGTABLE_PROT_DEVICE_nGnRE;
+ break;
+ case MT_DEVICE_nGnRnE:
+ prot |= KVM_PGTABLE_PROT_DEVICE_nGnRnE;
+ break;
+ /* MT_NORMAL/_TAGGED/_NC */
+ default:
+ if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
+ prot |= KVM_PGTABLE_PROT_X;
+ }

/*
* Under the premise of getting a FSC_PERM fault, we just need to relax
--
2.17.1