[RFC PATCH v2 12/31] KVM: arm/arm64: Handle shadow stage 2 page faults

From: Jintack Lim
Date: Mon Oct 02 2017 - 23:17:57 EST


From: Christoffer Dall <christoffer.dall@xxxxxxxxxx>

If we are faulting on a shadow stage 2 translation, we first walk the
guest hypervisor's stage 2 page table to see if it has a mapping. If
not, we inject a stage 2 page fault to the virtual EL2. Otherwise, we
create a mapping in the shadow stage 2 page table.

Note that we have to deal with two IPAs when we got a showdow stage 2
page fault. One is the address we faulted on, and is in the L2 guest
phys space. The other is from the guest stage-2 page table walk, and is
in the L1 guest phys space. To differentiate them, we rename variable
names so that fault_ipa is used for the former and ipa is used for the
latter.

Signed-off-by: Christoffer Dall <christoffer.dall@xxxxxxxxxx>
Signed-off-by: Jintack Lim <jintack.lim@xxxxxxxxxx>
---

Notes:
v1-->v2:
- Added a common function to inject s2 faults.
- Align L1 IPA as well as L2 IPA in transparent_hugepage_adjust(). This will
come in handy when creating a rmap entry with both IPAs.

arch/arm/include/asm/kvm_emulate.h | 7 ++++
arch/arm/include/asm/kvm_mmu.h | 4 ++
arch/arm64/include/asm/kvm_emulate.h | 5 +++
arch/arm64/include/asm/kvm_mmu.h | 1 +
arch/arm64/kvm/mmu-nested.c | 8 ++++
virt/kvm/arm/mmio.c | 12 +++---
virt/kvm/arm/mmu.c | 75 +++++++++++++++++++++++++++++-------
7 files changed, 92 insertions(+), 20 deletions(-)

diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
index 24a3fbf..8136464 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -297,4 +297,11 @@ static inline struct kvm_s2_vmid *vcpu_get_active_vmid(struct kvm_vcpu *vcpu)
{
return &vcpu->kvm->arch.mmu.vmid;
}
+
+/* arm architecture doesn't support the nesting */
+static inline bool kvm_is_shadow_s2_fault(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+
#endif /* __ARM_KVM_EMULATE_H__ */
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 5fab21a..6a22846 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -242,6 +242,10 @@ static inline void kvm_nested_s2_free(struct kvm *kvm) { }
static inline void kvm_nested_s2_wp(struct kvm *kvm) { }
static inline void kvm_nested_s2_clear(struct kvm *kvm) { }
static inline void kvm_nested_s2_flush(struct kvm *kvm) { }
+static inline int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2)
+{
+ return 0;
+}

static inline u64 kvm_get_vttbr(struct kvm_s2_vmid *vmid,
struct kvm_s2_mmu *mmu)
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index f476576..c66554b 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -390,4 +390,9 @@ static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu,
return data; /* Leave LE untouched */
}

+static inline bool kvm_is_shadow_s2_fault(struct kvm_vcpu *vcpu)
+{
+ return vcpu_nested_stage2_enabled(vcpu) && !is_hyp_ctxt(vcpu);
+}
+
#endif /* __ARM64_KVM_EMULATE_H__ */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index c4efcd5..425e4a2 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -342,6 +342,7 @@ int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
void kvm_nested_s2_wp(struct kvm *kvm);
void kvm_nested_s2_clear(struct kvm *kvm);
void kvm_nested_s2_flush(struct kvm *kvm);
+int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2);

static inline u64 kvm_get_vttbr(struct kvm_s2_vmid *vmid,
struct kvm_s2_mmu *mmu)
diff --git a/arch/arm64/kvm/mmu-nested.c b/arch/arm64/kvm/mmu-nested.c
index fb694b7..75570cc 100644
--- a/arch/arm64/kvm/mmu-nested.c
+++ b/arch/arm64/kvm/mmu-nested.c
@@ -60,6 +60,14 @@ static int esr_s2_fault(struct kvm_vcpu *vcpu, int level, u32 fsc)
return esr;
}

+int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2)
+{
+ vcpu->arch.ctxt.sys_regs[FAR_EL2] = vcpu->arch.fault.far_el2;
+ vcpu->arch.ctxt.sys_regs[HPFAR_EL2] = vcpu->arch.fault.hpfar_el2;
+
+ return kvm_inject_nested_sync(vcpu, esr_el2);
+}
+
static int check_base_s2_limits(struct kvm_vcpu *vcpu, struct s2_walk_info *wi,
int level, int input_size, int stride)
{
diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c
index b6e715f..a1009c2 100644
--- a/virt/kvm/arm/mmio.c
+++ b/virt/kvm/arm/mmio.c
@@ -153,7 +153,7 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len)
}

int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
- phys_addr_t fault_ipa)
+ phys_addr_t ipa)
{
unsigned long data;
unsigned long rt;
@@ -182,22 +182,22 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt),
len);

- trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, data);
+ trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, ipa, data);
kvm_mmio_write_buf(data_buf, len, data);

- ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len,
+ ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, ipa, len,
data_buf);
} else {
trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len,
- fault_ipa, 0);
+ ipa, 0);

- ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len,
+ ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, ipa, len,
data_buf);
}

/* Now prepare kvm_run for the potential return to userland. */
run->mmio.is_write = is_write;
- run->mmio.phys_addr = fault_ipa;
+ run->mmio.phys_addr = ipa;
run->mmio.len = len;

if (!ret) {
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 3143f81..25d3d73 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1098,7 +1098,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
return ret;
}

-static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
+static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap,
+ phys_addr_t *fault_ipap)
{
kvm_pfn_t pfn = *pfnp;
gfn_t gfn = *ipap >> PAGE_SHIFT;
@@ -1126,6 +1127,7 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
mask = PTRS_PER_PMD - 1;
VM_BUG_ON((gfn & mask) != (pfn & mask));
if (pfn & mask) {
+ *fault_ipap &= PMD_MASK;
*ipap &= PMD_MASK;
kvm_release_pfn_clean(pfn);
pfn &= ~mask;
@@ -1337,13 +1339,15 @@ static void kvm_send_hwpoison_signal(unsigned long address,
}

static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
- struct kvm_memory_slot *memslot, unsigned long hva,
- unsigned long fault_status)
+ struct kvm_s2_trans *nested,
+ struct kvm_memory_slot *memslot,
+ unsigned long hva, unsigned long fault_status)
{
int ret;
bool write_fault, writable, hugetlb = false, force_pte = false;
unsigned long mmu_seq;
- gfn_t gfn = fault_ipa >> PAGE_SHIFT;
+ phys_addr_t ipa = fault_ipa;
+ gfn_t gfn;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
struct vm_area_struct *vma;
@@ -1368,9 +1372,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
}

- if (is_vm_hugetlb_page(vma) && !logging_active) {
+ if (kvm_is_shadow_s2_fault(vcpu)) {
+ ipa = nested->output;
+
+ /*
+ * If we're about to create a shadow stage 2 entry, then we
+ * can only create huge mappings if the guest hypervisor also
+ * uses a huge mapping.
+ */
+ if (nested->block_size != PMD_SIZE)
+ force_pte = true;
+ }
+ gfn = ipa >> PAGE_SHIFT;
+
+
+ if (!force_pte && is_vm_hugetlb_page(vma) && !logging_active) {
hugetlb = true;
- gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
+ gfn = (ipa & PMD_MASK) >> PAGE_SHIFT;
} else {
/*
* Pages belonging to memslots that don't have the same
@@ -1438,7 +1456,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
goto out_unlock;

if (!hugetlb && !force_pte)
- hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
+ hugetlb = transparent_hugepage_adjust(&pfn, &ipa, &fault_ipa);

if (hugetlb) {
pmd_t new_pmd = pfn_pmd(pfn, mem_type);
@@ -1525,8 +1543,10 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
{
unsigned long fault_status;
- phys_addr_t fault_ipa;
+ phys_addr_t fault_ipa; /* The address we faulted on */
+ phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */
struct kvm_memory_slot *memslot;
+ struct kvm_s2_trans nested_trans;
unsigned long hva;
bool is_iabt, write_fault, writable;
gfn_t gfn;
@@ -1538,7 +1558,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
return 1;
}

- fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
+ ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);

trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
kvm_vcpu_get_hfar(vcpu), fault_ipa);
@@ -1547,6 +1567,12 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
fault_status != FSC_ACCESS) {
+ /*
+ * We must never see an address size fault on shadow stage 2
+ * page table walk, because we would have injected an addr
+ * size fault when we walked the nested s2 page and not
+ * create the shadow entry.
+ */
kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
kvm_vcpu_trap_get_class(vcpu),
(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
@@ -1556,7 +1582,27 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)

idx = srcu_read_lock(&vcpu->kvm->srcu);

- gfn = fault_ipa >> PAGE_SHIFT;
+ /*
+ * We may have faulted on a shadow stage 2 page table if we are
+ * running a nested guest. In this case, we have to resovle the L2
+ * IPA to the L1 IPA first, before knowing what kind of memory should
+ * back the L1 IPA.
+ *
+ * If the shadow stage 2 page table walk faults, then we simply inject
+ * this to the guest and carry on.
+ */
+ if (kvm_is_shadow_s2_fault(vcpu)) {
+ nested_trans.esr = 0;
+ ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans);
+ if (nested_trans.esr)
+ kvm_inject_s2_fault(vcpu, nested_trans.esr);
+ if (ret)
+ goto out_unlock;
+
+ ipa = nested_trans.output;
+ }
+
+ gfn = ipa >> PAGE_SHIFT;
memslot = gfn_to_memslot(vcpu->kvm, gfn);
hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
write_fault = kvm_is_write_fault(vcpu);
@@ -1590,13 +1636,13 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
* faulting VA. This is always 12 bits, irrespective
* of the page size.
*/
- fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
- ret = io_mem_abort(vcpu, run, fault_ipa);
+ ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
+ ret = io_mem_abort(vcpu, run, ipa);
goto out_unlock;
}

/* Userspace should not be able to register out-of-bounds IPAs */
- VM_BUG_ON(fault_ipa >= KVM_PHYS_SIZE);
+ VM_BUG_ON(ipa >= KVM_PHYS_SIZE);

if (fault_status == FSC_ACCESS) {
handle_access_fault(vcpu, fault_ipa);
@@ -1604,7 +1650,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
goto out_unlock;
}

- ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
+ ret = user_mem_abort(vcpu, fault_ipa, &nested_trans,
+ memslot, hva, fault_status);
if (ret == 0)
ret = 1;
out_unlock:
--
1.9.1