[PATCH v1 1/3] kvm: svm: Add support for additional SVM NPF error codes

From: Brijesh Singh
Date: Mon Nov 14 2016 - 17:37:40 EST


From: Tom Lendacky <thomas.lendacky@xxxxxxx>

AMD hardware adds two additional bits to aid in nested page fault handling.

Bit 32 - NPF occurred while translating the guest's final physical address
Bit 33 - NPF occurred while translating the guest page tables

The guest page tables fault indicator can be used as an aid for nested
virtualization. Using V0 for the host, V1 for the first level guest and
V2 for the second level guest, when both V1 and V2 are using nested paging
there are currently a number of unnecessary instruction emulations. When
V2 is launched shadow paging is used in V1 for the nested tables of V2. As
a result, KVM marks these pages as RO in the host nested page tables. When
V2 exits and we resume V1, these pages are still marked RO.

Every nested walk for a guest page table is treated as a user-level write
access and this causes a lot of NPFs because the V1 page tables are marked
RO in the V0 nested tables. While executing V1, when these NPFs occur KVM
sees a write to a read-only page, emulates the V1 instruction and unprotects
the page (marking it RW). This patch looks for cases where we get a NPF due
to a guest page table walk where the page was marked RO. It immediately
unprotects the page and resumes the guest, leading to far fewer instruction
emulations when nested virtualization is used.

Signed-off-by: Tom Lendacky <thomas.lendacky@xxxxxxx>
Reviewed-by: Borislav Petkov <bp@xxxxxxx>
Signed-off-by: Brijesh Singh <brijesh.singh@xxxxxxx>
---
arch/x86/include/asm/kvm_host.h | 11 ++++++++++-
arch/x86/kvm/mmu.c | 20 ++++++++++++++++++--
arch/x86/kvm/svm.c | 2 +-
3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bdde807..da07e17 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -191,6 +191,8 @@ enum {
#define PFERR_RSVD_BIT 3
#define PFERR_FETCH_BIT 4
#define PFERR_PK_BIT 5
+#define PFERR_GUEST_FINAL_BIT 32
+#define PFERR_GUEST_PAGE_BIT 33

#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
@@ -198,6 +200,13 @@ enum {
#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
#define PFERR_PK_MASK (1U << PFERR_PK_BIT)
+#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT)
+#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
+
+#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \
+ PFERR_USER_MASK | \
+ PFERR_WRITE_MASK | \
+ PFERR_PRESENT_MASK)

/* apic attention bits */
#define KVM_APIC_CHECK_VAPIC 0
@@ -1203,7 +1212,7 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu);

int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);

-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code,
void *insn, int insn_len);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d9c7e98..f633d29 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4508,7 +4508,7 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu)
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
}

-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
void *insn, int insn_len)
{
int r, emulation_type = EMULTYPE_RETRY;
@@ -4527,12 +4527,28 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
return r;
}

- r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
+ r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
+ false);
if (r < 0)
return r;
if (!r)
return 1;

+ /*
+ * Before emulating the instruction, check if the error code
+ * was due to a RO violation while translating the guest page.
+ * This can occur when using nested virtualization with nested
+ * paging in both guests. If true, we simply unprotect the page
+ * and resume the guest.
+ *
+ * Note: AMD only (since it supports the PFERR_GUEST_PAGE_MASK used
+ * in PFERR_NEXT_GUEST_PAGE)
+ */
+ if (error_code == PFERR_NESTED_GUEST_PAGE) {
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
+ return 1;
+ }
+
if (mmio_info_in_cache(vcpu, cr2, direct))
emulation_type = 0;
emulate:
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8ca1eca..4e462bb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2074,7 +2074,7 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
static int pf_interception(struct vcpu_svm *svm)
{
u64 fault_address = svm->vmcb->control.exit_info_2;
- u32 error_code;
+ u64 error_code;
int r = 1;

switch (svm->apf_reason) {