[PATCH v8 06/10] vmx: spp: Set up SPP paging table at vmentry/vmexit

From: Yang Weijiang
Date: Fri Nov 29 2019 - 12:26:27 EST


If write to subpage is not allowed, EPT violation generates
and it's handled in fast_page_fault().

In current implementation, SPPT setup is only handled in handle_spp()
vmexit handler, it's triggered when SPP bit is set in EPT leaf
entry while SPPT entries are not ready.

A SPP specific bit(11) is added to exit_qualification and a new
exit reason(66) is introduced for SPP.

Suggested-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
Co-developed-by: He Chen <he.chen@xxxxxxxxxxxxxxx>
Signed-off-by: He Chen <he.chen@xxxxxxxxxxxxxxx>
Co-developed-by: Zhang Yi <yi.z.zhang@xxxxxxxxxxxxxxx>
Signed-off-by: Zhang Yi <yi.z.zhang@xxxxxxxxxxxxxxx>
Signed-off-by: Yang Weijiang <weijiang.yang@xxxxxxxxx>
---
arch/x86/include/asm/vmx.h | 9 ++++
arch/x86/include/uapi/asm/vmx.h | 2 +
arch/x86/kvm/mmu/mmu.c | 47 +++++++++++++++++++-
arch/x86/kvm/mmu/spp.c | 12 +++++
arch/x86/kvm/mmu/spp.h | 2 +
arch/x86/kvm/vmx/vmx.c | 78 +++++++++++++++++++++++++++++++++
arch/x86/kvm/x86.c | 2 +
include/uapi/linux/kvm.h | 6 +++
8 files changed, 156 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index e1137807affc..f41989eae5e2 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -211,6 +211,8 @@ enum vmcs_field {
XSS_EXIT_BITMAP_HIGH = 0x0000202D,
ENCLS_EXITING_BITMAP = 0x0000202E,
ENCLS_EXITING_BITMAP_HIGH = 0x0000202F,
+ SPPT_POINTER = 0x00002030,
+ SPPT_POINTER_HIGH = 0x00002031,
TSC_MULTIPLIER = 0x00002032,
TSC_MULTIPLIER_HIGH = 0x00002033,
GUEST_PHYSICAL_ADDRESS = 0x00002400,
@@ -532,6 +534,13 @@ struct vmx_msr_entry {
#define EPT_VIOLATION_EXECUTABLE (1 << EPT_VIOLATION_EXECUTABLE_BIT)
#define EPT_VIOLATION_GVA_TRANSLATED (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)

+/*
+ * Exit Qualifications for SPPT-Induced vmexits
+ */
+#define SPPT_INDUCED_EXIT_TYPE_BIT 11
+#define SPPT_INDUCED_EXIT_TYPE (1 << SPPT_INDUCED_EXIT_TYPE_BIT)
+#define SPPT_INTR_INFO_UNBLOCK_NMI INTR_INFO_UNBLOCK_NMI
+
/*
* VM-instruction error numbers
*/
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index f0b0c90dd398..ac67622bac5a 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -85,6 +85,7 @@
#define EXIT_REASON_PML_FULL 62
#define EXIT_REASON_XSAVES 63
#define EXIT_REASON_XRSTORS 64
+#define EXIT_REASON_SPP 66

#define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -141,6 +142,7 @@
{ EXIT_REASON_ENCLS, "ENCLS" }, \
{ EXIT_REASON_RDSEED, "RDSEED" }, \
{ EXIT_REASON_PML_FULL, "PML_FULL" }, \
+ { EXIT_REASON_SPP, "SPP" }, \
{ EXIT_REASON_XSAVES, "XSAVES" }, \
{ EXIT_REASON_XRSTORS, "XRSTORS" }

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index a63964e7cec7..7c1118b81911 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -20,6 +20,7 @@
#include "x86.h"
#include "kvm_cache_regs.h"
#include "cpuid.h"
+#include "spp.h"

#include <linux/kvm_host.h>
#include <linux/types.h>
@@ -137,6 +138,7 @@ module_param(dbg, bool, 0644);
/* The mask for the R/X bits in EPT PTEs */
#define PT64_EPT_READABLE_MASK 0x1ull
#define PT64_EPT_EXECUTABLE_MASK 0x4ull
+#define PT64_SPP_SAVED_BIT (1ULL << (PT64_SECOND_AVAIL_BITS_SHIFT + 1))

#include <trace/events/kvm.h>

@@ -160,6 +162,7 @@ enum {
RET_PF_RETRY = 0,
RET_PF_EMULATE = 1,
RET_PF_INVALID = 2,
+ RET_PF_USERSPACE = 3,
};

struct pte_list_desc {
@@ -918,6 +921,11 @@ static u64 mark_spte_for_access_track(u64 spte)
shadow_acc_track_saved_bits_shift;
spte &= ~shadow_acc_track_mask;

+ if (spte & PT_SPP_MASK) {
+ spte &= ~PT_SPP_MASK;
+ spte |= PT64_SPP_SAVED_BIT;
+ }
+
return spte;
}

@@ -1598,9 +1606,14 @@ static bool wrprot_ad_disabled_spte(u64 *sptep)
{
bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
(unsigned long *)sptep);
+ bool was_spp_armed = test_and_clear_bit(PT_SPP_SHIFT,
+ (unsigned long *)sptep);
if (was_writable)
kvm_set_pfn_dirty(spte_to_pfn(*sptep));

+ if (was_spp_armed)
+ *sptep |= PT64_SPP_SAVED_BIT;
+
return was_writable;
}

@@ -3453,7 +3466,24 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
if ((error_code & PFERR_WRITE_MASK) &&
spte_can_locklessly_be_made_writable(spte))
{
- new_spte |= PT_WRITABLE_MASK;
+ /*
+ * Record write protect fault caused by
+ * Sub-page Protection, let VMI decide
+ * the next step.
+ */
+ if (spte & PT_SPP_MASK) {
+ fault_handled = true;
+ vcpu->run->exit_reason = KVM_EXIT_SPP;
+ vcpu->run->spp.addr = gva;
+ vcpu->run->spp.ins_len =
+ kvm_x86_ops->get_inst_len(vcpu);
+ break;
+ }
+
+ if (was_spp_armed(new_spte))
+ restore_spp_bit(&new_spte);
+ else
+ new_spte |= PT_WRITABLE_MASK;

/*
* Do not fix write-permission on the large spte. Since
@@ -3604,6 +3634,10 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
(mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
&invalid_list);
+ if (vcpu->kvm->arch.spp_active)
+ mmu_free_root_page(vcpu->kvm,
+ &vcpu->kvm->arch.sppt_root,
+ &invalid_list);
} else {
for (i = 0; i < 4; ++i)
if (mmu->pae_root[i] != 0)
@@ -5083,6 +5117,8 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
uint i;

vcpu->arch.mmu->root_hpa = INVALID_PAGE;
+ if (!vcpu->kvm->arch.spp_active)
+ vcpu->kvm->arch.sppt_root = INVALID_PAGE;

for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
@@ -5400,6 +5436,10 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
r = vcpu->arch.mmu->page_fault(vcpu, cr2,
lower_32_bits(error_code),
false);
+
+ if (vcpu->run->exit_reason == KVM_EXIT_SPP)
+ r = RET_PF_USERSPACE;
+
WARN_ON(r == RET_PF_INVALID);
}

@@ -5407,7 +5447,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
return 1;
if (r < 0)
return r;
-
+ if (r == RET_PF_USERSPACE)
+ return 0;
/*
* Before emulating the instruction, check if the error code
* was due to a RO violation while translating the guest page.
@@ -6165,6 +6206,8 @@ unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
return nr_mmu_pages;
}

+#include "spp.c"
+
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
kvm_mmu_unload(vcpu);
diff --git a/arch/x86/kvm/mmu/spp.c b/arch/x86/kvm/mmu/spp.c
index 578ca56e493d..0c72bb56ecb9 100644
--- a/arch/x86/kvm/mmu/spp.c
+++ b/arch/x86/kvm/mmu/spp.c
@@ -17,6 +17,18 @@ static void shadow_spp_walk_init(struct kvm_shadow_walk_iterator *iterator,
iterator->level = PT64_ROOT_4LEVEL;
}

+/* Restore an spp armed PTE */
+void restore_spp_bit(u64 *spte)
+{
+ *spte &= ~PT64_SPP_SAVED_BIT;
+ *spte |= PT_SPP_MASK;
+}
+
+bool was_spp_armed(u64 spte)
+{
+ return !!(spte & PT64_SPP_SAVED_BIT);
+}
+
u32 *gfn_to_subpage_wp_info(struct kvm_memory_slot *slot, gfn_t gfn)
{
unsigned long idx;
diff --git a/arch/x86/kvm/mmu/spp.h b/arch/x86/kvm/mmu/spp.h
index 370a6b71e143..3a2a71cea276 100644
--- a/arch/x86/kvm/mmu/spp.h
+++ b/arch/x86/kvm/mmu/spp.h
@@ -11,6 +11,8 @@ int kvm_spp_set_permission(struct kvm *kvm, u64 gfn, u32 npages,
u32 *access_map);
int kvm_spp_mark_protection(struct kvm *kvm, u64 gfn, u32 access);
bool is_spp_spte(struct kvm_mmu_page *sp);
+void restore_spp_bit(u64 *spte);
+bool was_spp_armed(u64 spte);
inline u64 construct_spptp(unsigned long root_hpa);
int kvm_vm_ioctl_get_subpages(struct kvm *kvm,
u64 gfn,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 6214d76e0426..a4ace223a385 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2896,6 +2896,7 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
struct kvm *kvm = vcpu->kvm;
unsigned long guest_cr3;
u64 eptp;
+ u64 spptp;

guest_cr3 = cr3;
if (enable_ept) {
@@ -2918,6 +2919,12 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
ept_load_pdptrs(vcpu);
}

+ if (kvm->arch.spp_active && VALID_PAGE(vcpu->kvm->arch.sppt_root)) {
+ spptp = construct_spptp(vcpu->kvm->arch.sppt_root);
+ vmcs_write64(SPPT_POINTER, spptp);
+ vmx_flush_tlb(vcpu, true);
+ }
+
vmcs_writel(GUEST_CR3, guest_cr3);
}

@@ -5338,6 +5345,73 @@ static int handle_monitor_trap(struct kvm_vcpu *vcpu)
return 1;
}

+int handle_spp(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+ struct kvm_memory_slot *slot;
+ gpa_t gpa;
+ gfn_t gfn;
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ /*
+ * SPP VM exit happened while executing iret from NMI,
+ * "blocked by NMI" bit has to be set before next VM entry.
+ * There are errata that may cause this bit to not be set:
+ * AAK134, BY25.
+ */
+ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ (exit_qualification & SPPT_INTR_INFO_UNBLOCK_NMI))
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+
+ vcpu->arch.exit_qualification = exit_qualification;
+ if (exit_qualification & SPPT_INDUCED_EXIT_TYPE) {
+ int page_num = KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL);
+ u32 *access;
+ gfn_t gfn_max;
+
+ /*
+ * SPPT missing
+ * We don't set SPP write access for the corresponding
+ * GPA, if we haven't setup, we need to construct
+ * SPP table here.
+ */
+ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ gfn = gpa >> PAGE_SHIFT;
+ /*
+ * In level 1 of SPPT, there's no PRESENT bit, all data is
+ * regarded as permission vector, so need to check from
+ * level 2 to set up the vector if target page is protected.
+ */
+ spin_lock(&vcpu->kvm->mmu_lock);
+ gfn &= ~(page_num - 1);
+ gfn_max = gfn + page_num - 1;
+ for (; gfn <= gfn_max; gfn++) {
+ slot = gfn_to_memslot(vcpu->kvm, gfn);
+ if (!slot)
+ continue;
+ access = gfn_to_subpage_wp_info(slot, gfn);
+ if (access && *access != FULL_SPP_ACCESS)
+ kvm_spp_setup_structure(vcpu,
+ *access,
+ gfn);
+ }
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return 1;
+ }
+ /*
+ * SPPT Misconfig
+ * This is probably caused by some mis-configuration in SPPT
+ * entries, cannot handle it here, escalate the fault to
+ * emulator.
+ */
+ WARN_ON(1);
+ vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
+ vcpu->run->hw.hardware_exit_reason = EXIT_REASON_SPP;
+ return 0;
+}
+
static int handle_monitor(struct kvm_vcpu *vcpu)
{
printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
@@ -5554,6 +5628,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_RDSEED] = handle_invalid_op,
[EXIT_REASON_XSAVES] = handle_xsaves,
[EXIT_REASON_XRSTORS] = handle_xrstors,
+ [EXIT_REASON_SPP] = handle_spp,
[EXIT_REASON_PML_FULL] = handle_pml_full,
[EXIT_REASON_INVPCID] = handle_invpcid,
[EXIT_REASON_VMFUNC] = handle_vmx_instruction,
@@ -5786,6 +5861,9 @@ void dump_vmcs(void)
pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
+ if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_SPP))
+ pr_err("SPPT pointer = 0x%016llx\n", vmcs_read64(SPPT_POINTER));
+
n = vmcs_read32(CR3_TARGET_COUNT);
for (i = 0; i + 1 < n; i += 4)
pr_err("CR3 target%u=%016lx target%u=%016lx\n",
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 270f0ea83f08..f38571d09d5e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9567,6 +9567,8 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
}

kvm_page_track_free_memslot(free, dont);
+ if (kvm->arch.spp_active)
+ kvm_spp_free_memslot(free, dont);
}

int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 36832c7eba9b..805b9a2ec9e2 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -244,6 +244,7 @@ struct kvm_hyperv_exit {
#define KVM_EXIT_S390_STSI 25
#define KVM_EXIT_IOAPIC_EOI 26
#define KVM_EXIT_HYPERV 27
+#define KVM_EXIT_SPP 28

/* For KVM_EXIT_INTERNAL_ERROR */
/* Emulate instruction failed. */
@@ -399,6 +400,11 @@ struct kvm_run {
struct {
__u8 vector;
} eoi;
+ /* KVM_EXIT_SPP */
+ struct {
+ __u64 addr;
+ __u8 ins_len;
+ } spp;
/* KVM_EXIT_HYPERV */
struct kvm_hyperv_exit hyperv;
/* Fix the size of the union. */
--
2.17.2