[PATCH] KVM: x86/mmu: Allow non-zero init value for shadow PTE

From: Sean Christopherson
Date: Mon Jul 29 2019 - 22:23:46 EST


Add support for using a non-zero "init" value for shadow PTEs, which is
required to enable EPT Violation #VEs in hardware. When #VEs are
enabled, KVM needs to set the "suppress #VE" bit in unused PTEs to avoid
unintentionally reflecting not-present EPT Violations into the guest.

Signed-off-by: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
---
arch/x86/kvm/mmu.h | 1 +
arch/x86/kvm/mmu/mmu.c | 43 ++++++++++++++++++++++++++++------
arch/x86/kvm/mmu/paging_tmpl.h | 2 +-
3 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 8a3b1bce722a..139db8a125d6 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -52,6 +52,7 @@ static inline u64 rsvd_bits(int s, int e)
}

void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask);
+void kvm_mmu_set_spte_init_value(u64 init_value);

void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8071952e9cf2..742ea9c254c4 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -250,6 +250,8 @@ static u64 __read_mostly shadow_mmio_access_mask;
static u64 __read_mostly shadow_present_mask;
static u64 __read_mostly shadow_me_mask;

+static u64 __read_mostly shadow_init_value;
+
/*
* SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK;
* shadow_acc_track_mask is the set of bits to be cleared in non-accessed
@@ -538,6 +540,13 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);

+void kvm_mmu_set_spte_init_value(u64 init_value)
+{
+ WARN_ON(!IS_ENABLED(CONFIG_X86_64) && init_value);
+ shadow_init_value = init_value;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_spte_init_value);
+
static u8 kvm_get_shadow_phys_bits(void)
{
/*
@@ -569,6 +578,7 @@ static void kvm_mmu_reset_all_pte_masks(void)
shadow_mmio_mask = 0;
shadow_present_mask = 0;
shadow_acc_track_mask = 0;
+ shadow_init_value = 0;

shadow_phys_bits = kvm_get_shadow_phys_bits();

@@ -610,7 +620,7 @@ static int is_nx(struct kvm_vcpu *vcpu)

static int is_shadow_present_pte(u64 pte)
{
- return (pte != 0) && !is_mmio_spte(pte);
+ return (pte != 0 && pte != shadow_init_value && !is_mmio_spte(pte));
}

static int is_large_pte(u64 pte)
@@ -921,9 +931,9 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
u64 old_spte = *sptep;

if (!spte_has_volatile_bits(old_spte))
- __update_clear_spte_fast(sptep, 0ull);
+ __update_clear_spte_fast(sptep, shadow_init_value);
else
- old_spte = __update_clear_spte_slow(sptep, 0ull);
+ old_spte = __update_clear_spte_slow(sptep, shadow_init_value);

if (!is_shadow_present_pte(old_spte))
return 0;
@@ -953,7 +963,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
*/
static void mmu_spte_clear_no_track(u64 *sptep)
{
- __update_clear_spte_fast(sptep, 0ull);
+ __update_clear_spte_fast(sptep, shadow_init_value);
}

static u64 mmu_spte_get_lockless(u64 *sptep)
@@ -2473,6 +2483,20 @@ static void clear_sp_write_flooding_count(u64 *spte)
__clear_sp_write_flooding_count(sp);
}

+#ifdef CONFIG_X86_64
+static inline void kvm_clear_ptes(void *page)
+{
+ int ign;
+
+ asm volatile (
+ "rep stosq\n\t"
+ : "=c"(ign), "=D"(page)
+ : "a"(shadow_init_value), "c"(4096/8), "D"(page)
+ : "memory"
+ );
+}
+#endif
+
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn_t gfn,
gva_t gaddr,
@@ -2553,7 +2577,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (level > PT_PAGE_TABLE_LEVEL && need_sync)
flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
}
- clear_page(sp->spt);
+#ifdef CONFIG_X86_64
+ if (shadow_init_value)
+ kvm_clear_ptes(sp->spt);
+ else
+#endif
+ clear_page(sp->spt);
trace_kvm_mmu_get_page(sp, true);

kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
@@ -3515,7 +3544,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
bool fault_handled = false;
- u64 spte = 0ull;
+ u64 spte = shadow_init_value;
uint retry_count = 0;

if (!page_fault_can_be_fast(error_code))
@@ -3951,7 +3980,7 @@ static bool
walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
{
struct kvm_shadow_walk_iterator iterator;
- u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
+ u64 sptes[PT64_ROOT_MAX_LEVEL], spte = shadow_init_value;
struct rsvd_bits_validate *rsvd_check;
int root, leaf;
bool reserved = false;
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 9bdf9b7d9a96..949deed15933 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -1025,7 +1025,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
gpa_t pte_gpa;
gfn_t gfn;

- if (!sp->spt[i])
+ if (!sp->spt[i] || sp->spt[i] == shadow_init_value)
continue;

pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
--
2.24.1


--nFreZHaLTZJo0R7j--