[RFC V2 2/9] Introduce page table population function for direct build EPT feature

From: yulei . kernel
Date: Tue Sep 01 2020 - 10:13:55 EST


From: Yulei Zhang <yulei.kernel@xxxxxxxxx>

Page table population function will pin the memory and pre-construct
the EPT base on the input memory slot configuration so that it won't
relay on the page fault interrupt to setup the page table.

Signed-off-by: Yulei Zhang <yuleixzhang@xxxxxxxxxxx>
---
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/mmu/mmu.c | 212 +++++++++++++++++++++++++++++++-
arch/x86/kvm/svm/svm.c | 2 +-
arch/x86/kvm/vmx/vmx.c | 7 +-
include/linux/kvm_host.h | 4 +-
virt/kvm/kvm_main.c | 30 ++++-
6 files changed, 244 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 485b1239ad39..ab3cbef8c1aa 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1138,7 +1138,7 @@ struct kvm_x86_ops {
int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
- u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+ u64 (*get_mt_mask)(struct kvm *kvm, struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);

void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long pgd,
int pgd_level);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 4e03841f053d..bfe4d2b3e809 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -241,6 +241,11 @@ struct kvm_shadow_walk_iterator {
({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
__shadow_walk_next(&(_walker), spte))

+#define for_each_direct_build_shadow_entry(_walker, shadow_addr, _addr, level) \
+ for (__shadow_walk_init(&(_walker), shadow_addr, _addr, level); \
+ shadow_walk_okay(&(_walker)); \
+ shadow_walk_next(&(_walker)))
+
static struct kmem_cache *pte_list_desc_cache;
static struct kmem_cache *mmu_page_header_cache;
static struct percpu_counter kvm_total_used_mmu_pages;
@@ -2506,13 +2511,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
return sp;
}

+static void __shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
+ hpa_t shadow_addr, u64 addr, int level)
+{
+ iterator->addr = addr;
+ iterator->shadow_addr = shadow_addr;
+ iterator->level = level;
+ iterator->sptep = NULL;
+}
+
static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
struct kvm_vcpu *vcpu, hpa_t root,
u64 addr)
{
- iterator->addr = addr;
- iterator->shadow_addr = root;
- iterator->level = vcpu->arch.mmu->shadow_root_level;
+ __shadow_walk_init(iterator, root, addr, vcpu->arch.mmu->shadow_root_level);

if (iterator->level == PT64_ROOT_4LEVEL &&
vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
@@ -3014,7 +3026,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (level > PG_LEVEL_4K)
spte |= PT_PAGE_SIZE_MASK;
if (tdp_enabled)
- spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn,
+ spte |= kvm_x86_ops.get_mt_mask(vcpu->kvm, vcpu, gfn,
kvm_is_mmio_pfn(pfn));

if (host_writable)
@@ -6278,6 +6290,198 @@ int kvm_mmu_module_init(void)
return ret;
}

+static int direct_build_tdp_set_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
+ u64 *sptep, unsigned pte_access, int level,
+ gfn_t gfn, kvm_pfn_t pfn, bool speculative,
+ bool dirty, bool host_writable)
+{
+ u64 spte = 0;
+ int ret = 0;
+ /*
+ * For the EPT case, shadow_present_mask is 0 if hardware
+ * supports exec-only page table entries. In that case,
+ * ACC_USER_MASK and shadow_user_mask are used to represent
+ * read access. See FNAME(gpte_access) in paging_tmpl.h.
+ */
+ spte |= shadow_present_mask;
+ if (!speculative)
+ spte |= shadow_accessed_mask;
+
+ if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
+ is_nx_huge_page_enabled()) {
+ pte_access &= ~ACC_EXEC_MASK;
+ }
+
+ if (pte_access & ACC_EXEC_MASK)
+ spte |= shadow_x_mask;
+ else
+ spte |= shadow_nx_mask;
+
+ if (pte_access & ACC_USER_MASK)
+ spte |= shadow_user_mask;
+
+ if (level > PG_LEVEL_4K)
+ spte |= PT_PAGE_SIZE_MASK;
+
+ if (tdp_enabled)
+ spte |= kvm_x86_ops.get_mt_mask(kvm, NULL, gfn, kvm_is_mmio_pfn(pfn));
+
+ if (host_writable)
+ spte |= SPTE_HOST_WRITEABLE;
+ else
+ pte_access &= ~ACC_WRITE_MASK;
+
+ spte |= (u64)pfn << PAGE_SHIFT;
+
+ if (pte_access & ACC_WRITE_MASK) {
+
+ spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
+
+ if (dirty) {
+ mark_page_dirty_in_slot(slot, gfn);
+ spte |= shadow_dirty_mask;
+ }
+ }
+
+ if (mmu_spte_update(sptep, spte))
+ kvm_flush_remote_tlbs(kvm);
+
+ return ret;
+}
+
+static void __kvm_walk_global_page(struct kvm *kvm, u64 addr, int level)
+{
+ int i;
+ kvm_pfn_t pfn;
+ u64 *sptep = (u64 *)__va(addr);
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ if (is_shadow_present_pte(sptep[i])) {
+ if (!is_last_spte(sptep[i], level)) {
+ __kvm_walk_global_page(kvm, sptep[i] & PT64_BASE_ADDR_MASK, level - 1);
+ } else {
+ pfn = spte_to_pfn(sptep[i]);
+ mmu_spte_clear_track_bits(&sptep[i]);
+ kvm_release_pfn_clean(pfn);
+ }
+ }
+ }
+ put_page(pfn_to_page(addr >> PAGE_SHIFT));
+}
+
+static int direct_build_tdp_map(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn,
+ kvm_pfn_t pfn, int level)
+{
+ int ret = 0;
+
+ struct kvm_shadow_walk_iterator iterator;
+ kvm_pfn_t old_pfn;
+ u64 spte;
+
+ for_each_direct_build_shadow_entry(iterator, kvm->arch.global_root_hpa,
+ gfn << PAGE_SHIFT, max_tdp_level) {
+ if (iterator.level == level) {
+ break;
+ }
+
+ if (!is_shadow_present_pte(*iterator.sptep)) {
+ struct page *page;
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ return 0;
+
+ spte = page_to_phys(page) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
+ shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
+ mmu_spte_set(iterator.sptep, spte);
+ }
+ }
+ /* if presented pte, release the original pfn */
+ if (is_shadow_present_pte(*iterator.sptep)) {
+ if (level > PG_LEVEL_4K)
+ __kvm_walk_global_page(kvm, (*iterator.sptep) & PT64_BASE_ADDR_MASK, level - 1);
+ else {
+ old_pfn = spte_to_pfn(*iterator.sptep);
+ mmu_spte_clear_track_bits(iterator.sptep);
+ kvm_release_pfn_clean(old_pfn);
+ }
+ }
+ direct_build_tdp_set_spte(kvm, slot, iterator.sptep, ACC_ALL, level, gfn, pfn, false, true, true);
+
+ return ret;
+}
+
+static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
+{
+ unsigned long page_size;
+ int i, ret = 0;
+
+ page_size = kvm_host_page_size(kvm, NULL, gfn);
+
+ for (i = PG_LEVEL_4K; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
+ if (page_size >= KVM_HPAGE_SIZE(i))
+ ret = i;
+ else
+ break;
+ }
+
+ return ret;
+}
+
+int direct_build_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ int host_level, max_level, level;
+ struct kvm_lpage_info *linfo;
+
+ host_level = host_mapping_level(kvm, gfn);
+ if (host_level != PG_LEVEL_4K) {
+ max_level = min(max_huge_page_level, host_level);
+ for (level = PG_LEVEL_4K; level <= max_level; ++level) {
+ linfo = lpage_info_slot(gfn, slot, level);
+ if (linfo->disallow_lpage)
+ break;
+ }
+ host_level = level - 1;
+ }
+ return host_level;
+}
+
+int kvm_direct_tdp_populate_page_table(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ gfn_t gfn;
+ kvm_pfn_t pfn;
+ int host_level;
+
+ if (!kvm->arch.global_root_hpa) {
+ struct page *page;
+ WARN_ON(!tdp_enabled);
+ WARN_ON(max_tdp_level != PT64_ROOT_4LEVEL);
+
+ /* init global root hpa */
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ return -ENOMEM;
+
+ kvm->arch.global_root_hpa = page_to_phys(page);
+ }
+
+ /* setup page table for the slot */
+ for (gfn = slot->base_gfn;
+ gfn < slot->base_gfn + slot->npages;
+ gfn += KVM_PAGES_PER_HPAGE(host_level)) {
+ pfn = gfn_to_pfn_try_write(slot, gfn);
+ if ((pfn & KVM_PFN_ERR_FAULT) || is_noslot_pfn(pfn))
+ return -ENOMEM;
+
+ host_level = direct_build_mapping_level(kvm, slot, gfn);
+
+ if (host_level > PG_LEVEL_4K)
+ MMU_WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(host_level) - 1));
+ direct_build_tdp_map(kvm, slot, gfn, pfn, host_level);
+ }
+
+ return 0;
+}
+
/*
* Calculate mmu pages needed for kvm.
*/
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 03dd7bac8034..3b7ee65cd941 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3607,7 +3607,7 @@ static bool svm_has_emulated_msr(u32 index)
return true;
}

-static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+static u64 svm_get_mt_mask(struct kvm *kvm, struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
return 0;
}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 46ba2e03a892..6f79343ed40e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7106,7 +7106,7 @@ static int __init vmx_check_processor_compat(void)
return 0;
}

-static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+static u64 vmx_get_mt_mask(struct kvm *kvm, struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
u8 cache;
u64 ipat = 0;
@@ -7134,12 +7134,15 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
goto exit;
}

- if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
+ if (!kvm_arch_has_noncoherent_dma(kvm)) {
ipat = VMX_EPT_IPAT_BIT;
cache = MTRR_TYPE_WRBACK;
goto exit;
}

+ if (!vcpu)
+ vcpu = kvm->vcpus[0];
+
if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
ipat = VMX_EPT_IPAT_BIT;
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a23076765b4c..8901862ba2a3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -694,6 +694,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
struct kvm_memory_slot *old,
const struct kvm_memory_slot *new,
enum kvm_mr_change change);
+void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
/* flush all memory translations */
void kvm_arch_flush_shadow_all(struct kvm *kvm);
/* flush memory translations pointing to 'slot' */
@@ -721,6 +722,7 @@ kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
bool atomic, bool *async, bool write_fault,
bool *writable);
+kvm_pfn_t gfn_to_pfn_try_write(struct kvm_memory_slot *slot, gfn_t gfn);

void kvm_release_pfn_clean(kvm_pfn_t pfn);
void kvm_release_pfn_dirty(kvm_pfn_t pfn);
@@ -775,7 +777,7 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
-unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn);
+unsigned long kvm_host_page_size(struct kvm *kvm, struct kvm_vcpu *vcpu, gfn_t gfn);
void mark_page_dirty(struct kvm *kvm, gfn_t gfn);

struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 737666db02de..47fc18b05c53 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -143,7 +143,7 @@ static void hardware_disable_all(void);

static void kvm_io_bus_destroy(struct kvm_io_bus *bus);

-static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
+void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);

__visible bool kvm_rebooting;
EXPORT_SYMBOL_GPL(kvm_rebooting);
@@ -1689,14 +1689,17 @@ bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);

-unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
+unsigned long kvm_host_page_size(struct kvm *kvm, struct kvm_vcpu *vcpu, gfn_t gfn)
{
struct vm_area_struct *vma;
unsigned long addr, size;

size = PAGE_SIZE;

- addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
+ if (vcpu)
+ addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
+ else
+ addr = gfn_to_hva(kvm, gfn);
if (kvm_is_error_hva(addr))
return PAGE_SIZE;

@@ -1989,6 +1992,25 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
return pfn;
}

+/* Map pfn for direct EPT mode, if map failed and it is readonly memslot,
+ * will try to remap it with readonly flag.
+ */
+kvm_pfn_t gfn_to_pfn_try_write(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ kvm_pfn_t pfn;
+ unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, !memslot_is_readonly(slot));
+
+ if (kvm_is_error_hva(addr))
+ return KVM_PFN_NOSLOT;
+
+ pfn = hva_to_pfn(addr, false, NULL, true, NULL);
+ if (pfn & KVM_PFN_ERR_FAULT) {
+ if (memslot_is_readonly(slot))
+ pfn = hva_to_pfn(addr, false, NULL, false, NULL);
+ }
+ return pfn;
+}
+
kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
bool atomic, bool *async, bool write_fault,
bool *writable)
@@ -2638,7 +2660,7 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
}
EXPORT_SYMBOL_GPL(kvm_clear_guest);

-static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
+void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
gfn_t gfn)
{
if (memslot && memslot->dirty_bitmap) {
--
2.17.1