Re: [RFC PATCH v5 049/104] KVM: x86/tdp_mmu: Ignore unsupported mmu operation on private GFNs

From: Paolo Bonzini
Date: Tue Apr 05 2022 - 16:17:27 EST


On 3/4/22 20:49, isaku.yamahata@xxxxxxxxx wrote:
static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
+ bool private_spte,
u64 old_spte, u64 new_spte, int level)
{
bool pfn_changed;
struct kvm_memory_slot *slot;
+ /*
+ * TDX doesn't support live migration. Never mark private page as
+ * dirty in log-dirty bitmap, since it's not possible for userspace
+ * hypervisor to live migrate private page anyway.
+ */
+ if (private_spte)
+ return;

This should not be needed, patch 35 will block it anyway because kvm_slot_dirty_track_enabled() will return false.

@@ -1215,7 +1227,23 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
* into this helper allow blocking; it'd be dead, wasteful code.
*/
for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
- tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
+ /*
+ * For TDX shared mapping, set GFN shared bit to the range,
+ * so the handler() doesn't need to set it, to avoid duplicated
+ * code in multiple handler()s.
+ */
+ gfn_t start;
+ gfn_t end;
+
+ if (is_private_sp(root)) {
+ start = kvm_gfn_private(kvm, range->start);
+ end = kvm_gfn_private(kvm, range->end);
+ } else {
+ start = kvm_gfn_shared(kvm, range->start);
+ end = kvm_gfn_shared(kvm, range->end);
+ }

I think this could be a separate function kvm_gfn_for_root(kvm, root, ...).

@@ -1237,6 +1265,15 @@ static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
if (!is_accessed_spte(iter->old_spte))
return false;
+ /*
+ * First TDX generation doesn't support clearing A bit for private
+ * mapping, since there's no secure EPT API to support it. However
+ * it's a legitimate request for TDX guest, so just return w/o a
+ * WARN().
+ */
+ if (is_private_spte(iter->sptep))
+ return false;

Please add instead a "bool only_shared" argument to kvm_tdp_mmu_handle_gfn, since you can check "only_shared && is_private_sp(root)" just once (instead of checking it once per PTE).

new_spte = iter->old_spte;
if (spte_ad_enabled(new_spte)) {
@@ -1281,6 +1318,13 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
/* Huge pages aren't expected to be modified without first being zapped. */
WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
+ /*
+ * .change_pte() callback should not happen for private page, because
+ * for now TDX private pages are pinned during VM's life time.
+ */
+ if (WARN_ON(is_private_spte(iter->sptep)))
+ return false;
+
if (iter->level != PG_LEVEL_4K ||
!is_shadow_present_pte(iter->old_spte))
return false;
@@ -1332,6 +1376,16 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
u64 new_spte;
bool spte_set = false;
+ /*
+ * First TDX generation doesn't support write protecting private
+ * mappings, since there's no secure EPT API to support it. It
+ * is a bug to reach here for TDX guest.
+ */
+ if (WARN_ON(is_private_sp(root)))
+ return spte_set;

Isn't this function unreachable even for the shared address range? If so, this WARN should be in kvm_tdp_mmu_wrprot_slot, and also it should check if !kvm_arch_dirty_log_supported(kvm).

+ start = kvm_gfn_shared(kvm, start);
+ end = kvm_gfn_shared(kvm, end);

... and then these two lines are unnecessary.

rcu_read_lock();
BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
@@ -1398,6 +1452,16 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
u64 new_spte;
bool spte_set = false;
+ /*
+ * First TDX generation doesn't support clearing dirty bit,
+ * since there's no secure EPT API to support it. It is a
+ * bug to reach here for TDX guest.
+ */
+ if (WARN_ON(is_private_sp(root)))
+ return spte_set;

Same here, can you check it in kvm_tdp_mmu_clear_dirty_slot?

+ start = kvm_gfn_shared(kvm, start);
+ end = kvm_gfn_shared(kvm, end);

Same here.

rcu_read_lock();
tdp_root_for_each_leaf_pte(iter, root, start, end) {
@@ -1467,6 +1531,15 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
struct tdp_iter iter;
u64 new_spte;
+ /*
+ * First TDX generation doesn't support clearing dirty bit,
+ * since there's no secure EPT API to support it. It is a
+ * bug to reach here for TDX guest.
+ */
+ if (WARN_ON(is_private_sp(root)))
+ return;

IIRC this is reachable from userspace, so WARN_ON is not possible. But, again please check

if (!kvm_arch_dirty_log_supported(kvm))
return;

in kvm_tdp_mmu_clear_dirty_pt_masked.

+ gfn = kvm_gfn_shared(kvm, gfn);

Also unnecessary, I think.

rcu_read_lock();
tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
@@ -1530,6 +1603,16 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
struct tdp_iter iter;
kvm_pfn_t pfn;
+ /*
+ * This should only be reachable in case of log-dirty, which TD
+ * private mapping doesn't support so far. Give a WARN() if it
+ * hits private mapping.
+ */
+ if (WARN_ON(is_private_sp(root)))
+ return;
+ start = kvm_gfn_shared(kvm, start);
+ end = kvm_gfn_shared(kvm, end);

I think this is not a big deal and you can leave it as is. Alternatively, please move the WARN to kvm_tdp_mmu_zap_collapsible_sptes(). It is also only reachable if you can set KVM_MEM_LOG_DIRTY_PAGES in the first place.

Paolo

rcu_read_lock();
tdp_root_for_each_pte(iter, root, start, end) {
@@ -1543,8 +1626,9 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
pfn = spte_to_pfn(iter.old_spte);
if (kvm_is_reserved_pfn(pfn) ||
- iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
- pfn, PG_LEVEL_NUM))
+ iter.level >= kvm_mmu_max_mapping_level(kvm, slot,
+ tdp_iter_gfn_unalias(kvm, &iter), pfn,
+ PG_LEVEL_NUM))
continue;
/* Note, a successful atomic zap also does a remote TLB flush. */
@@ -1590,6 +1674,14 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
+ /*
+ * First TDX generation doesn't support write protecting private
+ * mappings, since there's no secure EPT API to support it. It
+ * is a bug to reach here for TDX guest.
+ */
+ if (WARN_ON(is_private_sp(root)))
+ return spte_set;
+
rcu_read_lock();
for_each_tdp_pte_min_level(iter, root->spt, root->role.level,