Re: [PATCH 21/22] KVM: x86/mmu: Support rmap walks without holding mmu_lock when aging gfns

From: James Houghton
Date: Tue Sep 03 2024 - 16:39:46 EST


On Fri, Aug 9, 2024 at 12:44 PM Sean Christopherson <seanjc@xxxxxxxxxx> wrote:
>
> DO NOT MERGE, yet...
>
> Cc: James Houghton <jthoughton@xxxxxxxxxx>
> Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx>
> ---
> arch/x86/kvm/mmu/mmu.c | 63 +++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 59 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 48e8608c2738..9df6b465de06 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -995,13 +995,11 @@ static void kvm_rmap_unlock(struct kvm_rmap_head *rmap_head,
> * locking is the same, but the caller is disallowed from modifying the rmap,
> * and so the unlock flow is a nop if the rmap is/was empty.
> */
> -__maybe_unused
> static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head)
> {
> return __kvm_rmap_lock(rmap_head);
> }
>
> -__maybe_unused
> static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head,
> unsigned long old_val)
> {
> @@ -1743,8 +1741,53 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
> __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access);
> }
>
> -static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
> - struct kvm_gfn_range *range, bool test_only)
> +static bool kvm_rmap_age_gfn_range_lockless(struct kvm *kvm,
> + struct kvm_gfn_range *range,
> + bool test_only)
> +{
> + struct kvm_rmap_head *rmap_head;
> + struct rmap_iterator iter;
> + unsigned long rmap_val;
> + bool young = false;
> + u64 *sptep;
> + gfn_t gfn;
> + int level;
> + u64 spte;
> +
> + for (level = PG_LEVEL_4K; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
> + for (gfn = range->start; gfn < range->end;
> + gfn += KVM_PAGES_PER_HPAGE(level)) {
> + rmap_head = gfn_to_rmap(gfn, level, range->slot);
> + rmap_val = kvm_rmap_lock_readonly(rmap_head);
> +
> + for_each_rmap_spte_lockless(rmap_head, &iter, sptep, spte) {
> + if (!is_accessed_spte(spte))
> + continue;
> +
> + if (test_only) {
> + kvm_rmap_unlock_readonly(rmap_head, rmap_val);
> + return true;
> + }
> +
> + /*
> + * Marking SPTEs for access tracking outside of
> + * mmu_lock is unsupported. Report the page as
> + * young, but otherwise leave it as-is.

Just for my own understanding, what's the main reason why it's unsafe
to mark PTEs for access tracking outside the mmu_lock?

> + */
> + if (spte_ad_enabled(spte))
> + clear_bit((ffs(shadow_accessed_mask) - 1),
> + (unsigned long *)sptep);

I feel like it'd be kinda nice to de-duplicate this clear_bit() piece
with the one in kvm_rmap_age_gfn_range().

> + young = true;
> + }
> +
> + kvm_rmap_unlock_readonly(rmap_head, rmap_val);
> + }
> + }
> + return young;
> +}
> +
> +static bool __kvm_rmap_age_gfn_range(struct kvm *kvm,
> + struct kvm_gfn_range *range, bool test_only)
> {
> struct slot_rmap_walk_iterator iterator;
> struct rmap_iterator iter;
> @@ -1783,6 +1826,18 @@ static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
> return young;
> }
>
> +
> +static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
> + struct kvm_gfn_range *range, bool test_only)
> +{
> + /* FIXME: This also needs to be guarded with something like range->fast_only. */
> + if (kvm_ad_enabled())

I expect this to be something like `if (kvm_ad_enabled() ||
range->fast_only)`. With MGLRU, that means the pages will always be
the last candidates for eviction, though it is still possible for them
to be evicted (though I think this would basically never happen). I
think this is fine.

I think the only other possible choice is to record/return 'not
young'/false instead of 'young'/true if the spte is young but
!spte_ad_enabled(). That doesn't seem to be obviously better, though
we *will* get correct age information at eviction time, when
!range->fast_only, at which point the page will not be evicted, and
Accessed will be cleared.


> + return kvm_rmap_age_gfn_range_lockless(kvm, range, test_only);
> +
> + lockdep_assert_held_write(&kvm->mmu_lock);
> + return __kvm_rmap_age_gfn_range(kvm, range, test_only);
> +}
> +
> bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
> {
> bool young = false;
> --
> 2.46.0.76.ge559c4bf1a-goog
>