[PATCH 3/5] KVM: pfncache: Implement range-based invalidation check for hva_to_pfn_retry()

From: David Woodhouse
Date: Tue Oct 15 2024 - 15:49:06 EST


The existing retry loop in hva_to_pfn_retry() is extremely pessimistic.
If there are any concurrent invalidations running, it's effectively just
a complex busy wait loop because its local mmu_notifier_retry_cache()
function will always return true. Even worse, caches are forced to wait
even if the invalidations do not affect the cache's virtual address range.

Since multiple invalidations can be running in parallel, this can result
in a situation where hva_to_pfn_retry() just backs off and keeps retrying
forever, not making any progress.

Mitigate the badness by being a bit more selective about when to retry.
Specifically, retry only if an in-progress invalidation range affects the
cache, i.e. implement range-based invalidation for caches, similar to how
KVM imlements range-based invalidation for page faults.

Note, like the existing range-based invalidation, this approach doesn't
completely prevent false positives since the in-progress range never
shrinks. E.g. if userspace is spamming invalidations *and* simultaneously
invalidates a cache, the cache will still get stuck until the spam stops.
However, that problem already exists with page faults, and precisely
tracking in-progress ranges would add significant complexity. Defer
trying to address that issue until it actually becomes a problem, which in
all likelihood will never happen.

Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx>
Co-developed-by: Sean Christopherson <seanjc@xxxxxxxxxx>
Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx>
---
include/linux/kvm_host.h | 2 ++
virt/kvm/kvm_main.c | 7 ++--
virt/kvm/pfncache.c | 75 ++++++++++++++++++++++++++++------------
3 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index db567d26f7b9..2c0ed735f0f4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -785,6 +785,8 @@ struct kvm {
/* For management / invalidation of gfn_to_pfn_caches */
spinlock_t gpc_lock;
struct list_head gpc_list;
+ u64 mmu_gpc_invalidate_range_start;
+ u64 mmu_gpc_invalidate_range_end;

/*
* created_vcpus is protected by kvm->lock, and is incremented
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 05cbb2548d99..b9223ecab2ca 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -775,7 +775,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
*/
spin_lock(&kvm->mn_invalidate_lock);
kvm->mn_active_invalidate_count++;
- spin_unlock(&kvm->mn_invalidate_lock);

/*
* Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
@@ -784,10 +783,10 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
* any given time, and the caches themselves can check for hva overlap,
* i.e. don't need to rely on memslot overlap checks for performance.
* Because this runs without holding mmu_lock, the pfn caches must use
- * mn_active_invalidate_count (see above) instead of
- * mmu_invalidate_in_progress.
+ * mn_active_invalidate_count instead of mmu_invalidate_in_progress.
*/
gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);
+ spin_unlock(&kvm->mn_invalidate_lock);

/*
* If one or more memslots were found and thus zapped, notify arch code
@@ -1164,6 +1163,8 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)

INIT_LIST_HEAD(&kvm->gpc_list);
spin_lock_init(&kvm->gpc_lock);
+ kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
+ kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;

INIT_LIST_HEAD(&kvm->devices);
kvm->max_vcpus = KVM_MAX_VCPUS;
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 957f739227ab..3d5bc9bab3d9 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -27,6 +27,27 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
{
struct gfn_to_pfn_cache *gpc;

+ lockdep_assert_held(&kvm->mn_invalidate_lock);
+
+ if (likely(kvm->mn_active_invalidate_count == 1)) {
+ kvm->mmu_gpc_invalidate_range_start = start;
+ kvm->mmu_gpc_invalidate_range_end = end;
+ } else {
+ /*
+ * Fully tracking multiple concurrent ranges has diminishing
+ * returns. Keep things simple and just find the minimal range
+ * which includes the current and new ranges. As there won't be
+ * enough information to subtract a range after its invalidate
+ * completes, any ranges invalidated concurrently will
+ * accumulate and persist until all outstanding invalidates
+ * complete.
+ */
+ kvm->mmu_gpc_invalidate_range_start =
+ min(kvm->mmu_gpc_invalidate_range_start, start);
+ kvm->mmu_gpc_invalidate_range_end =
+ max(kvm->mmu_gpc_invalidate_range_end, end);
+ }
+
spin_lock(&kvm->gpc_lock);
list_for_each_entry(gpc, &kvm->gpc_list, list) {
read_lock_irq(&gpc->lock);
@@ -74,6 +95,8 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len)
{
struct kvm_memslots *slots = kvm_memslots(gpc->kvm);

+ lockdep_assert_held(&gpc->lock);
+
if (!gpc->active)
return false;

@@ -124,21 +147,26 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva)
#endif
}

-static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq)
+static bool gpc_invalidate_in_progress_hva(struct gfn_to_pfn_cache *gpc)
{
+ struct kvm *kvm = gpc->kvm;
+
/*
- * mn_active_invalidate_count acts for all intents and purposes
- * like mmu_invalidate_in_progress here; but the latter cannot
- * be used here because the invalidation of caches in the
- * mmu_notifier event occurs _before_ mmu_invalidate_in_progress
- * is elevated.
- *
- * Note, it does not matter that mn_active_invalidate_count
- * is not protected by gpc->lock. It is guaranteed to
- * be elevated before the mmu_notifier acquires gpc->lock, and
- * isn't dropped until after mmu_invalidate_seq is updated.
+ * Ensure the count and range are always read from memory so that
+ * checking for retry in a loop won't get false negatives on the range,
+ * and won't result in an infinite retry loop. Note, the range never
+ * shrinks, only grows, and so the key to avoiding infinite retry loops
+ * is observing the 1=>0 transition of the count.
*/
- if (kvm->mn_active_invalidate_count)
+ return unlikely(READ_ONCE(kvm->mn_active_invalidate_count)) &&
+ READ_ONCE(kvm->mmu_gpc_invalidate_range_start) <= gpc->uhva &&
+ READ_ONCE(kvm->mmu_gpc_invalidate_range_end) > gpc->uhva;
+}
+
+static bool gpc_invalidate_retry_hva(struct gfn_to_pfn_cache *gpc,
+ unsigned long mmu_seq)
+{
+ if (gpc_invalidate_in_progress_hva(gpc))
return true;

/*
@@ -149,7 +177,7 @@ static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_s
* new (incremented) value of mmu_invalidate_seq is observed.
*/
smp_rmb();
- return kvm->mmu_invalidate_seq != mmu_seq;
+ return gpc->kvm->mmu_invalidate_seq != mmu_seq;
}

static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
@@ -164,14 +192,15 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)

lockdep_assert_held_write(&gpc->lock);

- /*
- * Invalidate the cache prior to dropping gpc->lock, the gpa=>uhva
- * assets have already been updated and so a concurrent check() from a
- * different task may not fail the gpa/uhva/generation checks.
- */
- gpc->valid = false;
-
do {
+ /*
+ * Invalidate the cache prior to dropping gpc->lock, the
+ * gpa=>uhva assets have already been updated and so a check()
+ * from a different task may not fail the gpa/uhva/generation
+ * checks, i.e. must observe valid==false.
+ */
+ gpc->valid = false;
+
write_unlock_irq(&gpc->lock);

/*
@@ -201,7 +230,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
* trying to race ahead in the hope that a different task makes
* the cache valid.
*/
- while (READ_ONCE(gpc->kvm->mn_active_invalidate_count)) {
+ while (gpc_invalidate_in_progress_hva(gpc)) {
if (!cond_resched())
cpu_relax();
}
@@ -236,9 +265,9 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
* attempting to refresh.
*/
WARN_ON_ONCE(gpc->valid);
- } while (mmu_notifier_retry_cache(gpc->kvm, mmu_seq));
+ gpc->valid = true;
+ } while (gpc_invalidate_retry_hva(gpc, mmu_seq));

- gpc->valid = true;
gpc->pfn = new_pfn;
gpc->khva = new_khva + offset_in_page(gpc->uhva);

--
2.47.0.rc1.288.g06298d1525-goog


--oDMvAmx7QP3BxXt9
Content-Type: text/x-diff; charset=us-ascii
Content-Disposition: attachment;
filename="0004-KVM-pfncache-Add-lockless-checking-of-cache-invalida.patch"