Re: [PATCH] KVM: Move gfn_to_pfn_cache invalidation to invalidate_range_end hook

From: David Woodhouse
Date: Tue Aug 06 2024 - 12:41:14 EST


On Tue, 2024-08-06 at 08:57 -0700, Sean Christopherson wrote:
> The last one is key: the pages have already been freed when invalidate_range_end()
> is called, and so unmapping at that time would be too late.

OK, thanks.

> The obvious downside is what you've run into, where the start+end approach forces
> KVM to wait for all in-flight invalidations to go away.  But again, in practice
> the rudimentary range tracking suffices for all known use cases.

Makes sense.

> I suspect you're trying to solve a problem that doesn't exist in practice.
> hva_to_pfn_retry() already has a cond_resched(), so getting stuck for a long
> duration isn't fatal, just suboptimal.  And similar to the range-based tracking,
> _if_ there's a problem in practice, then it also affects guest page faults.  KVM
> simply resumes the vCPU and keeps re-faulting until the in-flight invalidation(s)
> has gone away.

Yeah. When it's looping on actual page faults it's easy to pretend it
isn't a busy-loop :)

Making it wait is fairly simple; it would be easy to just make it
cond_resched() instead though. Here's what I have so far (incremental).

https://git.infradead.org/?p=users/dwmw2/linux.git;a=commitdiff;h=pfncache-2

I need to revive the torture test you had at the end of your earlier series.

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 689e8be873a7..9b5261e11802 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -770,6 +770,9 @@ struct kvm {
/* For management / invalidation of gfn_to_pfn_caches */
spinlock_t gpc_lock;
struct list_head gpc_list;
+ u64 mmu_gpc_invalidate_range_start;
+ u64 mmu_gpc_invalidate_range_end;
+ wait_queue_head_t gpc_invalidate_wq;

/*
* created_vcpus is protected by kvm->lock, and is incremented
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ffd6ab4c2a16..a243f9f8a9c0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -775,6 +775,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
*/
spin_lock(&kvm->mn_invalidate_lock);
kvm->mn_active_invalidate_count++;
+ if (likely(kvm->mn_active_invalidate_count == 1) {
+ kvm->mmu_gpc_invalidate_range_start = range->start;
+ kvm->mmu_gpc_invalidate_range_end = range->end;
+ } else {
+ /*
+ * Fully tracking multiple concurrent ranges has diminishing
+ * returns. Keep things simple and just find the minimal range
+ * which includes the current and new ranges. As there won't be
+ * enough information to subtract a range after its invalidate
+ * completes, any ranges invalidated concurrently will
+ * accumulate and persist until all outstanding invalidates
+ * complete.
+ */
+ kvm->mmu_gpc_invalidate_range_start =
+ min(kvm->mmu_gpc_invalidate_range_start, range->start);
+ kvm->mmu_gpc_invalidate_range_end =
+ max(kvm->mmu_gpc_invalidate_range_end, range->end);
+ }
spin_unlock(&kvm->mn_invalidate_lock);

/*
@@ -830,21 +848,27 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,

__kvm_handle_hva_range(kvm, &hva_range);

+ gfn_to_pfn_cache_invalidate(kvm, range->start, range->end);
+
/* Pairs with the increment in range_start(). */
spin_lock(&kvm->mn_invalidate_lock);
if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count))
--kvm->mn_active_invalidate_count;
wake = !kvm->mn_active_invalidate_count;
+ if (wake) {
+ kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
+ kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;
+ }
spin_unlock(&kvm->mn_invalidate_lock);

- gfn_to_pfn_cache_invalidate(kvm, range->start, range->end);
-
/*
* There can only be one waiter, since the wait happens under
* slots_lock.
*/
- if (wake)
+ if (wake) {
+ wake_up(&kvm->gpc_invalidate_wq);
rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
+ }
}

static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
@@ -1154,7 +1178,9 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)

INIT_LIST_HEAD(&kvm->gpc_list);
spin_lock_init(&kvm->gpc_lock);
-
+ init_waitqueue_head(&kvm->gpc_invalidate_wq);
+ kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
+ kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;
INIT_LIST_HEAD(&kvm->devices);
kvm->max_vcpus = KVM_MAX_VCPUS;

diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 187b58150ef7..dad32ef5ac87 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -133,6 +133,39 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva)
#endif
}

+static bool gpc_invalidations_pending(struct gfn_to_pfn_cache *gpc)
+{
+ /*
+ * No need for locking on GPC here because these fields are protected
+ * by gpc->refresh_lock.
+ */
+ return unlikely(gpc->kvm->mn_active_invalidate_count) &&
+ (gpc->kvm->mmu_gpc_invalidate_range_start <= gpc->uhva) &&
+ (gpc->kvm->mmu_gpc_invalidate_range_end > gpc->uhva);
+}
+
+static void gpc_wait_for_invalidations(struct gfn_to_pfn_cache *gpc)
+{
+ spin_lock(&gpc->kvm->mn_invalidate_lock);
+ if (gpc_invalidations_pending(gpc)) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ for (;;) {
+ prepare_to_wait(&gpc->kvm->gpc_invalidate_wq, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ if (!gpc_invalidations_pending(gpc))
+ break;
+
+ spin_unlock(&gpc->kvm->mn_invalidate_lock);
+ schedule();
+ spin_lock(&gpc->kvm->mn_invalidate_lock);
+ }
+ finish_wait(&gpc->kvm->gpc_invalidate_wq, &wait);
+ }
+ spin_unlock(&gpc->kvm->mn_invalidate_lock);
+}
+
static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
{
/* Note, the new page offset may be different than the old! */
@@ -205,6 +238,15 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
goto out_error;
}

+ /*
+ * Avoid populating a GPC with a user address which is already
+ * being invalidated, if the invalidate_range_start() notifier
+ * has already been called. The actual invalidation happens in
+ * the invalidate_range_end() callback, so wait until there are
+ * no active invalidations (for the relevant HVA).
+ */
+ gpc_wait_for_invalidations(gpc);
+
write_lock_irq(&gpc->lock);

/*

Attachment: smime.p7s
Description: S/MIME cryptographic signature