[RFC PATCH v2 09/10] kvm: pfncache: hook up to gmem invalidation

From: Patrick Roy
Date: Tue Sep 10 2024 - 12:35:32 EST


Invalidate gfn_to_pfn_caches that hold gmem pfns whenever gmem
invalidations occur (fallocate(FALLOC_FL_PUNCH_HOLE),
error_remove_folio)..

gmem invalidations are difficult to handle for gpcs. The unmap path for
gmem pfns in gpc tries to decrement the sharing ref count, and
potentially modifies the direct map. However, these are not operations
we can do after the gmem folio that used to sit in the pfn has been
freed (and after we drop gpc->lock in
gfn_to_pfn_cache_invalidate_gfns_start we are racing against the freeing
of the folio, and we cannot do direct map manipulations before dropping
the lock). Thus, in these cases (punch hole and error_remove_folio), we
must "leak" the sharing reference (which is fine because either the
folio has already been freed, or it is about to be freed by
->invalidate_folio, which only reinserts into the direct map. So if the
folio already is in the direct map, no harm is done). So in these cases,
we simply store a flag that tells gpc to skip unmapping of these pfns
when the time comes to refresh the cache.

A slightly different case are if just the memory attributes on a memslot
change. If we switch from private to shared, the gmem pfn will still be
there, it will simply no longer be mapped into the guest. In this
scenario, we must unmap to decrement the sharing count, and reinsert
into the direct map. Otherwise, if for example the gpc gets deactivated
while the gfn is set to shared, and after that the gfn is flipped to
private, something else might use the pfn, but it is still present in
the direct map (which violates the security goal of direct map removal).

However, there is one edge case we need to deal with: It could happen
that a gpc gets invalidated by a memory attribute change (e.g.
gpc->needs_unmap = true), then refreshed, and after the refresh loop has
exited and the gpc->lock is dropped, but before we get to gpc_unmap, the
gmem folio that occupies the invalidated pfn of the cache is fallocated
away. Now needs_unmap will be true, but we are once again racing against
the freeing of the folio. For this case, take a reference to the folio
before we drop the gpc->lock, and only drop the reference after
gpc_unmap returned, to avoid the folio being freed.

For similar reasons, gfn_to_pfn_cache_invalidate_gfns_start needs to not
ignore already invalidated caches, as a cache that was invalidated due
to a memory attribute change will have needs_unmap=true. If a
fallocate(FALLOC_FL_PUNCH_HOLE) operation happens on the same range,
this will need to get updated to needs_unmap=false, even if the cache is
already invalidated.

Signed-off-by: Patrick Roy <roypat@xxxxxxxxxxxx>
---
include/linux/kvm_host.h | 3 +++
include/linux/kvm_types.h | 1 +
virt/kvm/guest_memfd.c | 19 +++++++++++++++-
virt/kvm/kvm_main.c | 5 ++++-
virt/kvm/kvm_mm.h | 6 +++--
virt/kvm/pfncache.c | 46 +++++++++++++++++++++++++++++++++------
6 files changed, 69 insertions(+), 11 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7d36164a2cee5..62e45a4ab810e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -843,6 +843,9 @@ struct kvm {
bool attribute_change_in_progress;
#endif
char stats_id[KVM_STATS_NAME_SIZE];
+#ifdef CONFIG_KVM_PRIVATE_MEM
+ atomic_t gmem_active_invalidate_count;
+#endif
};

#define kvm_err(fmt, ...) \
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 8903b8f46cf6c..a2df9623b17ce 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -71,6 +71,7 @@ struct gfn_to_pfn_cache {
bool active;
bool valid;
bool private;
+ bool needs_unmap;
};

#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 742eba36d2371..ac502f9b220c3 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -231,6 +231,15 @@ static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
struct kvm *kvm = gmem->kvm;
unsigned long index;

+ atomic_inc(&kvm->gmem_active_invalidate_count);
+
+ xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
+ pgoff_t pgoff = slot->gmem.pgoff;
+
+ gfn_to_pfn_cache_invalidate_gfns_start(kvm, slot->base_gfn + start - pgoff,
+ slot->base_gfn + end - pgoff, true);
+ }
+
xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
pgoff_t pgoff = slot->gmem.pgoff;

@@ -268,6 +277,8 @@ static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
kvm_mmu_invalidate_end(kvm);
KVM_MMU_UNLOCK(kvm);
}
+
+ atomic_dec(&kvm->gmem_active_invalidate_count);
}

static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
@@ -478,7 +489,13 @@ static void kvm_gmem_invalidate_folio(struct folio *folio, size_t start, size_t
if (start == 0 && end == folio_size(folio)) {
refcount_t *sharing_count = folio_get_private(folio);

- kvm_gmem_folio_clear_private(folio);
+ /*
+ * gfn_to_pfn_caches do not decrement the refcount if they
+ * get invalidated due to the gmem pfn going away (fallocate,
+ * or error_remove_folio)
+ */
+ if (refcount_read(sharing_count) == 1)
+ kvm_gmem_folio_clear_private(folio);
kfree(sharing_count);
}
}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 183f7ce57a428..6d0818c723d73 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1161,6 +1161,9 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
xa_init(&kvm->mem_attr_array);
#endif
+#ifdef CONFIG_KVM_PRIVATE_MEM
+ atomic_set(&kvm->gmem_active_invalidate_count, 0);
+#endif

INIT_LIST_HEAD(&kvm->gpc_list);
spin_lock_init(&kvm->gpc_lock);
@@ -2549,7 +2552,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
}

kvm->attribute_change_in_progress = true;
- gfn_to_pfn_cache_invalidate_gfns_start(kvm, start, end);
+ gfn_to_pfn_cache_invalidate_gfns_start(kvm, start, end, false);

kvm_handle_gfn_range(kvm, &pre_set_range);

diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 5a53d888e4b18..f4d0ced4a8f57 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -30,7 +30,8 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,

void gfn_to_pfn_cache_invalidate_gfns_start(struct kvm *kvm,
gfn_t start,
- gfn_t end);
+ gfn_t end,
+ bool needs_unmap);
#else
static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
unsigned long start,
@@ -40,7 +41,8 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,

static inline void gfn_to_pfn_cache_invalidate_gfns_start(struct kvm *kvm,
gfn_t start,
- gfn_t end)
+ gfn_t end,
+ bool needs_unmap)
{
}
#endif /* HAVE_KVM_PFNCACHE */
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index a4f935e80f545..828ba8ad8f20d 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -61,8 +61,15 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
/*
* Identical to `gfn_to_pfn_cache_invalidate_start`, except based on gfns
* instead of uhvas.
+ *
+ * needs_unmap indicates whether this invalidation is because a gmem range went
+ * away (fallocate(FALLOC_FL_PUNCH_HOLE), error_remove_folio), in which case
+ * we must not call kvm_gmem_put_shared_pfn for it, or because of a memory
+ * attribute change, in which case the gmem pfn still exists, but simply
+ * is no longer mapped into the guest.
*/
-void gfn_to_pfn_cache_invalidate_gfns_start(struct kvm *kvm, gfn_t start, gfn_t end)
+void gfn_to_pfn_cache_invalidate_gfns_start(struct kvm *kvm, gfn_t start, gfn_t end,
+ bool needs_unmap)
{
struct gfn_to_pfn_cache *gpc;

@@ -78,14 +85,16 @@ void gfn_to_pfn_cache_invalidate_gfns_start(struct kvm *kvm, gfn_t start, gfn_t
continue;
}

- if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
+ if (!is_error_noslot_pfn(gpc->pfn) &&
gpa_to_gfn(gpc->gpa) >= start && gpa_to_gfn(gpc->gpa) < end) {
read_unlock_irq(&gpc->lock);

write_lock_irq(&gpc->lock);
- if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
- gpa_to_gfn(gpc->gpa) >= start && gpa_to_gfn(gpc->gpa) < end)
+ if (!is_error_noslot_pfn(gpc->pfn) &&
+ gpa_to_gfn(gpc->gpa) >= start && gpa_to_gfn(gpc->gpa) < end) {
gpc->valid = false;
+ gpc->needs_unmap = needs_unmap && gpc->private;
+ }
write_unlock_irq(&gpc->lock);
continue;
}
@@ -194,6 +203,9 @@ static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_s
*/
if (kvm->attribute_change_in_progress)
return true;
+
+ if (atomic_read_acquire(&kvm->gmem_active_invalidate_count))
+ return true;
/*
* Ensure mn_active_invalidate_count is read before
* mmu_invalidate_seq. This pairs with the smp_wmb() in
@@ -425,20 +437,28 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l
* Some/all of the uhva, gpa, and memslot generation info may still be
* valid, leave it as is.
*/
+ unmap_old = gpc->needs_unmap;
if (ret) {
gpc->valid = false;
gpc->pfn = KVM_PFN_ERR_FAULT;
gpc->khva = NULL;
+ gpc->needs_unmap = false;
+ } else {
+ gpc->needs_unmap = true;
}

/* Detect a pfn change before dropping the lock! */
- unmap_old = (old_pfn != gpc->pfn);
+ unmap_old &= (old_pfn != gpc->pfn);

out_unlock:
+ if (unmap_old)
+ folio_get(pfn_folio(old_pfn));
write_unlock_irq(&gpc->lock);

- if (unmap_old)
+ if (unmap_old) {
gpc_unmap(old_pfn, old_khva, old_private);
+ folio_put(pfn_folio(old_pfn));
+ }

return ret;
}
@@ -530,6 +550,7 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
kvm_pfn_t old_pfn;
void *old_khva;
bool old_private;
+ bool old_needs_unmap;

guard(mutex)(&gpc->refresh_lock);

@@ -555,14 +576,25 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
old_private = gpc->private;
gpc->private = false;

+ old_needs_unmap = gpc->needs_unmap;
+ gpc->needs_unmap = false;
+
old_pfn = gpc->pfn;
gpc->pfn = KVM_PFN_ERR_FAULT;
+
+ if (old_needs_unmap && old_private)
+ folio_get(pfn_folio(old_pfn));
+
write_unlock_irq(&gpc->lock);

spin_lock(&kvm->gpc_lock);
list_del(&gpc->list);
spin_unlock(&kvm->gpc_lock);

- gpc_unmap(old_pfn, old_khva, old_private);
+ if (old_needs_unmap) {
+ gpc_unmap(old_pfn, old_khva, old_private);
+ if (old_private)
+ folio_put(pfn_folio(old_pfn));
+ }
}
}
--
2.46.0