[RFC PATCH 29/39] KVM: Handle conversions in the SET_MEMORY_ATTRIBUTES ioctl

From: Ackerley Tng
Date: Tue Sep 10 2024 - 19:55:11 EST


The key steps for a private to shared conversion are:

1. Unmap from guest page tables
2. Set pages associated with requested range in memslot to be
faultable
3. Update kvm->mem_attr_array

The key steps for a shared to private conversion are:

1. Check and disallow set_memory_attributes if any page in the range
is still mapped or pinned, by
a. Updating guest_memfd's faultability to prevent future faulting
b. Returning -EINVAL if any pages are still pinned.
2. Update kvm->mem_attr_array

Userspace VMM must ensure shared pages are not in use, since any
faults racing with this call will get a SIGBUS.

Co-developed-by: Ackerley Tng <ackerleytng@xxxxxxxxxx>
Signed-off-by: Ackerley Tng <ackerleytng@xxxxxxxxxx>
Co-developed-by: Vishal Annapurve <vannapurve@xxxxxxxxxx>
Signed-off-by: Vishal Annapurve <vannapurve@xxxxxxxxxx>

---
include/linux/kvm_host.h | 1 +
virt/kvm/guest_memfd.c | 207 +++++++++++++++++++++++++++++++++++++++
virt/kvm/kvm_main.c | 15 +++
virt/kvm/kvm_mm.h | 9 ++
4 files changed, 232 insertions(+)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 79a6b1a63027..10993cd33e34 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2476,6 +2476,7 @@ typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,

long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
kvm_gmem_populate_cb post_populate, void *opaque);
+
#endif

#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 1d4dfe0660ad..110c4bbb004b 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -1592,4 +1592,211 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
return ret && !i ? ret : i;
}
EXPORT_SYMBOL_GPL(kvm_gmem_populate);
+
+/**
+ * Returns true if pages in range [@start, @end) in inode @inode have no
+ * userspace mappings.
+ */
+static bool kvm_gmem_no_mappings_range(struct inode *inode, pgoff_t start, pgoff_t end)
+{
+ pgoff_t index;
+ bool checked_indices_unmapped;
+
+ filemap_invalidate_lock_shared(inode->i_mapping);
+
+ /* TODO: replace iteration with filemap_get_folios() for efficiency. */
+ checked_indices_unmapped = true;
+ for (index = start; checked_indices_unmapped && index < end;) {
+ struct folio *folio;
+
+ /* Don't use kvm_gmem_get_folio to avoid allocating */
+ folio = filemap_lock_folio(inode->i_mapping, index);
+ if (IS_ERR(folio)) {
+ ++index;
+ continue;
+ }
+
+ if (folio_mapped(folio) || folio_maybe_dma_pinned(folio))
+ checked_indices_unmapped = false;
+ else
+ index = folio_next_index(folio);
+
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+
+ filemap_invalidate_unlock_shared(inode->i_mapping);
+ return checked_indices_unmapped;
+}
+
+/**
+ * Returns true if pages in range [@start, @end) in memslot @slot have no
+ * userspace mappings.
+ */
+static bool kvm_gmem_no_mappings_slot(struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end)
+{
+ pgoff_t offset_start;
+ pgoff_t offset_end;
+ struct file *file;
+ bool ret;
+
+ offset_start = start - slot->base_gfn + slot->gmem.pgoff;
+ offset_end = end - slot->base_gfn + slot->gmem.pgoff;
+
+ file = kvm_gmem_get_file(slot);
+ if (!file)
+ return false;
+
+ ret = kvm_gmem_no_mappings_range(file_inode(file), offset_start, offset_end);
+
+ fput(file);
+
+ return ret;
+}
+
+/**
+ * Returns true if pages in range [@start, @end) have no host userspace mappings.
+ */
+static bool kvm_gmem_no_mappings(struct kvm *kvm, gfn_t start, gfn_t end)
+{
+ int i;
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+ struct kvm_memslot_iter iter;
+ struct kvm_memslots *slots;
+
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
+ struct kvm_memory_slot *slot;
+ gfn_t gfn_start;
+ gfn_t gfn_end;
+
+ slot = iter.slot;
+ gfn_start = max(start, slot->base_gfn);
+ gfn_end = min(end, slot->base_gfn + slot->npages);
+
+ if (iter.slot->flags & KVM_MEM_GUEST_MEMFD &&
+ !kvm_gmem_no_mappings_slot(iter.slot, gfn_start, gfn_end))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Set faultability of given range of gfns [@start, @end) in memslot @slot to
+ * @faultable.
+ */
+static void kvm_gmem_set_faultable_slot(struct kvm_memory_slot *slot, gfn_t start,
+ gfn_t end, bool faultable)
+{
+ pgoff_t start_offset;
+ pgoff_t end_offset;
+ struct file *file;
+
+ file = kvm_gmem_get_file(slot);
+ if (!file)
+ return;
+
+ start_offset = start - slot->base_gfn + slot->gmem.pgoff;
+ end_offset = end - slot->base_gfn + slot->gmem.pgoff;
+
+ WARN_ON(kvm_gmem_set_faultable(file_inode(file), start_offset, end_offset,
+ faultable));
+
+ fput(file);
+}
+
+/**
+ * Set faultability of given range of gfns [@start, @end) in memslot @slot to
+ * @faultable.
+ */
+static void kvm_gmem_set_faultable_vm(struct kvm *kvm, gfn_t start, gfn_t end,
+ bool faultable)
+{
+ int i;
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+ struct kvm_memslot_iter iter;
+ struct kvm_memslots *slots;
+
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
+ struct kvm_memory_slot *slot;
+ gfn_t gfn_start;
+ gfn_t gfn_end;
+
+ slot = iter.slot;
+ gfn_start = max(start, slot->base_gfn);
+ gfn_end = min(end, slot->base_gfn + slot->npages);
+
+ if (iter.slot->flags & KVM_MEM_GUEST_MEMFD) {
+ kvm_gmem_set_faultable_slot(slot, gfn_start,
+ gfn_end, faultable);
+ }
+ }
+ }
+}
+
+/**
+ * Returns true if guest_memfd permits setting range [@start, @end) to PRIVATE.
+ *
+ * If memory is faulted in to host userspace and a request was made to set the
+ * memory to PRIVATE, the faulted in pages must not be pinned for the request to
+ * be permitted.
+ */
+static int kvm_gmem_should_set_attributes_private(struct kvm *kvm, gfn_t start,
+ gfn_t end)
+{
+ kvm_gmem_set_faultable_vm(kvm, start, end, false);
+
+ if (kvm_gmem_no_mappings(kvm, start, end))
+ return 0;
+
+ kvm_gmem_set_faultable_vm(kvm, start, end, true);
+ return -EINVAL;
+}
+
+/**
+ * Returns true if guest_memfd permits setting range [@start, @end) to SHARED.
+ *
+ * Because this allows pages to be faulted in to userspace, this must only be
+ * called after the pages have been invalidated from guest page tables.
+ */
+static int kvm_gmem_should_set_attributes_shared(struct kvm *kvm, gfn_t start,
+ gfn_t end)
+{
+ /* Always okay to set shared, hence set range faultable here. */
+ kvm_gmem_set_faultable_vm(kvm, start, end, true);
+
+ return 0;
+}
+
+/**
+ * Returns 0 if guest_memfd permits setting attributes @attrs for range [@start,
+ * @end) or negative error otherwise.
+ *
+ * If memory is faulted in to host userspace and a request was made to set the
+ * memory to PRIVATE, the faulted in pages must not be pinned for the request to
+ * be permitted.
+ *
+ * Because this may allow pages to be faulted in to userspace when requested to
+ * set attributes to shared, this must only be called after the pages have been
+ * invalidated from guest page tables.
+ */
+int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+ unsigned long attrs)
+{
+ if (attrs & KVM_MEMORY_ATTRIBUTE_PRIVATE)
+ return kvm_gmem_should_set_attributes_private(kvm, start, end);
+ else
+ return kvm_gmem_should_set_attributes_shared(kvm, start, end);
+}
+
#endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 92901656a0d4..1a7bbcc31b7e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2524,6 +2524,13 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
.on_lock = kvm_mmu_invalidate_end,
.may_block = true,
};
+ struct kvm_mmu_notifier_range error_set_range = {
+ .start = start,
+ .end = end,
+ .handler = (void *)kvm_null_fn,
+ .on_lock = kvm_mmu_invalidate_end,
+ .may_block = true,
+ };
unsigned long i;
void *entry;
int r = 0;
@@ -2548,6 +2555,10 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,

kvm_handle_gfn_range(kvm, &pre_set_range);

+ r = kvm_gmem_should_set_attributes(kvm, start, end, attributes);
+ if (r)
+ goto err;
+
for (i = start; i < end; i++) {
r = xa_err(xa_store(&kvm->mem_attr_array, i, entry,
GFP_KERNEL_ACCOUNT));
@@ -2560,6 +2571,10 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
mutex_unlock(&kvm->slots_lock);

return r;
+
+err:
+ kvm_handle_gfn_range(kvm, &error_set_range);
+ goto out_unlock;
}
static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
struct kvm_memory_attributes *attrs)
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 715f19669d01..d8ff2b380d0e 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -41,6 +41,8 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
unsigned int fd, loff_t offset);
void kvm_gmem_unbind(struct kvm_memory_slot *slot);
+int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+ unsigned long attrs);
#else
static inline void kvm_gmem_init(struct module *module)
{
@@ -59,6 +61,13 @@ static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot)
{
WARN_ON_ONCE(1);
}
+
+static inline int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start,
+ gfn_t end, unsigned long attrs)
+{
+ return 0;
+}
+
#endif /* CONFIG_KVM_PRIVATE_MEM */

#endif /* __KVM_MM_H__ */
--
2.46.0.598.g6f2099f65c-goog