[RFC PATCH 26/39] KVM: guest_memfd: Track faultability within a struct kvm_gmem_private

From: Ackerley Tng
Date: Tue Sep 10 2024 - 19:54:02 EST


The faultability xarray is stored on the inode since faultability is a
property of the guest_memfd's memory contents.

In this RFC, presence of an entry in the xarray indicates faultable,
but this could be flipped so that presence indicates unfaultable. For
flexibility, a special value "FAULT" is used instead of a simple
boolean.

However, at some stages of a VM's lifecycle there could be more
private pages, and at other stages there could be more shared pages.

This is likely to be replaced by a better data structure in a future
revision to better support ranges.

Also store struct kvm_gmem_hugetlb in struct kvm_gmem_hugetlb as a
pointer. inode->i_mapping->i_private_data.

Co-developed-by: Fuad Tabba <tabba@xxxxxxxxxx>
Signed-off-by: Fuad Tabba <tabba@xxxxxxxxxx>
Co-developed-by: Ackerley Tng <ackerleytng@xxxxxxxxxx>
Signed-off-by: Ackerley Tng <ackerleytng@xxxxxxxxxx>
Co-developed-by: Vishal Annapurve <vannapurve@xxxxxxxxxx>
Signed-off-by: Vishal Annapurve <vannapurve@xxxxxxxxxx>

---
virt/kvm/guest_memfd.c | 105 ++++++++++++++++++++++++++++++++++++-----
1 file changed, 94 insertions(+), 11 deletions(-)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 8151df2c03e5..b603518f7b62 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -26,11 +26,21 @@ struct kvm_gmem_hugetlb {
struct hugepage_subpool *spool;
};

-static struct kvm_gmem_hugetlb *kvm_gmem_hgmem(struct inode *inode)
+struct kvm_gmem_inode_private {
+ struct xarray faultability;
+ struct kvm_gmem_hugetlb *hgmem;
+};
+
+static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
{
return inode->i_mapping->i_private_data;
}

+static struct kvm_gmem_hugetlb *kvm_gmem_hgmem(struct inode *inode)
+{
+ return kvm_gmem_private(inode)->hgmem;
+}
+
static bool is_kvm_gmem_hugetlb(struct inode *inode)
{
u64 flags = (u64)inode->i_private;
@@ -38,6 +48,57 @@ static bool is_kvm_gmem_hugetlb(struct inode *inode)
return flags & KVM_GUEST_MEMFD_HUGETLB;
}

+#define KVM_GMEM_FAULTABILITY_VALUE 0x4641554c54 /* FAULT */
+
+/**
+ * Set faultability of given range of inode indices [@start, @end) to
+ * @faultable. Return 0 if attributes were successfully updated or negative
+ * errno on error.
+ */
+static int kvm_gmem_set_faultable(struct inode *inode, pgoff_t start, pgoff_t end,
+ bool faultable)
+{
+ struct xarray *faultability;
+ void *val;
+ pgoff_t i;
+
+ /*
+ * The expectation is that fewer pages are faultable, hence save memory
+ * entries are created for faultable pages as opposed to creating
+ * entries for non-faultable pages.
+ */
+ val = faultable ? xa_mk_value(KVM_GMEM_FAULTABILITY_VALUE) : NULL;
+ faultability = &kvm_gmem_private(inode)->faultability;
+
+ /*
+ * TODO replace this with something else (maybe interval
+ * tree?). store_range doesn't quite do what we expect if overlapping
+ * ranges are specified: if we store_range(5, 10, val) and then
+ * store_range(7, 12, NULL), the entire range [5, 12] will be NULL. For
+ * now, use the slower xa_store() to store individual entries on indices
+ * to avoid this.
+ */
+ for (i = start; i < end; i++) {
+ int r;
+
+ r = xa_err(xa_store(faultability, i, val, GFP_KERNEL_ACCOUNT));
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
+/**
+ * Return true if the page at @index is allowed to be faulted in.
+ */
+static bool kvm_gmem_is_faultable(struct inode *inode, pgoff_t index)
+{
+ struct xarray *faultability = &kvm_gmem_private(inode)->faultability;
+
+ return xa_to_value(xa_load(faultability, index)) == KVM_GMEM_FAULTABILITY_VALUE;
+}
+
/**
* folio_file_pfn - like folio_file_page, but return a pfn.
* @folio: The folio which contains this index.
@@ -895,11 +956,21 @@ static void kvm_gmem_hugetlb_teardown(struct inode *inode)

static void kvm_gmem_evict_inode(struct inode *inode)
{
+ struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
+
+ /*
+ * .evict_inode can be called before faultability is set up if there are
+ * issues during inode creation.
+ */
+ if (private)
+ xa_destroy(&private->faultability);
+
if (is_kvm_gmem_hugetlb(inode))
kvm_gmem_hugetlb_teardown(inode);
else
truncate_inode_pages_final(inode->i_mapping);

+ kfree(private);
clear_inode(inode);
}

@@ -1028,7 +1099,9 @@ static const struct inode_operations kvm_gmem_iops = {
.setattr = kvm_gmem_setattr,
};

-static int kvm_gmem_hugetlb_setup(struct inode *inode, loff_t size, u64 flags)
+static int kvm_gmem_hugetlb_setup(struct inode *inode,
+ struct kvm_gmem_inode_private *private,
+ loff_t size, u64 flags)
{
struct kvm_gmem_hugetlb *hgmem;
struct hugepage_subpool *spool;
@@ -1036,6 +1109,10 @@ static int kvm_gmem_hugetlb_setup(struct inode *inode, loff_t size, u64 flags)
struct hstate *h;
long hpages;

+ hgmem = kzalloc(sizeof(*hgmem), GFP_KERNEL);
+ if (!hgmem)
+ return -ENOMEM;
+
page_size_log = (flags >> KVM_GUEST_MEMFD_HUGE_SHIFT) & KVM_GUEST_MEMFD_HUGE_MASK;
h = hstate_sizelog(page_size_log);

@@ -1046,21 +1123,16 @@ static int kvm_gmem_hugetlb_setup(struct inode *inode, loff_t size, u64 flags)
if (!spool)
goto err;

- hgmem = kzalloc(sizeof(*hgmem), GFP_KERNEL);
- if (!hgmem)
- goto err_subpool;
-
inode->i_blkbits = huge_page_shift(h);

hgmem->h = h;
hgmem->spool = spool;
- inode->i_mapping->i_private_data = hgmem;

+ private->hgmem = hgmem;
return 0;

-err_subpool:
- kfree(spool);
err:
+ kfree(hgmem);
return -ENOMEM;
}

@@ -1068,6 +1140,7 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
loff_t size, u64 flags)
{
const struct qstr qname = QSTR_INIT(name, strlen(name));
+ struct kvm_gmem_inode_private *private;
struct inode *inode;
int err;

@@ -1079,12 +1152,20 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
if (err)
goto out;

+ err = -ENOMEM;
+ private = kzalloc(sizeof(*private), GFP_KERNEL);
+ if (!private)
+ goto out;
+
if (flags & KVM_GUEST_MEMFD_HUGETLB) {
- err = kvm_gmem_hugetlb_setup(inode, size, flags);
+ err = kvm_gmem_hugetlb_setup(inode, private, size, flags);
if (err)
- goto out;
+ goto free_private;
}

+ xa_init(&private->faultability);
+ inode->i_mapping->i_private_data = private;
+
inode->i_private = (void *)(unsigned long)flags;
inode->i_op = &kvm_gmem_iops;
inode->i_mapping->a_ops = &kvm_gmem_aops;
@@ -1097,6 +1178,8 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,

return inode;

+free_private:
+ kfree(private);
out:
iput(inode);

--
2.46.0.598.g6f2099f65c-goog