[RFC PATCH 39/39] KVM: guest_memfd: Dynamically split/reconstruct HugeTLB page

From: Ackerley Tng
Date: Tue Sep 10 2024 - 19:58:29 EST


From: Vishal Annapurve <vannapurve@xxxxxxxxxx>

The faultability of a page is used to determine whether to split or
reconstruct a page.

If there is any page in a folio that is faultable, split the folio. If
all pages in a folio are not faultable, reconstruct the folio.

On truncation, always reconstruct and free regardless of
faultability (as long as a HugeTLB page's worth of pages is
truncated).

Co-developed-by: Vishal Annapurve <vannapurve@xxxxxxxxxx>
Signed-off-by: Vishal Annapurve <vannapurve@xxxxxxxxxx>
Co-developed-by: Ackerley Tng <ackerleytng@xxxxxxxxxx>
Signed-off-by: Ackerley Tng <ackerleytng@xxxxxxxxxx>

---
virt/kvm/guest_memfd.c | 678 +++++++++++++++++++++++++++--------------
1 file changed, 456 insertions(+), 222 deletions(-)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index fb292e542381..0afc111099c0 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -99,6 +99,23 @@ static bool kvm_gmem_is_faultable(struct inode *inode, pgoff_t index)
return xa_to_value(xa_load(faultability, index)) == KVM_GMEM_FAULTABILITY_VALUE;
}

+/**
+ * Return true if any of the @nr_pages beginning at @index is allowed to be
+ * faulted in.
+ */
+static bool kvm_gmem_is_any_faultable(struct inode *inode, pgoff_t index,
+ int nr_pages)
+{
+ pgoff_t i;
+
+ for (i = index; i < index + nr_pages; ++i) {
+ if (kvm_gmem_is_faultable(inode, i))
+ return true;
+ }
+
+ return false;
+}
+
/**
* folio_file_pfn - like folio_file_page, but return a pfn.
* @folio: The folio which contains this index.
@@ -312,6 +329,40 @@ static int kvm_gmem_hugetlb_filemap_add_folio(struct address_space *mapping,
return 0;
}

+static inline void kvm_gmem_hugetlb_filemap_remove_folio(struct folio *folio)
+{
+ folio_lock(folio);
+
+ folio_clear_dirty(folio);
+ folio_clear_uptodate(folio);
+ filemap_remove_folio(folio);
+
+ folio_unlock(folio);
+}
+
+/*
+ * Locks a block of nr_pages (1 << huge_page_order(h)) pages within @mapping
+ * beginning at @index. Take either this or filemap_invalidate_lock() whenever
+ * the filemap is accessed.
+ */
+static u32 hugetlb_fault_mutex_lock(struct address_space *mapping, pgoff_t index)
+{
+ pgoff_t hindex;
+ u32 hash;
+
+ hindex = index >> huge_page_order(kvm_gmem_hgmem(mapping->host)->h);
+ hash = hugetlb_fault_mutex_hash(mapping, hindex);
+
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ return hash;
+}
+
+static void hugetlb_fault_mutex_unlock(u32 hash)
+{
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+}
+
struct kvm_gmem_split_stash {
struct {
unsigned long _flags_2;
@@ -394,15 +445,136 @@ static int kvm_gmem_hugetlb_reconstruct_folio(struct hstate *h, struct folio *fo
}

__folio_set_hugetlb(folio);
-
- folio_set_count(folio, 1);
+ hugetlb_folio_list_add(folio, &h->hugepage_activelist);

hugetlb_vmemmap_optimize_folio(h, folio);

+ folio_set_count(folio, 1);
+
return 0;
}

-/* Basically folio_set_order(folio, 1) without the checks. */
+/**
+ * Reconstruct a HugeTLB folio out of folio_nr_pages(@first_folio) pages. Will
+ * clean up subfolios from filemap and add back the reconstructed folio. Folios
+ * to be reconstructed must not be locked, and reconstructed folio will not be
+ * locked. Return 0 on success or negative error otherwise.
+ *
+ * hugetlb_fault_mutex_lock() has to be held when calling this function.
+ *
+ * Expects that before this call, the filemap's refcounts are the only refcounts
+ * for the folios in the filemap. After this function returns, the filemap's
+ * refcount will be the only refcount on the reconstructed folio.
+ */
+static int kvm_gmem_reconstruct_folio_in_filemap(struct hstate *h,
+ struct folio *first_folio)
+{
+ struct address_space *mapping;
+ struct folio_batch fbatch;
+ unsigned long end;
+ pgoff_t index;
+ pgoff_t next;
+ int ret;
+ int i;
+
+ if (folio_order(first_folio) == huge_page_order(h))
+ return 0;
+
+ index = first_folio->index;
+ mapping = first_folio->mapping;
+
+ next = index;
+ end = index + (1UL << huge_page_order(h));
+ folio_batch_init(&fbatch);
+ while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
+ for (i = 0; i < folio_batch_count(&fbatch); ++i) {
+ struct folio *folio;
+
+ folio = fbatch.folios[i];
+
+ /*
+ * Before removing from filemap, take a reference so
+ * sub-folios don't get freed when removing from
+ * filemap.
+ */
+ folio_get(folio);
+
+ kvm_gmem_hugetlb_filemap_remove_folio(folio);
+ }
+ folio_batch_release(&fbatch);
+ }
+
+ ret = kvm_gmem_hugetlb_reconstruct_folio(h, first_folio);
+ if (ret) {
+ /* TODO: handle cleanup properly. */
+ WARN_ON(ret);
+ return ret;
+ }
+
+ kvm_gmem_hugetlb_filemap_add_folio(mapping, first_folio, index,
+ htlb_alloc_mask(h));
+
+ folio_unlock(first_folio);
+ folio_put(first_folio);
+
+ return ret;
+}
+
+/**
+ * Reconstruct any HugeTLB folios in range [@start, @end), if all the subfolios
+ * are not faultable. Return 0 on success or negative error otherwise.
+ *
+ * Will skip any folios that are already reconstructed.
+ */
+static int kvm_gmem_try_reconstruct_folios_range(struct inode *inode,
+ pgoff_t start, pgoff_t end)
+{
+ unsigned int nr_pages;
+ pgoff_t aligned_start;
+ pgoff_t aligned_end;
+ struct hstate *h;
+ pgoff_t index;
+ int ret;
+
+ if (!is_kvm_gmem_hugetlb(inode))
+ return 0;
+
+ h = kvm_gmem_hgmem(inode)->h;
+ nr_pages = 1UL << huge_page_order(h);
+
+ aligned_start = round_up(start, nr_pages);
+ aligned_end = round_down(end, nr_pages);
+
+ ret = 0;
+ for (index = aligned_start; !ret && index < aligned_end; index += nr_pages) {
+ struct folio *folio;
+ u32 hash;
+
+ hash = hugetlb_fault_mutex_lock(inode->i_mapping, index);
+
+ folio = filemap_get_folio(inode->i_mapping, index);
+ if (!IS_ERR(folio)) {
+ /*
+ * Drop refcount because reconstruction expects an equal number
+ * of refcounts for all subfolios - just keep the refcount taken
+ * by the filemap.
+ */
+ folio_put(folio);
+
+ /* Merge only when the entire block of nr_pages is not faultable. */
+ if (!kvm_gmem_is_any_faultable(inode, index, nr_pages)) {
+ ret = kvm_gmem_reconstruct_folio_in_filemap(h, folio);
+ WARN_ON(ret);
+ }
+ }
+
+ hugetlb_fault_mutex_unlock(hash);
+ }
+
+ return ret;
+}
+
+/* Basically folio_set_order() without the checks. */
static inline void kvm_gmem_folio_set_order(struct folio *folio, unsigned int order)
{
folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order;
@@ -414,8 +586,8 @@ static inline void kvm_gmem_folio_set_order(struct folio *folio, unsigned int or
/**
* Split a HugeTLB @folio of size huge_page_size(@h).
*
- * After splitting, each split folio has a refcount of 1. There are no checks on
- * refcounts before splitting.
+ * Folio must have refcount of 1 when this function is called. After splitting,
+ * each split folio has a refcount of 1.
*
* Return 0 on success and negative error otherwise.
*/
@@ -423,14 +595,18 @@ static int kvm_gmem_hugetlb_split_folio(struct hstate *h, struct folio *folio)
{
int ret;

+ VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio) != 1, folio);
+
+ folio_set_count(folio, 0);
+
ret = hugetlb_vmemmap_restore_folio(h, folio);
if (ret)
- return ret;
+ goto out;

ret = kvm_gmem_hugetlb_stash_metadata(folio);
if (ret) {
hugetlb_vmemmap_optimize_folio(h, folio);
- return ret;
+ goto out;
}

kvm_gmem_folio_set_order(folio, 0);
@@ -439,109 +615,183 @@ static int kvm_gmem_hugetlb_split_folio(struct hstate *h, struct folio *folio)
__folio_clear_hugetlb(folio);

/*
- * Remove the first folio from h->hugepage_activelist since it is no
+ * Remove the original folio from h->hugepage_activelist since it is no
* longer a HugeTLB page. The other split pages should not be on any
* lists.
*/
hugetlb_folio_list_del(folio);

- return 0;
+ ret = 0;
+out:
+ folio_set_count(folio, 1);
+ return ret;
}

-static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode *inode,
- pgoff_t index)
+/**
+ * Split a HugeTLB folio into folio_nr_pages(@folio) pages. Will clean up folio
+ * from filemap and add back the split folios. @folio must not be locked, and
+ * all split folios will not be locked. Return 0 on success or negative error
+ * otherwise.
+ *
+ * hugetlb_fault_mutex_lock() has to be held when calling this function.
+ *
+ * Expects that before this call, the filemap's refcounts are the only refcounts
+ * for the folio. After this function returns, the filemap's refcounts will be
+ * the only refcounts on the split folios.
+ */
+static int kvm_gmem_split_folio_in_filemap(struct hstate *h, struct folio *folio)
{
- struct folio *allocated_hugetlb_folio;
- pgoff_t hugetlb_first_subpage_index;
- struct page *hugetlb_first_subpage;
- struct kvm_gmem_hugetlb *hgmem;
- struct page *requested_page;
+ struct address_space *mapping;
+ struct page *first_subpage;
+ pgoff_t index;
int ret;
int i;

- hgmem = kvm_gmem_hgmem(inode);
- allocated_hugetlb_folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool);
- if (IS_ERR(allocated_hugetlb_folio))
- return allocated_hugetlb_folio;
+ if (folio_order(folio) == 0)
+ return 0;

- requested_page = folio_file_page(allocated_hugetlb_folio, index);
- hugetlb_first_subpage = folio_file_page(allocated_hugetlb_folio, 0);
- hugetlb_first_subpage_index = index & (huge_page_mask(hgmem->h) >> PAGE_SHIFT);
+ index = folio->index;
+ mapping = folio->mapping;

- ret = kvm_gmem_hugetlb_split_folio(hgmem->h, allocated_hugetlb_folio);
+ first_subpage = folio_page(folio, 0);
+
+ /*
+ * Take reference so that folio will not be released when removed from
+ * filemap.
+ */
+ folio_get(folio);
+
+ kvm_gmem_hugetlb_filemap_remove_folio(folio);
+
+ ret = kvm_gmem_hugetlb_split_folio(h, folio);
if (ret) {
- folio_put(allocated_hugetlb_folio);
- return ERR_PTR(ret);
+ WARN_ON(ret);
+ kvm_gmem_hugetlb_filemap_add_folio(mapping, folio, index,
+ htlb_alloc_mask(h));
+ folio_put(folio);
+ return ret;
}

- for (i = 0; i < pages_per_huge_page(hgmem->h); ++i) {
- struct folio *folio = page_folio(nth_page(hugetlb_first_subpage, i));
+ for (i = 0; i < pages_per_huge_page(h); ++i) {
+ struct folio *folio = page_folio(nth_page(first_subpage, i));

- ret = kvm_gmem_hugetlb_filemap_add_folio(inode->i_mapping,
- folio,
- hugetlb_first_subpage_index + i,
- htlb_alloc_mask(hgmem->h));
+ ret = kvm_gmem_hugetlb_filemap_add_folio(mapping, folio,
+ index + i,
+ htlb_alloc_mask(h));
if (ret) {
/* TODO: handle cleanup properly. */
- pr_err("Handle cleanup properly index=%lx, ret=%d\n",
- hugetlb_first_subpage_index + i, ret);
- dump_page(nth_page(hugetlb_first_subpage, i), "check");
- return ERR_PTR(ret);
+ WARN_ON(ret);
+ return ret;
}

+ folio_unlock(folio);
+
/*
- * Skip unlocking for the requested index since
- * kvm_gmem_get_folio() returns a locked folio.
- *
- * Do folio_put() to drop the refcount that came with the folio,
- * from splitting the folio. Splitting the folio has a refcount
- * to be in line with hugetlb_alloc_folio(), which returns a
- * folio with refcount 1.
- *
- * Skip folio_put() for requested index since
- * kvm_gmem_get_folio() returns a folio with refcount 1.
+ * Drop reference so that the only remaining reference is the
+ * one held by the filemap.
*/
- if (hugetlb_first_subpage_index + i != index) {
- folio_unlock(folio);
- folio_put(folio);
- }
+ folio_put(folio);
}

+ return ret;
+}
+
+/*
+ * Allocates and then caches a folio in the filemap. Returns a folio with
+ * refcount of 2: 1 after allocation, and 1 taken by the filemap.
+ */
+static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode *inode,
+ pgoff_t index)
+{
+ struct kvm_gmem_hugetlb *hgmem;
+ pgoff_t aligned_index;
+ struct folio *folio;
+ int nr_pages;
+ int ret;
+
+ hgmem = kvm_gmem_hgmem(inode);
+ folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool);
+ if (IS_ERR(folio))
+ return folio;
+
+ nr_pages = 1UL << huge_page_order(hgmem->h);
+ aligned_index = round_down(index, nr_pages);
+
+ ret = kvm_gmem_hugetlb_filemap_add_folio(inode->i_mapping, folio,
+ aligned_index,
+ htlb_alloc_mask(hgmem->h));
+ WARN_ON(ret);
+
spin_lock(&inode->i_lock);
inode->i_blocks += blocks_per_huge_page(hgmem->h);
spin_unlock(&inode->i_lock);

- return page_folio(requested_page);
+ return folio;
+}
+
+/**
+ * Split @folio if any of the subfolios are faultable. Returns the split
+ * (locked, refcount=2) folio at @index.
+ *
+ * Expects a locked folio with 1 refcount in addition to filemap's refcounts.
+ *
+ * After splitting, the subfolios in the filemap will be unlocked and have
+ * refcount 1 (other than the returned folio, which will be locked and have
+ * refcount 2).
+ */
+static struct folio *kvm_gmem_maybe_split_folio(struct folio *folio, pgoff_t index)
+{
+ pgoff_t aligned_index;
+ struct inode *inode;
+ struct hstate *h;
+ int nr_pages;
+ int ret;
+
+ inode = folio->mapping->host;
+ h = kvm_gmem_hgmem(inode)->h;
+ nr_pages = 1UL << huge_page_order(h);
+ aligned_index = round_down(index, nr_pages);
+
+ if (!kvm_gmem_is_any_faultable(inode, aligned_index, nr_pages))
+ return folio;
+
+ /* Drop lock and refcount in preparation for splitting. */
+ folio_unlock(folio);
+ folio_put(folio);
+
+ ret = kvm_gmem_split_folio_in_filemap(h, folio);
+ if (ret) {
+ kvm_gmem_hugetlb_filemap_remove_folio(folio);
+ return ERR_PTR(ret);
+ }
+
+ /*
+ * At this point, the filemap has the only reference on the folio. Take
+ * lock and refcount on folio to align with kvm_gmem_get_folio().
+ */
+ return filemap_lock_folio(inode->i_mapping, index);
}

static struct folio *kvm_gmem_get_hugetlb_folio(struct inode *inode,
pgoff_t index)
{
- struct address_space *mapping;
struct folio *folio;
- struct hstate *h;
- pgoff_t hindex;
u32 hash;

- h = kvm_gmem_hgmem(inode)->h;
- hindex = index >> huge_page_order(h);
- mapping = inode->i_mapping;
-
- /* To lock, we calculate the hash using the hindex and not index. */
- hash = hugetlb_fault_mutex_hash(mapping, hindex);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ hash = hugetlb_fault_mutex_lock(inode->i_mapping, index);

/*
- * The filemap is indexed with index and not hindex. Taking lock on
- * folio to align with kvm_gmem_get_regular_folio()
+ * The filemap is indexed with index and not hindex. Take lock on folio
+ * to align with kvm_gmem_get_regular_folio()
*/
- folio = filemap_lock_folio(mapping, index);
+ folio = filemap_lock_folio(inode->i_mapping, index);
+ if (IS_ERR(folio))
+ folio = kvm_gmem_hugetlb_alloc_and_cache_folio(inode, index);
+
if (!IS_ERR(folio))
- goto out;
+ folio = kvm_gmem_maybe_split_folio(folio, index);

- folio = kvm_gmem_hugetlb_alloc_and_cache_folio(inode, index);
-out:
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_fault_mutex_unlock(hash);

return folio;
}
@@ -610,17 +860,6 @@ static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
}
}

-static inline void kvm_gmem_hugetlb_filemap_remove_folio(struct folio *folio)
-{
- folio_lock(folio);
-
- folio_clear_dirty(folio);
- folio_clear_uptodate(folio);
- filemap_remove_folio(folio);
-
- folio_unlock(folio);
-}
-
/**
* Removes folios in range [@lstart, @lend) from page cache/filemap (@mapping),
* returning the number of HugeTLB pages freed.
@@ -631,61 +870,30 @@ static int kvm_gmem_hugetlb_filemap_remove_folios(struct address_space *mapping,
struct hstate *h,
loff_t lstart, loff_t lend)
{
- const pgoff_t end = lend >> PAGE_SHIFT;
- pgoff_t next = lstart >> PAGE_SHIFT;
- LIST_HEAD(folios_to_reconstruct);
- struct folio_batch fbatch;
- struct folio *folio, *tmp;
- int num_freed = 0;
- int i;
-
- /*
- * TODO: Iterate over huge_page_size(h) blocks to avoid taking and
- * releasing hugetlb_fault_mutex_table[hash] lock so often. When
- * truncating, lstart and lend should be clipped to the size of this
- * guest_memfd file, otherwise there would be too many iterations.
- */
- folio_batch_init(&fbatch);
- while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
- for (i = 0; i < folio_batch_count(&fbatch); ++i) {
- struct folio *folio;
- pgoff_t hindex;
- u32 hash;
-
- folio = fbatch.folios[i];
+ loff_t offset;
+ int num_freed;

- hindex = folio->index >> huge_page_order(h);
- hash = hugetlb_fault_mutex_hash(mapping, hindex);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ num_freed = 0;
+ for (offset = lstart; offset < lend; offset += huge_page_size(h)) {
+ struct folio *folio;
+ pgoff_t index;
+ u32 hash;

- /*
- * Collect first pages of HugeTLB folios for
- * reconstruction later.
- */
- if ((folio->index & ~(huge_page_mask(h) >> PAGE_SHIFT)) == 0)
- list_add(&folio->lru, &folios_to_reconstruct);
+ index = offset >> PAGE_SHIFT;
+ hash = hugetlb_fault_mutex_lock(mapping, index);

- /*
- * Before removing from filemap, take a reference so
- * sub-folios don't get freed. Don't free the sub-folios
- * until after reconstruction.
- */
- folio_get(folio);
+ folio = filemap_get_folio(mapping, index);
+ if (!IS_ERR(folio)) {
+ /* Drop refcount so that filemap holds only reference. */
+ folio_put(folio);

+ kvm_gmem_reconstruct_folio_in_filemap(h, folio);
kvm_gmem_hugetlb_filemap_remove_folio(folio);

- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ num_freed++;
}
- folio_batch_release(&fbatch);
- cond_resched();
- }
-
- list_for_each_entry_safe(folio, tmp, &folios_to_reconstruct, lru) {
- kvm_gmem_hugetlb_reconstruct_folio(h, folio);
- hugetlb_folio_list_move(folio, &h->hugepage_activelist);

- folio_put(folio);
- num_freed++;
+ hugetlb_fault_mutex_unlock(hash);
}

return num_freed;
@@ -705,6 +913,10 @@ static void kvm_gmem_hugetlb_truncate_folios_range(struct inode *inode,
int gbl_reserve;
int num_freed;

+ /* No point truncating more than inode size. */
+ lstart = min(lstart, inode->i_size);
+ lend = min(lend, inode->i_size);
+
hgmem = kvm_gmem_hgmem(inode);
h = hgmem->h;

@@ -1042,13 +1254,27 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
bool is_prepared;

inode = file_inode(vmf->vma->vm_file);
- if (!kvm_gmem_is_faultable(inode, vmf->pgoff))
+
+ /*
+ * Use filemap_invalidate_lock_shared() to make sure
+ * kvm_gmem_get_folio() doesn't race with faultability updates.
+ */
+ filemap_invalidate_lock_shared(inode->i_mapping);
+
+ if (!kvm_gmem_is_faultable(inode, vmf->pgoff)) {
+ filemap_invalidate_unlock_shared(inode->i_mapping);
return VM_FAULT_SIGBUS;
+ }

folio = kvm_gmem_get_folio(inode, vmf->pgoff);
+
+ filemap_invalidate_unlock_shared(inode->i_mapping);
+
if (!folio)
return VM_FAULT_SIGBUS;

+ WARN(folio_test_hugetlb(folio), "should not be faulting in hugetlb folio=%p\n", folio);
+
is_prepared = folio_test_uptodate(folio);
if (!is_prepared) {
unsigned long nr_pages;
@@ -1731,8 +1957,6 @@ static bool kvm_gmem_no_mappings_range(struct inode *inode, pgoff_t start, pgoff
pgoff_t index;
bool checked_indices_unmapped;

- filemap_invalidate_lock_shared(inode->i_mapping);
-
/* TODO: replace iteration with filemap_get_folios() for efficiency. */
checked_indices_unmapped = true;
for (index = start; checked_indices_unmapped && index < end;) {
@@ -1754,98 +1978,130 @@ static bool kvm_gmem_no_mappings_range(struct inode *inode, pgoff_t start, pgoff
folio_put(folio);
}

- filemap_invalidate_unlock_shared(inode->i_mapping);
return checked_indices_unmapped;
}

/**
- * Returns true if pages in range [@start, @end) in memslot @slot have no
- * userspace mappings.
+ * Split any HugeTLB folios in range [@start, @end), if any of the offsets in
+ * the folio are faultable. Return 0 on success or negative error otherwise.
+ *
+ * Will skip any folios that are already split.
*/
-static bool kvm_gmem_no_mappings_slot(struct kvm_memory_slot *slot,
- gfn_t start, gfn_t end)
+static int kvm_gmem_try_split_folios_range(struct inode *inode,
+ pgoff_t start, pgoff_t end)
{
- pgoff_t offset_start;
- pgoff_t offset_end;
- struct file *file;
- bool ret;
-
- offset_start = start - slot->base_gfn + slot->gmem.pgoff;
- offset_end = end - slot->base_gfn + slot->gmem.pgoff;
-
- file = kvm_gmem_get_file(slot);
- if (!file)
- return false;
-
- ret = kvm_gmem_no_mappings_range(file_inode(file), offset_start, offset_end);
+ unsigned int nr_pages;
+ pgoff_t aligned_start;
+ pgoff_t aligned_end;
+ struct hstate *h;
+ pgoff_t index;
+ int ret;

- fput(file);
+ if (!is_kvm_gmem_hugetlb(inode))
+ return 0;

- return ret;
-}
+ h = kvm_gmem_hgmem(inode)->h;
+ nr_pages = 1UL << huge_page_order(h);

-/**
- * Returns true if pages in range [@start, @end) have no host userspace mappings.
- */
-static bool kvm_gmem_no_mappings(struct kvm *kvm, gfn_t start, gfn_t end)
-{
- int i;
+ aligned_start = round_down(start, nr_pages);
+ aligned_end = round_up(end, nr_pages);

- lockdep_assert_held(&kvm->slots_lock);
+ ret = 0;
+ for (index = aligned_start; !ret && index < aligned_end; index += nr_pages) {
+ struct folio *folio;
+ u32 hash;

- for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
- struct kvm_memslot_iter iter;
- struct kvm_memslots *slots;
+ hash = hugetlb_fault_mutex_lock(inode->i_mapping, index);

- slots = __kvm_memslots(kvm, i);
- kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
- struct kvm_memory_slot *slot;
- gfn_t gfn_start;
- gfn_t gfn_end;
-
- slot = iter.slot;
- gfn_start = max(start, slot->base_gfn);
- gfn_end = min(end, slot->base_gfn + slot->npages);
+ folio = filemap_get_folio(inode->i_mapping, index);
+ if (!IS_ERR(folio)) {
+ /*
+ * Drop refcount so that the only references held are refcounts
+ * from the filemap.
+ */
+ folio_put(folio);

- if (iter.slot->flags & KVM_MEM_GUEST_MEMFD &&
- !kvm_gmem_no_mappings_slot(iter.slot, gfn_start, gfn_end))
- return false;
+ if (kvm_gmem_is_any_faultable(inode, index, nr_pages)) {
+ ret = kvm_gmem_split_folio_in_filemap(h, folio);
+ if (ret) {
+ /* TODO cleanup properly. */
+ WARN_ON(ret);
+ }
+ }
}
+
+ hugetlb_fault_mutex_unlock(hash);
}

- return true;
+ return ret;
}

/**
- * Set faultability of given range of gfns [@start, @end) in memslot @slot to
- * @faultable.
+ * Returns 0 if guest_memfd permits setting range [@start, @end) with
+ * faultability @faultable within memslot @slot, or negative error otherwise.
+ *
+ * If a request was made to set the memory to PRIVATE (not faultable), the pages
+ * in the range must not be pinned or mapped for the request to be permitted.
+ *
+ * Because this may allow pages to be faulted in to userspace when requested to
+ * set attributes to shared, this must only be called after the pages have been
+ * invalidated from guest page tables.
*/
-static void kvm_gmem_set_faultable_slot(struct kvm_memory_slot *slot, gfn_t start,
- gfn_t end, bool faultable)
+static int kvm_gmem_try_set_faultable_slot(struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end,
+ bool faultable)
{
pgoff_t start_offset;
+ struct inode *inode;
pgoff_t end_offset;
struct file *file;
+ int ret;

file = kvm_gmem_get_file(slot);
if (!file)
- return;
+ return 0;

start_offset = start - slot->base_gfn + slot->gmem.pgoff;
end_offset = end - slot->base_gfn + slot->gmem.pgoff;

- WARN_ON(kvm_gmem_set_faultable(file_inode(file), start_offset, end_offset,
- faultable));
+ inode = file_inode(file);
+
+ /*
+ * Use filemap_invalidate_lock_shared() to make sure
+ * splitting/reconstruction doesn't race with faultability updates.
+ */
+ filemap_invalidate_lock(inode->i_mapping);
+
+ kvm_gmem_set_faultable(inode, start_offset, end_offset, faultable);
+
+ if (faultable) {
+ ret = kvm_gmem_try_split_folios_range(inode, start_offset,
+ end_offset);
+ } else {
+ if (kvm_gmem_no_mappings_range(inode, start_offset, end_offset)) {
+ ret = kvm_gmem_try_reconstruct_folios_range(inode,
+ start_offset,
+ end_offset);
+ } else {
+ ret = -EINVAL;
+ }
+ }
+
+ filemap_invalidate_unlock(inode->i_mapping);

fput(file);
+
+ return ret;
}

/**
- * Set faultability of given range of gfns [@start, @end) in memslot @slot to
- * @faultable.
+ * Returns 0 if guest_memfd permits setting range [@start, @end) with
+ * faultability @faultable within VM @kvm, or negative error otherwise.
+ *
+ * See kvm_gmem_try_set_faultable_slot() for details.
*/
-static void kvm_gmem_set_faultable_vm(struct kvm *kvm, gfn_t start, gfn_t end,
- bool faultable)
+static int kvm_gmem_try_set_faultable_vm(struct kvm *kvm, gfn_t start, gfn_t end,
+ bool faultable)
{
int i;

@@ -1866,43 +2122,15 @@ static void kvm_gmem_set_faultable_vm(struct kvm *kvm, gfn_t start, gfn_t end,
gfn_end = min(end, slot->base_gfn + slot->npages);

if (iter.slot->flags & KVM_MEM_GUEST_MEMFD) {
- kvm_gmem_set_faultable_slot(slot, gfn_start,
- gfn_end, faultable);
+ int ret;
+
+ ret = kvm_gmem_try_set_faultable_slot(slot, gfn_start,
+ gfn_end, faultable);
+ if (ret)
+ return ret;
}
}
}
-}
-
-/**
- * Returns true if guest_memfd permits setting range [@start, @end) to PRIVATE.
- *
- * If memory is faulted in to host userspace and a request was made to set the
- * memory to PRIVATE, the faulted in pages must not be pinned for the request to
- * be permitted.
- */
-static int kvm_gmem_should_set_attributes_private(struct kvm *kvm, gfn_t start,
- gfn_t end)
-{
- kvm_gmem_set_faultable_vm(kvm, start, end, false);
-
- if (kvm_gmem_no_mappings(kvm, start, end))
- return 0;
-
- kvm_gmem_set_faultable_vm(kvm, start, end, true);
- return -EINVAL;
-}
-
-/**
- * Returns true if guest_memfd permits setting range [@start, @end) to SHARED.
- *
- * Because this allows pages to be faulted in to userspace, this must only be
- * called after the pages have been invalidated from guest page tables.
- */
-static int kvm_gmem_should_set_attributes_shared(struct kvm *kvm, gfn_t start,
- gfn_t end)
-{
- /* Always okay to set shared, hence set range faultable here. */
- kvm_gmem_set_faultable_vm(kvm, start, end, true);

return 0;
}
@@ -1922,10 +2150,16 @@ static int kvm_gmem_should_set_attributes_shared(struct kvm *kvm, gfn_t start,
int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
unsigned long attrs)
{
- if (attrs & KVM_MEMORY_ATTRIBUTE_PRIVATE)
- return kvm_gmem_should_set_attributes_private(kvm, start, end);
- else
- return kvm_gmem_should_set_attributes_shared(kvm, start, end);
+ bool faultable;
+ int ret;
+
+ faultable = !(attrs & KVM_MEMORY_ATTRIBUTE_PRIVATE);
+
+ ret = kvm_gmem_try_set_faultable_vm(kvm, start, end, faultable);
+ if (ret)
+ WARN_ON(kvm_gmem_try_set_faultable_vm(kvm, start, end, !faultable));
+
+ return ret;
}

#endif
--
2.46.0.598.g6f2099f65c-goog