[RFC PATCH kernel] iommufd: Allow mapping from KVM's guest_memfd
From: Alexey Kardashevskiy
Date: Wed Feb 25 2026 - 02:53:56 EST
CoCo VMs get their private memory allocated from guest_memfd
("gmemfd") which is a KVM facility similar to memfd.
The gmemfds does not allow mapping private memory to the userspace
so the IOMMU_IOAS_MAP ioctl does not work.
Use the existing IOMMU_IOAS_MAP_FILE ioctl to allow mapping from
fd + offset. Detect the gmemfd case in pfn_reader_user_pin().
For the new guest_memfd type, no additional reference is taken as
pinning is guaranteed by the KVM guest_memfd library.
There is no KVM-GMEMFD->IOMMUFD direct notification mechanism as
the assumption is that:
1) page stage change events will be handled by VMM which is going
to call IOMMUFD to remap pages;
2) shrinking GMEMFD equals to VM memory unplug and VMM is going to
handle it.
Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxx>
---
This is for Trusted IO == TEE-IO == PCIe TDISP, etc.
Previously posted here:
https://lore.kernel.org/r/20250218111017.491719-13-aik@xxxxxxx
The main comment was "what is the lifetime of those folios()" and
GMEMFD + QEMU should take care of it.
And horrendous stuff like this is not really useful:
https://github.com/AMDESE/linux-kvm/commit/7d73fd2cccb8489b1
---
include/linux/kvm_host.h | 4 +
drivers/iommu/iommufd/pages.c | 80 ++++++++++++++++++--
virt/kvm/guest_memfd.c | 36 +++++++++
3 files changed, 113 insertions(+), 7 deletions(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 995db7a7ba57..9369cf22b24e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2673,4 +2673,8 @@ unsigned long kvm_get_vm_memory_attributes(struct kvm *kvm, gfn_t gfn);
int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
struct kvm_memory_attributes2 *attrs);
+bool kvm_is_gmemfd(struct file *file);
+struct folio *kvm_gmemfd_get_pfn(struct file *file, unsigned long index,
+ unsigned long *pfn, int *max_order);
+
#endif
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index dbe51ecb9a20..4c07e39e17d0 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -56,6 +56,9 @@
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/vfio_pci_core.h>
+#include <linux/pagemap.h>
+#include <linux/memcontrol.h>
+#include <linux/kvm_host.h>
#include "double_span.h"
#include "io_pagetable.h"
@@ -660,7 +663,8 @@ static void batch_from_pages(struct pfn_batch *batch, struct page **pages,
}
static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p,
- unsigned long *offset_p, unsigned long npages)
+ unsigned long *offset_p, unsigned long npages,
+ bool do_pin)
{
int rc = 0;
struct folio **folios = *folios_p;
@@ -676,7 +680,7 @@ static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p,
if (!batch_add_pfn_num(batch, pfn, nr, BATCH_CPU_MEMORY))
break;
- if (nr > 1) {
+ if (nr > 1 && do_pin) {
rc = folio_add_pins(folio, nr - 1);
if (rc) {
batch_remove_pfn_num(batch, nr);
@@ -697,6 +701,7 @@ static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p,
static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages,
unsigned int first_page_off, size_t npages)
{
+ bool do_unpin = !kvm_is_gmemfd(pages->file);
unsigned int cur = 0;
while (first_page_off) {
@@ -710,9 +715,12 @@ static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages,
size_t to_unpin = min_t(size_t, npages,
batch->npfns[cur] - first_page_off);
- unpin_user_page_range_dirty_lock(
- pfn_to_page(batch->pfns[cur] + first_page_off),
- to_unpin, pages->writable);
+ /* Do nothing for guest_memfd */
+ if (do_unpin)
+ unpin_user_page_range_dirty_lock(
+ pfn_to_page(batch->pfns[cur] + first_page_off),
+ to_unpin, pages->writable);
+
iopt_pages_sub_npinned(pages, to_unpin);
cur++;
first_page_off = 0;
@@ -872,6 +880,57 @@ static long pin_memfd_pages(struct pfn_reader_user *user, unsigned long start,
return npages_out;
}
+static long pin_guest_memfd_pages(struct pfn_reader_user *user, loff_t start, unsigned long npages)
+{
+ struct page **upages = user->upages;
+ unsigned long offset = 0;
+ loff_t uptr = start;
+ long rc = 0;
+
+ for (unsigned long i = 0; (uptr - start) < (npages << PAGE_SHIFT); ++i) {
+ unsigned long gfn = 0, pfn = 0;
+ int max_order = 0;
+ struct folio *folio;
+
+ folio = kvm_gmemfd_get_pfn(user->file, uptr >> PAGE_SHIFT, &pfn, &max_order);
+ if (IS_ERR(folio))
+ rc = PTR_ERR(folio);
+
+ if (rc == -EINVAL && i == 0) {
+ pr_err_once("Must be vfio mmio at gfn=%lx pfn=%lx, skipping\n", gfn, pfn);
+ return rc;
+ }
+
+ if (rc) {
+ pr_err("%s: %ld %ld %lx -> %lx\n", __func__,
+ rc, i, (unsigned long) uptr, (unsigned long) pfn);
+ break;
+ }
+
+ if (i == 0)
+ offset = offset_in_folio(folio, start) >> PAGE_SHIFT;
+
+ user->ufolios[i] = folio;
+
+ if (upages) {
+ unsigned long np = (1UL << (max_order + PAGE_SHIFT)) - offset_in_folio(folio, uptr);
+
+ for (unsigned long j = 0; j < np; ++j)
+ *upages++ = folio_page(folio, offset + j);
+ }
+
+ uptr += (1UL << (max_order + PAGE_SHIFT)) - offset_in_folio(folio, uptr);
+ }
+
+ if (!rc) {
+ rc = npages;
+ user->ufolios_next = user->ufolios;
+ user->ufolios_offset = offset;
+ }
+
+ return rc;
+}
+
static int pfn_reader_user_pin(struct pfn_reader_user *user,
struct iopt_pages *pages,
unsigned long start_index,
@@ -925,7 +984,13 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user,
if (user->file) {
start = pages->start + (start_index * PAGE_SIZE);
- rc = pin_memfd_pages(user, start, npages);
+ if (kvm_is_gmemfd(pages->file)) {
+ rc = pin_guest_memfd_pages(user, start, npages);
+ } else {
+ pr_err("UNEXP PINFD start=%lx sz=%lx file=%lx",
+ start, npages << PAGE_SHIFT, (ulong) pages->file);
+ rc = pin_memfd_pages(user, start, npages);
+ }
} else if (!remote_mm) {
uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE);
rc = pin_user_pages_fast(uptr, npages, user->gup_flags,
@@ -1221,7 +1286,8 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns)
npages);
else
rc = batch_from_folios(&pfns->batch, &user->ufolios_next,
- &user->ufolios_offset, npages);
+ &user->ufolios_offset, npages,
+ !kvm_is_gmemfd(pfns->pages->file));
return rc;
}
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index e4e21068cf2a..2a313888c21b 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -1794,3 +1794,39 @@ void kvm_gmem_exit(void)
rcu_barrier();
kmem_cache_destroy(kvm_gmem_inode_cachep);
}
+
+bool kvm_is_gmemfd(struct file *file)
+{
+ if (!file)
+ return false;
+
+ if (file->f_op != &kvm_gmem_fops)
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(kvm_is_gmemfd);
+
+struct folio *kvm_gmemfd_get_pfn(struct file *file, unsigned long index,
+ unsigned long *pfn, int *max_order)
+{
+ struct inode *inode = file_inode(file);
+ struct folio *folio;
+
+ if (!inode || !kvm_is_gmemfd(file))
+ return NULL;
+
+ folio = kvm_gmem_get_folio(inode, index);
+ if (!folio)
+ return NULL;
+
+
+ *pfn = folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
+ *max_order = folio_order(folio);
+
+ folio_put(folio);
+ folio_unlock(folio);
+
+ return folio;
+}
+EXPORT_SYMBOL_GPL(kvm_gmemfd_get_pfn);
--
2.52.0