Re: [PATCH RFC v2 12/15] vfio/nvgrace-egm: Introduce ioctl to share retired pages
From: Alex Williamson
Date: Wed Mar 04 2026 - 18:01:14 EST
On Mon, 23 Feb 2026 15:55:11 +0000
<ankita@xxxxxxxxxx> wrote:
> From: Ankit Agrawal <ankita@xxxxxxxxxx>
>
> nvgrace-egm module stores the list of retired page offsets to be made
> available for usermode processes. Introduce an ioctl to share the
> information with the userspace.
>
> The ioctl is called by usermode apps such as QEMU to get the retired
> page offsets. The usermode apps are expected to take appropriate action
> to communicate the list to the VM.
>
> Signed-off-by: Ankit Agrawal <ankita@xxxxxxxxxx>
> ---
> MAINTAINERS | 1 +
> drivers/vfio/pci/nvgrace-gpu/egm.c | 67 ++++++++++++++++++++++++++++++
> include/uapi/linux/egm.h | 28 +++++++++++++
> 3 files changed, 96 insertions(+)
> create mode 100644 include/uapi/linux/egm.h
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 1fc551d7d667..94cf15a1e82c 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -27389,6 +27389,7 @@ M: Ankit Agrawal <ankita@xxxxxxxxxx>
> L: kvm@xxxxxxxxxxxxxxx
> S: Supported
> F: drivers/vfio/pci/nvgrace-gpu/egm.c
> +F: include/uapi/linux/egm.h
>
> VFIO PCI DEVICE SPECIFIC DRIVERS
> R: Jason Gunthorpe <jgg@xxxxxxxxxx>
> diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c
> index 077de3833046..918979d8fcd4 100644
> --- a/drivers/vfio/pci/nvgrace-gpu/egm.c
> +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c
> @@ -5,6 +5,7 @@
>
> #include <linux/vfio_pci_core.h>
> #include <linux/nvgrace-egm.h>
> +#include <linux/egm.h>
>
> #define MAX_EGM_NODES 4
>
> @@ -119,11 +120,77 @@ static int nvgrace_egm_mmap(struct file *file, struct vm_area_struct *vma)
> vma->vm_page_prot);
> }
>
> +static long nvgrace_egm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> +{
> + unsigned long minsz = offsetofend(struct egm_retired_pages_list, count);
> + struct egm_retired_pages_list info;
> + void __user *uarg = (void __user *)arg;
> + struct chardev *egm_chardev = file->private_data;
> +
> + if (copy_from_user(&info, uarg, minsz))
> + return -EFAULT;
> +
> + if (info.argsz < minsz || !egm_chardev)
> + return -EINVAL;
How could we get here with !egm_chardev?
> +
> + switch (cmd) {
> + case EGM_RETIRED_PAGES_LIST:
> + int ret;
> + unsigned long retired_page_struct_size = sizeof(struct egm_retired_pages_info);
> + struct egm_retired_pages_info tmp;
> + struct h_node *cur_page;
> + struct hlist_node *tmp_node;
> + unsigned long bkt;
> + int count = 0, index = 0;
No brackets for inline declarations. Ordering could be improved.
> +
> + hash_for_each_safe(egm_chardev->htbl, bkt, tmp_node, cur_page, node)
> + count++;
Why not keep track of the count as they're added?
Neither loop here needs the _safe variant here since we're not removing
entries.
> +
> + if (info.argsz < (minsz + count * retired_page_struct_size)) {
> + info.argsz = minsz + count * retired_page_struct_size;
> + info.count = 0;
vfio returns success when there's not enough space for compatibility
for new capabilities. For a new ioctl just set argsz and count and
return -ENOSPC.
> + goto done;
> + } else {
We don't need an else if the previous branch unconditionally goes
somewhere else.
> + hash_for_each_safe(egm_chardev->htbl, bkt, tmp_node, cur_page, node) {
> + /*
> + * This check fails if there was an ECC error
> + * after the usermode app read the count of
> + * bad pages through this ioctl.
> + */
> + if (minsz + index * retired_page_struct_size >= info.argsz) {
> + info.argsz = minsz + index * retired_page_struct_size;
> + info.count = index;
If only we had locking to prevent such races...
> + goto done;
> + }
> +
> + tmp.offset = cur_page->mem_offset;
> + tmp.size = PAGE_SIZE;
Is firmware recording 4K or 64K pages in this table?
The above comment alludes runtime ECC faults, are those a different
page size from the granularity firmware reports in the table?
> +
> + ret = copy_to_user(uarg + minsz +
> + index * retired_page_struct_size,
> + &tmp, retired_page_struct_size);
> + if (ret)
> + return -EFAULT;
> + index++;
> + }
> +
> + info.count = index;
> + }
> + break;
> + default:
> + return -EINVAL;
> + }
> +
> +done:
> + return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0;
> +}
> +
> static const struct file_operations file_ops = {
> .owner = THIS_MODULE,
> .open = nvgrace_egm_open,
> .release = nvgrace_egm_release,
> .mmap = nvgrace_egm_mmap,
> + .unlocked_ioctl = nvgrace_egm_ioctl,
> };
>
> static void egm_chardev_release(struct device *dev)
> diff --git a/include/uapi/linux/egm.h b/include/uapi/linux/egm.h
> new file mode 100644
> index 000000000000..4d3a2304d4f0
> --- /dev/null
> +++ b/include/uapi/linux/egm.h
> @@ -0,0 +1,28 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +/*
> + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved
2026
> + */
> +
> +#ifndef _UAPI_LINUX_EGM_H
> +#define _UAPI_LINUX_EGM_H
> +
> +#include <linux/types.h>
> +
> +#define EGM_TYPE ('E')
Arbitrarily chosen? Update ioctl-number.rst?
> +
> +struct egm_retired_pages_info {
> + __aligned_u64 offset;
> + __aligned_u64 size;
> +};
> +
> +struct egm_retired_pages_list {
> + __u32 argsz;
> + /* out */
> + __u32 count;
> + /* out */
> + struct egm_retired_pages_info retired_pages[];
> +};
I imagine you want some uapi description of this ioctl. Thanks,
Alex
> +
> +#define EGM_RETIRED_PAGES_LIST _IO(EGM_TYPE, 100)
> +
> +#endif /* _UAPI_LINUX_EGM_H */