Re: [PATCH RFC v2 11/15] vfio/nvgrace-egm: Fetch EGM region retired pages list
From: Alex Williamson
Date: Wed Mar 04 2026 - 17:37:30 EST
On Mon, 23 Feb 2026 15:55:10 +0000
<ankita@xxxxxxxxxx> wrote:
> From: Ankit Agrawal <ankita@xxxxxxxxxx>
>
> It is possible for some system memory pages on the EGM to
> have retired pages with uncorrectable ECC errors. A list of
> pages known with such errors (referred as retired pages) are
> maintained by the Host UEFI. The Host UEFI populates such list
> in a reserved region. It communicates the SPA of this region
> through a ACPI DSDT property.
>
> nvgrace-egm module is responsible to store the list of retired page
> offsets to be made available for usermode processes. The module:
> 1. Get the reserved memory region SPA and maps to it to fetch
> the list of bad pages.
> 2. Calculate the retired page offsets in the EGM and stores it.
>
> Signed-off-by: Ankit Agrawal <ankita@xxxxxxxxxx>
> ---
> drivers/vfio/pci/nvgrace-gpu/egm.c | 75 ++++++++++++++++++++++++++
> drivers/vfio/pci/nvgrace-gpu/egm_dev.c | 32 +++++++++--
> drivers/vfio/pci/nvgrace-gpu/egm_dev.h | 5 +-
> drivers/vfio/pci/nvgrace-gpu/main.c | 8 +--
> include/linux/nvgrace-egm.h | 2 +
> 5 files changed, 112 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c
> index de7771a4145d..077de3833046 100644
> --- a/drivers/vfio/pci/nvgrace-gpu/egm.c
> +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c
> @@ -8,6 +8,11 @@
>
> #define MAX_EGM_NODES 4
>
> +struct h_node {
> + unsigned long mem_offset;
> + struct hlist_node node;
> +};
> +
> static dev_t dev;
> static struct class *class;
> static DEFINE_XARRAY(egm_chardevs);
> @@ -16,6 +21,7 @@ struct chardev {
> struct device device;
> struct cdev cdev;
> atomic_t open_count;
> + DECLARE_HASHTABLE(htbl, 0x10);
> };
>
> static struct nvgrace_egm_dev *
> @@ -174,20 +180,88 @@ static void del_egm_chardev(struct chardev *egm_chardev)
> put_device(&egm_chardev->device);
> }
>
> +static void cleanup_retired_pages(struct chardev *egm_chardev)
> +{
> + struct h_node *cur_page;
> + unsigned long bkt;
> + struct hlist_node *temp_node;
> +
> + hash_for_each_safe(egm_chardev->htbl, bkt, temp_node, cur_page, node) {
> + hash_del(&cur_page->node);
> + kvfree(cur_page);
> + }
> +}
> +
> +static int nvgrace_egm_fetch_retired_pages(struct nvgrace_egm_dev *egm_dev,
> + struct chardev *egm_chardev)
> +{
> + u64 count;
> + void *memaddr;
> + int index, ret = 0;
> +
> + memaddr = memremap(egm_dev->retiredpagesphys, PAGE_SIZE, MEMREMAP_WB);
We're reading some data structure in physical memory, how does that
data structure have any relation to the kernel PAGE_SIZE, which might
be 4K or 64K?
> + if (!memaddr)
> + return -ENOMEM;
> +
> + count = *(u64 *)memaddr;
So the first 8-bytes contains a page count.
> + if (count > PAGE_SIZE / sizeof(count))
> + return -EINVAL;
So if it's a 64K table and we're on a 4K host, this can unnecessarily
fail, or fail to incorporate the vast majority of pages.
Also the 0th index is the count itself, so there can only be 511
entries with 4K page, not 512. This is off-by-one and the loop below
can exceed the map range.
AI also tells me that the hash table is vastly oversized for containing
either 511 or 8191 entries.
Also we didn't menunmap on this error condition.
> +
> + for (index = 0; index < count; index++) {
> + struct h_node *retired_page;
> +
> + /*
> + * Since the EGM is linearly mapped, the offset in the
> + * carveout is the same offset in the VM system memory.
> + *
> + * Calculate the offset to communicate to the usermode
> + * apps.
> + */
> + retired_page = kzalloc(sizeof(*retired_page), GFP_KERNEL);
> + if (!retired_page) {
> + ret = -ENOMEM;
> + break;
> + }
> +
> + retired_page->mem_offset = *((u64 *)memaddr + index + 1) -
> + egm_dev->egmphys;
> + hash_add(egm_chardev->htbl, &retired_page->node,
> + retired_page->mem_offset);
> + }
> +
> + memunmap(memaddr);
> +
> + if (ret)
> + cleanup_retired_pages(egm_chardev);
> +
> + return ret;
> +}
> +
> static int egm_driver_probe(struct auxiliary_device *aux_dev,
> const struct auxiliary_device_id *id)
> {
> struct nvgrace_egm_dev *egm_dev =
> container_of(aux_dev, struct nvgrace_egm_dev, aux_dev);
> struct chardev *egm_chardev;
> + int ret;
>
> egm_chardev = setup_egm_chardev(egm_dev);
> if (!egm_chardev)
> return -EINVAL;
>
> + hash_init(egm_chardev->htbl);
> +
> + ret = nvgrace_egm_fetch_retired_pages(egm_dev, egm_chardev);
> + if (ret)
> + goto error_exit;
> +
> xa_store(&egm_chardevs, egm_dev->egmpxm, egm_chardev, GFP_KERNEL);
>
> return 0;
> +
> +error_exit:
> + del_egm_chardev(egm_chardev);
> + return ret;
> }
>
> static void egm_driver_remove(struct auxiliary_device *aux_dev)
> @@ -199,6 +273,7 @@ static void egm_driver_remove(struct auxiliary_device *aux_dev)
> if (!egm_chardev)
> return;
>
> + cleanup_retired_pages(egm_chardev);
> del_egm_chardev(egm_chardev);
> }
>
> diff --git a/drivers/vfio/pci/nvgrace-gpu/egm_dev.c b/drivers/vfio/pci/nvgrace-gpu/egm_dev.c
> index 20291504aca8..6d716c3a3257 100644
> --- a/drivers/vfio/pci/nvgrace-gpu/egm_dev.c
> +++ b/drivers/vfio/pci/nvgrace-gpu/egm_dev.c
> @@ -18,22 +18,41 @@ int nvgrace_gpu_has_egm_property(struct pci_dev *pdev, u64 *pegmpxm)
> }
>
> int nvgrace_gpu_fetch_egm_property(struct pci_dev *pdev, u64 *pegmphys,
> - u64 *pegmlength)
> + u64 *pegmlength, u64 *pretiredpagesphys)
> {
> int ret;
>
> /*
> - * The memory information is present in the system ACPI tables as DSD
> - * properties nvidia,egm-base-pa and nvidia,egm-size.
> + * The EGM memory information is present in the system ACPI tables
> + * as DSD properties nvidia,egm-base-pa and nvidia,egm-size.
> */
> ret = device_property_read_u64(&pdev->dev, "nvidia,egm-size",
> pegmlength);
> if (ret)
> - return ret;
> + goto error_exit;
>
> ret = device_property_read_u64(&pdev->dev, "nvidia,egm-base-pa",
> pegmphys);
> + if (ret)
> + goto error_exit;
> +
> + /*
> + * SBIOS puts the list of retired pages on a region. The region
> + * SPA is exposed as "nvidia,egm-retired-pages-data-base".
> + */
> + ret = device_property_read_u64(&pdev->dev,
> + "nvidia,egm-retired-pages-data-base",
> + pretiredpagesphys);
> + if (ret)
> + goto error_exit;
> +
> + /* Catch firmware bug and avoid a crash */
> + if (*pretiredpagesphys == 0) {
> + dev_err(&pdev->dev, "Retired pages region is not setup\n");
> + ret = -EINVAL;
> + }
>
> +error_exit:
> return ret;
> }
>
> @@ -74,7 +93,8 @@ static void nvgrace_gpu_release_aux_device(struct device *device)
>
> struct nvgrace_egm_dev *
> nvgrace_gpu_create_aux_device(struct pci_dev *pdev, const char *name,
> - u64 egmphys, u64 egmlength, u64 egmpxm)
> + u64 egmphys, u64 egmlength, u64 egmpxm,
> + u64 retiredpagesphys)
> {
> struct nvgrace_egm_dev *egm_dev;
> int ret;
> @@ -86,6 +106,8 @@ nvgrace_gpu_create_aux_device(struct pci_dev *pdev, const char *name,
> egm_dev->egmpxm = egmpxm;
> egm_dev->egmphys = egmphys;
> egm_dev->egmlength = egmlength;
> + egm_dev->retiredpagesphys = retiredpagesphys;
> +
> INIT_LIST_HEAD(&egm_dev->gpus);
>
> egm_dev->aux_dev.id = egmpxm;
> diff --git a/drivers/vfio/pci/nvgrace-gpu/egm_dev.h b/drivers/vfio/pci/nvgrace-gpu/egm_dev.h
> index 2e1612445898..2f329a05685d 100644
> --- a/drivers/vfio/pci/nvgrace-gpu/egm_dev.h
> +++ b/drivers/vfio/pci/nvgrace-gpu/egm_dev.h
> @@ -16,8 +16,9 @@ void remove_gpu(struct nvgrace_egm_dev *egm_dev, struct pci_dev *pdev);
>
> struct nvgrace_egm_dev *
> nvgrace_gpu_create_aux_device(struct pci_dev *pdev, const char *name,
> - u64 egmphys, u64 egmlength, u64 egmpxm);
> + u64 egmphys, u64 egmlength, u64 egmpxm,
> + u64 retiredpagesphys);
>
> int nvgrace_gpu_fetch_egm_property(struct pci_dev *pdev, u64 *pegmphys,
> - u64 *pegmlength);
> + u64 *pegmlength, u64 *pretiredpagesphys);
> #endif /* EGM_DEV_H */
> diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
> index 0bb427cca31f..11bbecda1ad2 100644
> --- a/drivers/vfio/pci/nvgrace-gpu/main.c
> +++ b/drivers/vfio/pci/nvgrace-gpu/main.c
> @@ -78,7 +78,7 @@ static struct list_head egm_dev_list;
> static int nvgrace_gpu_create_egm_aux_device(struct pci_dev *pdev)
> {
> struct nvgrace_egm_dev_entry *egm_entry = NULL;
> - u64 egmphys, egmlength, egmpxm;
> + u64 egmphys, egmlength, egmpxm, retiredpagesphys;
> int ret = 0;
> bool is_new_region = false;
>
> @@ -91,7 +91,8 @@ static int nvgrace_gpu_create_egm_aux_device(struct pci_dev *pdev)
> if (nvgrace_gpu_has_egm_property(pdev, &egmpxm))
> goto exit;
>
> - ret = nvgrace_gpu_fetch_egm_property(pdev, &egmphys, &egmlength);
> + ret = nvgrace_gpu_fetch_egm_property(pdev, &egmphys, &egmlength,
> + &retiredpagesphys);
> if (ret)
> goto exit;
>
> @@ -114,7 +115,8 @@ static int nvgrace_gpu_create_egm_aux_device(struct pci_dev *pdev)
>
> egm_entry->egm_dev =
> nvgrace_gpu_create_aux_device(pdev, NVGRACE_EGM_DEV_NAME,
> - egmphys, egmlength, egmpxm);
> + egmphys, egmlength, egmpxm,
> + retiredpagesphys);
> if (!egm_entry->egm_dev) {
> ret = -EINVAL;
> goto free_egm_entry;
> diff --git a/include/linux/nvgrace-egm.h b/include/linux/nvgrace-egm.h
> index b9956e7e5a0e..9e0d190c7da0 100644
> --- a/include/linux/nvgrace-egm.h
> +++ b/include/linux/nvgrace-egm.h
> @@ -7,6 +7,7 @@
> #define NVGRACE_EGM_H
>
> #include <linux/auxiliary_bus.h>
> +#include <linux/hashtable.h>
This is implementation, it should be in the c file not the public
header. Thanks,
Alex
>
> #define NVGRACE_EGM_DEV_NAME "egm"
> #define EGM_OFFSET_SHIFT 40
> @@ -20,6 +21,7 @@ struct nvgrace_egm_dev {
> struct auxiliary_device aux_dev;
> phys_addr_t egmphys;
> size_t egmlength;
> + phys_addr_t retiredpagesphys;
> u64 egmpxm;
> struct list_head gpus;
> };