[PATCH RFC v2 11/15] vfio/nvgrace-egm: Fetch EGM region retired pages list

From: ankita

Date: Mon Feb 23 2026 - 10:57:03 EST


From: Ankit Agrawal <ankita@xxxxxxxxxx>

It is possible for some system memory pages on the EGM to
have retired pages with uncorrectable ECC errors. A list of
pages known with such errors (referred as retired pages) are
maintained by the Host UEFI. The Host UEFI populates such list
in a reserved region. It communicates the SPA of this region
through a ACPI DSDT property.

nvgrace-egm module is responsible to store the list of retired page
offsets to be made available for usermode processes. The module:
1. Get the reserved memory region SPA and maps to it to fetch
the list of bad pages.
2. Calculate the retired page offsets in the EGM and stores it.

Signed-off-by: Ankit Agrawal <ankita@xxxxxxxxxx>
---
drivers/vfio/pci/nvgrace-gpu/egm.c | 75 ++++++++++++++++++++++++++
drivers/vfio/pci/nvgrace-gpu/egm_dev.c | 32 +++++++++--
drivers/vfio/pci/nvgrace-gpu/egm_dev.h | 5 +-
drivers/vfio/pci/nvgrace-gpu/main.c | 8 +--
include/linux/nvgrace-egm.h | 2 +
5 files changed, 112 insertions(+), 10 deletions(-)

diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c
index de7771a4145d..077de3833046 100644
--- a/drivers/vfio/pci/nvgrace-gpu/egm.c
+++ b/drivers/vfio/pci/nvgrace-gpu/egm.c
@@ -8,6 +8,11 @@

#define MAX_EGM_NODES 4

+struct h_node {
+ unsigned long mem_offset;
+ struct hlist_node node;
+};
+
static dev_t dev;
static struct class *class;
static DEFINE_XARRAY(egm_chardevs);
@@ -16,6 +21,7 @@ struct chardev {
struct device device;
struct cdev cdev;
atomic_t open_count;
+ DECLARE_HASHTABLE(htbl, 0x10);
};

static struct nvgrace_egm_dev *
@@ -174,20 +180,88 @@ static void del_egm_chardev(struct chardev *egm_chardev)
put_device(&egm_chardev->device);
}

+static void cleanup_retired_pages(struct chardev *egm_chardev)
+{
+ struct h_node *cur_page;
+ unsigned long bkt;
+ struct hlist_node *temp_node;
+
+ hash_for_each_safe(egm_chardev->htbl, bkt, temp_node, cur_page, node) {
+ hash_del(&cur_page->node);
+ kvfree(cur_page);
+ }
+}
+
+static int nvgrace_egm_fetch_retired_pages(struct nvgrace_egm_dev *egm_dev,
+ struct chardev *egm_chardev)
+{
+ u64 count;
+ void *memaddr;
+ int index, ret = 0;
+
+ memaddr = memremap(egm_dev->retiredpagesphys, PAGE_SIZE, MEMREMAP_WB);
+ if (!memaddr)
+ return -ENOMEM;
+
+ count = *(u64 *)memaddr;
+ if (count > PAGE_SIZE / sizeof(count))
+ return -EINVAL;
+
+ for (index = 0; index < count; index++) {
+ struct h_node *retired_page;
+
+ /*
+ * Since the EGM is linearly mapped, the offset in the
+ * carveout is the same offset in the VM system memory.
+ *
+ * Calculate the offset to communicate to the usermode
+ * apps.
+ */
+ retired_page = kzalloc(sizeof(*retired_page), GFP_KERNEL);
+ if (!retired_page) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ retired_page->mem_offset = *((u64 *)memaddr + index + 1) -
+ egm_dev->egmphys;
+ hash_add(egm_chardev->htbl, &retired_page->node,
+ retired_page->mem_offset);
+ }
+
+ memunmap(memaddr);
+
+ if (ret)
+ cleanup_retired_pages(egm_chardev);
+
+ return ret;
+}
+
static int egm_driver_probe(struct auxiliary_device *aux_dev,
const struct auxiliary_device_id *id)
{
struct nvgrace_egm_dev *egm_dev =
container_of(aux_dev, struct nvgrace_egm_dev, aux_dev);
struct chardev *egm_chardev;
+ int ret;

egm_chardev = setup_egm_chardev(egm_dev);
if (!egm_chardev)
return -EINVAL;

+ hash_init(egm_chardev->htbl);
+
+ ret = nvgrace_egm_fetch_retired_pages(egm_dev, egm_chardev);
+ if (ret)
+ goto error_exit;
+
xa_store(&egm_chardevs, egm_dev->egmpxm, egm_chardev, GFP_KERNEL);

return 0;
+
+error_exit:
+ del_egm_chardev(egm_chardev);
+ return ret;
}

static void egm_driver_remove(struct auxiliary_device *aux_dev)
@@ -199,6 +273,7 @@ static void egm_driver_remove(struct auxiliary_device *aux_dev)
if (!egm_chardev)
return;

+ cleanup_retired_pages(egm_chardev);
del_egm_chardev(egm_chardev);
}

diff --git a/drivers/vfio/pci/nvgrace-gpu/egm_dev.c b/drivers/vfio/pci/nvgrace-gpu/egm_dev.c
index 20291504aca8..6d716c3a3257 100644
--- a/drivers/vfio/pci/nvgrace-gpu/egm_dev.c
+++ b/drivers/vfio/pci/nvgrace-gpu/egm_dev.c
@@ -18,22 +18,41 @@ int nvgrace_gpu_has_egm_property(struct pci_dev *pdev, u64 *pegmpxm)
}

int nvgrace_gpu_fetch_egm_property(struct pci_dev *pdev, u64 *pegmphys,
- u64 *pegmlength)
+ u64 *pegmlength, u64 *pretiredpagesphys)
{
int ret;

/*
- * The memory information is present in the system ACPI tables as DSD
- * properties nvidia,egm-base-pa and nvidia,egm-size.
+ * The EGM memory information is present in the system ACPI tables
+ * as DSD properties nvidia,egm-base-pa and nvidia,egm-size.
*/
ret = device_property_read_u64(&pdev->dev, "nvidia,egm-size",
pegmlength);
if (ret)
- return ret;
+ goto error_exit;

ret = device_property_read_u64(&pdev->dev, "nvidia,egm-base-pa",
pegmphys);
+ if (ret)
+ goto error_exit;
+
+ /*
+ * SBIOS puts the list of retired pages on a region. The region
+ * SPA is exposed as "nvidia,egm-retired-pages-data-base".
+ */
+ ret = device_property_read_u64(&pdev->dev,
+ "nvidia,egm-retired-pages-data-base",
+ pretiredpagesphys);
+ if (ret)
+ goto error_exit;
+
+ /* Catch firmware bug and avoid a crash */
+ if (*pretiredpagesphys == 0) {
+ dev_err(&pdev->dev, "Retired pages region is not setup\n");
+ ret = -EINVAL;
+ }

+error_exit:
return ret;
}

@@ -74,7 +93,8 @@ static void nvgrace_gpu_release_aux_device(struct device *device)

struct nvgrace_egm_dev *
nvgrace_gpu_create_aux_device(struct pci_dev *pdev, const char *name,
- u64 egmphys, u64 egmlength, u64 egmpxm)
+ u64 egmphys, u64 egmlength, u64 egmpxm,
+ u64 retiredpagesphys)
{
struct nvgrace_egm_dev *egm_dev;
int ret;
@@ -86,6 +106,8 @@ nvgrace_gpu_create_aux_device(struct pci_dev *pdev, const char *name,
egm_dev->egmpxm = egmpxm;
egm_dev->egmphys = egmphys;
egm_dev->egmlength = egmlength;
+ egm_dev->retiredpagesphys = retiredpagesphys;
+
INIT_LIST_HEAD(&egm_dev->gpus);

egm_dev->aux_dev.id = egmpxm;
diff --git a/drivers/vfio/pci/nvgrace-gpu/egm_dev.h b/drivers/vfio/pci/nvgrace-gpu/egm_dev.h
index 2e1612445898..2f329a05685d 100644
--- a/drivers/vfio/pci/nvgrace-gpu/egm_dev.h
+++ b/drivers/vfio/pci/nvgrace-gpu/egm_dev.h
@@ -16,8 +16,9 @@ void remove_gpu(struct nvgrace_egm_dev *egm_dev, struct pci_dev *pdev);

struct nvgrace_egm_dev *
nvgrace_gpu_create_aux_device(struct pci_dev *pdev, const char *name,
- u64 egmphys, u64 egmlength, u64 egmpxm);
+ u64 egmphys, u64 egmlength, u64 egmpxm,
+ u64 retiredpagesphys);

int nvgrace_gpu_fetch_egm_property(struct pci_dev *pdev, u64 *pegmphys,
- u64 *pegmlength);
+ u64 *pegmlength, u64 *pretiredpagesphys);
#endif /* EGM_DEV_H */
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index 0bb427cca31f..11bbecda1ad2 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -78,7 +78,7 @@ static struct list_head egm_dev_list;
static int nvgrace_gpu_create_egm_aux_device(struct pci_dev *pdev)
{
struct nvgrace_egm_dev_entry *egm_entry = NULL;
- u64 egmphys, egmlength, egmpxm;
+ u64 egmphys, egmlength, egmpxm, retiredpagesphys;
int ret = 0;
bool is_new_region = false;

@@ -91,7 +91,8 @@ static int nvgrace_gpu_create_egm_aux_device(struct pci_dev *pdev)
if (nvgrace_gpu_has_egm_property(pdev, &egmpxm))
goto exit;

- ret = nvgrace_gpu_fetch_egm_property(pdev, &egmphys, &egmlength);
+ ret = nvgrace_gpu_fetch_egm_property(pdev, &egmphys, &egmlength,
+ &retiredpagesphys);
if (ret)
goto exit;

@@ -114,7 +115,8 @@ static int nvgrace_gpu_create_egm_aux_device(struct pci_dev *pdev)

egm_entry->egm_dev =
nvgrace_gpu_create_aux_device(pdev, NVGRACE_EGM_DEV_NAME,
- egmphys, egmlength, egmpxm);
+ egmphys, egmlength, egmpxm,
+ retiredpagesphys);
if (!egm_entry->egm_dev) {
ret = -EINVAL;
goto free_egm_entry;
diff --git a/include/linux/nvgrace-egm.h b/include/linux/nvgrace-egm.h
index b9956e7e5a0e..9e0d190c7da0 100644
--- a/include/linux/nvgrace-egm.h
+++ b/include/linux/nvgrace-egm.h
@@ -7,6 +7,7 @@
#define NVGRACE_EGM_H

#include <linux/auxiliary_bus.h>
+#include <linux/hashtable.h>

#define NVGRACE_EGM_DEV_NAME "egm"
#define EGM_OFFSET_SHIFT 40
@@ -20,6 +21,7 @@ struct nvgrace_egm_dev {
struct auxiliary_device aux_dev;
phys_addr_t egmphys;
size_t egmlength;
+ phys_addr_t retiredpagesphys;
u64 egmpxm;
struct list_head gpus;
};
--
2.34.1