RE: [PATCH v5 7/7] vfio/nvgrace-gpu: wait for the GPU mem to be ready

From: Shameer Kolothum
Date: Mon Nov 24 2025 - 13:42:08 EST




> -----Original Message-----
> From: Ankit Agrawal <ankita@xxxxxxxxxx>
> Sent: 24 November 2025 11:59
> To: Ankit Agrawal <ankita@xxxxxxxxxx>; jgg@xxxxxxxx; Yishai Hadas
> <yishaih@xxxxxxxxxx>; Shameer Kolothum <skolothumtho@xxxxxxxxxx>;
> kevin.tian@xxxxxxxxx; alex@xxxxxxxxxxx; Aniket Agashe
> <aniketa@xxxxxxxxxx>; Vikram Sethi <vsethi@xxxxxxxxxx>; Matt Ochs
> <mochs@xxxxxxxxxx>
> Cc: Yunxiang.Li@xxxxxxx; yi.l.liu@xxxxxxxxx;
> zhangdongdong@xxxxxxxxxxxxxxxxxx; Avihai Horon <avihaih@xxxxxxxxxx>;
> bhelgaas@xxxxxxxxxx; peterx@xxxxxxxxxx; pstanner@xxxxxxxxxx; Alistair
> Popple <apopple@xxxxxxxxxx>; kvm@xxxxxxxxxxxxxxx; linux-
> kernel@xxxxxxxxxxxxxxx; Neo Jia <cjia@xxxxxxxxxx>; Kirti Wankhede
> <kwankhede@xxxxxxxxxx>; Tarun Gupta (SW-GPU) <targupta@xxxxxxxxxx>;
> Zhi Wang <zhiw@xxxxxxxxxx>; Dan Williams <danw@xxxxxxxxxx>; Dheeraj
> Nigam <dnigam@xxxxxxxxxx>; Krishnakant Jaju <kjaju@xxxxxxxxxx>
> Subject: [PATCH v5 7/7] vfio/nvgrace-gpu: wait for the GPU mem to be ready
>
> From: Ankit Agrawal <ankita@xxxxxxxxxx>
>
> Speculative prefetches from CPU to GPU memory until the GPU is
> ready after reset can cause harmless corrected RAS events to
> be logged on Grace systems. It is thus preferred that the
> mapping not be re-established until the GPU is ready post reset.
>
> The GPU readiness can be checked through BAR0 registers similar
> to the checking at the time of device probe.
>
> It can take several seconds for the GPU to be ready. So it is
> desirable that the time overlaps as much of the VM startup as
> possible to reduce impact on the VM bootup time. The GPU
> readiness state is thus checked on the first fault/huge_fault
> request or read/write access which amortizes the GPU readiness
> time.
>
> The first fault and read/write checks the GPU state when the
> reset_done flag - which denotes whether the GPU has just been
> reset. The memory_lock is taken across map/access to avoid
> races with GPU reset.
>
> cc: Alex Williamson <alex@xxxxxxxxxxx>
> cc: Jason Gunthorpe <jgg@xxxxxxxx>
> cc: Vikram Sethi <vsethi@xxxxxxxxxx>
> Suggested-by: Alex Williamson <alex@xxxxxxxxxxx>
> Signed-off-by: Ankit Agrawal <ankita@xxxxxxxxxx>
> ---
> drivers/vfio/pci/nvgrace-gpu/main.c | 79 ++++++++++++++++++++++++++-
> --
> 1 file changed, 72 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-
> gpu/main.c
> index bef9f25bf8f3..fbc19fe688ca 100644
> --- a/drivers/vfio/pci/nvgrace-gpu/main.c
> +++ b/drivers/vfio/pci/nvgrace-gpu/main.c
> @@ -104,6 +104,17 @@ static int nvgrace_gpu_open_device(struct
> vfio_device *core_vdev)
> mutex_init(&nvdev->remap_lock);
> }
>
> + /*
> + * GPU readiness is checked by reading the BAR0 registers.
> + *
> + * ioremap BAR0 to ensure that the BAR0 mapping is present before
> + * register reads on first fault before establishing any GPU
> + * memory mapping.
> + */
> + ret = vfio_pci_core_setup_barmap(vdev, 0);
> + if (ret)
> + return ret;
> +
> vfio_pci_core_finish_enable(vdev);
>
> return 0;
> @@ -150,6 +161,26 @@ static int nvgrace_gpu_wait_device_ready(void
> __iomem *io)
> return ret;
> }
>
> +static int
> +nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device
> *nvdev)
> +{
> + struct vfio_pci_core_device *vdev = &nvdev->core_device;
> + int ret;
> +
> + lockdep_assert_held_read(&vdev->memory_lock);
> +
> + if (!nvdev->reset_done)
> + return 0;
> +
> + ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]);
> + if (ret)
> + return ret;
> +
> + nvdev->reset_done = false;
> +
> + return 0;
> +}
> +
> static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf,
> unsigned int order)
> {
> @@ -173,8 +204,18 @@ static vm_fault_t
> nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf,
> pfn & ((1 << order) - 1)))
> return VM_FAULT_FALLBACK;
>
> - scoped_guard(rwsem_read, &nvdev->core_device.memory_lock)
> + scoped_guard(rwsem_read, &nvdev->core_device.memory_lock) {
> + /*
> + * If the GPU memory is accessed by the CPU while the GPU is
> + * not ready after reset, it can cause harmless corrected RAS
> + * events to be logged. Make sure the GPU is ready before
> + * establishing the mappings.
> + */
> + if (nvgrace_gpu_check_device_ready(nvdev))
> + return ret;
> +
> ret = vfio_pci_vmf_insert_pfn(vmf, pfn, order);
> + }
>
> return ret;
> }
> @@ -593,9 +634,21 @@ nvgrace_gpu_read_mem(struct
> nvgrace_gpu_pci_core_device *nvdev,
> else
> mem_count = min(count, memregion->memlength -
> (size_t)offset);
>
> - ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos);
> - if (ret)
> - return ret;
> + scoped_guard(rwsem_read, &nvdev->core_device.memory_lock) {
> + /*
> + * If the GPU memory is accessed by the CPU while the GPU is
> + * not ready after reset, it can cause harmless corrected RAS
> + * events to be logged. Make sure the GPU is ready before
> + * establishing the mappings.
> + */
> + ret = nvgrace_gpu_check_device_ready(nvdev);
> + if (ret)
> + return ret;
> +
> + ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count,
> ppos);
> + if (ret)
> + return ret;
> + }
>
> /*
> * Only the device memory present on the hardware is mapped, which
> may
> @@ -713,9 +766,21 @@ nvgrace_gpu_write_mem(struct
> nvgrace_gpu_pci_core_device *nvdev,
> */
> mem_count = min(count, memregion->memlength - (size_t)offset);
>
> - ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos);
> - if (ret)
> - return ret;
> + scoped_guard(rwsem_read, &nvdev->core_device.memory_lock) {
> + /*
> + * If the GPU memory is accessed by the CPU while the GPU is
> + * not ready after reset, it can cause harmless corrected RAS
> + * events to be logged. Make sure the GPU is ready before
> + * establishing the mappings.
> + */

The comment above is now repeated 3 times. Good to consolidate and add
that comment above nvgrace_gpu_check_device_ready().

Thanks,
Shameer

> + ret = nvgrace_gpu_check_device_ready(nvdev);
> + if (ret)
> + return ret;
> +
> + ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count,
> ppos);
> + if (ret)
> + return ret;
> + }
>
> exitfn:
> *ppos += count;
> --
> 2.34.1