Re: [PATCH v11 6/9] s390/pci: Store PCI error information for passthrough devices

From: Alex Williamson

Date: Wed Mar 25 2026 - 13:09:21 EST

On Mon, 16 Mar 2026 12:15:41 -0700
Farhan Ali <alifm@xxxxxxxxxxxxx> wrote:

> For a passthrough device we need co-operation from user space to recover
> the device. This would require to bubble up any error information to user
> space. Let's store this error information for passthrough devices, so it
> can be retrieved later.
>
> Reviewed-by: Niklas Schnelle <schnelle@xxxxxxxxxxxxx>
> Signed-off-by: Farhan Ali <alifm@xxxxxxxxxxxxx>
> ---
> arch/s390/include/asm/pci.h | 28 ++++++++++
> arch/s390/pci/pci.c | 1 +
> arch/s390/pci/pci_event.c | 94 +++++++++++++++++++-------------
> drivers/vfio/pci/vfio_pci_zdev.c | 2 +
> 4 files changed, 87 insertions(+), 38 deletions(-)
>
> diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
> index ec8a772bf526..383f6483b656 100644
> --- a/arch/s390/include/asm/pci.h
> +++ b/arch/s390/include/asm/pci.h
> @@ -118,6 +118,31 @@ struct zpci_bus {
> enum pci_bus_speed max_bus_speed;
> };
>
> +/* Content Code Description for PCI Function Error */
> +struct zpci_ccdf_err {
> + u32 reserved1;
> + u32 fh; /* function handle */
> + u32 fid; /* function id */
> + u32 ett : 4; /* expected table type */
> + u32 mvn : 12; /* MSI vector number */
> + u32 dmaas : 8; /* DMA address space */
> + u32 reserved2 : 6;
> + u32 q : 1; /* event qualifier */
> + u32 rw : 1; /* read/write */
> + u64 faddr; /* failing address */
> + u32 reserved3;
> + u16 reserved4;
> + u16 pec; /* PCI event code */
> +} __packed;
> +
> +#define ZPCI_ERR_PENDING_MAX 4
> +struct zpci_ccdf_pending {
> + u8 count;
> + u8 head;
> + u8 tail;
> + struct zpci_ccdf_err err[ZPCI_ERR_PENDING_MAX];
> +};
> +
> /* Private data per function */
> struct zpci_dev {
> struct zpci_bus *zbus;
> @@ -193,6 +218,8 @@ struct zpci_dev {
> struct iommu_domain *s390_domain; /* attached IOMMU domain */
> struct kvm_zdev *kzdev;
> struct mutex kzdev_lock;
> + struct zpci_ccdf_pending pending_errs;
> + struct mutex pending_errs_lock;
> spinlock_t dom_lock; /* protect s390_domain change */
> };
>
> @@ -331,6 +358,7 @@ void zpci_debug_exit_device(struct zpci_dev *);
> int zpci_report_error(struct pci_dev *, struct zpci_report_error_header *);
> int zpci_clear_error_state(struct zpci_dev *zdev);
> int zpci_reset_load_store_blocked(struct zpci_dev *zdev);
> +void zpci_cleanup_pending_errors(struct zpci_dev *zdev);
>
> #ifdef CONFIG_NUMA
>
> diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
> index 87077e510266..bc253cc52056 100644
> --- a/arch/s390/pci/pci.c
> +++ b/arch/s390/pci/pci.c
> @@ -915,6 +915,7 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state)
> mutex_init(&zdev->state_lock);
> mutex_init(&zdev->fmb_lock);
> mutex_init(&zdev->kzdev_lock);
> + mutex_init(&zdev->pending_errs_lock);
>
> return zdev;
>
> diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c
> index de504925f709..5b24f3a9fe23 100644
> --- a/arch/s390/pci/pci_event.c
> +++ b/arch/s390/pci/pci_event.c
> @@ -17,23 +17,6 @@
> #include "pci_bus.h"
> #include "pci_report.h"
>
> -/* Content Code Description for PCI Function Error */
> -struct zpci_ccdf_err {
> - u32 reserved1;
> - u32 fh; /* function handle */
> - u32 fid; /* function id */
> - u32 ett : 4; /* expected table type */
> - u32 mvn : 12; /* MSI vector number */
> - u32 dmaas : 8; /* DMA address space */
> - u32 : 6;
> - u32 q : 1; /* event qualifier */
> - u32 rw : 1; /* read/write */
> - u64 faddr; /* failing address */
> - u32 reserved3;
> - u16 reserved4;
> - u16 pec; /* PCI event code */
> -} __packed;
> -
> /* Content Code Description for PCI Function Availability */
> struct zpci_ccdf_avail {
> u32 reserved1;
> @@ -75,6 +58,41 @@ static bool is_driver_supported(struct pci_driver *driver)
> return true;
> }
>
> +static void zpci_store_pci_error(struct pci_dev *pdev,
> + struct zpci_ccdf_err *ccdf)
> +{
> + struct zpci_dev *zdev = to_zpci(pdev);
> + int i;
> +
> + mutex_lock(&zdev->pending_errs_lock);
> + if (zdev->pending_errs.count >= ZPCI_ERR_PENDING_MAX) {
> + pr_err("%s: Maximum number (%d) of pending error events queued",
> + pci_name(pdev), ZPCI_ERR_PENDING_MAX);
> + mutex_unlock(&zdev->pending_errs_lock);
> + return;
> + }
> +
> + i = zdev->pending_errs.tail % ZPCI_ERR_PENDING_MAX;
> + memcpy(&zdev->pending_errs.err[i], ccdf, sizeof(struct zpci_ccdf_err));
> + zdev->pending_errs.tail++;
> + zdev->pending_errs.count++;
> + mutex_unlock(&zdev->pending_errs_lock);
> +}
> +
> +void zpci_cleanup_pending_errors(struct zpci_dev *zdev)
> +{
> + struct pci_dev *pdev = NULL;
> +
> + guard(mutex)(&zdev->pending_errs_lock);
> + pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn);
> + if (zdev->pending_errs.count)
> + pr_info("%s: Unhandled PCI error events count=%d",
> + pci_name(pdev), zdev->pending_errs.count);
> + memset(&zdev->pending_errs, 0, sizeof(struct zpci_ccdf_pending));
> + pci_dev_put(pdev);
> +}
> +EXPORT_SYMBOL_GPL(zpci_cleanup_pending_errors);
> +
> static pci_ers_result_t zpci_event_notify_error_detected(struct pci_dev *pdev,
> struct pci_driver *driver)
> {
> @@ -169,7 +187,8 @@ static pci_ers_result_t zpci_event_do_reset(struct pci_dev *pdev,
> * and the platform determines which functions are affected for
> * multi-function devices.
> */
> -static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
> +static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev,
> + struct zpci_ccdf_err *ccdf)
> {
> pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
> struct zpci_dev *zdev = to_zpci(pdev);
> @@ -188,13 +207,6 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
> }
> pdev->error_state = pci_channel_io_frozen;
>
> - if (needs_mediated_recovery(pdev)) {
> - pr_info("%s: Cannot be recovered in the host because it is a pass-through device\n",
> - pci_name(pdev));
> - status_str = "failed (pass-through)";
> - goto out_unlock;
> - }
> -
> driver = to_pci_driver(pdev->dev.driver);
> if (!is_driver_supported(driver)) {
> if (!driver) {
> @@ -210,12 +222,23 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
> goto out_unlock;
> }
>
> + if (needs_mediated_recovery(pdev))
> + zpci_store_pci_error(pdev, ccdf);
> +
> ers_res = zpci_event_notify_error_detected(pdev, driver);
> if (ers_result_indicates_abort(ers_res)) {
> status_str = "failed (abort on detection)";
> goto out_unlock;
> }
>
> + if (needs_mediated_recovery(pdev)) {
> + pr_info("%s: Leaving recovery of pass-through device to user-space\n",
> + pci_name(pdev));
> + ers_res = PCI_ERS_RESULT_RECOVERED;
> + status_str = "in progress";
> + goto out_unlock;
> + }
> +
> if (ers_res != PCI_ERS_RESULT_NEED_RESET) {
> ers_res = zpci_event_do_error_state_clear(pdev, driver);
> if (ers_result_indicates_abort(ers_res)) {
> @@ -260,25 +283,20 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
> * @pdev: PCI function for which to report
> * @es: PCI channel failure state to report
> */
> -static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es)
> +static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es,
> + struct zpci_ccdf_err *ccdf)
> {
> struct pci_driver *driver;
>
> pci_dev_lock(pdev);
> pdev->error_state = es;
> - /**
> - * While vfio-pci's error_detected callback notifies user-space QEMU
> - * reacts to this by freezing the guest. In an s390 environment PCI
> - * errors are rarely fatal so this is overkill. Instead in the future
> - * we will inject the error event and let the guest recover the device
> - * itself.
> - */
> +
> if (needs_mediated_recovery(pdev))
> - goto out;
> + zpci_store_pci_error(pdev, ccdf);
> driver = to_pci_driver(pdev->dev.driver);
> if (driver && driver->err_handler && driver->err_handler->error_detected)
> driver->err_handler->error_detected(pdev, pdev->error_state);
> -out:
> +
> pci_dev_unlock(pdev);
> }
>
> @@ -324,12 +342,12 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf)
> break;
> case 0x0040: /* Service Action or Error Recovery Failed */
> case 0x003b:
> - zpci_event_io_failure(pdev, pci_channel_io_perm_failure);
> + zpci_event_io_failure(pdev, pci_channel_io_perm_failure, ccdf);
> break;
> default: /* PCI function left in the error state attempt to recover */
> - ers_res = zpci_event_attempt_error_recovery(pdev);
> + ers_res = zpci_event_attempt_error_recovery(pdev, ccdf);
> if (ers_res != PCI_ERS_RESULT_RECOVERED)
> - zpci_event_io_failure(pdev, pci_channel_io_perm_failure);
> + zpci_event_io_failure(pdev, pci_channel_io_perm_failure, ccdf);
> break;
> }
> pci_dev_put(pdev);
> diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
> index a7bc23ce8483..2be37eab9279 100644
> --- a/drivers/vfio/pci/vfio_pci_zdev.c
> +++ b/drivers/vfio/pci/vfio_pci_zdev.c
> @@ -168,6 +168,8 @@ void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev)
>
> zdev->mediated_recovery = false;
>
> + zpci_cleanup_pending_errors(zdev);
> +
> if (!vdev->vdev.kvm)
> return;
>

It begins to look here like the mediated_recovery should be protected
by pending_errs_lock and perhaps there should be
zpci_{start,stop}_mediated_recovery() where we set and clear the flag
under mutex, while also clearing pending errors in the latter case.
The various needs_mediated_recovery tests could be pulled in to test
under mutex as well. Thanks,

Alex