Re: [PATCH v16 5/9] PCI/AER: Factor out error reporting from AER
From: Bjorn Helgaas
Date: Tue May 15 2018 - 20:06:32 EST
On Fri, May 11, 2018 at 06:43:24AM -0400, Oza Pawandeep wrote:
> This patch factors out error reporting callbacks, which are currently
> tightly coupled with AER.
>
> DPC should be able to register callbacks and attempt recovery when DPC
> trigger event occurs.
>
> Signed-off-by: Oza Pawandeep <poza@xxxxxxxxxxxxxx>
> +static int report_error_detected(struct pci_dev *dev, void *data)
> +{
> + pci_ers_result_t vote;
> + const struct pci_error_handlers *err_handler;
> + struct aer_broadcast_data *result_data;
> +
> + result_data = (struct aer_broadcast_data *) data;
> +
> + device_lock(&dev->dev);
> + dev->error_state = result_data->state;
> +
> + if (!dev->driver ||
> + !dev->driver->err_handler ||
> + !dev->driver->err_handler->error_detected) {
> + if (result_data->state == pci_channel_io_frozen &&
> + dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
> + /*
> + * In case of fatal recovery, if one of down-
> + * stream device has no driver. We might be
> + * unable to recover because a later insmod
> + * of a driver for this device is unaware of
> + * its hw state.
> + */
> + pci_printk(KERN_DEBUG, dev, "device has %s\n",
> + dev->driver ?
> + "no AER-aware driver" : "no driver");
> + }
> +
> + /*
> + * If there's any device in the subtree that does not
> + * have an error_detected callback, returning
> + * PCI_ERS_RESULT_NO_AER_DRIVER prevents calling of
> + * the subsequent mmio_enabled/slot_reset/resume
> + * callbacks of "any" device in the subtree. All the
> + * devices in the subtree are left in the error state
> + * without recovery.
> + */
> +
> + if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
> + vote = PCI_ERS_RESULT_NO_AER_DRIVER;
> + else
> + vote = PCI_ERS_RESULT_NONE;
> + } else {
> + err_handler = dev->driver->err_handler;
> + vote = err_handler->error_detected(dev, result_data->state);
> +#if defined(CONFIG_PCIEAER)
> + pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
> +#endif
> +static int report_slot_reset(struct pci_dev *dev, void *data)
> +{
> + pci_ers_result_t vote;
> + const struct pci_error_handlers *err_handler;
> + struct aer_broadcast_data *result_data;
> +
> + result_data = (struct aer_broadcast_data *) data;
> +
> + device_lock(&dev->dev);
> + if (!dev->driver ||
> + !dev->driver->err_handler ||
> + !dev->driver->err_handler->slot_reset)
> + goto out;
> +
> + err_handler = dev->driver->err_handler;
> + vote = err_handler->slot_reset(dev);
> + result_data->result = merge_result(result_data->result, vote);
> +out:
> + device_unlock(&dev->dev);
> + return 0;
> +}
> +
> +static int report_resume(struct pci_dev *dev, void *data)
> +{
> + const struct pci_error_handlers *err_handler;
> +
> + device_lock(&dev->dev);
> + dev->error_state = pci_channel_io_normal;
> +
> + if (!dev->driver ||
> + !dev->driver->err_handler ||
> + !dev->driver->err_handler->resume)
> + goto out;
> +
> + err_handler = dev->driver->err_handler;
> + err_handler->resume(dev);
> +#if defined(CONFIG_PCIEAER)
> + pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
> +#endif
> +void pcie_do_fatal_recovery(struct pci_dev *dev)
> +{
> + struct pci_dev *udev;
> + struct pci_bus *parent;
> + struct pci_dev *pdev, *temp;
> + pci_ers_result_t result = PCI_ERS_RESULT_RECOVERED;
> + struct aer_broadcast_data result_data;
> +
> + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
> + udev = dev;
> + else
> + udev = dev->bus->self;
> +
> + parent = udev->subordinate;
> + pci_lock_rescan_remove();
> + list_for_each_entry_safe_reverse(pdev, temp, &parent->devices,
> + bus_list) {
> + pci_dev_get(pdev);
> + pci_dev_set_disconnected(pdev, NULL);
> + if (pci_has_subordinate(pdev))
> + pci_walk_bus(pdev->subordinate,
> + pci_dev_set_disconnected, NULL);
> + pci_stop_and_remove_bus_device(pdev);
> + pci_dev_put(pdev);
> + }
> +
> + result = reset_link(udev);
> +
> + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
> + /*
> + * If the error is reported by a bridge, we think this error
> + * is related to the downstream link of the bridge, so we
> + * do error recovery on all subordinates of the bridge instead
> + * of the bridge and clear the error status of the bridge.
> + */
> + pci_walk_bus(dev->subordinate, report_resume, &result_data);
> + pci_cleanup_aer_uncorrect_error_status(dev);
> + }
> +
> + if (result == PCI_ERS_RESULT_RECOVERED) {
> + if (pcie_wait_for_link(udev, true))
> + pci_rescan_bus(udev->bus);
> + pci_info(dev, "Device recovery successful\n");
> + } else {
> +#if defined(CONFIG_PCIEAER)
> + pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
> +#endif
I don't think this is the optimal resolution for this problem.
It is true that we only call this function if either
CONFIG_PCIEAER=y or
CONFIG_PCIE_DPC=y
and furthermore that CONFIG_PCIE_DPC depends on CONFIG_PCIEAER, so in
either case, pci_uevent_ers() is present, since it is conditional on
#if defined(CONFIG_PCIEAER) || defined(CONFIG_EEH)
But the #ifdef here seems unnecessarily complicated. I think it would be
better to change the #ifdef around the definition of pci_uevent_ers().
Then we wouldn't need the several #ifdefs in this file.