Re: [Qemu-devel] [PATCH v2 2/2] vfio : add aer process

From: Alex Williamson
Date: Mon Aug 01 2016 - 11:37:01 EST


On Mon, 1 Aug 2016 10:14:06 +0800
Zhou Jie <zhoujie2011@xxxxxxxxxxxxxx> wrote:

> Hi, Alex
>
> On 2016/7/30 1:12, Alex Williamson wrote:
> > On Tue, 19 Jul 2016 15:32:43 +0800
> > Zhou Jie <zhoujie2011@xxxxxxxxxxxxxx> wrote:
> >
> >> From: Chen Fan <chen.fan.fnst@xxxxxxxxxxxxxx>
> >>
> >> During aer err occurs and resume do following to
> >> protect device from being accessed.
> >> 1. Make config space read only.
> >> 2. Disable INTx/MSI Interrupt.
> >> 3. Do nothing for bar regions.
> >>
> >> Signed-off-by: Zhou Jie <zhoujie2011@xxxxxxxxxxxxxx>
> >> ---
> >> drivers/vfio/pci/vfio_pci.c | 30 ++++++++++++++++++++++++++++++
> >> drivers/vfio/pci/vfio_pci_private.h | 2 ++
> >> include/uapi/linux/vfio.h | 2 ++
> >> 3 files changed, 34 insertions(+)
> >>
> >> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> >> index 2d12b03..dd96b60 100644
> >> --- a/drivers/vfio/pci/vfio_pci.c
> >> +++ b/drivers/vfio/pci/vfio_pci.c
> >> @@ -318,6 +318,7 @@ static int vfio_pci_open(void *device_data)
> >> return -ENODEV;
> >>
> >> mutex_lock(&driver_lock);
> >> + init_completion(&vdev->aer_error_completion);
> >>
> >> if (!vdev->refcnt) {
> >> ret = vfio_pci_enable(vdev);
> >> @@ -571,6 +572,16 @@ static long vfio_pci_ioctl(void *device_data,
> >> struct vfio_pci_device *vdev = device_data;
> >> unsigned long minsz;
> >>
> >> + if (vdev->aer_error_in_progress && (cmd == VFIO_DEVICE_SET_IRQS ||
> >> + cmd == VFIO_DEVICE_RESET || cmd == VFIO_DEVICE_PCI_HOT_RESET)) {
> >> + int ret;
> >> + ret = wait_for_completion_interruptible(
> >> + &vdev->aer_error_completion);
> >> + if (ret) {
> >> + return ret;
> >> + }
> >
> > No brackets necessary.
> >
> >> + }
> >> +
> >> if (cmd == VFIO_DEVICE_GET_INFO) {
> >> struct vfio_device_info info;
> >>
> >> @@ -587,6 +598,10 @@ static long vfio_pci_ioctl(void *device_data,
> >> if (vdev->reset_works)
> >> info.flags |= VFIO_DEVICE_FLAGS_RESET;
> >>
> >> + info.flags |= VFIO_DEVICE_FLAGS_AERPROCESS;
> >> + if (vdev->aer_error_in_progress)
> >> + info.flags |= VFIO_DEVICE_FLAGS_INAERPROCESS;
> >> +
> >> info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
> >> info.num_irqs = VFIO_PCI_NUM_IRQS;
> >>
> >> @@ -996,6 +1011,14 @@ static ssize_t vfio_pci_rw(void *device_data, char __user *buf,
> >>
> >> switch (index) {
> >> case VFIO_PCI_CONFIG_REGION_INDEX:
> >> + if (vdev->aer_error_in_progress && iswrite) {
> >> + int ret;
> >> + ret = wait_for_completion_interruptible(
> >> + &vdev->aer_error_completion);
> >> + if (ret) {
> >> + return ret;
> >> + }
> >> + }
> >> return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
> >>
> >> case VFIO_PCI_ROM_REGION_INDEX:
> >> @@ -1226,6 +1249,10 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
> >>
> >> mutex_lock(&vdev->igate);
> >>
> >> + vdev->aer_error_in_progress = true;
> >> + vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
> >> + VFIO_IRQ_SET_ACTION_TRIGGER,
> >> + vdev->irq_type, 0, 0, NULL);
> >> if (vdev->err_trigger)
> >> eventfd_signal(vdev->err_trigger, 1);
> >>
> >> @@ -1252,6 +1279,9 @@ static void vfio_pci_aer_resume(struct pci_dev *pdev)
> >> }
> >>
> >> mutex_lock(&vdev->igate);
> >> +
> >> + vdev->aer_error_in_progress = false;
> >> + complete_all(&vdev->aer_error_completion);
> >> if (vdev->resume_trigger)
> >> eventfd_signal(vdev->resume_trigger, 1);
> >>
> >> diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
> >> index 80d4ddd..2f151f5 100644
> >> --- a/drivers/vfio/pci/vfio_pci_private.h
> >> +++ b/drivers/vfio/pci/vfio_pci_private.h
> >> @@ -84,6 +84,8 @@ struct vfio_pci_device {
> >> bool has_vga;
> >> bool needs_reset;
> >> bool nointx;
> >> + bool aer_error_in_progress;
> >> + struct completion aer_error_completion;
> >> struct pci_saved_state *pci_saved_state;
> >> int refcnt;
> >> struct eventfd_ctx *err_trigger;
> >> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> >> index 34ab138..276ce50 100644
> >> --- a/include/uapi/linux/vfio.h
> >> +++ b/include/uapi/linux/vfio.h
> >> @@ -198,6 +198,8 @@ struct vfio_device_info {
> >> #define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */
> >> #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2) /* vfio-platform device */
> >> #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */
> >> +#define VFIO_DEVICE_FLAGS_AERPROCESS (1 << 4) /* support aer error progress */
> >> +#define VFIO_DEVICE_FLAGS_INAERPROCESS (1 << 5)/* status in aer error progress */
> >> __u32 num_regions; /* Max region index + 1 */
> >> __u32 num_irqs; /* Max IRQ index + 1 */
> >> };
> >
> > Clearly this has only been tested for a single instance of an AER error
> > event and resume per device. Are the things you're intending to block
> > actually blocked for subsequent events? Note how complete_all() fills
> > the done field to let all current and future waiters go through and
> > nowhere is there a call to reinit_completion() to drain that path.
> > Thanks,
> >
> > Alex
>
> Do you mean this condition?
>
> For device 1:
> error1 occurs ---- error1 resumes
> error2 occurs ---- error2 resumes
> error3 occurs ---- error3 resumes
>
> In current code, I do complete_all() when error1 resumes.
> And this will unblock the device
> when error2 and error3 are still be processed.

So walk me through how this works. On vfio_pci_open() we call
init_completion(), which sets aer_error_completion.done equal to zero
(BTW, a user can open the device file descriptor multiple times, so
there's already a bug here). Let's assume that an error occurs and the
user stalls a single access on wait_for_completion_interruptible().
The bulk of this function happens here:

static inline long __sched
do_wait_for_common(struct completion *x,
long (*action)(long), long timeout, int state)
{
if (!x->done) {
DECLARE_WAITQUEUE(wait, current);

__add_wait_queue_tail_exclusive(&x->wait, &wait);
do {
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
break;
}
__set_current_state(state);
spin_unlock_irq(&x->wait.lock);
timeout = action(timeout);
spin_lock_irq(&x->wait.lock);
} while (!x->done && timeout);
__remove_wait_queue(&x->wait, &wait);
if (!x->done)
return timeout;
}
x->done--;
return timeout ?: 1;
}

So it waits within that do{}while loop for a completion, interruption,
or timeout. Then we have:

void complete_all(struct completion *x)
{
unsigned long flags;

spin_lock_irqsave(&x->wait.lock, flags);
x->done += UINT_MAX/2;
__wake_up_locked(&x->wait, TASK_NORMAL, 0);
spin_unlock_irqrestore(&x->wait.lock, flags);
}

So aer_error_completion.done gets incremented to let a couple billion
completion waiters through... Show me how another call to
wait_for_completion_interruptible() will ever block again within our
lifetime when the actual wait of do_wait_for_common() is only entered
when 'done' count is equal to zero. This seems to be why
reinit_completion() exists, but it's not used here. Thanks,

Alex