Re: [PATCH 2/2] vfio_pci: make use of update_irq_devid and optimize irq ops

From: Yunsheng Lin
Date: Mon Aug 12 2019 - 01:45:09 EST


On 2019/8/8 20:07, Ben Luo wrote:
> When userspace (e.g. qemu) triggers a switch between KVM
> irqfd and userspace eventfd, only dev_id of irq action
> (i.e. the "trigger" in this patch's context) will be
> changed, but a free-then-request-irq action is taken in
> current code. And, irq affinity setting in VM will also
> trigger a free-then-request-irq action, which actully
> changes nothing, but only fires a producer re-registration
> to update irte in case that posted-interrupt is in use.
>
> This patch makes use of update_irq_devid() and optimize
> both cases above, which reduces the risk of losing interrupt
> and also cuts some overhead.
>
> Signed-off-by: Ben Luo <luoben@xxxxxxxxxxxxxxxxx>
> Reviewed-by: Liu Jiang <gerry@xxxxxxxxxxxxxxxxx>
> ---
> drivers/vfio/pci/vfio_pci_intrs.c | 100 +++++++++++++++++++++++---------------
> 1 file changed, 62 insertions(+), 38 deletions(-)
>
> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
> index 3fa3f72..1323dc8 100644
> --- a/drivers/vfio/pci/vfio_pci_intrs.c
> +++ b/drivers/vfio/pci/vfio_pci_intrs.c
> @@ -285,69 +285,93 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
> int vector, int fd, bool msix)
> {
> struct pci_dev *pdev = vdev->pdev;
> - struct eventfd_ctx *trigger;
> + struct eventfd_ctx *trigger = NULL;

struct eventfd_ctx *trigger = NULL;
struct pci_dev *pdev = vdev->pdev;

to maintain reverse christmas tree?

> int irq, ret;
>
> if (vector < 0 || vector >= vdev->num_ctx)
> return -EINVAL;
>
> + if (fd >= 0) {
> + trigger = eventfd_ctx_fdget(fd);
> + if (IS_ERR(trigger))
> + return PTR_ERR(trigger);

It seems vdev->ctx[vector].trigger is freed even if fd < 0 before
this patch. If it return here, vdev->ctx[vector].trigger is not free?

> + }
> +
> irq = pci_irq_vector(pdev, vector);
>
> if (vdev->ctx[vector].trigger) {
> - free_irq(irq, vdev->ctx[vector].trigger);
> - irq_bypass_unregister_producer(&vdev->ctx[vector].producer);
> - kfree(vdev->ctx[vector].name);
> - eventfd_ctx_put(vdev->ctx[vector].trigger);
> - vdev->ctx[vector].trigger = NULL;
> + if (vdev->ctx[vector].trigger != trigger) {
> + if (trigger) {
> + ret = update_irq_devid(irq,
> + vdev->ctx[vector].trigger, trigger);
> + if (unlikely(ret)) {
> + dev_info(&pdev->dev,
> + "update_irq_devid %d (token %p) fails: %d\n",
> + irq, vdev->ctx[vector].trigger, ret);
> + eventfd_ctx_put(trigger);
> + return ret;
> + }
> + irq_bypass_unregister_producer(&vdev->ctx[vector].producer);
> + eventfd_ctx_put(vdev->ctx[vector].trigger);
> + vdev->ctx[vector].producer.token = trigger;
> + vdev->ctx[vector].trigger = trigger;
> + } else {
> + free_irq(irq, vdev->ctx[vector].trigger);
> + irq_bypass_unregister_producer(&vdev->ctx[vector].producer);
> + kfree(vdev->ctx[vector].name);
> + eventfd_ctx_put(vdev->ctx[vector].trigger);
> + vdev->ctx[vector].trigger = NULL;
> + }
> + } else
> + irq_bypass_unregister_producer(&vdev->ctx[vector].producer);
> }

Maybe adjust it a litte to reduce indent and to improve readability?

if (vdev->ctx[vector].trigger && vdev->ctx[vector].trigger == trigger) {
irq_bypass_unregister_producer(&vdev->ctx[vector].producer);
} else if (vdev->ctx[vector].trigger && !trigger) {
free_irq(irq, vdev->ctx[vector].trigger);
irq_bypass_unregister_producer(&vdev->ctx[vector].producer);
kfree(vdev->ctx[vector].name);
eventfd_ctx_put(vdev->ctx[vector].trigger);
vdev->ctx[vector].trigger = NULL;
} else if (vdev->ctx[vector].trigger) {
ret = update_irq_devid(irq, vdev->ctx[vector].trigger, trigger);
if (unlikely(ret)) {
dev_info(&pdev->dev,
"update_irq_devid %d (token %p) fails: %d\n",
irq, vdev->ctx[vector].trigger, ret);
eventfd_ctx_put(trigger);
return ret;
}
irq_bypass_unregister_producer(&vdev->ctx[vector].producer);
eventfd_ctx_put(vdev->ctx[vector].trigger);
vdev->ctx[vector].producer.token = trigger;
vdev->ctx[vector].trigger = trigger;
}


>
> if (fd < 0)
> return 0;
>
> - vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "vfio-msi%s[%d](%s)",
> - msix ? "x" : "", vector,
> - pci_name(pdev));
> - if (!vdev->ctx[vector].name)
> - return -ENOMEM;
> + if (vdev->ctx[vector].trigger == NULL) {

It may be common to use the below check to do NULL checking:
If (!vdev->ctx[vector].trigger)


> + vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "vfio-msi%s[%d](%s)",
> + msix ? "x" : "", vector,
> + pci_name(pdev));
> + if (!vdev->ctx[vector].name) {
> + eventfd_ctx_put(trigger);
> + return -ENOMEM;
> + }
>
> - trigger = eventfd_ctx_fdget(fd);
> - if (IS_ERR(trigger)) {
> - kfree(vdev->ctx[vector].name);
> - return PTR_ERR(trigger);
> - }
> + /*
> + * The MSIx vector table resides in device memory which may be cleared
> + * via backdoor resets. We don't allow direct access to the vector
> + * table so even if a userspace driver attempts to save/restore around
> + * such a reset it would be unsuccessful. To avoid this, restore the
> + * cached value of the message prior to enabling.
> + */
> + if (msix) {
> + struct msi_msg msg;
>
> - /*
> - * The MSIx vector table resides in device memory which may be cleared
> - * via backdoor resets. We don't allow direct access to the vector
> - * table so even if a userspace driver attempts to save/restore around
> - * such a reset it would be unsuccessful. To avoid this, restore the
> - * cached value of the message prior to enabling.
> - */
> - if (msix) {
> - struct msi_msg msg;
> + get_cached_msi_msg(irq, &msg);
> + pci_write_msi_msg(irq, &msg);
> + }
>
> - get_cached_msi_msg(irq, &msg);
> - pci_write_msi_msg(irq, &msg);
> - }
> + ret = request_irq(irq, vfio_msihandler, 0,
> + vdev->ctx[vector].name, trigger);
> + if (ret) {
> + kfree(vdev->ctx[vector].name);
> + eventfd_ctx_put(trigger);
> + return ret;
> + }
>
> - ret = request_irq(irq, vfio_msihandler, 0,
> - vdev->ctx[vector].name, trigger);
> - if (ret) {
> - kfree(vdev->ctx[vector].name);
> - eventfd_ctx_put(trigger);
> - return ret;
> + vdev->ctx[vector].producer.token = trigger;
> + vdev->ctx[vector].producer.irq = irq;
> + vdev->ctx[vector].trigger = trigger;
> }
>
> - vdev->ctx[vector].producer.token = trigger;
> - vdev->ctx[vector].producer.irq = irq;
> + /* always update irte for posted mode */
> ret = irq_bypass_register_producer(&vdev->ctx[vector].producer);
> if (unlikely(ret))
> dev_info(&pdev->dev,
> "irq bypass producer (token %p) registration fails: %d\n",
> vdev->ctx[vector].producer.token, ret);
>
> - vdev->ctx[vector].trigger = trigger;
> -
> return 0;
> }
>
>