Re: [KVM PATCH v3 3/3] KVM: Fix races in irqfd using neweventfd_kref_get interface
From: Michael S. Tsirkin
Date: Mon Jun 22 2009 - 12:58:04 EST
On Mon, Jun 22, 2009 at 12:05:57PM -0400, Gregory Haskins wrote:
> This patch fixes all known races in irqfd, and paves the way to restore
> DEASSIGN support. For details of the eventfd races, please see the patch
> presumably commited immediately prior to this one.
>
> In a nutshell, we use eventfd_kref_get/put() to properly manage the
> lifetime of the underlying eventfd. We also use careful coordination
> with our workqueue to ensure that all irqfd objects have terminated
> before we allow kvm to shutdown. The logic used for shutdown walks
> all open irqfds and releases them. This logic can be generalized in
> the future to allow a subset of irqfds to be released, thus allowing
> DEASSIGN support.
>
> Signed-off-by: Gregory Haskins <ghaskins@xxxxxxxxxx>
I think this patch is a shade too tricky. Some explanation why below.
But I think irqfd_pop is a good idea.
Here's an alternative design sketch: add a list of irqfds to be shutdown
in kvm, and create a single-threaded workqueue. To kill an irqfd, move
it from list of live irqfds to list of dead irqfds, then schedule work
on a workqueue that walks this list and kills irqfds.
> ---
>
> virt/kvm/eventfd.c | 144 ++++++++++++++++++++++++++++++++++++++++------------
> 1 files changed, 110 insertions(+), 34 deletions(-)
>
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 9656027..67985cd 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -28,6 +28,7 @@
> #include <linux/file.h>
> #include <linux/list.h>
> #include <linux/eventfd.h>
> +#include <linux/kref.h>
>
> /*
> * --------------------------------------------------------------------
> @@ -36,26 +37,68 @@
> * Credit goes to Avi Kivity for the original idea.
> * --------------------------------------------------------------------
> */
> +
> +enum {
> + irqfd_flags_shutdown,
> +};
> +
> struct _irqfd {
> struct kvm *kvm;
> + struct kref *eventfd;
Yay, kref.
> int gsi;
> struct list_head list;
> poll_table pt;
> wait_queue_head_t *wqh;
> wait_queue_t wait;
> - struct work_struct inject;
> + struct work_struct work;
> + unsigned long flags;
Just make it "int shutdown"?
> };
>
> static void
> -irqfd_inject(struct work_struct *work)
> +irqfd_release(struct _irqfd *irqfd)
> +{
> + eventfd_kref_put(irqfd->eventfd);
> + kfree(irqfd);
> +}
> +
> +static void
> +irqfd_work(struct work_struct *work)
> {
> - struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
> + struct _irqfd *irqfd = container_of(work, struct _irqfd, work);
> struct kvm *kvm = irqfd->kvm;
>
> - mutex_lock(&kvm->irq_lock);
> - kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
> - kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> - mutex_unlock(&kvm->irq_lock);
> + if (!test_bit(irqfd_flags_shutdown, &irqfd->flags)) {
Why is it safe to test this bit outside of any lock?
> + /* Inject an interrupt */
> + mutex_lock(&kvm->irq_lock);
> + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
> + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> + mutex_unlock(&kvm->irq_lock);
> + } else {
Not much shared code here - create a separate showdown work struct?
They are cheap ...
> + /* shutdown the irqfd */
> + struct _irqfd *_irqfd = NULL;
> +
> + mutex_lock(&kvm->lock);
> +
> + if (!list_empty(&irqfd->list))
> + _irqfd = irqfd;
> +
> + if (_irqfd)
> + list_del(&_irqfd->list);
> +
> + mutex_unlock(&kvm->lock);
> +
> + /*
> + * If the item is not currently on the irqfds list, we know
> + * we are running concurrently with the KVM side trying to
> + * remove this item as well.
We do? How? As far as I can see list is only empty after it has been
created. Generally, it would be better to either use a flag or use
list_empty as an indication of going down, but not both.
> Since the KVM side should be
> + * holding the reference now, and will block behind a
> + * flush_work(), lets just let them do the release() for us
> + */
> + if (!_irqfd)
> + return;
> +
> + irqfd_release(_irqfd);
> + }
> }
>
> static int
> @@ -65,25 +108,20 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
> unsigned long flags = (unsigned long)key;
>
> /*
> - * Assume we will be called with interrupts disabled
> + * called with interrupts disabled
> */
> - if (flags & POLLIN)
> - /*
> - * Defer the IRQ injection until later since we need to
> - * acquire the kvm->lock to do so.
> - */
> - schedule_work(&irqfd->inject);
> -
> if (flags & POLLHUP) {
> /*
> - * for now, just remove ourselves from the list and let
> - * the rest dangle. We will fix this up later once
> - * the races in eventfd are fixed
> + * ordering is important: shutdown flag must be visible
> + * before we schedule
> */
> __remove_wait_queue(irqfd->wqh, &irqfd->wait);
> - irqfd->wqh = NULL;
> + set_bit(irqfd_flags_shutdown, &irqfd->flags);
So what happens if a previously scheduled work runs on irqfd
and sees this flag? And note that multiple works can run on irqfd
in parallel.
> }
>
> + if (flags & (POLLHUP | POLLIN))
> + schedule_work(&irqfd->work);
> +
> return 0;
> }
>
> @@ -102,6 +140,7 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
> {
> struct _irqfd *irqfd;
> struct file *file = NULL;
> + struct kref *kref = NULL;
> int ret;
> unsigned int events;
>
> @@ -112,7 +151,7 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
> irqfd->kvm = kvm;
> irqfd->gsi = gsi;
> INIT_LIST_HEAD(&irqfd->list);
> - INIT_WORK(&irqfd->inject, irqfd_inject);
> + INIT_WORK(&irqfd->work, irqfd_work);
>
> file = eventfd_fget(fd);
> if (IS_ERR(file)) {
> @@ -133,11 +172,13 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
> list_add_tail(&irqfd->list, &kvm->irqfds);
> mutex_unlock(&kvm->lock);
>
> - /*
> - * Check if there was an event already queued
> - */
> - if (events & POLLIN)
> - schedule_work(&irqfd->inject);
> + kref = eventfd_kref_get(file);
> + if (IS_ERR(file)) {
> + ret = PTR_ERR(file);
> + goto fail;
> + }
> +
> + irqfd->eventfd = kref;
>
> /*
> * do not drop the file until the irqfd is fully initialized, otherwise
> @@ -145,9 +186,18 @@ kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
> */
> fput(file);
>
> + /*
> + * Check if there was an event already queued
> + */
This comment seems to confuse more that it clarifies:
queued where? eventfd only counts... Just kill the comment?
> + if (events & POLLIN)
> + schedule_work(&irqfd->work);
> +
> return 0;
>
> fail:
> + if (kref && !IS_ERR(kref))
> + eventfd_kref_put(kref);
> +
> if (file && !IS_ERR(file))
> fput(file);
let's add a couple more labels and avoid the kref/file check
and the initialization above?
>
> @@ -161,21 +211,47 @@ kvm_irqfd_init(struct kvm *kvm)
> INIT_LIST_HEAD(&kvm->irqfds);
> }
>
> +static struct _irqfd *
> +irqfd_pop(struct kvm *kvm)
> +{
> + struct _irqfd *irqfd = NULL;
> +
> + mutex_lock(&kvm->lock);
> +
> + if (!list_empty(&kvm->irqfds)) {
> + irqfd = list_first_entry(&kvm->irqfds, struct _irqfd, list);
> + list_del(&irqfd->list);
> + }
> +
> + mutex_unlock(&kvm->lock);
> +
> + return irqfd;
> +}
> +
> void
> kvm_irqfd_release(struct kvm *kvm)
> {
> - struct _irqfd *irqfd, *tmp;
> + struct _irqfd *irqfd;
>
> - list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds, list) {
> - if (irqfd->wqh)
> - remove_wait_queue(irqfd->wqh, &irqfd->wait);
> + while ((irqfd = irqfd_pop(kvm))) {
>
> - flush_work(&irqfd->inject);
> + remove_wait_queue(irqfd->wqh, &irqfd->wait);
>
> - mutex_lock(&kvm->lock);
> - list_del(&irqfd->list);
> - mutex_unlock(&kvm->lock);
> + /*
> + * We guarantee there will be no more notifications after
> + * the remove_wait_queue returns. Now lets make sure we
> + * synchronize behind any outstanding work items before
> + * releasing the resources
> + */
> + flush_work(&irqfd->work);
>
> - kfree(irqfd);
> + irqfd_release(irqfd);
> }
> +
> + /*
> + * We need to wait in case there are any outstanding work-items
> + * in flight that had already removed themselves from the list
> + * prior to entry to this function
> + */
Looks scary. Why doesn't the flush above cover all cases?
> + flush_scheduled_work();
> }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/