Re: [PATCH v7 2/2] kvm: KVM_EOIFD, an eventfd for EOIs
From: Michael S. Tsirkin
Date: Sun Jul 29 2012 - 10:54:27 EST
On Tue, Jul 24, 2012 at 02:43:22PM -0600, Alex Williamson wrote:
> This new ioctl enables an eventfd to be triggered when an EOI is
> written for a specified irqchip pin. The first user of this will
> be external device assignment through VFIO, using a level irqfd
> for asserting a PCI INTx interrupt and this interface for de-assert
> and notification once the interrupt is serviced.
>
> Here we make use of the reference counting of the _irq_source
> object allowing us to share it with an irqfd and cleanup regardless
> of the release order.
>
> Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx>
> ---
>
> Documentation/virtual/kvm/api.txt | 21 ++
> arch/x86/kvm/x86.c | 2
> include/linux/kvm.h | 15 ++
> include/linux/kvm_host.h | 13 +
> virt/kvm/eventfd.c | 336 +++++++++++++++++++++++++++++++++++++
> virt/kvm/kvm_main.c | 11 +
> 6 files changed, 398 insertions(+)
>
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 3911e62..8cd6b36 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -1989,6 +1989,27 @@ return the hash table order in the parameter. (If the guest is using
> the virtualized real-mode area (VRMA) facility, the kernel will
> re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.)
>
> +4.77 KVM_EOIFD
> +
> +Capability: KVM_CAP_EOIFD
> +Architectures: x86
> +Type: vm ioctl
> +Parameters: struct kvm_eoifd (in)
> +Returns: 0 on success, < 0 on error
> +
> +KVM_EOIFD allows userspace to receive interrupt EOI notification
> +through an eventfd.
I thought about it some more, and I think it should be renamed to an
interrupt ack notification than eoi notification.
For example, consider userspace that uses threaded interrupts.
Currently what will happen is each interrupt will be injected
twice, since on eoi device is still asserting it.
One fix would be to delay event until interrupt is re-enabled.
Now I am not asking you to fix this immediately,
but I think we should make the interface generic by
saying we report an ack to userspace and not specifically EOI.
> kvm_eoifd.fd specifies the eventfd used for
> +notification. KVM_EOIFD_FLAG_DEASSIGN is used to de-assign an eoifd
> +once assigned. KVM_EOIFD also requires additional bits set in
> +kvm_eoifd.flags to bind to the proper interrupt line. The
> +KVM_EOIFD_FLAG_LEVEL_IRQFD indicates that kvm_eoifd.key is provided
> +and is a key from a level triggered interrupt (configured from
> +KVM_IRQFD using KVM_IRQFD_FLAG_LEVEL). The EOI notification is bound
> +to the same GSI and irqchip input as the irqfd. Both kvm_eoifd.key
> +and KVM_EOIFD_FLAG_LEVEL_IRQFD must be specified on assignment and
> +de-assignment of KVM_EOIFD. A level irqfd may only be bound to a
> +single eoifd. KVM_CAP_EOIFD_LEVEL_IRQFD indicates support of
> +KVM_EOIFD_FLAG_LEVEL_IRQFD.
>
Hmm returning the key means we'll need to keep refcounting for source
IDs around forever. I liked passing the fd better: make implementation
match interface and not the other way around.
> 5. The kvm_run structure
> ------------------------
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 9ded39d..8f3164e 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2171,6 +2171,8 @@ int kvm_dev_ioctl_check_extension(long ext)
> case KVM_CAP_PCI_2_3:
> case KVM_CAP_KVMCLOCK_CTRL:
> case KVM_CAP_IRQFD_LEVEL:
> + case KVM_CAP_EOIFD:
> + case KVM_CAP_EOIFD_LEVEL_IRQFD:
> r = 1;
> break;
> case KVM_CAP_COALESCED_MMIO:
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index b2e6e4f..effb916 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -619,6 +619,8 @@ struct kvm_ppc_smmu_info {
> #define KVM_CAP_S390_COW 79
> #define KVM_CAP_PPC_ALLOC_HTAB 80
> #define KVM_CAP_IRQFD_LEVEL 81
> +#define KVM_CAP_EOIFD 82
> +#define KVM_CAP_EOIFD_LEVEL_IRQFD 83
>
> #ifdef KVM_CAP_IRQ_ROUTING
>
> @@ -694,6 +696,17 @@ struct kvm_irqfd {
> __u8 pad[20];
> };
>
> +#define KVM_EOIFD_FLAG_DEASSIGN (1 << 0)
> +/* Available with KVM_CAP_EOIFD_LEVEL_IRQFD */
> +#define KVM_EOIFD_FLAG_LEVEL_IRQFD (1 << 1)
> +
> +struct kvm_eoifd {
> + __u32 fd;
> + __u32 flags;
> + __u32 key;
> + __u8 pad[20];
> +};
> +
> struct kvm_clock_data {
> __u64 clock;
> __u32 flags;
> @@ -834,6 +847,8 @@ struct kvm_s390_ucas_mapping {
> #define KVM_PPC_GET_SMMU_INFO _IOR(KVMIO, 0xa6, struct kvm_ppc_smmu_info)
> /* Available with KVM_CAP_PPC_ALLOC_HTAB */
> #define KVM_PPC_ALLOCATE_HTAB _IOWR(KVMIO, 0xa7, __u32)
> +/* Available with KVM_CAP_EOIFD */
> +#define KVM_EOIFD _IOW(KVMIO, 0xa8, struct kvm_eoifd)
>
> /*
> * ioctls for vcpu fds
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index c73f071..01e72a6 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -289,6 +289,10 @@ struct kvm {
> struct mutex lock;
> struct list_head items;
> } irqsources;
> + struct {
> + spinlock_t lock;
> + struct list_head items;
> + } eoifds;
> #endif
> struct kvm_vm_stat stat;
> struct kvm_arch arch;
> @@ -832,6 +836,8 @@ int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
> void kvm_irqfd_release(struct kvm *kvm);
> void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
> int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
> +int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args);
> +void kvm_eoifd_release(struct kvm *kvm);
>
> #else
>
> @@ -857,6 +863,13 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> return -ENOSYS;
> }
>
> +static inline int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args)
> +{
> + return -ENOSYS;
> +}
> +
> +static inline void kvm_eoifd_release(struct kvm *kvm) {}
> +
> #endif /* CONFIG_HAVE_KVM_EVENTFD */
>
> #ifdef CONFIG_KVM_APIC_ARCHITECTURE
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 878cb52..3aa2d62 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -95,6 +95,25 @@ static struct _irq_source *_irq_source_alloc(struct kvm *kvm, int gsi)
> return source;
> }
>
> +static struct _irq_source *_irq_source_get_from_key(struct kvm *kvm, int key)
> +{
> + struct _irq_source *tmp, *source = ERR_PTR(-ENOENT);
> +
> + mutex_lock(&kvm->irqsources.lock);
> +
> + list_for_each_entry(tmp, &kvm->irqsources.items, list) {
> + if (tmp->id == key) {
> + source = tmp;
> + kref_get(&source->kref);
> + break;
> + }
> + }
> +
> + mutex_unlock(&kvm->irqsources.lock);
> +
> + return source;
> +}
> +
> /*
> * --------------------------------------------------------------------
> * irqfd: Allows an fd to be used to inject an interrupt to the guest
> @@ -406,6 +425,8 @@ kvm_eventfd_init(struct kvm *kvm)
> INIT_LIST_HEAD(&kvm->ioeventfds);
> mutex_init(&kvm->irqsources.lock);
> INIT_LIST_HEAD(&kvm->irqsources.items);
> + spin_lock_init(&kvm->eoifds.lock);
> + INIT_LIST_HEAD(&kvm->eoifds.items);
> }
>
> /*
> @@ -772,3 +793,318 @@ kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>
> return kvm_assign_ioeventfd(kvm, args);
> }
> +
> +/*
> + * --------------------------------------------------------------------
> + * eoifd: Translate KVM APIC/IOAPIC EOI into eventfd signal.
> + *
> + * userspace can register with an eventfd for receiving
> + * notification when an EOI occurs.
> + * --------------------------------------------------------------------
> + */
> +
> +struct _eoifd {
> + /* eventfd triggered on EOI */
> + struct eventfd_ctx *eventfd;
> + /* irq source ID de-asserted on EOI */
> + struct _irq_source *source;
> + wait_queue_t wait;
> + /* EOI notification from KVM */
> + struct kvm_irq_ack_notifier notifier;
> + struct list_head list;
> + poll_table pt;
> + struct work_struct shutdown;
> +};
> +
> +/* Called under eoifds.lock */
> +static void eoifd_shutdown(struct work_struct *work)
> +{
> + struct _eoifd *eoifd = container_of(work, struct _eoifd, shutdown);
> + struct kvm *kvm = eoifd->source->kvm;
> + u64 cnt;
> +
> + /*
> + * Stop EOI signaling
> + */
> + kvm_unregister_irq_ack_notifier(kvm, &eoifd->notifier);
> +
> + /*
> + * Synchronize with the wait-queue and unhook ourselves to prevent
> + * further events.
> + */
> + eventfd_ctx_remove_wait_queue(eoifd->eventfd, &eoifd->wait, &cnt);
> +
> + /*
> + * Release resources
> + */
> + eventfd_ctx_put(eoifd->eventfd);
> + _irq_source_put(eoifd->source);
> + kfree(eoifd);
> +}
> +
> +/* assumes kvm->eoifds.lock is held */
> +static bool eoifd_is_active(struct _eoifd *eoifd)
> +{
> + return list_empty(&eoifd->list) ? false : true;
> +}
> +
> +/*
> + * Mark the eoifd as inactive and schedule it for removal
> + *
> + * assumes kvm->eoifds.lock is held
> + */
> +static void eoifd_deactivate(struct _eoifd *eoifd)
> +{
> + BUG_ON(!eoifd_is_active(eoifd));
> +
> + list_del_init(&eoifd->list);
> +
> + queue_work(irqfd_cleanup_wq, &eoifd->shutdown);
> +}
> +
> +/*
> + * Called with wqh->lock held and interrupts disabled
> + */
> +static int eoifd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
> +{
> + unsigned long flags = (unsigned long)key;
> +
> + if (unlikely(flags & POLLHUP)) {
> + /* The eventfd is closing, detach from KVM */
> + struct _eoifd *eoifd = container_of(wait, struct _eoifd, wait);
> + struct kvm *kvm = eoifd->source->kvm;
> + unsigned long flags;
> +
> + spin_lock_irqsave(&kvm->eoifds.lock, flags);
> +
> + /*
> + * We must check if someone deactivated the eoifd before
> + * we could acquire the eoifds.lock since the item is
> + * deactivated from the KVM side before it is unhooked from
> + * the wait-queue. If it is already deactivated, we can
> + * simply return knowing the other side will cleanup for us.
> + * We cannot race against the eoifd going away since the
> + * other side is required to acquire wqh->lock, which we hold
> + */
> + if (eoifd_is_active(eoifd))
> + eoifd_deactivate(eoifd);
> +
> + spin_unlock_irqrestore(&kvm->eoifds.lock, flags);
> + }
> +
> + return 0;
> +}
Looks like there is a bug here: if I close irqfd, then close eoifd,
the key is not immediately released so an attempt to create
an irqfd can fail to get the source id.
Maybe we should simply document that userspace should deassign
eoifd before closing it? This is what we do for ioeventfd.
If we do this, the whole polling code can go away completely.
> +
> +static void eoifd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
> + poll_table *pt)
> +{
> + struct _eoifd *eoifd = container_of(pt, struct _eoifd, pt);
> + add_wait_queue(wqh, &eoifd->wait);
> +}
> +
> +/*
> + * This function is called as the kvm VM fd is being released. Shutdown all
> + * eoifds that still remain open
> + */
> +void kvm_eoifd_release(struct kvm *kvm)
> +{
> + struct _eoifd *tmp, *eoifd;
> +
> + spin_lock_irq(&kvm->eoifds.lock);
> +
> + list_for_each_entry_safe(eoifd, tmp, &kvm->eoifds.items, list)
> + eoifd_deactivate(eoifd);
> +
> + spin_unlock_irq(&kvm->eoifds.lock);
> +
> + flush_workqueue(irqfd_cleanup_wq);
> +}
> +
> +static void eoifd_event(struct kvm_irq_ack_notifier *notifier)
> +{
> + struct _eoifd *eoifd;
> +
> + eoifd = container_of(notifier, struct _eoifd, notifier);
> +
> + if (unlikely(!eoifd->source))
> + return;
> +
> + /*
> + * De-assert and send EOI, user needs to re-assert if
> + * device still requires service.
> + */
I'm not sure why did you drop filtering by source id.
This means userspace gets events even if it did not send an interrupt.
So
1. Should be documented that you can get spurious events
2. when an interrupt is shared with an emulated device,
and said device uses EOI, this will not
perform well as we will wake up userspace on each EOI.
3. Just sharing interrupt with virtio means we are polling
assigned device on each virtio interrupt.
> + kvm_set_irq(eoifd->source->kvm,
> + eoifd->source->id, eoifd->source->gsi, 0);
> + eventfd_signal(eoifd->eventfd, 1);
> +}
> +
> +static int kvm_assign_eoifd(struct kvm *kvm, struct kvm_eoifd *args)
> +{
> + struct file *file = NULL;
> + struct eventfd_ctx *eventfd = NULL;
> + struct _eoifd *eoifd = NULL, *tmp;
> + struct _irq_source *source = NULL;
> + int ret;
> + u64 cnt;
> +
> + if (!(args->flags & KVM_EOIFD_FLAG_LEVEL_IRQFD))
> + return -EINVAL;
> +
> + file = eventfd_fget(args->fd);
> + if (IS_ERR(file)) {
> + ret = PTR_ERR(file);
> + goto fail;
> + }
> +
> + eventfd = eventfd_ctx_fileget(file);
> + if (IS_ERR(eventfd)) {
> + ret = PTR_ERR(eventfd);
> + goto fail;
> + }
> +
> + eoifd = kzalloc(sizeof(*eoifd), GFP_KERNEL);
> + if (!eoifd) {
> + ret = -ENOMEM;
> + goto fail;
> + }
> +
> + source = _irq_source_get_from_key(kvm, args->key);
> + if (IS_ERR(source)) {
> + ret = PTR_ERR(source);
> + goto fail;
> + }
> +
> + INIT_LIST_HEAD(&eoifd->list);
> + INIT_WORK(&eoifd->shutdown, eoifd_shutdown);
> + eoifd->eventfd = eventfd;
> + eoifd->notifier.gsi = source->gsi;
> + eoifd->notifier.irq_acked = eoifd_event;
> +
> + /*
> + * Install our own custom wake-up handling so we are notified via
> + * a callback whenever someone releases the underlying eventfd
> + */
> + init_waitqueue_func_entry(&eoifd->wait, eoifd_wakeup);
> + init_poll_funcptr(&eoifd->pt, eoifd_ptable_queue_proc);
> +
> + /*
> + * Clear out any previously released eoifds that might conflict
> + */
> + flush_workqueue(irqfd_cleanup_wq);
> +
> + /*
> + * This can sleep, so register before acquiring spinlock, notifier
> + * becomes a nop until we finish.
> + */
> + kvm_register_irq_ack_notifier(kvm, &eoifd->notifier);
> +
> + /*
> + * Install the wait queue function to allow cleanup when the
> + * eventfd is closed by the user. This grabs the wqh lock, so
> + * we do it out of spinlock, holding the file reference ensures
> + * we won't see a POLLHUP until setup is complete.
> + */
> + file->f_op->poll(file, &eoifd->pt);
> +
> + spin_lock_irq(&kvm->eoifds.lock);
> +
> + /*
> + * Enforce a one-to-one relationship between irq source and eoifd so
> + * that this interface can't be used to consume all kernel memory.
> + * NB. single eventfd can still be used by multiple eoifds.
> + */
> + list_for_each_entry(tmp, &kvm->eoifds.items, list) {
> + if (tmp->source == source) {
> + spin_unlock_irq(&kvm->eoifds.lock);
> + ret = -EBUSY;
> + goto fail_unregister;
> + }
> + }
> +
> + list_add_tail(&eoifd->list, &kvm->eoifds.items);
> + eoifd->source = source; /* Enable ack notifier */
> +
> + spin_unlock_irq(&kvm->eoifds.lock);
> +
> + fput(file); /* Enable POLLHUP */
> +
> + return 0;
> +
> +fail_unregister:
> + eventfd_ctx_remove_wait_queue(eventfd, &eoifd->wait, &cnt);
> + kvm_unregister_irq_ack_notifier(kvm, &eoifd->notifier);
> +fail:
> + if (source && !IS_ERR(source))
> + _irq_source_put(source);
> +
> + if (eventfd && !IS_ERR(eventfd))
> + eventfd_ctx_put(eventfd);
> +
> + if (file && !IS_ERR(file))
> + fput(file);
> +
> + kfree(eoifd);
> + return ret;
> +}
> +
> +static int kvm_deassign_eoifd(struct kvm *kvm, struct kvm_eoifd *args)
> +{
> + struct eventfd_ctx *eventfd = NULL;
> + struct _irq_source *source = NULL;
> + struct _eoifd *eoifd;
> + int ret = -ENOENT;
> +
> + if (!(args->flags & KVM_EOIFD_FLAG_LEVEL_IRQFD))
> + return -EINVAL;
> +
> + eventfd = eventfd_ctx_fdget(args->fd);
> + if (IS_ERR(eventfd)) {
> + ret = PTR_ERR(eventfd);
> + goto fail;
> + }
> +
> + source = _irq_source_get_from_key(kvm, args->key);
> + if (IS_ERR(source)) {
> + ret = PTR_ERR(source);
> + goto fail;
> + }
> +
> + spin_lock_irq(&kvm->eoifds.lock);
> +
> + list_for_each_entry(eoifd, &kvm->eoifds.items, list) {
> + if (eoifd->eventfd == eventfd && eoifd->source == source) {
> + eoifd_deactivate(eoifd);
> + ret = 0;
> + break;
> + }
> + }
> +
> + spin_unlock_irq(&kvm->eoifds.lock);
> +
> +fail:
> + if (source && !IS_ERR(source))
> + _irq_source_put(source);
> + if (eventfd && !IS_ERR(eventfd))
> + eventfd_ctx_put(eventfd);
> +
> + /*
> + * Block until we know all outstanding shutdown jobs have completed
> + * so that we guarantee there will not be any more EOIs signaled on
> + * this eventfd once this deassign function returns.
> + */
> + flush_workqueue(irqfd_cleanup_wq);
> +
> + return ret;
> +}
> +
> +int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args)
> +{
> + if (args->flags & ~(KVM_EOIFD_FLAG_DEASSIGN |
> + KVM_EOIFD_FLAG_LEVEL_IRQFD))
> + return -EINVAL;
> +
> + if (args->flags & KVM_EOIFD_FLAG_DEASSIGN)
> + return kvm_deassign_eoifd(kvm, args);
> +
> + return kvm_assign_eoifd(kvm, args);
> +}
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 2468523..0b241bf 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -620,6 +620,8 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
>
> kvm_irqfd_release(kvm);
>
> + kvm_eoifd_release(kvm);
> +
> kvm_put_kvm(kvm);
> return 0;
> }
> @@ -2093,6 +2095,15 @@ static long kvm_vm_ioctl(struct file *filp,
> break;
> }
> #endif
> + case KVM_EOIFD: {
> + struct kvm_eoifd data;
> +
> + r = -EFAULT;
> + if (copy_from_user(&data, argp, sizeof data))
> + goto out;
> + r = kvm_eoifd(kvm, &data);
> + break;
> + }
> default:
> r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> if (r == -ENOTTY)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/