[PATCH v10 2/2] kvm: Add resampling irqfds for level triggeredinterrupts

From: Alex Williamson
Date: Mon Sep 17 2012 - 23:16:22 EST


To emulate level triggered interrupts, add a resample option to
KVM_IRQFD. When specified, a new resamplefd is provided that notifies
the user when the irqchip has been resampled by the VM. This may, for
instance, indicate an EOI. Also in this mode, injection of an
interrupt through an irqfd only asserts the interrupt. On resampling,
the interrupt is automatically de-asserted prior to user notification.
This enables level triggered interrupts to be injected and re-enabled
from vfio with no userspace intervention.

All resampling irqfds can make use of a single irq source ID, so we
reserve a new one for this interface.

Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx>
---

Documentation/virtual/kvm/api.txt | 13 +++
arch/x86/kvm/x86.c | 4 +
include/linux/kvm.h | 12 ++-
include/linux/kvm_host.h | 4 +
virt/kvm/eventfd.c | 175 ++++++++++++++++++++++++++++++++++++-
virt/kvm/irq_comm.c | 6 +
virt/kvm/kvm_main.c | 2
7 files changed, 210 insertions(+), 6 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index bf33aaa..d30af74 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1946,6 +1946,19 @@ the guest using the specified gsi pin. The irqfd is removed using
the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd
and kvm_irqfd.gsi.

+With KVM_CAP_IRQFD_RESAMPLE, KVM_IRQFD supports a de-assert and notify
+mechanism allowing emulation of level-triggered, irqfd-based
+interrupts. When KVM_IRQFD_FLAG_RESAMPLE is set the user must pass an
+additional eventfd in the kvm_irqfd.resamplefd field. When operating
+in resample mode, injection of an interrupt through kvm_irq.fd asserts
+the specified gsi in the irqchip. When the irqchip is resampled, such
+as from an EOI, the gsi is de-asserted and the user is notifed via
+kvm_irqfd.resamplefd. It is the user's respnosibility to re-inject
+the interrupt if the device making use of it still requires service.
+Note that closing the resamplefd is not sufficient to disable the
+irqfd. The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment
+and need not be specified with KVM_IRQFD_FLAG_DEASSIGN.
+
4.76 KVM_PPC_ALLOCATE_HTAB

Capability: KVM_CAP_PPC_ALLOC_HTAB
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2966c84..56f9002 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2177,6 +2177,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_GET_TSC_KHZ:
case KVM_CAP_PCI_2_3:
case KVM_CAP_KVMCLOCK_CTRL:
+ case KVM_CAP_IRQFD_RESAMPLE:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -6268,6 +6269,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)

/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
+ /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
+ set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ &kvm->arch.irq_sources_bitmap);

raw_spin_lock_init(&kvm->arch.tsc_write_lock);

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 2ce09aa..a01a3d5 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -618,6 +618,7 @@ struct kvm_ppc_smmu_info {
#define KVM_CAP_PPC_GET_SMMU_INFO 78
#define KVM_CAP_S390_COW 79
#define KVM_CAP_PPC_ALLOC_HTAB 80
+#define KVM_CAP_IRQFD_RESAMPLE 81

#ifdef KVM_CAP_IRQ_ROUTING

@@ -683,12 +684,21 @@ struct kvm_xen_hvm_config {
#endif

#define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
+/*
+ * Available with KVM_CAP_IRQFD_RESAMPLE
+ *
+ * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies
+ * the irqfd to operate in resampling mode for level triggered interrupt
+ * emlation. See Documentation/virtual/kvm/api.txt.
+ */
+#define KVM_IRQFD_FLAG_RESAMPLE (1 << 1)

struct kvm_irqfd {
__u32 fd;
__u32 gsi;
__u32 flags;
- __u8 pad[20];
+ __u32 resamplefd;
+ __u8 pad[16];
};

struct kvm_clock_data {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 84f6950..d7bc79c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -70,7 +70,8 @@
#define KVM_REQ_PMU 16
#define KVM_REQ_PMI 17

-#define KVM_USERSPACE_IRQ_SOURCE_ID 0
+#define KVM_USERSPACE_IRQ_SOURCE_ID 0
+#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1

struct kvm;
struct kvm_vcpu;
@@ -283,6 +284,7 @@ struct kvm {
struct {
spinlock_t lock;
struct list_head items;
+ struct list_head resamplers;
} irqfds;
struct list_head ioeventfds;
#endif
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 7d7e2aa..822f50a 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -43,6 +43,31 @@
* --------------------------------------------------------------------
*/

+/*
+ * Resamper irqfds are a special variety of irqfds used to emulate
+ * level triggered interrupts. The interrupt is asserted on eventfd
+ * trigger. On acknowledgement through the irq ack notifier, the
+ * interrupt is de-asserted and userspace is notified through the
+ * resamplefd. All resamplers on the same gsi are de-asserted
+ * together, so we don't need to track the state of each individual
+ * user. We can also therefore share the same irq source ID.
+ */
+struct _irqfd_resampler {
+ struct kvm *kvm;
+ /*
+ * List of resampling irqfds sharing this gsi.
+ * RCU list modified under kvm->irqfds.lock
+ */
+ struct list_head irqfds;
+ /* The irq ack notifier for this gsi */
+ struct kvm_irq_ack_notifier notifier;
+ /*
+ * Entry in list of kvm->irqfd.resamplers for shutdown cleanup.
+ * Accessed and modified under kvm->irqfds.lock
+ */
+ struct list_head list;
+};
+
struct _irqfd {
/* Used for MSI fast-path */
struct kvm *kvm;
@@ -52,6 +77,12 @@ struct _irqfd {
/* Used for level IRQ fast-path */
int gsi;
struct work_struct inject;
+ /* Eventfd notified on resample (resampler-only) */
+ struct eventfd_ctx *resamplefd;
+ /* Entry in list of irqfds for a resampler (resampler-only) */
+ struct list_head resampler_list;
+ /* The resampler used by this irqfd (resampler-only) */
+ struct _irqfd_resampler *resampler;
/* Used for setup/shutdown */
struct eventfd_ctx *eventfd;
struct list_head list;
@@ -71,6 +102,39 @@ irqfd_inject(struct work_struct *work)
kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
}

+static void
+irqfd_resampler_inject(struct work_struct *work)
+{
+ struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
+
+ kvm_set_irq(irqfd->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ irqfd->gsi, 1);
+}
+
+/*
+ * Since resampler irqfds share an IRQ source ID, we de-assert once
+ * then notify all of the resampler irqfds using this GSI. We can't
+ * do multiple de-asserts or we risk racing with incoming re-asserts.
+ */
+static void
+irqfd_resampler_notify(struct kvm_irq_ack_notifier *kian)
+{
+ struct _irqfd_resampler *resampler;
+ struct _irqfd *irqfd;
+
+ resampler = container_of(kian, struct _irqfd_resampler, notifier);
+
+ rcu_read_lock();
+
+ kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ resampler->notifier.gsi, 0);
+
+ list_for_each_entry_rcu(irqfd, &resampler->irqfds, resampler_list)
+ eventfd_signal(irqfd->resamplefd, 1);
+
+ rcu_read_unlock();
+}
+
/*
* Race-free decouple logic (ordering is critical)
*/
@@ -92,6 +156,43 @@ irqfd_shutdown(struct work_struct *work)
*/
flush_work_sync(&irqfd->inject);

+ if (irqfd->resampler) {
+ struct _irqfd_resampler *resampler = irqfd->resampler;
+ struct kvm *kvm = resampler->kvm;
+
+ mutex_lock(&kvm->irq_lock);
+ spin_lock_irq(&irqfd->kvm->irqfds.lock);
+
+ list_del_rcu(&irqfd->resampler_list);
+
+ /*
+ * On removal of the last irqfd in the resampler list,
+ * remove the resampler and unregister the irq ack
+ * notifier. It's possible to race the ack of the final
+ * injection here, so manually de-assert the gsi to avoid
+ * leaving an unmanaged, asserted interrupt line.
+ */
+ if (list_empty(&resampler->irqfds)) {
+ list_del(&resampler->list);
+ __kvm_unregister_irq_ack_notifier(kvm,
+ &resampler->notifier);
+ kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ resampler->notifier.gsi, 0);
+ kfree(resampler);
+ }
+
+ spin_unlock_irq(&irqfd->kvm->irqfds.lock);
+ mutex_unlock(&kvm->irq_lock);
+
+ /*
+ * Both list_del_rcu & __kvm_unregister_irq_ack_notifier
+ * require an rcu grace period/
+ */
+ synchronize_rcu();
+
+ eventfd_ctx_put(irqfd->resamplefd);
+ }
+
/*
* It is now safe to release the object's resources
*/
@@ -202,8 +303,9 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
{
struct kvm_irq_routing_table *irq_rt;
struct _irqfd *irqfd, *tmp;
+ struct _irqfd_resampler *resampler = NULL;
struct file *file = NULL;
- struct eventfd_ctx *eventfd = NULL;
+ struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
int ret;
unsigned int events;

@@ -214,7 +316,40 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
irqfd->kvm = kvm;
irqfd->gsi = args->gsi;
INIT_LIST_HEAD(&irqfd->list);
- INIT_WORK(&irqfd->inject, irqfd_inject);
+ INIT_LIST_HEAD(&irqfd->resampler_list);
+
+ if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
+ resamplefd = eventfd_ctx_fdget(args->resamplefd);
+ if (IS_ERR(resamplefd)) {
+ ret = PTR_ERR(resamplefd);
+ goto fail;
+ }
+
+ irqfd->resamplefd = resamplefd;
+
+ /*
+ * We may be able to share an existing resampler, but we
+ * won't know that until we search under spinlock. Setup
+ * one here to avoid an atomic allocation later.
+ */
+ resampler = kzalloc(sizeof(*resampler), GFP_KERNEL);
+ if (!resampler) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ resampler->kvm = kvm;
+ INIT_LIST_HEAD(&resampler->irqfds);
+ resampler->notifier.gsi = irqfd->gsi;
+ resampler->notifier.irq_acked = irqfd_resampler_notify;
+ INIT_LIST_HEAD(&resampler->list);
+
+ irqfd->resampler = resampler;
+
+ INIT_WORK(&irqfd->inject, irqfd_resampler_inject);
+ } else
+ INIT_WORK(&irqfd->inject, irqfd_inject);
+
INIT_WORK(&irqfd->shutdown, irqfd_shutdown);

file = eventfd_fget(args->fd);
@@ -238,6 +373,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);

+ mutex_lock(&kvm->irq_lock);
spin_lock_irq(&kvm->irqfds.lock);

ret = 0;
@@ -247,9 +383,36 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
/* This fd is used for another irq already. */
ret = -EBUSY;
spin_unlock_irq(&kvm->irqfds.lock);
+ mutex_unlock(&kvm->irq_lock);
goto fail;
}

+ if (resampler) {
+ struct _irqfd_resampler *resampler_tmp;
+
+ /*
+ * Re-use a resampler if it's already registered on the
+ * gsi we need. Free the one we created above if found,
+ * otherwise add to the list and setup the irq ack notifier.
+ */
+ list_for_each_entry(resampler_tmp,
+ &kvm->irqfds.resamplers, list) {
+ if (resampler_tmp->notifier.gsi == irqfd->gsi) {
+ irqfd->resampler = resampler_tmp;
+ break;
+ }
+ }
+
+ if (irqfd->resampler == resampler) {
+ list_add(&resampler->list, &kvm->irqfds.resamplers);
+ __kvm_register_irq_ack_notifier(kvm,
+ &resampler->notifier);
+ } else
+ kfree(resampler);
+
+ list_add_rcu(&irqfd->resampler_list, &irqfd->resampler->irqfds);
+ }
+
irq_rt = rcu_dereference_protected(kvm->irq_routing,
lockdep_is_held(&kvm->irqfds.lock));
irqfd_update(kvm, irqfd, irq_rt);
@@ -266,6 +429,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
schedule_work(&irqfd->inject);

spin_unlock_irq(&kvm->irqfds.lock);
+ mutex_unlock(&kvm->irq_lock);

/*
* do not drop the file until the irqfd is fully initialized, otherwise
@@ -276,12 +440,16 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
return 0;

fail:
+ if (resamplefd && !IS_ERR(resamplefd))
+ eventfd_ctx_put(resamplefd);
+
if (eventfd && !IS_ERR(eventfd))
eventfd_ctx_put(eventfd);

if (!IS_ERR(file))
fput(file);

+ kfree(resampler);
kfree(irqfd);
return ret;
}
@@ -291,6 +459,7 @@ kvm_eventfd_init(struct kvm *kvm)
{
spin_lock_init(&kvm->irqfds.lock);
INIT_LIST_HEAD(&kvm->irqfds.items);
+ INIT_LIST_HEAD(&kvm->irqfds.resamplers);
INIT_LIST_HEAD(&kvm->ioeventfds);
}

@@ -340,7 +509,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
int
kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
{
- if (args->flags & ~KVM_IRQFD_FLAG_DEASSIGN)
+ if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
return -EINVAL;

if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index dd0cbf6..5d29c0b 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -237,6 +237,9 @@ int kvm_request_irq_source_id(struct kvm *kvm)
}

ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+#ifdef CONFIG_X86
+ ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+#endif
set_bit(irq_source_id, bitmap);
unlock:
mutex_unlock(&kvm->irq_lock);
@@ -247,6 +250,9 @@ unlock:
void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
{
ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+#ifdef CONFIG_X86
+ ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+#endif

mutex_lock(&kvm->irq_lock);
if (irq_source_id < 0 ||
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d617f69..3f3fe0c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -67,7 +67,7 @@ MODULE_LICENSE("GPL");
/*
* Ordering of locks:
*
- * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
+ * kvm->lock --> kvm->slots_lock --> kvm->irq_lock --> kvm->irqfds.lock
*/

DEFINE_RAW_SPINLOCK(kvm_lock);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/