Re: [PATCH v4] kvm/irqchip: Speed up KVM_SET_GSI_ROUTING

From: Christian Borntraeger
Date: Mon May 05 2014 - 10:21:41 EST


On 28/04/14 18:39, Paolo Bonzini wrote:
> From: Christian Borntraeger <borntraeger@xxxxxxxxxx>

Given all your work, What about From: Paolo Bonzini <pbonzini@xxxxxxxxxx>
plus
"Based on an inital patch from Christian Borntraeger"
>
> When starting lots of dataplane devices the bootup takes very long on
> Christian's s390 with irqfd patches. With larger setups he is even
> able to trigger some timeouts in some components. Turns out that the
> KVM_SET_GSI_ROUTING ioctl takes very long (strace claims up to 0.1 sec)
> when having multiple CPUs. This is caused by the synchronize_rcu and
> the HZ=100 of s390. By changing the code to use a private srcu we can
> speed things up. This patch reduces the boot time till mounting root
> from 8 to 2 seconds on my s390 guest with 100 disks.
>
> Uses of hlist_for_each_entry_rcu, hlist_add_head_rcu, hlist_del_init_rcu
> are fine because they do not have lockdep checks (hlist_for_each_entry_rcu
> uses rcu_dereference_raw rather than rcu_dereference, and write-sides
> do not do rcu lockdep at all).
>
> Note that we're hardly relying on the "sleepable" part of srcu. We just
> want SRCU's faster detection of grace periods.
>
> Testing was done by Andrew Theurer using NETPERF. The difference between
> results "before" and "after" the patch has mean -0.2% and standard deviation
> 0.6%. Using a paired t-test on the data points says that there is a 2.5%
> probability that the patch is the cause of the performance difference
> (rather than a random fluctuation).
>
> Cc: Marcelo Tosatti <mtosatti@xxxxxxxxxx>
> Cc: Michael S. Tsirkin <mst@xxxxxxxxxx>
> Signed-off-by: Christian Borntraeger <borntraeger@xxxxxxxxxx>
> Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>

Some questions regarding expedided vs. non expedited and a comment without a necessary action.

Otherwise
Reviewed-by: Christian Borntraeger <borntraeger@xxxxxxxxxx>
Tested-by: Christian Borntraeger <borntraeger@xxxxxxxxxx> # on s390

> ---
> include/linux/kvm_host.h | 1 +
> virt/kvm/eventfd.c | 25 +++++++++++++++----------
> virt/kvm/irq_comm.c | 17 +++++++++--------
> virt/kvm/irqchip.c | 31 ++++++++++++++++---------------
> virt/kvm/kvm_main.c | 16 ++++++++++------
> 5 files changed, 51 insertions(+), 39 deletions(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 820fc2e1d9df..cd0df9a9352d 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -368,6 +368,7 @@ struct kvm {
> struct mm_struct *mm; /* userspace tied to this vm */
> struct kvm_memslots *memslots;
> struct srcu_struct srcu;
> + struct srcu_struct irq_srcu;
> #ifdef CONFIG_KVM_APIC_ARCHITECTURE
> u32 bsp_vcpu_id;
> #endif
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 912ec5a95e2c..20c3af7692c5 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -31,6 +31,7 @@
> #include <linux/list.h>
> #include <linux/eventfd.h>
> #include <linux/kernel.h>
> +#include <linux/srcu.h>
> #include <linux/slab.h>
>
> #include "iodev.h"
> @@ -118,19 +119,22 @@ static void
> irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
> {
> struct _irqfd_resampler *resampler;
> + struct kvm *kvm;
> struct _irqfd *irqfd;
> + int idx;
>
> resampler = container_of(kian, struct _irqfd_resampler, notifier);
> + kvm = resampler->kvm;
>
> - kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
> + kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
> resampler->notifier.gsi, 0, false);
>
> - rcu_read_lock();
> + idx = srcu_read_lock(&kvm->irq_srcu);
>
> list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
> eventfd_signal(irqfd->resamplefd, 1);
>
> - rcu_read_unlock();
> + srcu_read_unlock(&kvm->irq_srcu, idx);
> }
>
> static void
> @@ -142,7 +146,7 @@ irqfd_resampler_shutdown(struct _irqfd *irqfd)
> mutex_lock(&kvm->irqfds.resampler_lock);
>
> list_del_rcu(&irqfd->resampler_link);
> - synchronize_rcu();
> + synchronize_srcu(&kvm->irq_srcu);
>
> if (list_empty(&resampler->list)) {
> list_del(&resampler->link);
> @@ -221,17 +225,18 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
> unsigned long flags = (unsigned long)key;
> struct kvm_kernel_irq_routing_entry *irq;
> struct kvm *kvm = irqfd->kvm;
> + int idx;
>
> if (flags & POLLIN) {
> - rcu_read_lock();
> - irq = rcu_dereference(irqfd->irq_entry);
> + idx = srcu_read_lock(&kvm->irq_srcu);
> + irq = srcu_dereference(irqfd->irq_entry, &kvm->irq_srcu);
> /* An event has been signaled, inject an interrupt */
> if (irq)
> kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1,
> false);
> else
> schedule_work(&irqfd->inject);
> - rcu_read_unlock();
> + srcu_read_unlock(&kvm->irq_srcu, idx);
> }
>
> if (flags & POLLHUP) {
> @@ -363,7 +368,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
> }
>
> list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
> - synchronize_rcu();
> + synchronize_srcu(&kvm->irq_srcu);

No idea what resampler is, can this become time critical as well - iow do we need expedited here?


>
> mutex_unlock(&kvm->irqfds.resampler_lock);
> }
> @@ -465,7 +470,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
> * another thread calls kvm_irq_routing_update before
> * we flush workqueue below (we synchronize with
> * kvm_irq_routing_update using irqfds.lock).
> - * It is paired with synchronize_rcu done by caller
> + * It is paired with synchronize_srcu done by caller
> * of that function.
> */
> rcu_assign_pointer(irqfd->irq_entry, NULL);
> @@ -524,7 +529,7 @@ kvm_irqfd_release(struct kvm *kvm)
>
> /*
> * Change irq_routing and irqfd.
> - * Caller must invoke synchronize_rcu afterwards.
> + * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards.
> */
> void kvm_irq_routing_update(struct kvm *kvm,
> struct kvm_irq_routing_table *irq_rt)
> diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> index e2e6b4473a96..ced4a542a031 100644
> --- a/virt/kvm/irq_comm.c
> +++ b/virt/kvm/irq_comm.c
> @@ -163,6 +163,7 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
> struct kvm_kernel_irq_routing_entry *e;
> int ret = -EINVAL;
> struct kvm_irq_routing_table *irq_rt;
> + int idx;
>
> trace_kvm_set_irq(irq, level, irq_source_id);
>
> @@ -174,8 +175,8 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
> * Since there's no easy way to do this, we only support injecting MSI
> * which is limited to 1:1 GSI mapping.
> */
> - rcu_read_lock();
> - irq_rt = rcu_dereference(kvm->irq_routing);
> + idx = srcu_read_lock(&kvm->irq_srcu);
> + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
> if (irq < irq_rt->nr_rt_entries)
> hlist_for_each_entry(e, &irq_rt->map[irq], link) {
> if (likely(e->type == KVM_IRQ_ROUTING_MSI))
> @@ -184,7 +185,7 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
> ret = -EWOULDBLOCK;
> break;
> }
> - rcu_read_unlock();
> + srcu_read_unlock(&kvm->irq_srcu, idx);
> return ret;
> }
>
> @@ -253,22 +254,22 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
> mutex_lock(&kvm->irq_lock);
> hlist_del_rcu(&kimn->link);
> mutex_unlock(&kvm->irq_lock);
> - synchronize_rcu();
> + synchronize_srcu(&kvm->irq_srcu);
> }
>
> void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
> bool mask)
> {
> struct kvm_irq_mask_notifier *kimn;
> - int gsi;
> + int idx, gsi;
>
> - rcu_read_lock();
> - gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
> + idx = srcu_read_lock(&kvm->irq_srcu);
> + gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin];
> if (gsi != -1)
> hlist_for_each_entry_rcu(kimn, &kvm->mask_notifier_list, link)
> if (kimn->irq == gsi)
> kimn->func(kimn, mask);
> - rcu_read_unlock();
> + srcu_read_unlock(&kvm->irq_srcu, idx);
> }
>
> int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,
> diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
> index 20dc9e4a8f6c..58bf5ba1aab1 100644
> --- a/virt/kvm/irqchip.c
> +++ b/virt/kvm/irqchip.c
> @@ -26,6 +26,7 @@
>
> #include <linux/kvm_host.h>
> #include <linux/slab.h>
> +#include <linux/srcu.h>
> #include <linux/export.h>
> #include <trace/events/kvm.h>
> #include "irq.h"
> @@ -33,19 +34,19 @@
> bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
> {
> struct kvm_irq_ack_notifier *kian;
> - int gsi;
> + int gsi, idx;
>
> - rcu_read_lock();
> - gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
> + idx = srcu_read_lock(&kvm->irq_srcu);
> + gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin];
> if (gsi != -1)
> hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
> link)
> if (kian->gsi == gsi) {
> - rcu_read_unlock();
> + srcu_read_unlock(&kvm->irq_srcu, idx);
> return true;
> }
>
> - rcu_read_unlock();
> + srcu_read_unlock(&kvm->irq_srcu, idx);
>
> return false;
> }
> @@ -54,18 +55,18 @@ EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
> void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
> {
> struct kvm_irq_ack_notifier *kian;
> - int gsi;
> + int gsi, idx;
>
> trace_kvm_ack_irq(irqchip, pin);
>
> - rcu_read_lock();
> - gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
> + idx = srcu_read_lock(&kvm->irq_srcu);
> + gsi = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu)->chip[irqchip][pin];
> if (gsi != -1)
> hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
> link)
> if (kian->gsi == gsi)
> kian->irq_acked(kian);
> - rcu_read_unlock();
> + srcu_read_unlock(&kvm->irq_srcu, idx);
> }
>
> void kvm_register_irq_ack_notifier(struct kvm *kvm,
> @@ -85,7 +86,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
> mutex_lock(&kvm->irq_lock);
> hlist_del_init_rcu(&kian->link);
> mutex_unlock(&kvm->irq_lock);
> - synchronize_rcu();
> + synchronize_srcu_expedited(&kvm->irq_srcu);

Hmm, looks like all callers are slow path (shutdown, deregister assigned dev). Couldnt
we use the non expedited variant?


> #ifdef __KVM_HAVE_IOAPIC
> kvm_vcpu_request_scan_ioapic(kvm);
> #endif
> @@ -115,7 +116,7 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
> bool line_status)
> {
> struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
> - int ret = -1, i = 0;
> + int ret = -1, i = 0, idx;
> struct kvm_irq_routing_table *irq_rt;
>
> trace_kvm_set_irq(irq, level, irq_source_id);
> @@ -124,12 +125,12 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
> * IOAPIC. So set the bit in both. The guest will ignore
> * writes to the unused one.
> */
> - rcu_read_lock();
> - irq_rt = rcu_dereference(kvm->irq_routing);
> + idx = srcu_read_lock(&kvm->irq_srcu);
> + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
> if (irq < irq_rt->nr_rt_entries)
> hlist_for_each_entry(e, &irq_rt->map[irq], link)
> irq_set[i++] = *e;
> - rcu_read_unlock();
> + srcu_read_unlock(&kvm->irq_srcu, idx);
>
> while(i--) {
> int r;
> @@ -226,7 +227,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
> kvm_irq_routing_update(kvm, new);
> mutex_unlock(&kvm->irq_lock);
>
> - synchronize_rcu();
> + synchronize_srcu_expedited(&kvm->irq_srcu);
>
> new = old;
> r = 0;
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index fa70c6e642b4..95b4c2b3906a 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -457,11 +457,11 @@ static struct kvm *kvm_create_vm(unsigned long type)
>
> r = kvm_arch_init_vm(kvm, type);
> if (r)
> - goto out_err_nodisable;
> + goto out_err_no_disable;
>
> r = hardware_enable_all();
> if (r)
> - goto out_err_nodisable;
> + goto out_err_no_disable;
>
> #ifdef CONFIG_HAVE_KVM_IRQCHIP
> INIT_HLIST_HEAD(&kvm->mask_notifier_list);
> @@ -473,10 +473,12 @@ static struct kvm *kvm_create_vm(unsigned long type)
> r = -ENOMEM;
> kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
> if (!kvm->memslots)
> - goto out_err_nosrcu;
> + goto out_err_no_srcu;
> kvm_init_memslots_id(kvm);
> if (init_srcu_struct(&kvm->srcu))
> - goto out_err_nosrcu;
> + goto out_err_no_srcu;
> + if (init_srcu_struct(&kvm->irq_srcu))
> + goto out_err_no_irq_srcu;
> for (i = 0; i < KVM_NR_BUSES; i++) {
> kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
> GFP_KERNEL);
> @@ -505,10 +507,12 @@ static struct kvm *kvm_create_vm(unsigned long type)
> return kvm;
>
> out_err:
> + cleanup_srcu_struct(&kvm->irq_srcu);
> +out_err_no_irq_srcu:
> cleanup_srcu_struct(&kvm->srcu);
> -out_err_nosrcu:
> +out_err_no_srcu:
> hardware_disable_all();
> -out_err_nodisable:
> +out_err_no_disable:


the patch would be smaller without this change, but it makes the naming more consistent, so ok.


> for (i = 0; i < KVM_NR_BUSES; i++)
> kfree(kvm->buses[i]);
> kfree(kvm->memslots);
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/