Re: [PATCH 05/13] KVM: Update IRTE according to guest interrupt configuration changes

From: Alex Williamson
Date: Mon Nov 10 2014 - 16:58:17 EST


On Mon, 2014-11-10 at 14:26 +0800, Feng Wu wrote:
> When guest changes its interrupt configuration (such as, vector, etc.)
> for direct-assigned devices, we need to update the associated IRTE
> with the new guest vector, so external interrupts from the assigned
> devices can be injected to guests without VM-Exit.
>
> The current method of handling guest lowest priority interrtups
> is to use a counter 'apic_arb_prio' for each VCPU, we choose the
> VCPU with smallest 'apic_arb_prio' and then increase it by 1.
> However, for VT-d PI, we cannot re-use this, since we no longer
> have control to 'apic_arb_prio' with posted interrupt direct
> delivery by Hardware.
>
> Here, we introduce a similiar way with 'apic_arb_prio' to handle
> guest lowest priority interrtups when VT-d PI is used. Here is the
> ideas:
> - Each VCPU has a counter 'round_robin_counter'.
> - When guests sets an interrupts to lowest priority, we choose
> the VCPU with smallest 'round_robin_counter' as the destination,
> then increase it.
>
> Signed-off-by: Feng Wu <feng.wu@xxxxxxxxx>
> ---
> arch/x86/include/asm/irq_remapping.h | 6 ++
> arch/x86/include/asm/kvm_host.h | 2 +
> arch/x86/kvm/vmx.c | 12 +++
> arch/x86/kvm/x86.c | 11 +++
> drivers/iommu/amd_iommu.c | 6 ++
> drivers/iommu/intel_irq_remapping.c | 28 +++++++
> drivers/iommu/irq_remapping.c | 9 ++
> drivers/iommu/irq_remapping.h | 3 +
> include/linux/dmar.h | 26 ++++++
> include/linux/kvm_host.h | 22 +++++
> include/uapi/linux/kvm.h | 1 +
> virt/kvm/assigned-dev.c | 141 ++++++++++++++++++++++++++++++++++
> virt/kvm/irq_comm.c | 4 +-
> virt/kvm/irqchip.c | 11 ---
> 14 files changed, 269 insertions(+), 13 deletions(-)
>
> diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
> index a3cc437..32d6cc4 100644
> --- a/arch/x86/include/asm/irq_remapping.h
> +++ b/arch/x86/include/asm/irq_remapping.h
> @@ -51,6 +51,7 @@ extern void compose_remapped_msi_msg(struct pci_dev *pdev,
> unsigned int irq, unsigned int dest,
> struct msi_msg *msg, u8 hpet_id);
> extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id);
> +extern int update_pi_irte(unsigned int irq, u64 pi_desc_addr, u32 vector);
> extern void panic_if_irq_remap(const char *msg);
> extern bool setup_remapped_irq(int irq,
> struct irq_cfg *cfg,
> @@ -88,6 +89,11 @@ static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id)
> return -ENODEV;
> }
>
> +static inline int update_pi_irte(unsigned int irq, u64 pi_desc_addr, u32 vector)
> +{
> + return -ENODEV;
> +}
> +
> static inline void panic_if_irq_remap(const char *msg)
> {
> }
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 6ed0c30..0630161 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -358,6 +358,7 @@ struct kvm_vcpu_arch {
> struct kvm_lapic *apic; /* kernel irqchip context */
> unsigned long apic_attention;
> int32_t apic_arb_prio;
> + int32_t round_robin_counter;
> int mp_state;
> u64 ia32_misc_enable_msr;
> bool tpr_access_reporting;
> @@ -771,6 +772,7 @@ struct kvm_x86_ops {
> int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
>
> void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
> + u64 (*get_pi_desc_addr)(struct kvm_vcpu *vcpu);
> };
>
> struct kvm_arch_async_pf {
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index a4670d3..ae91b72 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -544,6 +544,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
> return container_of(vcpu, struct vcpu_vmx, vcpu);
> }
>
> +struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
> +{
> + return &(to_vmx(vcpu)->pi_desc);
> +}
> +
> #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
> #define FIELD(number, name) [number] = VMCS12_OFFSET(name)
> #define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \
> @@ -4280,6 +4285,11 @@ static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu)
> return;
> }
>
> +static u64 vmx_get_pi_desc_addr(struct kvm_vcpu *vcpu)
> +{
> + return __pa((u64)vcpu_to_pi_desc(vcpu));
> +}
> +
> /*
> * Set up the vmcs's constant host-state fields, i.e., host-state fields that
> * will not change in the lifetime of the guest.
> @@ -9232,6 +9242,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
> .check_nested_events = vmx_check_nested_events,
>
> .sched_in = vmx_sched_in,
> +
> + .get_pi_desc_addr = vmx_get_pi_desc_addr,
> };
>
> static int __init vmx_init(void)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index b447a98..0c19d15 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -7735,6 +7735,17 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
> }
> EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
>
> +int kvm_update_pi_irte_common(struct kvm *kvm, struct kvm_vcpu *vcpu,
> + u32 guest_vector, int host_irq)
> +{
> + u64 pi_desc_addr = kvm_x86_ops->get_pi_desc_addr(vcpu);
> +
> + if (update_pi_irte(host_irq, pi_desc_addr, guest_vector))
> + return -1;
> +
> + return 0;
> +}
> +
> EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
> EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
> EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
> diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
> index 505a9ad..a36fdc7 100644
> --- a/drivers/iommu/amd_iommu.c
> +++ b/drivers/iommu/amd_iommu.c
> @@ -4280,6 +4280,11 @@ static int alloc_hpet_msi(unsigned int irq, unsigned int id)
> return 0;
> }
>
> +static int dummy_update_pi_irte(int irq, u64 pi_desc_addr, u32 vector)
> +{
> + return -EINVAL;
> +}
> +
> struct irq_remap_ops amd_iommu_irq_ops = {
> .supported = amd_iommu_supported,
> .prepare = amd_iommu_prepare,
> @@ -4294,5 +4299,6 @@ struct irq_remap_ops amd_iommu_irq_ops = {
> .msi_alloc_irq = msi_alloc_irq,
> .msi_setup_irq = msi_setup_irq,
> .alloc_hpet_msi = alloc_hpet_msi,
> + .update_pi_irte = dummy_update_pi_irte,
> };
> #endif
> diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c
> index 776da10..87c02fe 100644
> --- a/drivers/iommu/intel_irq_remapping.c
> +++ b/drivers/iommu/intel_irq_remapping.c
> @@ -1172,6 +1172,33 @@ static int intel_alloc_hpet_msi(unsigned int irq, unsigned int id)
> return ret;
> }
>
> +static int intel_update_pi_irte(int irq, u64 pi_desc_addr, u32 vector)
> +{
> + struct irte irte;
> +
> + if (get_irte(irq, &irte))
> + return -1;
> +
> + irte.irq_post_low.urg = 0;
> + irte.irq_post_low.vector = vector;
> + irte.irq_post_low.pda_l = (pi_desc_addr >> (32 - PDA_LOW_BIT)) &
> + ~(-1UL << PDA_LOW_BIT);
> + irte.irq_post_high.pda_h = (pi_desc_addr >> 32) &
> + ~(-1UL << PDA_HIGH_BIT);
> +
> + irte.irq_post_low.__reserved_1 = 0;
> + irte.irq_post_low.__reserved_2 = 0;
> + irte.irq_post_low.__reserved_3 = 0;
> + irte.irq_post_high.__reserved_4 = 0;
> +
> + irte.irq_post_low.pst = 1;
> +
> + if (modify_irte(irq, &irte))
> + return -1;
> +
> + return 0;
> +}
> +
> struct irq_remap_ops intel_irq_remap_ops = {
> .supported = intel_irq_remapping_supported,
> .prepare = dmar_table_init,
> @@ -1186,4 +1213,5 @@ struct irq_remap_ops intel_irq_remap_ops = {
> .msi_alloc_irq = intel_msi_alloc_irq,
> .msi_setup_irq = intel_msi_setup_irq,
> .alloc_hpet_msi = intel_alloc_hpet_msi,
> + .update_pi_irte = intel_update_pi_irte,

Extending irq_remap_ops should really be a separate patch from it's use
by KVM.

> };
> diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
> index 2f8ee00..0e36860 100644
> --- a/drivers/iommu/irq_remapping.c
> +++ b/drivers/iommu/irq_remapping.c
> @@ -362,6 +362,15 @@ int setup_hpet_msi_remapped(unsigned int irq, unsigned int id)
> return default_setup_hpet_msi(irq, id);
> }
>
> +int update_pi_irte(unsigned int irq, u64 pi_desc_addr, u32 vector)
> +{
> + if (!remap_ops || !remap_ops->update_pi_irte)
> + return -ENODEV;
> +
> + return remap_ops->update_pi_irte(irq, pi_desc_addr, vector);
> +}
> +EXPORT_SYMBOL_GPL(update_pi_irte);
> +
> void panic_if_irq_remap(const char *msg)
> {
> if (irq_remapping_enabled)
> diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h
> index 7bb5913..2d8f740 100644
> --- a/drivers/iommu/irq_remapping.h
> +++ b/drivers/iommu/irq_remapping.h
> @@ -84,6 +84,9 @@ struct irq_remap_ops {
>
> /* Setup interrupt remapping for an HPET MSI */
> int (*alloc_hpet_msi)(unsigned int, unsigned int);
> +
> + /* Update IRTE for posted-interrupt */
> + int (*update_pi_irte)(int irq, u64 pi_desc_addr, u32 vector);
> };
>
> extern struct irq_remap_ops intel_irq_remap_ops;
> diff --git a/include/linux/dmar.h b/include/linux/dmar.h
> index 8be5d42..e1ff4f7 100644
> --- a/include/linux/dmar.h
> +++ b/include/linux/dmar.h
> @@ -160,6 +160,20 @@ struct irte {
> __reserved_2 : 8,
> dest_id : 32;
> } irq_remap_low;
> +
> + struct {
> + __u64 present : 1,
> + fpd : 1,
> + __reserved_1 : 6,
> + avail : 4,
> + __reserved_2 : 2,
> + urg : 1,
> + pst : 1,
> + vector : 8,
> + __reserved_3 : 14,
> + pda_l : 26;
> + } irq_post_low;
> +
> __u64 low;
> };
>
> @@ -170,10 +184,22 @@ struct irte {
> svt : 2,
> __reserved_3 : 44;
> } irq_remap_high;
> +
> + struct {
> + __u64 sid: 16,
> + sq: 2,
> + svt: 2,
> + __reserved_4: 12,
> + pda_h: 32;
> + } irq_post_high;
> +
> __u64 high;
> };
> };
>
> +#define PDA_LOW_BIT 26
> +#define PDA_HIGH_BIT 32
> +
> enum {
> IRQ_REMAP_XAPIC_MODE,
> IRQ_REMAP_X2APIC_MODE,
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index ea53b04..6bb8287 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -335,6 +335,25 @@ struct kvm_kernel_irq_routing_entry {
> struct hlist_node link;
> };
>
> +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
> +
> +struct kvm_irq_routing_table {
> + int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
> + struct kvm_kernel_irq_routing_entry *rt_entries;
> + u32 nr_rt_entries;
> + /*
> + * Array indexed by gsi. Each entry contains list of irq chips
> + * the gsi is connected to.
> + */
> + struct hlist_head map[0];
> +};
> +
> +#else
> +
> +struct kvm_irq_routing_table {};
> +
> +#endif
> +
> #ifndef KVM_PRIVATE_MEM_SLOTS
> #define KVM_PRIVATE_MEM_SLOTS 0
> #endif
> @@ -766,6 +785,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
> struct kvm_irq_ack_notifier *kian);
> int kvm_request_irq_source_id(struct kvm *kvm);
> void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
> +void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
> + struct kvm_lapic_irq *irq);
> +bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq);
>
> #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
> int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 7593c52..509223a 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1027,6 +1027,7 @@ struct kvm_s390_ucas_mapping {
> #define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config)
> #define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data)
> #define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data)
> +#define KVM_ASSIGN_DEV_PI_UPDATE _IOR(KVMIO, 0x7d, __u32)
> /* Available with KVM_CAP_PIT_STATE2 */
> #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2)
> #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2)

Needs an accompanying Documentation/virtual/kvm/api.txt update.

> diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> index e05000e..e154009 100644
> --- a/virt/kvm/assigned-dev.c
> +++ b/virt/kvm/assigned-dev.c


Since legacy KVM device assignment is effectively deprecated, have you
considered how we might do this with VFIO? Thanks,

Alex


> @@ -326,6 +326,135 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
> }
> }
>
> +int __weak kvm_update_pi_irte_common(struct kvm *kvm, struct kvm_vcpu *vcpu,
> + u32 guest_vector, int host_irq)
> +{
> + return 0;
> +}
> +
> +int kvm_compare_rr_counter(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
> +{
> + return vcpu1->arch.round_robin_counter -
> + vcpu2->arch.round_robin_counter;
> +}
> +
> +bool kvm_pi_find_dest_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
> + struct kvm_vcpu **dest_vcpu)
> +{
> + int i, r = 0;
> + struct kvm_vcpu *vcpu, *dest = NULL;
> +
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + if (!kvm_apic_present(vcpu))
> + continue;
> +
> + if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
> + irq->dest_id, irq->dest_mode))
> + continue;
> +
> + if (!kvm_is_dm_lowest_prio(irq)) {
> + r++;
> + *dest_vcpu = vcpu;
> + } else if (kvm_lapic_enabled(vcpu)) {
> + if (!dest)
> + dest = vcpu;
> + else if (kvm_compare_rr_counter(vcpu, dest) < 0)
> + dest = vcpu;
> + }
> + }
> +
> + if (dest) {
> + dest->arch.round_robin_counter++;
> + *dest_vcpu = dest;
> + return true;
> + } else if (r == 1)
> + return true;
> +
> + return false;
> +}
> +
> +static int __kvm_update_pi_irte(struct kvm *kvm, int host_irq, int guest_irq)
> +{
> + struct kvm_kernel_irq_routing_entry *e;
> + struct kvm_irq_routing_table *irq_rt;
> + struct kvm_lapic_irq irq;
> + struct kvm_vcpu *vcpu;
> + int idx, ret = -EINVAL;
> +
> + idx = srcu_read_lock(&kvm->irq_srcu);
> + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
> + ASSERT(guest_irq < irq_rt->nr_rt_entries);
> +
> + hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
> + if (e->type != KVM_IRQ_ROUTING_MSI)
> + continue;
> + /*
> + * VT-d posted-interrupt has the following
> + * limitations:
> + * - No support for posting multicast/broadcast
> + * interrupts to a VCPU
> + * Still use interrupt remapping for these
> + * kind of interrupts
> + */
> +
> + kvm_set_msi_irq(e, &irq);
> + if (!kvm_pi_find_dest_vcpu(kvm, &irq, &vcpu)) {
> + printk(KERN_INFO "%s: can not find the target VCPU\n",
> + __func__);
> + ret = -EINVAL;
> + goto out;
> + }
> +
> + if (kvm_update_pi_irte_common(kvm, vcpu, irq.vector,
> + host_irq)) {
> + printk(KERN_INFO "%s: failed to update PI IRTE\n",
> + __func__);
> + ret = -EINVAL;
> + goto out;
> + }
> + }
> +
> + ret = 0;
> +out:
> + srcu_read_unlock(&kvm->irq_srcu, idx);
> + return ret;
> +}
> +
> +int kvm_update_pi_irte(struct kvm *kvm, u32 dev_id)
> +{
> + int i, rc = -1;
> + struct kvm_assigned_dev_kernel *dev;
> +
> + mutex_lock(&kvm->lock);
> + dev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, dev_id);
> + if (!dev) {
> + printk(KERN_INFO "%s: cannot find the assigned dev.\n",
> + __func__);
> + rc = -1;
> + goto out;
> + }
> +
> + BUG_ON(dev->irq_requested_type == 0);
> +
> + if ((dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) &&
> + (dev->dev->msi_enabled == 1)) {
> + __kvm_update_pi_irte(kvm,
> + dev->host_irq, dev->guest_irq);
> + } else if ((dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) &&
> + (dev->dev->msix_enabled == 1)) {
> + for (i = 0; i < dev->entries_nr; i++) {
> + __kvm_update_pi_irte(kvm,
> + dev->host_msix_entries[i].vector,
> + dev->guest_msix_entries[i].vector);
> + }
> + }
> +
> +out:
> + rc = 0;
> + mutex_unlock(&kvm->lock);
> + return rc;
> +}
> +
> static int assigned_device_enable_host_intx(struct kvm *kvm,
> struct kvm_assigned_dev_kernel *dev)
> {
> @@ -1017,6 +1146,18 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
> r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
> break;
> }
> + case KVM_ASSIGN_DEV_PI_UPDATE: {
> + u32 dev_id;
> +
> + r = -EFAULT;
> + if (copy_from_user(&dev_id, argp, sizeof(dev_id)))
> + goto out;
> + r = kvm_update_pi_irte(kvm, dev_id);
> + if (r)
> + goto out;
> + break;
> +
> + }
> default:
> r = -ENOTTY;
> break;
> diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
> index 963b899..f51aed3 100644
> --- a/virt/kvm/irq_comm.c
> +++ b/virt/kvm/irq_comm.c
> @@ -55,7 +55,7 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
> line_status);
> }
>
> -inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
> +bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
> {
> #ifdef CONFIG_IA64
> return irq->delivery_mode ==
> @@ -106,7 +106,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
> return r;
> }
>
> -static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
> +void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
> struct kvm_lapic_irq *irq)
> {
> trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
> diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
> index 7f256f3..cdf29a6 100644
> --- a/virt/kvm/irqchip.c
> +++ b/virt/kvm/irqchip.c
> @@ -31,17 +31,6 @@
> #include <trace/events/kvm.h>
> #include "irq.h"
>
> -struct kvm_irq_routing_table {
> - int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
> - struct kvm_kernel_irq_routing_entry *rt_entries;
> - u32 nr_rt_entries;
> - /*
> - * Array indexed by gsi. Each entry contains list of irq chips
> - * the gsi is connected to.
> - */
> - struct hlist_head map[0];
> -};
> -
> int kvm_irq_map_gsi(struct kvm *kvm,
> struct kvm_kernel_irq_routing_entry *entries, int gsi)
> {



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/